From 5e548acd2f0134699fc54ad0a31978fe49b27b86 Mon Sep 17 00:00:00 2001 From: Will Dumm Date: Mon, 16 May 2022 15:37:44 -0700 Subject: [PATCH 01/11] Add parsimony script --- scripts/parsimony.py | 82 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 scripts/parsimony.py diff --git a/scripts/parsimony.py b/scripts/parsimony.py new file mode 100644 index 00000000..cc6a0c16 --- /dev/null +++ b/scripts/parsimony.py @@ -0,0 +1,82 @@ +import gctree +import gctree.phylip_parse as pp +import click +from collections import Counter +import ete3 + +@click.group(context_settings={"help_option_names": ["-h", "--help"]}) +def _cli(): + """ + A collection of tools for calculating parsimony scores of newick trees, and + using them to create a history DAG + """ + pass + +def load_fasta(fastapath): + fasta_map = {} + with open(fastapath, 'r') as fh: + seqid = None + for line in fh: + if line[0] == '>': + seqid = line[1:].strip() + if seqid in fasta_map: + raise ValueError("Duplicate records with matching identifier in fasta file") + else: + fasta_map[seqid] = "" + else: + if seqid is None and line.strip(): + raise ValueError("First non-blank line in fasta does not contain identifier") + else: + fasta_map[seqid] += line.strip() + return fasta_map + +def build_tree(newickstring, fasta_map, newickformat=1, reference_id=None, reference_sequence=None, ignore_internal_sequences=False): + tree = ete3.Tree(newickstring, format=newickformat) + # all fasta entries should be same length + seq_len = len(next(iter(fasta_map.values()))) + ambig_seq = 'N' * seq_len + for node in tree.traverse(): + if node.is_root() and reference_sequence is not None: + node.add_feature("sequence", reference_sequence) + elif node.is_root() and reference_id is not None: + node.add_feature("sequence", fasta_map[reference_id]) + elif (not node.is_leaf()) and ignore_internal_sequences: + node.add_feature("sequence", ambig_seq) + elif node.name in fasta_map: + node.add_feature("sequence", fasta_map[node.name]) + else: + node.add_feature("sequence", ambig_seq) + return tree + +def build_tree_from_files(newickfile, fastafile, **kwargs): + with open(newickfile, 'r') as fh: + newick = fh.read() + return build_tree(newick, load_fasta(fastafile), **kwargs) + +def parsimony_score(tree): + return sum(gctree.utils.hamming_distance(node.up.sequence, node.sequence) + for node in tree.iter_descendants()) + +def parsimony_score_from_files(*args, **kwargs): + tree = build_tree_from_files(*args, **kwargs) + tree = pp.disambiguate(tree) + return parsimony_score(tree) + +@_cli.command("parsimony_scores") +@click.argument("treefiles", nargs=-1) +@click.option("-f", "--fasta-file", required=True, help="Filename of a fasta file containing sequences appearing on nodes of newick tree") +@click.option("-r", "--root-id", default=None, help="The fasta identifier of the fixed root of provided trees. May be omitted if there is no fixed root sequence.") +@click.option("-F", "--newick-format", default=1, help="Newick format of the provided newick file. See http://etetoolkit.org/docs/latest/reference/reference_tree.html#ete3.TreeNode") +@click.option("-i", "--include-internal-sequences", is_flag=True, help=" non-leaf node labels, and associated sequences in the fasta file.") +def _cli_parsimony_score_from_files(treefiles, fasta_file, root_id, newick_format, include_internal_sequences): + """Print the parsimony score of one or more newick files""" + fasta_map = load_fasta(fasta_file) + parsimony_counter = Counter() + for treepath in treefiles: + with open(treepath, 'r') as fh: + tree = build_tree(fh.read(), fasta_map, newickformat=newick_format, reference_id=root_id, ignore_internal_sequences=(not include_internal_sequences)) + print(treepath) + print(parsimony_score(pp.disambiguate(tree))) + +if __name__ == "__main__": + _cli() From 777fba647d57b53aac8039f834e8968ea025c7be Mon Sep 17 00:00:00 2001 From: Will Dumm Date: Mon, 16 May 2022 15:38:36 -0700 Subject: [PATCH 02/11] format --- scripts/parsimony.py | 83 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 66 insertions(+), 17 deletions(-) diff --git a/scripts/parsimony.py b/scripts/parsimony.py index cc6a0c16..0021f59c 100644 --- a/scripts/parsimony.py +++ b/scripts/parsimony.py @@ -4,6 +4,7 @@ from collections import Counter import ete3 + @click.group(context_settings={"help_option_names": ["-h", "--help"]}) def _cli(): """ @@ -12,34 +13,47 @@ def _cli(): """ pass + def load_fasta(fastapath): fasta_map = {} - with open(fastapath, 'r') as fh: + with open(fastapath, "r") as fh: seqid = None for line in fh: - if line[0] == '>': + if line[0] == ">": seqid = line[1:].strip() if seqid in fasta_map: - raise ValueError("Duplicate records with matching identifier in fasta file") + raise ValueError( + "Duplicate records with matching identifier in fasta file" + ) else: fasta_map[seqid] = "" else: if seqid is None and line.strip(): - raise ValueError("First non-blank line in fasta does not contain identifier") + raise ValueError( + "First non-blank line in fasta does not contain identifier" + ) else: fasta_map[seqid] += line.strip() return fasta_map -def build_tree(newickstring, fasta_map, newickformat=1, reference_id=None, reference_sequence=None, ignore_internal_sequences=False): + +def build_tree( + newickstring, + fasta_map, + newickformat=1, + reference_id=None, + reference_sequence=None, + ignore_internal_sequences=False, +): tree = ete3.Tree(newickstring, format=newickformat) # all fasta entries should be same length seq_len = len(next(iter(fasta_map.values()))) - ambig_seq = 'N' * seq_len + ambig_seq = "N" * seq_len for node in tree.traverse(): if node.is_root() and reference_sequence is not None: node.add_feature("sequence", reference_sequence) elif node.is_root() and reference_id is not None: - node.add_feature("sequence", fasta_map[reference_id]) + node.add_feature("sequence", fasta_map[reference_id]) elif (not node.is_leaf()) and ignore_internal_sequences: node.add_feature("sequence", ambig_seq) elif node.name in fasta_map: @@ -48,35 +62,70 @@ def build_tree(newickstring, fasta_map, newickformat=1, reference_id=None, refer node.add_feature("sequence", ambig_seq) return tree + def build_tree_from_files(newickfile, fastafile, **kwargs): - with open(newickfile, 'r') as fh: + with open(newickfile, "r") as fh: newick = fh.read() return build_tree(newick, load_fasta(fastafile), **kwargs) + def parsimony_score(tree): - return sum(gctree.utils.hamming_distance(node.up.sequence, node.sequence) - for node in tree.iter_descendants()) + return sum( + gctree.utils.hamming_distance(node.up.sequence, node.sequence) + for node in tree.iter_descendants() + ) + def parsimony_score_from_files(*args, **kwargs): tree = build_tree_from_files(*args, **kwargs) tree = pp.disambiguate(tree) return parsimony_score(tree) + @_cli.command("parsimony_scores") @click.argument("treefiles", nargs=-1) -@click.option("-f", "--fasta-file", required=True, help="Filename of a fasta file containing sequences appearing on nodes of newick tree") -@click.option("-r", "--root-id", default=None, help="The fasta identifier of the fixed root of provided trees. May be omitted if there is no fixed root sequence.") -@click.option("-F", "--newick-format", default=1, help="Newick format of the provided newick file. See http://etetoolkit.org/docs/latest/reference/reference_tree.html#ete3.TreeNode") -@click.option("-i", "--include-internal-sequences", is_flag=True, help=" non-leaf node labels, and associated sequences in the fasta file.") -def _cli_parsimony_score_from_files(treefiles, fasta_file, root_id, newick_format, include_internal_sequences): +@click.option( + "-f", + "--fasta-file", + required=True, + help="Filename of a fasta file containing sequences appearing on nodes of newick tree", +) +@click.option( + "-r", + "--root-id", + default=None, + help="The fasta identifier of the fixed root of provided trees. May be omitted if there is no fixed root sequence.", +) +@click.option( + "-F", + "--newick-format", + default=1, + help="Newick format of the provided newick file. See http://etetoolkit.org/docs/latest/reference/reference_tree.html#ete3.TreeNode", +) +@click.option( + "-i", + "--include-internal-sequences", + is_flag=True, + help=" non-leaf node labels, and associated sequences in the fasta file.", +) +def _cli_parsimony_score_from_files( + treefiles, fasta_file, root_id, newick_format, include_internal_sequences +): """Print the parsimony score of one or more newick files""" fasta_map = load_fasta(fasta_file) parsimony_counter = Counter() for treepath in treefiles: - with open(treepath, 'r') as fh: - tree = build_tree(fh.read(), fasta_map, newickformat=newick_format, reference_id=root_id, ignore_internal_sequences=(not include_internal_sequences)) + with open(treepath, "r") as fh: + tree = build_tree( + fh.read(), + fasta_map, + newickformat=newick_format, + reference_id=root_id, + ignore_internal_sequences=(not include_internal_sequences), + ) print(treepath) print(parsimony_score(pp.disambiguate(tree))) + if __name__ == "__main__": _cli() From 23c4eca90e91258931f64d17adccb864706bcd94 Mon Sep 17 00:00:00 2001 From: Will Dumm Date: Tue, 17 May 2022 10:11:38 -0700 Subject: [PATCH 03/11] docstrings and option to write to dag --- scripts/parsimony.py | 113 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 94 insertions(+), 19 deletions(-) diff --git a/scripts/parsimony.py b/scripts/parsimony.py index 0021f59c..356cbf3e 100644 --- a/scripts/parsimony.py +++ b/scripts/parsimony.py @@ -3,6 +3,8 @@ import click from collections import Counter import ete3 +import historydag as hdag +import pickle @click.group(context_settings={"help_option_names": ["-h", "--help"]}) @@ -15,6 +17,7 @@ def _cli(): def load_fasta(fastapath): + """Load a fasta file as a dictionary, with sequence ids as keys and sequences as values.""" fasta_map = {} with open(fastapath, "r") as fh: seqid = None @@ -45,6 +48,18 @@ def build_tree( reference_sequence=None, ignore_internal_sequences=False, ): + """Load an ete tree from a newick string, and add 'sequence' attributes from fasta. + + If internal node sequences aren't specified in the newick string and fasta data, + internal node sequences will be fully ambiguous (contain repeated N's). + + Arguments: + newickstring: a newick string + fasta_map: a dictionary with sequence id keys matching node names in `newickstring`, and sequence values. + newickformat: the ete format identifier for the passed newick string. See ete docs + reference_id: key in `fasta_map` corresponding to the root sequence of the tree, if root sequence is fixed. + reference_sequence: fixed root sequence of the tree, if root sequence is fixed + ignore_internal_sequences: if True, sequences at non-leaf nodes specified in fasta_map and newickstring will be ignored, and all internal sequences will be fully ambiguous.""" tree = ete3.Tree(newickstring, format=newickformat) # all fasta entries should be same length seq_len = len(next(iter(fasta_map.values()))) @@ -63,23 +78,48 @@ def build_tree( return tree -def build_tree_from_files(newickfile, fastafile, **kwargs): - with open(newickfile, "r") as fh: - newick = fh.read() - return build_tree(newick, load_fasta(fastafile), **kwargs) +def build_trees_from_files(newickfiles, fastafile, **kwargs): + """Same as `build_tree`, but takes a list of filenames containing newick strings, and a filename for a fasta file, and returns a generator on trees""" + fasta_map = load_fasta(fastafile) + trees = [] + for newick in newickfiles: + with open(newick, "r") as fh: + newick = fh.read() + yield build_tree(newick, fasta_map, **kwargs) def parsimony_score(tree): + """returns the parsimony score of a (disambiguated) ete tree. + Tree must have 'sequence' attributes on all nodes.""" return sum( gctree.utils.hamming_distance(node.up.sequence, node.sequence) for node in tree.iter_descendants() ) -def parsimony_score_from_files(*args, **kwargs): - tree = build_tree_from_files(*args, **kwargs) - tree = pp.disambiguate(tree) - return parsimony_score(tree) +def parsimony_scores_from_files(*args, **kwargs): + """returns the parsimony scores of trees specified by newick files and a fasta file. + Arguments match `build_trees_from_files`.""" + trees = build_trees_from_files(*args, **kwargs) + trees = [pp.disambiguate(tree) for tree in trees] + return [parsimony_score(tree) for tree in trees] + + +def build_dag_from_trees(trees): + """Build a history DAG from trees containing a `sequence` attribute on all nodes. + unifurcations in the provided trees will be deleted.""" + trees = [tree.copy() for tree in trees] + for tree in trees: + for node in tree.iter_descendants(): + if len(node.children) == 1: + node.delete(prevent_nondicotomic=False) + if len(tree.children) == 1: + newchild = tree.add_child() + newchild.add_feature("sequence", tree.sequence) + return hdag.history_dag_from_etes( + trees, + ["sequence"], + ) @_cli.command("parsimony_scores") @@ -106,25 +146,60 @@ def parsimony_score_from_files(*args, **kwargs): "-i", "--include-internal-sequences", is_flag=True, - help=" non-leaf node labels, and associated sequences in the fasta file.", + help="include non-leaf node labels, and associated sequences in the fasta file.", +) +@click.option( + "-d", + "--save-to-dag", + default=None, + help="Combine loaded and disambiguated trees into a history DAG, and save pickled DAG to provided path.", ) def _cli_parsimony_score_from_files( - treefiles, fasta_file, root_id, newick_format, include_internal_sequences + treefiles, + fasta_file, + root_id, + newick_format, + include_internal_sequences, + save_to_dag, ): """Print the parsimony score of one or more newick files""" fasta_map = load_fasta(fasta_file) parsimony_counter = Counter() - for treepath in treefiles: - with open(treepath, "r") as fh: - tree = build_tree( - fh.read(), - fasta_map, - newickformat=newick_format, - reference_id=root_id, - ignore_internal_sequences=(not include_internal_sequences), - ) + trees = [] + for tree, treepath in zip( + build_trees_from_files( + treefiles, + fasta_file, + reference_id=root_id, + ignore_internal_sequences=(not include_internal_sequences), + ), + treefiles, + ): print(treepath) print(parsimony_score(pp.disambiguate(tree))) + if save_to_dag is not None: + trees.append(tree) + if save_to_dag is not None: + with open(save_to_dag, "wb") as fh: + fh.write(pickle.dumps(build_dag_from_trees(trees))) + + +def summarize_dag(dag): + """print summary information about the provided history DAG.""" + print("DAG contains") + print("trees: ", dag.count_trees()) + print("nodes: ", len(list(dag.preorder()))) + print("edges: ", sum(len(list(node.children())) for node in dag.preorder())) + print("parsimony scores: ", dag.weight_count()) + + +@_cli.command("summarize-dag") +@click.argument("dagpath") +def _cli_summarize_dag(dagpath): + """print summary information about the provided history DAG.""" + with open(dagpath, "rb") as fh: + dag = pickle.load(fh) + summarize_dag(dag) if __name__ == "__main__": From 5b6d8d870b14fc57205665e3e9a945de67f67d45 Mon Sep 17 00:00:00 2001 From: Will Dumm Date: Wed, 25 May 2022 14:50:06 -0700 Subject: [PATCH 04/11] possibly faster disambiguation algorithm --- scripts/parsimony.py | 130 ++++++++++++++++++++++++++++++++++++- tests/test_disambiguate.py | 2 +- 2 files changed, 129 insertions(+), 3 deletions(-) diff --git a/scripts/parsimony.py b/scripts/parsimony.py index 356cbf3e..b8650114 100644 --- a/scripts/parsimony.py +++ b/scripts/parsimony.py @@ -1,10 +1,136 @@ import gctree +import random import gctree.phylip_parse as pp import click from collections import Counter import ete3 import historydag as hdag import pickle +import numpy as np + + +# bases = "AGCT" +# base_vecs = {base: np.array([float('inf') if base != rbase else 0 for rbase in bases]) for base in bases} +# base_vecs.update({'N': np.array([0, 0, 0, 0])}) +# delta_vecs = [np.array([int(i != j) for j in range(4)]) for i in range(4)] +# transition_weights = [ +# [0 if base == rbase else 1 for rbase in bases] +# for base in bases +# ] + +delta_vectors = { + code: np.array( + [ + 0 if base in gctree.utils.ambiguous_dna_values[code] else 1 + for base in gctree.utils.bases + ] + ) + for code in gctree.utils.ambiguous_dna_values +} +code_vectors = { + code: np.array( + [ + 0 if base in gctree.utils.ambiguous_dna_values[code] else float("inf") + for base in gctree.utils.bases + ] + ) + for code in gctree.utils.ambiguous_dna_values +} +cost_adjust = { + base: np.array([int(not i == j) for j in range(5)]) + for i, base in enumerate(gctree.utils.bases) +} + +# def sankoff(tree, random_state=None): +# """Randomly resolve ambiguous bases using a two-pass Sankoff Algorithm on +# subtrees of consecutive ambiguity codes.""" +# if random_state is None: +# random.seed(tree.write(format=1)) +# else: +# random.setstate(random_state) +# seq_len = len(tree.sequence) + +# # First pass of Sankoff: compute cost vectors +# for node in tree.traverse(strategy="postorder"): +# node.add_feature("cv", +# np.array([delta_vectors[base].copy() for base in node.sequence]) + +# sum([child.cv for child in node.children], start=np.array([[0] * 5] * seq_len))) +# print(node.sequence, node.cv) + + +# print("starting preorder pass") +# for node in tree.traverse(strategy="preorder"): +# # disallow bases that aren't compatible with existing bases +# node.cv = node.cv + [code_vectors[base] for base in node.sequence] +# # choose sequence +# newseq = [] +# for cv_idx in range(len(node.cv)): +# cv = node.cv[cv_idx] +# min_cost = min(cv) +# min_indices = [idx for idx, cost in enumerate(cv) if cost == min_cost] +# newbase = gctree.utils.bases[random.choice(min_indices)] +# newseq.append(newbase) + +# cost_adjust = np.array([delta_vectors[base] for base in newseq]) +# for child in node.children: +# child.cv += cost_adjust +# node.sequence = ''.join(newseq) +# return tree + + +def disambiguate(tree, random_state=None, remove_cvs=False, adj_dist=False): + """Randomly resolve ambiguous bases using a two-pass Sankoff Algorithm on + subtrees of consecutive ambiguity codes.""" + seq_len = len(tree.sequence) + if random_state is None: + random.seed(tree.write(format=1)) + else: + random.setstate(random_state) + # First pass of Sankoff: compute cost vectors + for node in tree.traverse(strategy="postorder"): + node.add_feature( + "cv", np.array([code_vectors[base].copy() for base in node.sequence]) + ) + if not node.is_leaf(): + for idx in range(seq_len): + for i in range(5): + for child in node.children: + node.cv[idx][i] += min( + [ + sum(v) + for v in zip( + child.cv[idx], cost_adjust[gctree.utils.bases[i]] + ) + ] + ) + + # Second pass of Sankoff: choose bases + preorder = list(tree.traverse(strategy="preorder")) + for node in preorder: + new_seq = [] + for idx in range(seq_len): + min_cost = min(node.cv[idx]) + base_index = random.choice( + [i for i, val in enumerate(node.cv[idx]) if val == min_cost] + ) + new_base = gctree.utils.bases[base_index] + new_seq.append(new_base) + # Adjust child cost vectors + for child in node.children: + child.cv += np.array([delta_vectors[base] for base in new_seq]) + node.sequence = "".join(new_seq) + + if remove_cvs: + for node in tree.traverse(): + try: + node.del_feature("cv") + except (AttributeError, KeyError): + pass + if adj_dist: + tree.dist = 0 + for node in tree.iter_descendants(): + node.dist = gctree.utils.hamming_distance(node.up.sequence, node.sequence) + return tree @click.group(context_settings={"help_option_names": ["-h", "--help"]}) @@ -101,7 +227,7 @@ def parsimony_scores_from_files(*args, **kwargs): """returns the parsimony scores of trees specified by newick files and a fasta file. Arguments match `build_trees_from_files`.""" trees = build_trees_from_files(*args, **kwargs) - trees = [pp.disambiguate(tree) for tree in trees] + trees = [disambiguate(tree) for tree in trees] return [parsimony_score(tree) for tree in trees] @@ -176,7 +302,7 @@ def _cli_parsimony_score_from_files( treefiles, ): print(treepath) - print(parsimony_score(pp.disambiguate(tree))) + print(parsimony_score(disambiguate(tree))) if save_to_dag is not None: trees.append(tree) if save_to_dag is not None: diff --git a/tests/test_disambiguate.py b/tests/test_disambiguate.py index 54783353..2690e16f 100644 --- a/tests/test_disambiguate.py +++ b/tests/test_disambiguate.py @@ -31,7 +31,7 @@ def treeprint(tree: ete3.TreeNode): def sample(treestring, n=20): - ogt = ete3.TreeNode(newick=newick_tree2, format=1) + ogt = ete3.TreeNode(newick=treestring, format=1) print(ogt.sequence) newickset = set() for i in range(n): From cf6c3ee2faf8cb5ed52892d0b565754c2d54b133 Mon Sep 17 00:00:00 2001 From: Will Dumm Date: Thu, 26 May 2022 13:38:04 -0700 Subject: [PATCH 05/11] roll back new disambiguation and consider - char --- scripts/parsimony.py | 131 ++----------------------------------------- 1 file changed, 5 insertions(+), 126 deletions(-) diff --git a/scripts/parsimony.py b/scripts/parsimony.py index b8650114..fda01a9c 100644 --- a/scripts/parsimony.py +++ b/scripts/parsimony.py @@ -9,129 +9,6 @@ import numpy as np -# bases = "AGCT" -# base_vecs = {base: np.array([float('inf') if base != rbase else 0 for rbase in bases]) for base in bases} -# base_vecs.update({'N': np.array([0, 0, 0, 0])}) -# delta_vecs = [np.array([int(i != j) for j in range(4)]) for i in range(4)] -# transition_weights = [ -# [0 if base == rbase else 1 for rbase in bases] -# for base in bases -# ] - -delta_vectors = { - code: np.array( - [ - 0 if base in gctree.utils.ambiguous_dna_values[code] else 1 - for base in gctree.utils.bases - ] - ) - for code in gctree.utils.ambiguous_dna_values -} -code_vectors = { - code: np.array( - [ - 0 if base in gctree.utils.ambiguous_dna_values[code] else float("inf") - for base in gctree.utils.bases - ] - ) - for code in gctree.utils.ambiguous_dna_values -} -cost_adjust = { - base: np.array([int(not i == j) for j in range(5)]) - for i, base in enumerate(gctree.utils.bases) -} - -# def sankoff(tree, random_state=None): -# """Randomly resolve ambiguous bases using a two-pass Sankoff Algorithm on -# subtrees of consecutive ambiguity codes.""" -# if random_state is None: -# random.seed(tree.write(format=1)) -# else: -# random.setstate(random_state) -# seq_len = len(tree.sequence) - -# # First pass of Sankoff: compute cost vectors -# for node in tree.traverse(strategy="postorder"): -# node.add_feature("cv", -# np.array([delta_vectors[base].copy() for base in node.sequence]) + -# sum([child.cv for child in node.children], start=np.array([[0] * 5] * seq_len))) -# print(node.sequence, node.cv) - - -# print("starting preorder pass") -# for node in tree.traverse(strategy="preorder"): -# # disallow bases that aren't compatible with existing bases -# node.cv = node.cv + [code_vectors[base] for base in node.sequence] -# # choose sequence -# newseq = [] -# for cv_idx in range(len(node.cv)): -# cv = node.cv[cv_idx] -# min_cost = min(cv) -# min_indices = [idx for idx, cost in enumerate(cv) if cost == min_cost] -# newbase = gctree.utils.bases[random.choice(min_indices)] -# newseq.append(newbase) - -# cost_adjust = np.array([delta_vectors[base] for base in newseq]) -# for child in node.children: -# child.cv += cost_adjust -# node.sequence = ''.join(newseq) -# return tree - - -def disambiguate(tree, random_state=None, remove_cvs=False, adj_dist=False): - """Randomly resolve ambiguous bases using a two-pass Sankoff Algorithm on - subtrees of consecutive ambiguity codes.""" - seq_len = len(tree.sequence) - if random_state is None: - random.seed(tree.write(format=1)) - else: - random.setstate(random_state) - # First pass of Sankoff: compute cost vectors - for node in tree.traverse(strategy="postorder"): - node.add_feature( - "cv", np.array([code_vectors[base].copy() for base in node.sequence]) - ) - if not node.is_leaf(): - for idx in range(seq_len): - for i in range(5): - for child in node.children: - node.cv[idx][i] += min( - [ - sum(v) - for v in zip( - child.cv[idx], cost_adjust[gctree.utils.bases[i]] - ) - ] - ) - - # Second pass of Sankoff: choose bases - preorder = list(tree.traverse(strategy="preorder")) - for node in preorder: - new_seq = [] - for idx in range(seq_len): - min_cost = min(node.cv[idx]) - base_index = random.choice( - [i for i, val in enumerate(node.cv[idx]) if val == min_cost] - ) - new_base = gctree.utils.bases[base_index] - new_seq.append(new_base) - # Adjust child cost vectors - for child in node.children: - child.cv += np.array([delta_vectors[base] for base in new_seq]) - node.sequence = "".join(new_seq) - - if remove_cvs: - for node in tree.traverse(): - try: - node.del_feature("cv") - except (AttributeError, KeyError): - pass - if adj_dist: - tree.dist = 0 - for node in tree.iter_descendants(): - node.dist = gctree.utils.hamming_distance(node.up.sequence, node.sequence) - return tree - @click.group(context_settings={"help_option_names": ["-h", "--help"]}) def _cli(): @@ -189,7 +66,9 @@ def build_tree( tree = ete3.Tree(newickstring, format=newickformat) # all fasta entries should be same length seq_len = len(next(iter(fasta_map.values()))) - ambig_seq = "N" * seq_len + # find characters for which there's no diversity: + ambig_seq = ''.join(list(charset := set(chars))[0] if len(charset) == 1 else "?" + for chars in zip(*fasta_map.values())) for node in tree.traverse(): if node.is_root() and reference_sequence is not None: node.add_feature("sequence", reference_sequence) @@ -227,7 +106,7 @@ def parsimony_scores_from_files(*args, **kwargs): """returns the parsimony scores of trees specified by newick files and a fasta file. Arguments match `build_trees_from_files`.""" trees = build_trees_from_files(*args, **kwargs) - trees = [disambiguate(tree) for tree in trees] + trees = [pp.disambiguate(tree) for tree in trees] return [parsimony_score(tree) for tree in trees] @@ -302,7 +181,7 @@ def _cli_parsimony_score_from_files( treefiles, ): print(treepath) - print(parsimony_score(disambiguate(tree))) + print(parsimony_score(pp.disambiguate(tree))) if save_to_dag is not None: trees.append(tree) if save_to_dag is not None: From bb04ae4ca7d8d51fd2ba3f2758912f76aaaacfe0 Mon Sep 17 00:00:00 2001 From: Will Dumm Date: Thu, 26 May 2022 17:09:43 -0700 Subject: [PATCH 06/11] definitely faster parsimony computation --- scripts/parsimony.py | 153 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 128 insertions(+), 25 deletions(-) diff --git a/scripts/parsimony.py b/scripts/parsimony.py index fda01a9c..b7a4a25f 100644 --- a/scripts/parsimony.py +++ b/scripts/parsimony.py @@ -8,6 +8,28 @@ import pickle import numpy as np +delta_vectors = { + code: np.array( + [ + 0 if base in gctree.utils.ambiguous_dna_values[code] else 1 + for base in gctree.utils.bases + ] + ) + for code in gctree.utils.ambiguous_dna_values +} +code_vectors = { + code: np.array( + [ + 0 if base in gctree.utils.ambiguous_dna_values[code] else float("inf") + for base in gctree.utils.bases + ] + ) + for code in gctree.utils.ambiguous_dna_values +} +cost_adjust = { + base: np.array([int(not i == j) for j in range(5)]) + for i, base in enumerate(gctree.utils.bases) +} @click.group(context_settings={"help_option_names": ["-h", "--help"]}) @@ -19,6 +41,76 @@ def _cli(): pass +def sankoff_upward(tree): + seq_len = len(tree.sequence) + adj_arr = np.array( + [ + [ + [0, 1, 1, 1, 1], + [1, 0, 1, 1, 1], + [1, 1, 0, 1, 1], + [1, 1, 1, 0, 1], + [1, 1, 1, 1, 0], + ] + ] + * seq_len + ) + # First pass of Sankoff: compute cost vectors + for node in tree.traverse(strategy="postorder"): + node.add_feature( + "cv", np.array([code_vectors[base].copy() for base in node.sequence]) + ) + if not node.is_leaf(): + child_costs = [] + for child in node.children: + stacked_child_cv = np.stack((child.cv,) * 5, axis=1) + total_cost = adj_arr + stacked_child_cv + child_costs.append(np.min(total_cost, axis=2)) + child_cost = np.sum(child_costs, axis=0) + node.cv = ( + np.array([code_vectors[base] for base in node.sequence]) + child_cost + ) + return np.sum(np.min(tree.cv, axis=1)) + + +def disambiguate(tree, random_state=None, remove_cvs=False, adj_dist=False): + """Randomly resolve ambiguous bases using a two-pass Sankoff Algorithm on + subtrees of consecutive ambiguity codes.""" + + seq_len = len(tree.sequence) + sankoff_upward(tree) + # Second pass of Sankoff: choose bases + preorder = list(tree.traverse(strategy="preorder")) + for node in preorder: + new_seq = [] + for idx in range(seq_len): + min_cost = min(node.cv[idx]) + base_index = random.choice( + [i for i, val in enumerate(node.cv[idx]) if val == min_cost] + ) + new_base = gctree.utils.bases[base_index] + new_seq.append(new_base) + # Adjust child cost vectors + for child in node.children: + for idx, base in enumerate(new_seq): + dvec = delta_vectors[base] + for subidx in range(5): + child.cv[idx][subidx] += dvec[subidx] + node.sequence = "".join(new_seq) + + if remove_cvs: + for node in tree.traverse(): + try: + node.del_feature("cv") + except (AttributeError, KeyError): + pass + if adj_dist: + tree.dist = 0 + for node in tree.iter_descendants(): + node.dist = gctree.utils.hamming_distance(node.up.sequence, node.sequence) + return tree + + def load_fasta(fastapath): """Load a fasta file as a dictionary, with sequence ids as keys and sequences as values.""" fasta_map = {} @@ -39,7 +131,7 @@ def load_fasta(fastapath): "First non-blank line in fasta does not contain identifier" ) else: - fasta_map[seqid] += line.strip() + fasta_map[seqid] += line.strip().upper() return fasta_map @@ -66,9 +158,7 @@ def build_tree( tree = ete3.Tree(newickstring, format=newickformat) # all fasta entries should be same length seq_len = len(next(iter(fasta_map.values()))) - # find characters for which there's no diversity: - ambig_seq = ''.join(list(charset := set(chars))[0] if len(charset) == 1 else "?" - for chars in zip(*fasta_map.values())) + ambig_seq = "?" * seq_len for node in tree.traverse(): if node.is_root() and reference_sequence is not None: node.add_feature("sequence", reference_sequence) @@ -102,12 +192,34 @@ def parsimony_score(tree): ) -def parsimony_scores_from_files(*args, **kwargs): +def parsimony_scores_from_topologies(newicks, fasta_map, **kwargs): + """returns a generator on parsimony scores of trees specified by newick strings and fasta. + additional keyword arguments are passed to `build_tree`.""" + # eliminate characters for which there's no diversity: + informative_sites = [ + idx for idx, chars in enumerate(zip(*fasta_map.values())) if len(set(chars)) > 1 + ] + newfasta = { + key: "".join(oldseq[idx] for idx in informative_sites) + for key, oldseq in fasta_map.items() + } + yield from ( + sankoff_upward(build_tree(newick, newfasta, **kwargs)) for newick in newicks + ) + + +def parsimony_scores_from_files(treefiles, fastafile, **kwargs): """returns the parsimony scores of trees specified by newick files and a fasta file. Arguments match `build_trees_from_files`.""" - trees = build_trees_from_files(*args, **kwargs) - trees = [pp.disambiguate(tree) for tree in trees] - return [parsimony_score(tree) for tree in trees] + def load_newick(npath): + with open(npath, "r") as fh: + return fh.read() + + return parsimony_scores_from_topologies( + (load_newick(path) for path in treefiles), + load_fasta(fastafile), + **kwargs + ) def build_dag_from_trees(trees): @@ -168,25 +280,16 @@ def _cli_parsimony_score_from_files( save_to_dag, ): """Print the parsimony score of one or more newick files""" - fasta_map = load_fasta(fasta_file) - parsimony_counter = Counter() - trees = [] - for tree, treepath in zip( - build_trees_from_files( - treefiles, - fasta_file, - reference_id=root_id, - ignore_internal_sequences=(not include_internal_sequences), - ), + pscore_gen = parsimony_scores_from_files( treefiles, - ): + fasta_file, + reference_id=root_id, + ignore_internal_sequences=(not include_internal_sequences), + ) + + for score, treepath in zip(pscore_gen, treefiles): print(treepath) - print(parsimony_score(pp.disambiguate(tree))) - if save_to_dag is not None: - trees.append(tree) - if save_to_dag is not None: - with open(save_to_dag, "wb") as fh: - fh.write(pickle.dumps(build_dag_from_trees(trees))) + print(score) def summarize_dag(dag): From 6d9fbe316a3acd29c7815454646b117af7094e40 Mon Sep 17 00:00:00 2001 From: Will Dumm Date: Thu, 2 Jun 2022 09:58:45 -0700 Subject: [PATCH 07/11] add gap_as_char option to sankoff_upward --- scripts/parsimony.py | 90 ++++++++++++++++++++++++++++++-------------- 1 file changed, 62 insertions(+), 28 deletions(-) diff --git a/scripts/parsimony.py b/scripts/parsimony.py index b7a4a25f..d8de0ca6 100644 --- a/scripts/parsimony.py +++ b/scripts/parsimony.py @@ -8,6 +8,14 @@ import pickle import numpy as np +_yey = [ + [0, 1, 1, 1, 1], + [1, 0, 1, 1, 1], + [1, 1, 0, 1, 1], + [1, 1, 1, 0, 1], + [1, 1, 1, 1, 0], +] + delta_vectors = { code: np.array( [ @@ -41,24 +49,35 @@ def _cli(): pass -def sankoff_upward(tree): +def sankoff_upward(tree, gap_as_char=False): + """Compute Sankoff cost vectors at nodes in a postorder traversal, + and return best possible parsimony score of the tree. + + Args: + gap_as_char: if True, the gap character ``-`` will be treated as a fifth character. Otherwise, + it will be treated the same as an ``N``.""" + if gap_as_char: + + def translate_base(char): + return char + + else: + + def translate_base(char): + if char == "-": + return "N" + else: + return char + seq_len = len(tree.sequence) - adj_arr = np.array( - [ - [ - [0, 1, 1, 1, 1], - [1, 0, 1, 1, 1], - [1, 1, 0, 1, 1], - [1, 1, 1, 0, 1], - [1, 1, 1, 1, 0], - ] - ] - * seq_len - ) + adj_arr = np.array([_yey] * seq_len) # First pass of Sankoff: compute cost vectors for node in tree.traverse(strategy="postorder"): node.add_feature( - "cv", np.array([code_vectors[base].copy() for base in node.sequence]) + "cv", + np.array( + [code_vectors[translate_base(base)].copy() for base in node.sequence] + ), ) if not node.is_leaf(): child_costs = [] @@ -68,7 +87,8 @@ def sankoff_upward(tree): child_costs.append(np.min(total_cost, axis=2)) child_cost = np.sum(child_costs, axis=0) node.cv = ( - np.array([code_vectors[base] for base in node.sequence]) + child_cost + np.array([code_vectors[translate_base(base)] for base in node.sequence]) + + child_cost ) return np.sum(np.min(tree.cv, axis=1)) @@ -76,26 +96,31 @@ def sankoff_upward(tree): def disambiguate(tree, random_state=None, remove_cvs=False, adj_dist=False): """Randomly resolve ambiguous bases using a two-pass Sankoff Algorithm on subtrees of consecutive ambiguity codes.""" + if random_state is None: + random.seed(tree.write(format=1)) + else: + random.setstate(random_state) seq_len = len(tree.sequence) + adj_arr = np.array([_yey] * seq_len) sankoff_upward(tree) # Second pass of Sankoff: choose bases preorder = list(tree.traverse(strategy="preorder")) for node in preorder: new_seq = [] + base_indices = [] for idx in range(seq_len): min_cost = min(node.cv[idx]) base_index = random.choice( [i for i, val in enumerate(node.cv[idx]) if val == min_cost] ) - new_base = gctree.utils.bases[base_index] - new_seq.append(new_base) + base_indices.append(base_index) + # Adjust child cost vectors + adj_vec = adj_arr[np.arange(seq_len), base_indices] for child in node.children: - for idx, base in enumerate(new_seq): - dvec = delta_vectors[base] - for subidx in range(5): - child.cv[idx][subidx] += dvec[subidx] + child.cv += adj_vec + new_seq = [gctree.utils.bases[base_index] for base_index in base_indices] node.sequence = "".join(new_seq) if remove_cvs: @@ -192,7 +217,7 @@ def parsimony_score(tree): ) -def parsimony_scores_from_topologies(newicks, fasta_map, **kwargs): +def parsimony_scores_from_topologies(newicks, fasta_map, gap_as_char=False, **kwargs): """returns a generator on parsimony scores of trees specified by newick strings and fasta. additional keyword arguments are passed to `build_tree`.""" # eliminate characters for which there's no diversity: @@ -204,21 +229,22 @@ def parsimony_scores_from_topologies(newicks, fasta_map, **kwargs): for key, oldseq in fasta_map.items() } yield from ( - sankoff_upward(build_tree(newick, newfasta, **kwargs)) for newick in newicks + sankoff_upward(build_tree(newick, newfasta, **kwargs), gap_as_char=gap_as_char) + for newick in newicks ) def parsimony_scores_from_files(treefiles, fastafile, **kwargs): """returns the parsimony scores of trees specified by newick files and a fasta file. - Arguments match `build_trees_from_files`.""" + Arguments match `build_trees_from_files`. Additional keyword arguments are passed to + ``parsimony_scores_from_topologies``.""" + def load_newick(npath): with open(npath, "r") as fh: return fh.read() - + return parsimony_scores_from_topologies( - (load_newick(path) for path in treefiles), - load_fasta(fastafile), - **kwargs + (load_newick(path) for path in treefiles), load_fasta(fastafile), **kwargs ) @@ -271,6 +297,12 @@ def build_dag_from_trees(trees): default=None, help="Combine loaded and disambiguated trees into a history DAG, and save pickled DAG to provided path.", ) +@click.option( + "-g", + "--gap-as-char", + is_flag=True, + help="Treat gap character `-` as a fifth character. Otherwise treated as ambiguous `N`.", +) def _cli_parsimony_score_from_files( treefiles, fasta_file, @@ -278,12 +310,14 @@ def _cli_parsimony_score_from_files( newick_format, include_internal_sequences, save_to_dag, + gap_as_char, ): """Print the parsimony score of one or more newick files""" pscore_gen = parsimony_scores_from_files( treefiles, fasta_file, reference_id=root_id, + gap_as_char=gap_as_char, ignore_internal_sequences=(not include_internal_sequences), ) From f5c6d05120a5b2ef5d96c73f0921048482a355ec Mon Sep 17 00:00:00 2001 From: Will Dumm Date: Thu, 2 Jun 2022 14:30:12 -0700 Subject: [PATCH 08/11] better docstrings --- scripts/parsimony.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/scripts/parsimony.py b/scripts/parsimony.py index d8de0ca6..f4eb9b35 100644 --- a/scripts/parsimony.py +++ b/scripts/parsimony.py @@ -93,9 +93,21 @@ def translate_base(char): return np.sum(np.min(tree.cv, axis=1)) -def disambiguate(tree, random_state=None, remove_cvs=False, adj_dist=False): +def disambiguate( + tree, compute_cvs=True, random_state=None, remove_cvs=False, adj_dist=False +): """Randomly resolve ambiguous bases using a two-pass Sankoff Algorithm on - subtrees of consecutive ambiguity codes.""" + subtrees of consecutive ambiguity codes. + + Args: + compute_cvs: If true, compute upward sankoff cost vectors. If ``sankoff_upward`` was + already run on the tree, this may be skipped. + random_state: A ``random`` module random state, returned by ``random.getstate()``. Output + from this function is otherwise deterministic. + remove_cvs: Remove sankoff cost vectors from tree nodes after disambiguation. + adj_dist: Recompute hamming parsimony distances on tree after disambiguation, and store them + in ``dist`` node attributes. + """ if random_state is None: random.seed(tree.write(format=1)) else: @@ -103,7 +115,8 @@ def disambiguate(tree, random_state=None, remove_cvs=False, adj_dist=False): seq_len = len(tree.sequence) adj_arr = np.array([_yey] * seq_len) - sankoff_upward(tree) + if compute_cvs: + sankoff_upward(tree) # Second pass of Sankoff: choose bases preorder = list(tree.traverse(strategy="preorder")) for node in preorder: @@ -219,7 +232,13 @@ def parsimony_score(tree): def parsimony_scores_from_topologies(newicks, fasta_map, gap_as_char=False, **kwargs): """returns a generator on parsimony scores of trees specified by newick strings and fasta. - additional keyword arguments are passed to `build_tree`.""" + additional keyword arguments are passed to `build_tree`. + + Args: + newicks: newick strings of trees whose parsimony scores will be computed + fasta_map: fasta data as a dictionary, as output by ``load_fasta`` + gap_as_char: if True, gap characters `-` will be treated as a fifth character. + Otherwise, they will be synonymous with `N`'s.""" # eliminate characters for which there's no diversity: informative_sites = [ idx for idx, chars in enumerate(zip(*fasta_map.values())) if len(set(chars)) > 1 From 48cb340ec8af69d1a4846924346e3189edabfa29 Mon Sep 17 00:00:00 2001 From: Will Dumm Date: Fri, 3 Jun 2022 14:49:15 -0700 Subject: [PATCH 09/11] add transition weight option --- scripts/parsimony.py | 85 +++++++++++++++++++++++++++++--------------- 1 file changed, 57 insertions(+), 28 deletions(-) diff --git a/scripts/parsimony.py b/scripts/parsimony.py index f4eb9b35..931e97da 100644 --- a/scripts/parsimony.py +++ b/scripts/parsimony.py @@ -8,23 +8,18 @@ import pickle import numpy as np -_yey = [ - [0, 1, 1, 1, 1], - [1, 0, 1, 1, 1], - [1, 1, 0, 1, 1], - [1, 1, 1, 0, 1], - [1, 1, 1, 1, 0], -] - -delta_vectors = { - code: np.array( - [ - 0 if base in gctree.utils.ambiguous_dna_values[code] else 1 - for base in gctree.utils.bases - ] - ) - for code in gctree.utils.ambiguous_dna_values -} +_yey = np.array( + [ + [0, 1, 1, 1, 1], + [1, 0, 1, 1, 1], + [1, 1, 0, 1, 1], + [1, 1, 1, 0, 1], + [1, 1, 1, 1, 0], + ] +) + +# This is applicable even when diagonal entries in transition rate matrix are +# nonzero, since it is only a mask on allowable sites based on each base. code_vectors = { code: np.array( [ @@ -34,10 +29,6 @@ ) for code in gctree.utils.ambiguous_dna_values } -cost_adjust = { - base: np.array([int(not i == j) for j in range(5)]) - for i, base in enumerate(gctree.utils.bases) -} @click.group(context_settings={"help_option_names": ["-h", "--help"]}) @@ -49,13 +40,37 @@ def _cli(): pass -def sankoff_upward(tree, gap_as_char=False): +def _get_adj_array(seq_len, transition_weights=None): + if transition_weights is None: + transition_weights = _yey + else: + transition_weights = np.array(transition_weights) + + if transition_weights.shape == (5, 5): + adj_arr = np.array([transition_weights] * seq_len) + elif transition_weights.shape == (seq_len, 5, 5): + adj_arr = transition_weights + else: + raise RuntimeError( + "Transition weight matrix must have shape (5, 5) or (sequence_length, 5, 5)." + ) + return adj_arr + + +def sankoff_upward(tree, gap_as_char=False, transition_weights=None): """Compute Sankoff cost vectors at nodes in a postorder traversal, and return best possible parsimony score of the tree. Args: gap_as_char: if True, the gap character ``-`` will be treated as a fifth character. Otherwise, - it will be treated the same as an ``N``.""" + it will be treated the same as an ``N``. + transition_weights: A 5x5 transition weight matrix, with base order `AGCT-`. + Rows contain targeting weights. That is, the first row contains the transition weights + from `A` to each possible target base. Alternatively, a sequence-length array of these + transition weight matrices, if transition weights vary by-site. By default, a constant + weight matrix will be used containing 1 in all off-diagonal positions, equivalent + to Hamming parsimony. + """ if gap_as_char: def translate_base(char): @@ -69,8 +84,8 @@ def translate_base(char): else: return char - seq_len = len(tree.sequence) - adj_arr = np.array([_yey] * seq_len) + adj_arr = _get_adj_array(len(tree.sequence), transition_weights=transition_weights) + # First pass of Sankoff: compute cost vectors for node in tree.traverse(strategy="postorder"): node.add_feature( @@ -94,7 +109,13 @@ def translate_base(char): def disambiguate( - tree, compute_cvs=True, random_state=None, remove_cvs=False, adj_dist=False + tree, + compute_cvs=True, + random_state=None, + remove_cvs=False, + adj_dist=False, + gap_as_char=False, + transition_weights=None, ): """Randomly resolve ambiguous bases using a two-pass Sankoff Algorithm on subtrees of consecutive ambiguity codes. @@ -107,6 +128,14 @@ def disambiguate( remove_cvs: Remove sankoff cost vectors from tree nodes after disambiguation. adj_dist: Recompute hamming parsimony distances on tree after disambiguation, and store them in ``dist`` node attributes. + gap_as_char: if True, the gap character ``-`` will be treated as a fifth character. Otherwise, + it will be treated the same as an ``N``. + transition_weights: A 5x5 transition weight matrix, with base order `AGCT-`. + Rows contain targeting weights. That is, the first row contains the transition weights + from `A` to each possible target base. Alternatively, a sequence-length array of these + transition weight matrices, if transition weights vary by-site. By default, a constant + weight matrix will be used containing 1 in all off-diagonal positions, equivalent + to Hamming parsimony. """ if random_state is None: random.seed(tree.write(format=1)) @@ -114,9 +143,9 @@ def disambiguate( random.setstate(random_state) seq_len = len(tree.sequence) - adj_arr = np.array([_yey] * seq_len) + adj_arr = _get_adj_array(len(tree.sequence), transition_weights=transition_weights) if compute_cvs: - sankoff_upward(tree) + sankoff_upward(tree, gap_as_char=gap_as_char) # Second pass of Sankoff: choose bases preorder = list(tree.traverse(strategy="preorder")) for node in preorder: From ce7599af95e0fb51b08c24c6a39e69f9e2998339 Mon Sep 17 00:00:00 2001 From: Will Dumm Date: Mon, 6 Jun 2022 13:56:27 -0700 Subject: [PATCH 10/11] beginnings of strictly-min-weight-ambiguous labels --- scripts/parsimony.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/parsimony.py b/scripts/parsimony.py index 931e97da..7ff0120e 100644 --- a/scripts/parsimony.py +++ b/scripts/parsimony.py @@ -116,6 +116,7 @@ def disambiguate( adj_dist=False, gap_as_char=False, transition_weights=None, + min_ambiguities=False, ): """Randomly resolve ambiguous bases using a two-pass Sankoff Algorithm on subtrees of consecutive ambiguity codes. @@ -136,6 +137,9 @@ def disambiguate( transition weight matrices, if transition weights vary by-site. By default, a constant weight matrix will be used containing 1 in all off-diagonal positions, equivalent to Hamming parsimony. + min_ambiguities: If True, leaves ambiguities in reconstructed sequences, expressing which + bases are possible at each site in a maximally parsimonious disambiguation of the given + topology. Otherwise, sequences are resolved in one possible way, and include no ambiguities. """ if random_state is None: random.seed(tree.write(format=1)) From 4caef84fd5011ec2af88e01a29b105764267933c Mon Sep 17 00:00:00 2001 From: Will Dumm Date: Mon, 6 Jun 2022 16:32:47 -0700 Subject: [PATCH 11/11] Add strictly min-weight ambiguous labeling option --- scripts/parsimony.py | 58 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 47 insertions(+), 11 deletions(-) diff --git a/scripts/parsimony.py b/scripts/parsimony.py index 7ff0120e..e0305f3c 100644 --- a/scripts/parsimony.py +++ b/scripts/parsimony.py @@ -30,6 +30,11 @@ for code in gctree.utils.ambiguous_dna_values } +ambiguous_codes_from_vecs = { + tuple(0 if base in base_set else 1 for base in gctree.utils.bases): code + for code, base_set in gctree.utils.ambiguous_dna_values.items() +} + @click.group(context_settings={"help_option_names": ["-h", "--help"]}) def _cli(): @@ -139,7 +144,8 @@ def disambiguate( to Hamming parsimony. min_ambiguities: If True, leaves ambiguities in reconstructed sequences, expressing which bases are possible at each site in a maximally parsimonious disambiguation of the given - topology. Otherwise, sequences are resolved in one possible way, and include no ambiguities. + topology. In the history DAG paper, this is known as a strictly min-weight ambiguous labeling. + Otherwise, sequences are resolved in one possible way, and include no ambiguities. """ if random_state is None: random.seed(tree.write(format=1)) @@ -153,20 +159,26 @@ def disambiguate( # Second pass of Sankoff: choose bases preorder = list(tree.traverse(strategy="preorder")) for node in preorder: - new_seq = [] - base_indices = [] - for idx in range(seq_len): - min_cost = min(node.cv[idx]) - base_index = random.choice( - [i for i, val in enumerate(node.cv[idx]) if val == min_cost] - ) - base_indices.append(base_index) + if min_ambiguities: + adj_vec = node.cv != np.stack((node.cv.min(axis=1),) * 5, axis=1) + new_seq = [ + ambiguous_codes_from_vecs[tuple(map(float, row))] for row in adj_vec + ] + else: + base_indices = [] + for idx in range(seq_len): + min_cost = min(node.cv[idx]) + base_index = random.choice( + [i for i, val in enumerate(node.cv[idx]) if val == min_cost] + ) + base_indices.append(base_index) + + adj_vec = adj_arr[np.arange(seq_len), base_indices] + new_seq = [gctree.utils.bases[base_index] for base_index in base_indices] # Adjust child cost vectors - adj_vec = adj_arr[np.arange(seq_len), base_indices] for child in node.children: child.cv += adj_vec - new_seq = [gctree.utils.bases[base_index] for base_index in base_indices] node.sequence = "".join(new_seq) if remove_cvs: @@ -355,6 +367,12 @@ def build_dag_from_trees(trees): is_flag=True, help="Treat gap character `-` as a fifth character. Otherwise treated as ambiguous `N`.", ) +@click.option( + "-a", + "--preserve-ambiguities", + is_flag=True, + help="Do not disambiguate fully, but rather preserve ambiguities to express all maximally parsimonious assignments at each site.", +) def _cli_parsimony_score_from_files( treefiles, fasta_file, @@ -363,6 +381,7 @@ def _cli_parsimony_score_from_files( include_internal_sequences, save_to_dag, gap_as_char, + preserve_ambiguities, ): """Print the parsimony score of one or more newick files""" pscore_gen = parsimony_scores_from_files( @@ -377,6 +396,23 @@ def _cli_parsimony_score_from_files( print(treepath) print(score) + if save_to_dag is not None: + trees = build_trees_from_files( + treefiles, + fasta_file, + reference_id=root_id, + ignore_internal_sequences=(not include_internal_sequences), + ) + trees = ( + disambiguate( + tree, gap_as_char=gap_as_char, min_ambiguities=preserve_ambiguities + ) + for tree in trees + ) + dag = build_dag_from_trees(trees) + with open(save_to_dag, "wb") as fh: + fh.write(pickle.dumps(dag)) + def summarize_dag(dag): """print summary information about the provided history DAG."""