update scripts to python 3 + update partis

matsengrp · Mar 1, 2024 · 2f8b463 · 2f8b463
1 parent 8359450
commit 2f8b463
Show file tree

Hide file tree

Showing 8 changed files with 46 additions and 31 deletions.
diff --git a/README.md b/README.md
@@ -193,7 +193,7 @@ If `--lineage-unique-ids` is specified, there will also be additional lineage-sp
 | aa_lineage_seqs.dnamap | fasta(ish) | for **each intermediate ancestor of the lineage of the sequence with the specified id**, map from sampled amino acid sequence to its corresponding set of nucleotide sequences and posterior probabilities |
 | aa_lineage_seqs.pfilterX.svg | svg | posterior probability lineage graphic made with [Graphviz](https://www.graphviz.org/), where `X` is the posterior probability cutoff for the sampled sequences (see details below). |
 
-The posterior probability lineage plot (`lineage aa_lineage_seqs.pfilterX.svg`) summarizes all the inferred ancestral sequences (and transitions among them) that were observed in the sampled trees.
+The posterior probability lineage plot (`aa_lineage_seqs.pfilterX.svg`) summarizes all the inferred ancestral sequences (and transitions among them) that were observed in the sampled trees.
 Each node represents an amino acid sequence: either inferred ("naive" if it was a naive sequence in any tree, otherwise "inter") or the seed sequence.
 The node's percent label (which also determines the color weight) is the fraction of trees in which it was observed, whether as naive or intermediate (so note that this can be larger than the probability in a naive sequence's name in the fasta file above, since in this plot we include also instances where it was an intermediate).
 Edges are labelled with the mutations separating their two nodes, and with the percent of transitions (in all trees) from the parent node that connect to the child node (which also determines the color weight).

diff --git a/lib/partis b/lib/partis
diff --git a/scripts/generate_revbayes_rev_file.py b/scripts/generate_revbayes_rev_file.py
@@ -1,5 +1,6 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
+from __future__ import absolute_import
 import argparse
 import jinja2
 import os

diff --git a/scripts/parse_cluster.py b/scripts/parse_cluster.py
@@ -1,5 +1,7 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
+from __future__ import absolute_import
+from __future__ import print_function
 import argparse
 from collections import OrderedDict
 import yaml
@@ -13,42 +15,42 @@
 from util_functions import write_to_fasta
 
 default_partis_path = os.path.join(os.getcwd(), "lib/partis")
-sys.path.append(os.path.join(default_partis_path, "python"))
-import utils
-import glutils
-from clusterpath import ClusterPath
+sys.path.append(default_partis_path)
+import python.utils as utils
+import python.glutils as glutils
+from python.clusterpath import ClusterPath
 
 
 def show_available_clusters(cpath, ipartition, ptn):
     available_clusters = [
         OrderedDict([("index", i), ("size", len(cluster)), ("unique_ids", ' '.join(cluster))])
         for i, cluster in enumerate(ptn)
     ]
-    print " available clusters in partition at index {} {}:".format(
+    print(" available clusters in partition at index {} {}:".format(
         ipartition, ("(best)" if ipartition == cpath.i_best else "")
-    )
-    print "\t".join(available_clusters[0].keys())
+    ))
+    print("\t".join(list(available_clusters[0].keys())))
     for clust in available_clusters:
-        print "\t".join([str(val) for val in clust.values()])
+        print("\t".join([str(val) for val in clust.values()]))
 
 def parse_cluster_annotation(annotation_list, cpath, args):
     if len(annotation_list) == 1:
-        print " only one annotation in partis output file. Using it."
+        print(" only one annotation in partis output file. Using it.")
         return annotation_list[0]
 
     if cpath is None or len(cpath.partitions) == 0:
         raise Exception('partis output file has no partitions: %s' % args.partis_yaml_file)
 
     ipartition = cpath.i_best if args.partition_index is None else args.partition_index
     ptn = cpath.partitions[ipartition]
-    print "  found %d clusters in %s" % (len(ptn), "best partition" if args.partition_index is None else "partition at index %d (of %d)" % (ipartition, len(cpath.partitions)))
+    print("  found %d clusters in %s" % (len(ptn), "best partition" if args.partition_index is None else "partition at index %d (of %d)" % (ipartition, len(cpath.partitions))))
 
     clusters_to_use = ptn if args.cluster_index is None else [ptn[args.cluster_index]]
-    print "    taking %s" % (("all %d clusters"%len(clusters_to_use)) if args.cluster_index is None else "cluster at index %d" % args.cluster_index)
+    print("    taking %s" % (("all %d clusters"%len(clusters_to_use)) if args.cluster_index is None else "cluster at index %d" % args.cluster_index))
 
     if args.seed_unique_id is not None:
         clusters_to_use = [c for c in clusters_to_use if args.seed_unique_id in c]  # NOTE can result in more than one cluster with the seed sequence (e.g. if this file contains intermediate annotations from seed partitioning))
-        print "    removing clusters not containing sequence '%s' (leaving %d)" % (args.seed_unique_id, len(clusters_to_use))
+        print("    removing clusters not containing sequence '%s' (leaving %d)" % (args.seed_unique_id, len(clusters_to_use)))
         if len(clusters_to_use) > 1:
             show_available_clusters(cpath, ipartition, ptn)
             raise Exception(
@@ -107,7 +109,7 @@ def cluster_sequences(annotation, use_indel_reversed_sequences=False):
 def write_cluster_fasta(fname, seqfos):
     if not os.path.exists(os.path.dirname(os.path.abspath(fname))):
         os.makedirs(os.path.dirname(os.path.abspath(fname)))
-    print "  writing %d sequences (including naive) to %s" % (len(seqfos), fname)
+    print("  writing %d sequences (including naive) to %s" % (len(seqfos), fname))
     write_to_fasta(
         OrderedDict([(seqfo["name"], seqfo["seq"]) for seqfo in seqfos]), fname
     )
@@ -159,7 +161,7 @@ def write_cluster_fasta(fname, seqfos):
     if utils.getsuffix(args.partis_yaml_file) == ".csv":
         default_glfo_dir = default_partis_path + "/data/germlines/human"
         if args.glfo_dir is None:
-            print "  note: reading deprecated csv format, so need to get germline info from a separate directory; --glfo-dir was not set, so using default %s. If it doesn't crash, it's probably ok." % default_glfo_dir
+            print("  note: reading deprecated csv format, so need to get germline info from a separate directory; --glfo-dir was not set, so using default %s. If it doesn't crash, it's probably ok." % default_glfo_dir)
             args.glfo_dir = default_glfo_dir
         glfo = glutils.read_glfo(args.glfo_dir, locus=args.locus)
 

diff --git a/scripts/tabulate_lineage_probs.py b/scripts/tabulate_lineage_probs.py
@@ -1,5 +1,7 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
+from __future__ import absolute_import
+from __future__ import print_function
 import argparse
 import copy
 from collections import Counter, OrderedDict
@@ -10,6 +12,8 @@
 import sys
 
 from util_functions import read_from_fasta, translate, write_to_fasta
+import six
+from six.moves import zip
 
 
 # ----------------------------------------------------------------------------------------
@@ -139,7 +143,7 @@ def seqs_of_tree(t, seed):
     write_to_fasta(aa_dna_map, args.output_base + ".dnamap")  # sort of a fasta, but with name for the aa seq, and multiple seq lines for each name, with each seq line a (prob, nuc seq) pair for all the nuc seqs contributing to the aa seq
 
     # Flip the dictionary.
-    seqs_out = {v:k for k,v in out_seqs.iteritems()}
+    seqs_out = {v:k for k,v in six.iteritems(out_seqs)}
 
     # make empty node/edge plot
     dot = graphviz.Digraph(comment=" ".join(sys.argv), format='svg',
@@ -169,7 +173,7 @@ def seqs_of_tree(t, seed):
                     continue
                 node_conf = int(10 + (100-10) * float(node_c[ab]) / num_trees)  # note that the numbers in the naive seq names are *not* the same as the ones we calculate here (since here we include *all* times that they occur, not as the naive seq, whereas the number in their name is the fraction of times they were the naive sequence)
                 if debug:
-                    print '%3d  %6.3f  %s' % (node_c[ab], float(node_c[ab]) / num_trees, seqs_out[ab])
+                    print('%3d  %6.3f  %s' % (node_c[ab], float(node_c[ab]) / num_trees, seqs_out[ab]))
                 dot_copy.node(nlabels[seqs_out[ab]], style="filled", fillcolor="#ff0000" + (str(node_conf) if node_conf < 100 else ""))  # can also add xlabel= to get text outside bubble
 
         dot_copy.save(args.output_base + ".pfilter" + str(pfilter) + ".dot")

diff --git a/scripts/tabulate_naive_probs.py b/scripts/tabulate_naive_probs.py
@@ -1,13 +1,16 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
+from __future__ import absolute_import
 import argparse
 from collections import Counter, OrderedDict
 import dendropy
 from itertools import groupby
 import subprocess
-import weblogolib as w
+import weblogo as w
 
 from util_functions import translate, write_to_fasta
+from io import open
+import six
 
 
 if __name__ == '__main__':
@@ -47,7 +50,7 @@
 
     format = w.LogoFormat(data, options)
     with open(args.output_base + ".png", 'w') as f:
-        f.write(w.png_print_formatter(data, format))
+        f.write(str(w.png_print_formatter(data, format)))
 
     aa_naive_seqs_c = Counter(aa_naive_seqs)
     num_trees = len(aa_naive_seqs)
@@ -67,6 +70,6 @@
     aa_dna_naive_seqs_map = OrderedDict(
         (k, "\n".join(str(float(count) / num_trees) + "," + dna_seq
                       for dna_seq, count in aa_dna_naive_seqs_d[v].most_common(None)))
-        for k, v in aa_naive_seqs_d.iteritems()
+        for k, v in six.iteritems(aa_naive_seqs_d)
     )
     write_to_fasta(aa_dna_naive_seqs_map, args.output_base + ".dnamap")
diff --git a/scripts/util_functions.py b/scripts/util_functions.py
@@ -1,7 +1,9 @@
-from Bio.Alphabet import IUPAC
+from __future__ import absolute_import
 from collections import OrderedDict
 from Bio.Seq import Seq
 from Bio import SeqIO
+from io import open
+import six
 
 
 def read_from_fasta(path, invert = False):
@@ -17,14 +19,14 @@ def translate(s):
     '''
     Assume we are in frame and translate DNA to amino acids.
     '''
-    coding_dna = Seq(s[:(3*int(len(s)/3))], IUPAC.ambiguous_dna)
+    coding_dna = Seq(s[:(3*int(len(s)/3))])
     return str(coding_dna.translate())
 
 def write_to_fasta(d, filename):
     '''
     Write a FASTA dict to file.
     '''
     with open(filename, 'w') as f:
-        for k, v in d.iteritems():
+        for k, v in six.iteritems(d):
             f.write('>{}\n'.format(k))
             f.write('{}\n'.format(v))
diff --git a/scripts/write_lh_annotations.py b/scripts/write_lh_annotations.py
@@ -1,13 +1,16 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
+from __future__ import absolute_import
 import math
 import copy
 import argparse
 import csv
 import os
 import sys
+from io import open
+from six.moves import zip
 default_partis_path = os.path.join(os.getcwd(), 'lib/partis')
-sys.path.append(os.path.join(default_partis_path, 'python'))
-import utils
+sys.path.append(default_partis_path)
+import python.utils as utils
 
 # reads sampled trees/annotation pairs, collapses duplicate annotations while keeping tracking of all the trees that contributed, then writes all [best] unique annotations to linearham_run_all[best].yaml
 #  This does not add the inferred ancestral sequences to the annotations (atm this is done in partis/test/linearham-run.py, although maybe it'd make sense to do it here)