From 2f8b4636f3b3adf315f04021046e9f31ec55b841 Mon Sep 17 00:00:00 2001 From: duncan ralph Date: Fri, 1 Mar 2024 14:57:00 -0800 Subject: [PATCH] update scripts to python 3 + update partis --- README.md | 2 +- lib/partis | 2 +- scripts/generate_revbayes_rev_file.py | 3 ++- scripts/parse_cluster.py | 32 ++++++++++++++------------- scripts/tabulate_lineage_probs.py | 10 ++++++--- scripts/tabulate_naive_probs.py | 11 +++++---- scripts/util_functions.py | 8 ++++--- scripts/write_lh_annotations.py | 9 +++++--- 8 files changed, 46 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index 5b217e55..100af3a7 100644 --- a/README.md +++ b/README.md @@ -193,7 +193,7 @@ If `--lineage-unique-ids` is specified, there will also be additional lineage-sp | aa_lineage_seqs.dnamap | fasta(ish) | for **each intermediate ancestor of the lineage of the sequence with the specified id**, map from sampled amino acid sequence to its corresponding set of nucleotide sequences and posterior probabilities | | aa_lineage_seqs.pfilterX.svg | svg | posterior probability lineage graphic made with [Graphviz](https://www.graphviz.org/), where `X` is the posterior probability cutoff for the sampled sequences (see details below). | -The posterior probability lineage plot (`lineage aa_lineage_seqs.pfilterX.svg`) summarizes all the inferred ancestral sequences (and transitions among them) that were observed in the sampled trees. +The posterior probability lineage plot (`aa_lineage_seqs.pfilterX.svg`) summarizes all the inferred ancestral sequences (and transitions among them) that were observed in the sampled trees. Each node represents an amino acid sequence: either inferred ("naive" if it was a naive sequence in any tree, otherwise "inter") or the seed sequence. The node's percent label (which also determines the color weight) is the fraction of trees in which it was observed, whether as naive or intermediate (so note that this can be larger than the probability in a naive sequence's name in the fasta file above, since in this plot we include also instances where it was an intermediate). Edges are labelled with the mutations separating their two nodes, and with the percent of transitions (in all trees) from the parent node that connect to the child node (which also determines the color weight). diff --git a/lib/partis b/lib/partis index c789fc96..3c81c089 160000 --- a/lib/partis +++ b/lib/partis @@ -1 +1 @@ -Subproject commit c789fc96183be8ce67f349347387b1bbf09eeae4 +Subproject commit 3c81c089b762f19fb9662b010ac78f5a0aab654b diff --git a/scripts/generate_revbayes_rev_file.py b/scripts/generate_revbayes_rev_file.py index 0050c7d8..ace576c0 100755 --- a/scripts/generate_revbayes_rev_file.py +++ b/scripts/generate_revbayes_rev_file.py @@ -1,5 +1,6 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 +from __future__ import absolute_import import argparse import jinja2 import os diff --git a/scripts/parse_cluster.py b/scripts/parse_cluster.py index eb8a90a4..0169e85c 100755 --- a/scripts/parse_cluster.py +++ b/scripts/parse_cluster.py @@ -1,5 +1,7 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 +from __future__ import absolute_import +from __future__ import print_function import argparse from collections import OrderedDict import yaml @@ -13,10 +15,10 @@ from util_functions import write_to_fasta default_partis_path = os.path.join(os.getcwd(), "lib/partis") -sys.path.append(os.path.join(default_partis_path, "python")) -import utils -import glutils -from clusterpath import ClusterPath +sys.path.append(default_partis_path) +import python.utils as utils +import python.glutils as glutils +from python.clusterpath import ClusterPath def show_available_clusters(cpath, ipartition, ptn): @@ -24,16 +26,16 @@ def show_available_clusters(cpath, ipartition, ptn): OrderedDict([("index", i), ("size", len(cluster)), ("unique_ids", ' '.join(cluster))]) for i, cluster in enumerate(ptn) ] - print " available clusters in partition at index {} {}:".format( + print(" available clusters in partition at index {} {}:".format( ipartition, ("(best)" if ipartition == cpath.i_best else "") - ) - print "\t".join(available_clusters[0].keys()) + )) + print("\t".join(list(available_clusters[0].keys()))) for clust in available_clusters: - print "\t".join([str(val) for val in clust.values()]) + print("\t".join([str(val) for val in clust.values()])) def parse_cluster_annotation(annotation_list, cpath, args): if len(annotation_list) == 1: - print " only one annotation in partis output file. Using it." + print(" only one annotation in partis output file. Using it.") return annotation_list[0] if cpath is None or len(cpath.partitions) == 0: @@ -41,14 +43,14 @@ def parse_cluster_annotation(annotation_list, cpath, args): ipartition = cpath.i_best if args.partition_index is None else args.partition_index ptn = cpath.partitions[ipartition] - print " found %d clusters in %s" % (len(ptn), "best partition" if args.partition_index is None else "partition at index %d (of %d)" % (ipartition, len(cpath.partitions))) + print(" found %d clusters in %s" % (len(ptn), "best partition" if args.partition_index is None else "partition at index %d (of %d)" % (ipartition, len(cpath.partitions)))) clusters_to_use = ptn if args.cluster_index is None else [ptn[args.cluster_index]] - print " taking %s" % (("all %d clusters"%len(clusters_to_use)) if args.cluster_index is None else "cluster at index %d" % args.cluster_index) + print(" taking %s" % (("all %d clusters"%len(clusters_to_use)) if args.cluster_index is None else "cluster at index %d" % args.cluster_index)) if args.seed_unique_id is not None: clusters_to_use = [c for c in clusters_to_use if args.seed_unique_id in c] # NOTE can result in more than one cluster with the seed sequence (e.g. if this file contains intermediate annotations from seed partitioning)) - print " removing clusters not containing sequence '%s' (leaving %d)" % (args.seed_unique_id, len(clusters_to_use)) + print(" removing clusters not containing sequence '%s' (leaving %d)" % (args.seed_unique_id, len(clusters_to_use))) if len(clusters_to_use) > 1: show_available_clusters(cpath, ipartition, ptn) raise Exception( @@ -107,7 +109,7 @@ def cluster_sequences(annotation, use_indel_reversed_sequences=False): def write_cluster_fasta(fname, seqfos): if not os.path.exists(os.path.dirname(os.path.abspath(fname))): os.makedirs(os.path.dirname(os.path.abspath(fname))) - print " writing %d sequences (including naive) to %s" % (len(seqfos), fname) + print(" writing %d sequences (including naive) to %s" % (len(seqfos), fname)) write_to_fasta( OrderedDict([(seqfo["name"], seqfo["seq"]) for seqfo in seqfos]), fname ) @@ -159,7 +161,7 @@ def write_cluster_fasta(fname, seqfos): if utils.getsuffix(args.partis_yaml_file) == ".csv": default_glfo_dir = default_partis_path + "/data/germlines/human" if args.glfo_dir is None: - print " note: reading deprecated csv format, so need to get germline info from a separate directory; --glfo-dir was not set, so using default %s. If it doesn't crash, it's probably ok." % default_glfo_dir + print(" note: reading deprecated csv format, so need to get germline info from a separate directory; --glfo-dir was not set, so using default %s. If it doesn't crash, it's probably ok." % default_glfo_dir) args.glfo_dir = default_glfo_dir glfo = glutils.read_glfo(args.glfo_dir, locus=args.locus) diff --git a/scripts/tabulate_lineage_probs.py b/scripts/tabulate_lineage_probs.py index d3002536..5fe24805 100755 --- a/scripts/tabulate_lineage_probs.py +++ b/scripts/tabulate_lineage_probs.py @@ -1,5 +1,7 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 +from __future__ import absolute_import +from __future__ import print_function import argparse import copy from collections import Counter, OrderedDict @@ -10,6 +12,8 @@ import sys from util_functions import read_from_fasta, translate, write_to_fasta +import six +from six.moves import zip # ---------------------------------------------------------------------------------------- @@ -139,7 +143,7 @@ def seqs_of_tree(t, seed): write_to_fasta(aa_dna_map, args.output_base + ".dnamap") # sort of a fasta, but with name for the aa seq, and multiple seq lines for each name, with each seq line a (prob, nuc seq) pair for all the nuc seqs contributing to the aa seq # Flip the dictionary. - seqs_out = {v:k for k,v in out_seqs.iteritems()} + seqs_out = {v:k for k,v in six.iteritems(out_seqs)} # make empty node/edge plot dot = graphviz.Digraph(comment=" ".join(sys.argv), format='svg', @@ -169,7 +173,7 @@ def seqs_of_tree(t, seed): continue node_conf = int(10 + (100-10) * float(node_c[ab]) / num_trees) # note that the numbers in the naive seq names are *not* the same as the ones we calculate here (since here we include *all* times that they occur, not as the naive seq, whereas the number in their name is the fraction of times they were the naive sequence) if debug: - print '%3d %6.3f %s' % (node_c[ab], float(node_c[ab]) / num_trees, seqs_out[ab]) + print('%3d %6.3f %s' % (node_c[ab], float(node_c[ab]) / num_trees, seqs_out[ab])) dot_copy.node(nlabels[seqs_out[ab]], style="filled", fillcolor="#ff0000" + (str(node_conf) if node_conf < 100 else "")) # can also add xlabel= to get text outside bubble dot_copy.save(args.output_base + ".pfilter" + str(pfilter) + ".dot") diff --git a/scripts/tabulate_naive_probs.py b/scripts/tabulate_naive_probs.py index b6c26c4c..abf8576b 100755 --- a/scripts/tabulate_naive_probs.py +++ b/scripts/tabulate_naive_probs.py @@ -1,13 +1,16 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 +from __future__ import absolute_import import argparse from collections import Counter, OrderedDict import dendropy from itertools import groupby import subprocess -import weblogolib as w +import weblogo as w from util_functions import translate, write_to_fasta +from io import open +import six if __name__ == '__main__': @@ -47,7 +50,7 @@ format = w.LogoFormat(data, options) with open(args.output_base + ".png", 'w') as f: - f.write(w.png_print_formatter(data, format)) + f.write(str(w.png_print_formatter(data, format))) aa_naive_seqs_c = Counter(aa_naive_seqs) num_trees = len(aa_naive_seqs) @@ -67,6 +70,6 @@ aa_dna_naive_seqs_map = OrderedDict( (k, "\n".join(str(float(count) / num_trees) + "," + dna_seq for dna_seq, count in aa_dna_naive_seqs_d[v].most_common(None))) - for k, v in aa_naive_seqs_d.iteritems() + for k, v in six.iteritems(aa_naive_seqs_d) ) write_to_fasta(aa_dna_naive_seqs_map, args.output_base + ".dnamap") diff --git a/scripts/util_functions.py b/scripts/util_functions.py index 83094f4a..d4e24f9b 100644 --- a/scripts/util_functions.py +++ b/scripts/util_functions.py @@ -1,7 +1,9 @@ -from Bio.Alphabet import IUPAC +from __future__ import absolute_import from collections import OrderedDict from Bio.Seq import Seq from Bio import SeqIO +from io import open +import six def read_from_fasta(path, invert = False): @@ -17,7 +19,7 @@ def translate(s): ''' Assume we are in frame and translate DNA to amino acids. ''' - coding_dna = Seq(s[:(3*int(len(s)/3))], IUPAC.ambiguous_dna) + coding_dna = Seq(s[:(3*int(len(s)/3))]) return str(coding_dna.translate()) def write_to_fasta(d, filename): @@ -25,6 +27,6 @@ def write_to_fasta(d, filename): Write a FASTA dict to file. ''' with open(filename, 'w') as f: - for k, v in d.iteritems(): + for k, v in six.iteritems(d): f.write('>{}\n'.format(k)) f.write('{}\n'.format(v)) diff --git a/scripts/write_lh_annotations.py b/scripts/write_lh_annotations.py index 5ebf081e..33c9a327 100755 --- a/scripts/write_lh_annotations.py +++ b/scripts/write_lh_annotations.py @@ -1,13 +1,16 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 +from __future__ import absolute_import import math import copy import argparse import csv import os import sys +from io import open +from six.moves import zip default_partis_path = os.path.join(os.getcwd(), 'lib/partis') -sys.path.append(os.path.join(default_partis_path, 'python')) -import utils +sys.path.append(default_partis_path) +import python.utils as utils # reads sampled trees/annotation pairs, collapses duplicate annotations while keeping tracking of all the trees that contributed, then writes all [best] unique annotations to linearham_run_all[best].yaml # This does not add the inferred ancestral sequences to the annotations (atm this is done in partis/test/linearham-run.py, although maybe it'd make sense to do it here)