Skip to content

Commit

Permalink
update scripts to python 3 + update partis
Browse files Browse the repository at this point in the history
  • Loading branch information
psathyrella committed Mar 1, 2024
1 parent 8359450 commit 2f8b463
Show file tree
Hide file tree
Showing 8 changed files with 46 additions and 31 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ If `--lineage-unique-ids` is specified, there will also be additional lineage-sp
| aa_lineage_seqs.dnamap | fasta(ish) | for **each intermediate ancestor of the lineage of the sequence with the specified id**, map from sampled amino acid sequence to its corresponding set of nucleotide sequences and posterior probabilities |
| aa_lineage_seqs.pfilterX.svg | svg | posterior probability lineage graphic made with [Graphviz](https://www.graphviz.org/), where `X` is the posterior probability cutoff for the sampled sequences (see details below). |

The posterior probability lineage plot (`lineage aa_lineage_seqs.pfilterX.svg`) summarizes all the inferred ancestral sequences (and transitions among them) that were observed in the sampled trees.
The posterior probability lineage plot (`aa_lineage_seqs.pfilterX.svg`) summarizes all the inferred ancestral sequences (and transitions among them) that were observed in the sampled trees.
Each node represents an amino acid sequence: either inferred ("naive" if it was a naive sequence in any tree, otherwise "inter") or the seed sequence.
The node's percent label (which also determines the color weight) is the fraction of trees in which it was observed, whether as naive or intermediate (so note that this can be larger than the probability in a naive sequence's name in the fasta file above, since in this plot we include also instances where it was an intermediate).
Edges are labelled with the mutations separating their two nodes, and with the percent of transitions (in all trees) from the parent node that connect to the child node (which also determines the color weight).
Expand Down
2 changes: 1 addition & 1 deletion lib/partis
Submodule partis updated 5914 files
3 changes: 2 additions & 1 deletion scripts/generate_revbayes_rev_file.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/env python
#!/usr/bin/env python3

from __future__ import absolute_import
import argparse
import jinja2
import os
Expand Down
32 changes: 17 additions & 15 deletions scripts/parse_cluster.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/usr/bin/env python
#!/usr/bin/env python3

from __future__ import absolute_import
from __future__ import print_function
import argparse
from collections import OrderedDict
import yaml
Expand All @@ -13,42 +15,42 @@
from util_functions import write_to_fasta

default_partis_path = os.path.join(os.getcwd(), "lib/partis")
sys.path.append(os.path.join(default_partis_path, "python"))
import utils
import glutils
from clusterpath import ClusterPath
sys.path.append(default_partis_path)
import python.utils as utils
import python.glutils as glutils
from python.clusterpath import ClusterPath


def show_available_clusters(cpath, ipartition, ptn):
available_clusters = [
OrderedDict([("index", i), ("size", len(cluster)), ("unique_ids", ' '.join(cluster))])
for i, cluster in enumerate(ptn)
]
print " available clusters in partition at index {} {}:".format(
print(" available clusters in partition at index {} {}:".format(
ipartition, ("(best)" if ipartition == cpath.i_best else "")
)
print "\t".join(available_clusters[0].keys())
))
print("\t".join(list(available_clusters[0].keys())))
for clust in available_clusters:
print "\t".join([str(val) for val in clust.values()])
print("\t".join([str(val) for val in clust.values()]))

def parse_cluster_annotation(annotation_list, cpath, args):
if len(annotation_list) == 1:
print " only one annotation in partis output file. Using it."
print(" only one annotation in partis output file. Using it.")
return annotation_list[0]

if cpath is None or len(cpath.partitions) == 0:
raise Exception('partis output file has no partitions: %s' % args.partis_yaml_file)

ipartition = cpath.i_best if args.partition_index is None else args.partition_index
ptn = cpath.partitions[ipartition]
print " found %d clusters in %s" % (len(ptn), "best partition" if args.partition_index is None else "partition at index %d (of %d)" % (ipartition, len(cpath.partitions)))
print(" found %d clusters in %s" % (len(ptn), "best partition" if args.partition_index is None else "partition at index %d (of %d)" % (ipartition, len(cpath.partitions))))

clusters_to_use = ptn if args.cluster_index is None else [ptn[args.cluster_index]]
print " taking %s" % (("all %d clusters"%len(clusters_to_use)) if args.cluster_index is None else "cluster at index %d" % args.cluster_index)
print(" taking %s" % (("all %d clusters"%len(clusters_to_use)) if args.cluster_index is None else "cluster at index %d" % args.cluster_index))

if args.seed_unique_id is not None:
clusters_to_use = [c for c in clusters_to_use if args.seed_unique_id in c] # NOTE can result in more than one cluster with the seed sequence (e.g. if this file contains intermediate annotations from seed partitioning))
print " removing clusters not containing sequence '%s' (leaving %d)" % (args.seed_unique_id, len(clusters_to_use))
print(" removing clusters not containing sequence '%s' (leaving %d)" % (args.seed_unique_id, len(clusters_to_use)))
if len(clusters_to_use) > 1:
show_available_clusters(cpath, ipartition, ptn)
raise Exception(
Expand Down Expand Up @@ -107,7 +109,7 @@ def cluster_sequences(annotation, use_indel_reversed_sequences=False):
def write_cluster_fasta(fname, seqfos):
if not os.path.exists(os.path.dirname(os.path.abspath(fname))):
os.makedirs(os.path.dirname(os.path.abspath(fname)))
print " writing %d sequences (including naive) to %s" % (len(seqfos), fname)
print(" writing %d sequences (including naive) to %s" % (len(seqfos), fname))
write_to_fasta(
OrderedDict([(seqfo["name"], seqfo["seq"]) for seqfo in seqfos]), fname
)
Expand Down Expand Up @@ -159,7 +161,7 @@ def write_cluster_fasta(fname, seqfos):
if utils.getsuffix(args.partis_yaml_file) == ".csv":
default_glfo_dir = default_partis_path + "/data/germlines/human"
if args.glfo_dir is None:
print " note: reading deprecated csv format, so need to get germline info from a separate directory; --glfo-dir was not set, so using default %s. If it doesn't crash, it's probably ok." % default_glfo_dir
print(" note: reading deprecated csv format, so need to get germline info from a separate directory; --glfo-dir was not set, so using default %s. If it doesn't crash, it's probably ok." % default_glfo_dir)
args.glfo_dir = default_glfo_dir
glfo = glutils.read_glfo(args.glfo_dir, locus=args.locus)

Expand Down
10 changes: 7 additions & 3 deletions scripts/tabulate_lineage_probs.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/usr/bin/env python
#!/usr/bin/env python3

from __future__ import absolute_import
from __future__ import print_function
import argparse
import copy
from collections import Counter, OrderedDict
Expand All @@ -10,6 +12,8 @@
import sys

from util_functions import read_from_fasta, translate, write_to_fasta
import six
from six.moves import zip


# ----------------------------------------------------------------------------------------
Expand Down Expand Up @@ -139,7 +143,7 @@ def seqs_of_tree(t, seed):
write_to_fasta(aa_dna_map, args.output_base + ".dnamap") # sort of a fasta, but with name for the aa seq, and multiple seq lines for each name, with each seq line a (prob, nuc seq) pair for all the nuc seqs contributing to the aa seq

# Flip the dictionary.
seqs_out = {v:k for k,v in out_seqs.iteritems()}
seqs_out = {v:k for k,v in six.iteritems(out_seqs)}

# make empty node/edge plot
dot = graphviz.Digraph(comment=" ".join(sys.argv), format='svg',
Expand Down Expand Up @@ -169,7 +173,7 @@ def seqs_of_tree(t, seed):
continue
node_conf = int(10 + (100-10) * float(node_c[ab]) / num_trees) # note that the numbers in the naive seq names are *not* the same as the ones we calculate here (since here we include *all* times that they occur, not as the naive seq, whereas the number in their name is the fraction of times they were the naive sequence)
if debug:
print '%3d %6.3f %s' % (node_c[ab], float(node_c[ab]) / num_trees, seqs_out[ab])
print('%3d %6.3f %s' % (node_c[ab], float(node_c[ab]) / num_trees, seqs_out[ab]))
dot_copy.node(nlabels[seqs_out[ab]], style="filled", fillcolor="#ff0000" + (str(node_conf) if node_conf < 100 else "")) # can also add xlabel= to get text outside bubble

dot_copy.save(args.output_base + ".pfilter" + str(pfilter) + ".dot")
Expand Down
11 changes: 7 additions & 4 deletions scripts/tabulate_naive_probs.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
#!/usr/bin/env python
#!/usr/bin/env python3

from __future__ import absolute_import
import argparse
from collections import Counter, OrderedDict
import dendropy
from itertools import groupby
import subprocess
import weblogolib as w
import weblogo as w

from util_functions import translate, write_to_fasta
from io import open
import six


if __name__ == '__main__':
Expand Down Expand Up @@ -47,7 +50,7 @@

format = w.LogoFormat(data, options)
with open(args.output_base + ".png", 'w') as f:
f.write(w.png_print_formatter(data, format))
f.write(str(w.png_print_formatter(data, format)))

aa_naive_seqs_c = Counter(aa_naive_seqs)
num_trees = len(aa_naive_seqs)
Expand All @@ -67,6 +70,6 @@
aa_dna_naive_seqs_map = OrderedDict(
(k, "\n".join(str(float(count) / num_trees) + "," + dna_seq
for dna_seq, count in aa_dna_naive_seqs_d[v].most_common(None)))
for k, v in aa_naive_seqs_d.iteritems()
for k, v in six.iteritems(aa_naive_seqs_d)
)
write_to_fasta(aa_dna_naive_seqs_map, args.output_base + ".dnamap")
8 changes: 5 additions & 3 deletions scripts/util_functions.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from Bio.Alphabet import IUPAC
from __future__ import absolute_import
from collections import OrderedDict
from Bio.Seq import Seq
from Bio import SeqIO
from io import open
import six


def read_from_fasta(path, invert = False):
Expand All @@ -17,14 +19,14 @@ def translate(s):
'''
Assume we are in frame and translate DNA to amino acids.
'''
coding_dna = Seq(s[:(3*int(len(s)/3))], IUPAC.ambiguous_dna)
coding_dna = Seq(s[:(3*int(len(s)/3))])
return str(coding_dna.translate())

def write_to_fasta(d, filename):
'''
Write a FASTA dict to file.
'''
with open(filename, 'w') as f:
for k, v in d.iteritems():
for k, v in six.iteritems(d):
f.write('>{}\n'.format(k))
f.write('{}\n'.format(v))
9 changes: 6 additions & 3 deletions scripts/write_lh_annotations.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
#!/usr/bin/env python
#!/usr/bin/env python3
from __future__ import absolute_import
import math
import copy
import argparse
import csv
import os
import sys
from io import open
from six.moves import zip
default_partis_path = os.path.join(os.getcwd(), 'lib/partis')
sys.path.append(os.path.join(default_partis_path, 'python'))
import utils
sys.path.append(default_partis_path)
import python.utils as utils

# reads sampled trees/annotation pairs, collapses duplicate annotations while keeping tracking of all the trees that contributed, then writes all [best] unique annotations to linearham_run_all[best].yaml
# This does not add the inferred ancestral sequences to the annotations (atm this is done in partis/test/linearham-run.py, although maybe it'd make sense to do it here)
Expand Down

0 comments on commit 2f8b463

Please sign in to comment.