-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathalign_bepipred_scores.py
executable file
·99 lines (71 loc) · 2.48 KB
/
align_bepipred_scores.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@author: camilla eldridge
"""
import sys
bepipred=sys.argv[1] #"AG_test_clustalo.bepipred"
alignment=sys.argv[2] #"AG_test_clustalo.fasta"
outfile=sys.argv[3] #"for_ggplot.csv"
out=open(outfile, "w")
# save split bepipred scores and remove unwanted lines
zz=[]
with open(bepipred,"rt") as ag_scores:
t="".join(ag_scores).split("\n") #split by lines
for line in t[1:]:
if "bepipred" in line and "source" not in line:
zz.append(line)
# get headers info
headers=""
with open(alignment, "rt") as prots_aligned:
g="".join(prots_aligned).split(">")[1:]
for n in g:
n=n.split()
headers=headers + (n[0]) + "\n"
Heads=[f for f in headers.split("\n")] # for headers more than one letter...
testM=""
with open(alignment, "rt") as prots_aligned:
g="".join(prots_aligned).split(">")[1:]
for n in g:
n=n.split()
head=n[0]
sequence="".join(n[1:])
for gg in Heads:
if gg == head:
testM = testM + sequence
# list of strings, stays in same order so can add to Bepipred
# as long as alignment is in same order as was input to bepipred.
allM=[]
for i in zz:
k=i.split()
jj=[v for v in k]
jl=jj[0] + " " + jj[4] + " " + jj[5]
allM.append(jl)
res1=[l for l in testM]
out1 = '\n'.join(map(' '.join, zip(allM,res1)))
#print(out1)
removed_gaps=[]
for h in out1.split("\n"):
h=h.split()
if "-" in h[3]:
pass
else:
removed_gaps.append(h)
final=[",".join(p) for p in removed_gaps]
out.write("Sequence" + "," + "AA" + "," + "Ag" + "," + "code" + "\n" + "\n".join(final))
## NOTES ##
#separate out only ag_score lines (score lines have bepipred and
#one line has source that isnt a score line
#function to perform alternate line merging of strings
#In1="acegi"
#In2="bdfhj"
#out1 = ''.join(map(''.join, zip(In1, In2)))
# bepipred 1 replaces gaps and stops with A so cant use replace! :(
# best bet is to index gaps and results...ad cut the lines relating to sequence and not inserted A's
# bepipred remove ##Type Protein A
# seqname source feature start end score N/A ?
#Manual option
#1. Replace all gaps with X sed -i 's/-/X/g' AG_test_proteins.fasta
#2. Run Bepipred2 on online server
#3. edit csv output by removing lines with X's.
#4.plot as usual