-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvcf2fasta.sjf.py
142 lines (130 loc) · 3.82 KB
/
vcf2fasta.sjf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import gzip
import sys
import time
import argparse
from multiprocessing import Pool
def get_list(in_file):
try:
global decode
if in_file[in_file.rfind(".")+1:] == "gz":
vcf = gzip.open(in_file,'r')
decode=True
else:
vcf = open(in_file,'r')
decode=False
except IOError:
print("ERROR: {0} do not exists.".format(in_file))
else:
for line in vcf:
if decode:
line=line.decode('utf-8')
if line.startswith("#CHROM"):
sample_list = line.split()[9:]
break
finally:
vcf.close()
return sample_list
def recode(sample_id):
print('[{0}] INFO: Start convert {1}'.format(time.ctime(),sample_id))
in_file=dict_run[sample_id][0]
out_prefix=dict_run[sample_id][1]
i=int(dict_run[sample_id][2])
dict_allel = {
'..' : '-',
'AA':'A', 'TT':'T', 'CC':'C', 'GG':'G',
'AC':'M', 'CA':'M',
'AG':'R', 'GA':'R',
'AT':'W', 'TA':'W',
'CG':'S', 'GC':'S',
'CT':'Y', 'TC':'Y',
'GT':'K', 'TG':'K'
}
try:
if decode:
vcf = gzip.open(in_file,'r')
else:
vcf = open(in_file,'r')
except IOError:
print("[{0}] ERROR: {1} do not exists.".format(time.ctime(),in_file))
else:
output = open('{0}/{1}.fasta'.format(out_prefix,sample_id),'w')
output.write('>{0}'.format(sample_id)+'\n')
fas_box=[]
if decode:
for line in vcf:
line=line.decode('utf-8')
if line.startswith("#"):
pass
else:
ref=line.split()[3]
alt=line.split()[4]
dict_temp={
'0/0':ref+ref,'0|0':ref+ref,
'0/1':ref+alt,'0|1':ref+alt,
'1/1':alt+alt,'1|1':alt+alt,
'./.':'..'
}
try:
snp=dict_allel[dict_temp[line.split()[9+i].split(':')[0]]]
fas_box.append(snp)
if len(fas_box)==100:
output.write(''.join(fas_box)+'\n')
fas_box=[]
except KeyError:
print(line)
output.write(''.join(fas_box)+'\n')
else:
for line in vcf:
if line.startswith("#"):
pass
else:
ref=line.split()[3]
alt=line.split()[4]
dict_temp={
'0/0':ref+ref,'0|0':ref+ref,
'0/1':ref+alt,'0|1':ref+alt,
'1/1':alt+alt,'1|1':alt+alt,
'./.':'..'
}
snp=dict_allel[dict_temp[line.split()[9+i].split(':')[0]]]
fas_box.append(snp)
if len(fas_box)==100:
output.write(''.join(fas_box)+'\n')
fas_box=[]
output.write(''.join(fas_box)+'\n')
finally:
vcf.close()
output.close()
print('[{0}] INFO: Finish convert {1}'.format(time.ctime(),sample_id))
def final_run():
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description="Convert VCF format to fasta format (per sample).",
epilog='''
@author: Jingfang SI
@copyright: 2018 China Agricultural University. All rights reserved.
@contact: [email protected]
'''
)
parser.add_argument("-v", "--vcf", required=True,help="specify the input vcf(.vcf/.vcf.gz) file")
parser.add_argument("-op", "--out-prefix", required=True,help="specify the prefix of output file")
parser.add_argument("-nt", "--num-threads", required=True,type=int, default=1,help="specify the number of data threads")
args = parser.parse_args()
in_file = args.vcf
out_prefix = args.out_prefix
num_threads = args.num_threads
sample_list=get_list(in_file)
global dict_run
dict_run={}
for sample_id in sample_list:
dict_run[sample_id]=[in_file,out_prefix,sample_list.index(sample_id)]
print('[{0}] INFO: Convert vcf to fasta. Total samples : {1}.'.format(time.ctime(),len(sample_list)))
current_time = time.time()
with Pool(num_threads) as p:
p.map(recode,sample_list)
p.close()
p.join()
print('[{0}] INFO: Done'.format(time.ctime()))
# print('[{0}] INFO: Total running time {1}'.format(time.ctime(),time.strftime("%H:%M:%S", time.mgtime(time.time()-current_time))))
if __name__ == '__main__':
final_run()