-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilter_coding_reads_circ.py
340 lines (275 loc) · 14.4 KB
/
filter_coding_reads_circ.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import yaml
import argparse
import time
import os
import random
def make_mrna_rrna_index(transcript_fasta, ribosome_fasta, thread, tmp_file_path, raw_reads, soft_log_folder):
read_name = str(raw_reads).split('/')[-1].split('.')[0]
transcript_name = str(transcript_fasta).split('/')[-1].split('.')[0]
ribosome_name = str(ribosome_fasta).split('/')[-1].split('.')[0]
print(get_time(), 'make rRNA index...')
rRNA_index_log = soft_log_folder + '/' + read_name +'_rRNA_index_log'
os.system('bowtie-build --threads {} {} {}/{} > {}'.format(thread, ribosome_fasta, tmp_file_path, ribosome_name, rRNA_index_log))
print(get_time(), 'make transcript index...')
trans_index_log = soft_log_folder + '/' + read_name +'_transcprit_index_log'
os.system('bowtie-build --threads {} {} {}/{} > {}'.format(thread, transcript_fasta, tmp_file_path, transcript_name, trans_index_log))
def predict_adapter(raw_reads, ribotype, tmp_file_path):
read_name = str(raw_reads).split('/')[-1].split('.')[0]
fastq_reads = raw_reads
if ribotype == 'sra':
print(get_time(), '{} sra to fastq...'.format(read_name))
os.system('fastq-dump {} -O {}'.format(raw_reads, tmp_file_path))
fastq_reads = tmp_file_path + '/' + read_name + '.fastq'
fasta_reads = tmp_file_path + '/' + fastq_reads.split('/')[-1].split('.')[0]+'.fasta'
# .fa is easy to predict adapter_start
print(get_time(), 'fastq to fasta...')
os.system('sed -n \'1~4s/^@/>/p;2~4p\' {} > {}'.format(fastq_reads, fasta_reads))
# refer to these reads
refer_reads_num = 5
# obtain the index of refer reads
print(get_time(), 'obtain refer reads...')
han_fasta = open(fasta_reads)
fasta_lines_num = int(len(han_fasta.readlines()))
han_fasta.close()
refer_reads_index = []
while len(refer_reads_index) < refer_reads_num:
reads_index = random.randrange(1, fasta_lines_num, 2)
if reads_index not in refer_reads_index:
refer_reads_index.append(reads_index)
# window scanning and predict adapters
print(get_time(), 'analysis refer reads...')
predict_adapter_window = 8
adapter_start_rate_list = [0.8, 0.7, 0.6]
tmp_read_file = tmp_file_path + '/' + 'tmp_read.txt'
tmp_count_file = tmp_file_path + '/' + 'tmp_count.txt'
global adapter_start_set
adapter_start_set = set()
for adapter_start_rate in adapter_start_rate_list:
for i in refer_reads_index:
os.system('sed -n \'{},{}p\' {} > {}'.format(i, i+1, fasta_reads, tmp_read_file))
han_tmp_read = open(tmp_read_file)
for seq in SeqIO.parse(han_tmp_read, 'fasta'):
for site in range(0,len(seq.seq)-8):
tmp_seq = str(seq.seq[site:site+8])
os.system('grep -c {} {} > {}'.format(tmp_seq, fasta_reads, tmp_count_file))
han_tmp_count = open(tmp_count_file)
tmp_seq_count = int(han_tmp_count.read())
han_tmp_count.close()
if tmp_seq_count/fasta_lines_num*2 >= adapter_start_rate:
adapter_start_set.add(tmp_seq)
break
han_tmp_read.close()
if len(adapter_start_set) != 0:
break
if len(adapter_start_set) == 0:
print(get_time(), 'no adapter in these reads')
elif len(adapter_start_set) != 0:
print(get_time(), 'adapter_start in reads rate: {}'.format(adapter_start_rate))
print(get_time(), 'adapter_start is: {}'.format(adapter_start_set))
def deal_raw_reads(raw_reads, tmp_file_path, thread, transcript_fasta, ribosome_fasta, trimmomatic, ribotype, soft_log_folder):
read_name = str(raw_reads).split('/')[-1].split('.')[0]
if ribotype == 'sra':
fastq_reads = tmp_file_path + '/' + read_name + '.fastq'
else:
fastq_reads = raw_reads
fastq_reads_cutadapt = tmp_file_path + '/' + read_name
# cut predicted adapters
print(get_time(), 'cut predicted adapters...')
if len(adapter_start_set) != 0:
adapter_command = ''
fastq_reads_cutadapt = tmp_file_path + '/' + read_name + '_cutadapt'
cutadapt_log = soft_log_folder + '/' + read_name + '_cutadapt_log'
for item in adapter_start_set:
adapter_command += ' -a ' + item
os.system('cutadapt -j {}{} -o {} {} > {}'.format(thread, adapter_command, fastq_reads_cutadapt+ '.fastq', fastq_reads, cutadapt_log))
# Filter out low quality reads by Trimmomatic
print(get_time(), 'filter out low quality reads...')
fastq_reads_cutadapt_clean = fastq_reads_cutadapt + '_clean'
trim_log = soft_log_folder + '/' + read_name + '_trim_log'
os.system('java -jar {} SE -threads {} {} {} LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:16 > {}'.format(trimmomatic, thread, fastq_reads_cutadapt+'.fastq', fastq_reads_cutadapt_clean+'.fastq', trim_log))
# Map clean reads to transcript sequence by bowtie
print(get_time(), 'clean reads from transcript...')
unmapped_transcript_reads = fastq_reads_cutadapt_clean + '_untranscipt'
transcript_name = str(transcript_fasta).split('/')[-1].split('.')[0]
transcript_index = tmp_file_path + '/' + transcript_name
log_trans = soft_log_folder + '/' + transcript_name + '.map_to_transcript.sam'
os.system('bowtie -p {} --un {} --norc {} {} > {}'.format(thread, unmapped_transcript_reads+'.fastq', transcript_index, fastq_reads_cutadapt_clean+'.fastq', log_trans))
# Map clean reads to ribosome sequence by bowtie
print(get_time(), 'clean reads from rRNA...')
global unmapped_rRNA_reads
unmapped_rRNA_reads = unmapped_transcript_reads + '_unrRNA'
ribosome_name = str(ribosome_fasta).split('/')[-1].split('.')[0]
rRNA_index = tmp_file_path + '/' + ribosome_name
log_out = soft_log_folder + '/' + ribosome_name + '.map_to_rRNA.sam'
os.system('bowtie -p {} --un {} --norc {} {} > {}'.format(thread, unmapped_rRNA_reads+'.fastq', rRNA_index, unmapped_transcript_reads+'.fastq', log_out))
global final_reads
final_reads = unmapped_rRNA_reads+'.fastq'
def make_junction_seq(raw_circrnas, circrna_gtf, raw_reads, tmp_file_path):
print(get_time(), 'make junction seq...')
# transform the final fastq to fasta, easy to filter
global final_fasta_reads
final_fasta_reads = unmapped_rRNA_reads + '.fasta'
os.system('sed -n \'1~4s/^@/>/p;2~4p\' {} > {}'.format(final_reads, final_fasta_reads))
max_reads_lenth = 0
global reads_lenth_dic
reads_lenth_dic = {}
han_fasta_reads = open(final_fasta_reads)
for seq_record in SeqIO.parse(han_fasta_reads, 'fasta'):
reads_lenth_dic[seq_record.id] = len(seq_record.seq)
if len(seq_record.seq) > max_reads_lenth:
max_reads_lenth = len(seq_record.seq)
# get the biggest length of reads, assemble the junction file
global junc_site
junc_site = max_reads_lenth
read_name = str(raw_reads).split('/')[-1].split('.')[0]
circ_name = read_name + '_' + str(raw_circrnas).split('/')[-1].split('.')[0]
han_raw_circ = open(raw_circrnas)
han_circ_gtf = open(circrna_gtf)
# reverse the '-' circRNA
dic_strand = {}
for line in han_circ_gtf:
line = line.replace('\n', '')
line_list = line.split('\t')
dic_strand[line_list[0]] = line_list[1]
global true_raw_file
true_raw_file = tmp_file_path +'/'+ circ_name + '_true.fa'
global true_junc_file
true_junc_file = tmp_file_path + '/' + circ_name + '_true_junc.fa'
data_circ = []
data_junc = []
junc_dic = {}
# make the true seq file and junction file
for seq_record in SeqIO.parse(han_raw_circ, 'fasta'):
strand = dic_strand[seq_record.id]
if strand == '-':
seq_true = seq_record.seq.reverse_complement()
rec_circ_true = SeqRecord(seq_true,
id = seq_record.id,
description = '')
data_circ.append(rec_circ_true)
seq_true_junc = seq_true[-max_reads_lenth:]+seq_true[:max_reads_lenth]
rec_junc_true = SeqRecord(seq_true_junc,
id = seq_record.id,
description = '')
data_junc.append(rec_junc_true)
elif strand == '+':
rec_circ_true = SeqRecord(seq_record.seq,
id = seq_record.id,
description = '')
data_circ.append(rec_circ_true)
seq_true_junc = seq_record.seq[-max_reads_lenth:]+seq_record.seq[:max_reads_lenth]
rec_junc_true = SeqRecord(seq_true_junc,
id = seq_record.id,
description = '')
data_junc.append(rec_junc_true)
han_true = open(true_raw_file, 'w+')
SeqIO.write(data_circ, han_true, 'fasta')
han_junc = open(true_junc_file, 'w+')
SeqIO.write(data_junc, han_junc, 'fasta')
han_true.close()
han_junc.close()
han_raw_circ.close()
han_circ_gtf.close()
han_fasta_reads.close()
def make_junc_index_align(raw_reads, raw_circrnas, thread, tmp_file_path, soft_log_folder):
read_name = str(raw_reads).split('/')[-1].split('.')[0]
circ_name = read_name + '_' + str(raw_circrnas).split('/')[-1].split('.')[0]
# make junction index
print(get_time(), 'make junction index...')
junc_index_log = soft_log_folder + '/' + read_name +'_junc_index_log'
os.system('./requiredSoft/STAR --runMode genomeGenerate --runThreadN {} --genomeDir {} --genomeFastaFiles {} > {}'.format(thread, tmp_file_path+'/', true_junc_file, junc_index_log))
# map the junction to filter reads
print(get_time(), 'map the filter reads to the junction of circRNA...')
global out_name
out_name = tmp_file_path +'/'+ circ_name
junc_map_reads_log = soft_log_folder + '/' + read_name +'_junc_map_to_reads_log'
os.system('./requiredSoft/STAR --runThreadN {} --outSAMtype BAM SortedByCoordinate --genomeDir {} --readFilesIn {} --outFileNamePrefix {} > {}'.format(thread, tmp_file_path+'/', final_fasta_reads, out_name, junc_map_reads_log))
def bamtobed():
print(get_time(), 'transform the bamflie to bedfile')
bamfile = out_name + 'Aligned.sortedByCoord.out.bam'
global bedfile
bedfile = bamfile + '.bamtobedresult.bed'
os.system('bedtools bamtobed -bed12 -i {} > {}'.format(bamfile, bedfile))
def filter_coding_circ_and_reads_index(raw_reads,tmp_file_path):
print(get_time(), 'filter coding ablity by ribo-seq data...')
coding_circ_set = set()
coding_circ_reads_site = {}
han_bed = open(bedfile)
# save the circ which reads cross the junction, save the reads site
for line in han_bed:
list_line = line.split('\t')
circ_id = list_line[0]
read_id = list_line[3]
reads_strand = list_line[5]
start_site = eval(list_line[1]) + 1
stop_site = eval(list_line[1]) + reads_lenth_dic[read_id]
if start_site <= junc_site and stop_site >junc_site:
coding_circ_set.add(circ_id)
if circ_id in coding_circ_reads_site.keys():
coding_circ_reads_site[circ_id] += [start_site-junc_site-1, stop_site-junc_site]
if circ_id not in coding_circ_reads_site.keys():
coding_circ_reads_site[circ_id] = [start_site-junc_site-1, stop_site-junc_site]
# write the reads site to new file
read_name = str(raw_reads).split('/')[-1].split('.')[0]
reads_cover_jun_file = tmp_file_path + '/' + read_name + '_read_cover_jun.txt'
han_reads_cover = open(reads_cover_jun_file, 'w+')
for key in coding_circ_reads_site.keys():
han_reads_cover.write(key + '\t' +str(int(len(coding_circ_reads_site[key])/2)))
for item in coding_circ_reads_site[key]:
han_reads_cover.write('\t'+str(item))
han_reads_cover.write('\t'+ str(int(len(coding_circ_reads_site[key])/2)) +'\n')
# write the circRNA with ribo_seq data to new file
han_raw_circ = open(true_raw_file)
read_name = str(raw_reads).split('/')[-1].split('.')[0]
coding_circ_file = tmp_file_path + '/' + read_name + '_true.fa'
coding_circ_seq_list = []
for seq_record in SeqIO.parse(han_raw_circ, 'fasta'):
if seq_record.id in coding_circ_set:
junc_reads_count = str(int(len(coding_circ_reads_site[seq_record.id])/2))
rec_coding = SeqRecord(seq_record.seq,
id=seq_record.id+'$'+junc_reads_count,
description='')
coding_circ_seq_list.append(rec_coding)
han_coding_circ = open(coding_circ_file,'w+')
SeqIO.write(coding_circ_seq_list, han_coding_circ, 'fasta')
han_bed.close()
han_reads_cover.close()
han_coding_circ.close()
def get_time():
nowtime = time.strftime('[%Y-%m-%d %H:%M:%S]', time.localtime())
return nowtime
def main():
parse = argparse.ArgumentParser(description='This script helps to make junction index for each circ!')
parse.add_argument('-y', '--yaml', required=True, help='please input the config.yaml file')
args = parse.parse_args()
yamlfile = args.yaml
con_file = open(yamlfile)
fileload = yaml.full_load(con_file)
raw_reads = fileload['raw_reads']
transcript_fasta = fileload['transcript_fasta']
ribosome_fasta = fileload['ribosome_fasta']
tmp_file_path = fileload['tmp_file_location']
trimmomatic = fileload['trimmomatic_jar']
thread = fileload['thread']
ribotype = fileload['ribotype']
raw_circrnas = fileload['circrnas']
circrna_gtf = fileload['circrna_gtf']
if not os.path.exists(tmp_file_path):
os.makedirs(tmp_file_path)
soft_log_folder = tmp_file_path + '/' + 'log'
if not os.path.exists(soft_log_folder):
os.makedirs(soft_log_folder)
for item in raw_reads:
raw_reads = item
predict_adapter(raw_reads, ribotype, tmp_file_path)
make_mrna_rrna_index(transcript_fasta, ribosome_fasta, thread, tmp_file_path, raw_reads, soft_log_folder)
deal_raw_reads(raw_reads, tmp_file_path, thread, transcript_fasta, ribosome_fasta, trimmomatic, ribotype, soft_log_folder)
make_junction_seq(raw_circrnas, circrna_gtf, raw_reads, tmp_file_path)
make_junc_index_align(raw_reads, raw_circrnas, thread, tmp_file_path, soft_log_folder)
bamtobed()
filter_coding_circ_and_reads_index(raw_reads,tmp_file_path)
if __name__ == '__main__':
main()