-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake_snp_calling.py
executable file
·220 lines (186 loc) · 12.2 KB
/
make_snp_calling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
#!/usr/bin/env python3
import argparse
import jinja2
import os
import json
from globalsearch.rnaseq.find_files import find_fastq_files
DESCRIPTION = "make_fixed_snp_calling"
SCRIPT_TEMPLATE_PRE = """#!/bin/bash
#SBATCH -J resr_mtb
if [ ! -d "{{common_result_path}}" ]; then
mkdir -p {{common_result_path}}
fi
if [ ! -d "{{fixed_result_path}}" ]; then
mkdir -p {{fixed_result_path}}
fi
if [ ! -d "{{unfixed_result_path}}" ]; then
mkdir -p {{unfixed_result_path}}
fi
if [ ! -d "{{tbprofiler_result_path}}" ]; then
mkdir -p {{tbprofiler_result_path}}
fi
"""
SCRIPT_TEMPLATE_PAIRED = """
sickle pe -l 35 -f {{fastq1}} -r {{fastq2}} -t sanger -o {{common_result_path}}/{{trimmed1}} -p {{common_result_path}}/{{trimmed2}} -s {{common_result_path}}/{{trimmedS}}
bwa aln -R 1 {{fasta_path}} {{common_result_path}}/{{trimmed1}} > {{common_result_path}}/{{sai1}}
bwa aln -R 1 {{fasta_path}} {{common_result_path}}/{{trimmed2}} > {{common_result_path}}/{{sai2}}
bwa aln -R 1 {{fasta_path}} {{common_result_path}}/{{trimmedS}} > {{common_result_path}}/{{saiS}}
bwa sampe -a 1000 -n 1 -N 1 {{fasta_path}} {{common_result_path}}/{{sai1}} {{common_result_path}}/{{sai2}} {{common_result_path}}/{{trimmed1}} {{common_result_path}}/{{trimmed2}} > {{common_result_path}}/{{paired_sam}}
bwa samse -n 1 {{fasta_path}} {{common_result_path}}/{{saiS}} {{common_result_path}}/{{trimmedS}} > {{common_result_path}}/{{single_sam}}
samtools view -bhSt {{fasta_path}}.fai {{common_result_path}}/{{paired_sam}} -o {{common_result_path}}/{{paired_bam}}
samtools view -bhSt {{fasta_path}}.fai {{common_result_path}}/{{single_sam}} -o {{common_result_path}}/{{single_bam}}
samtools merge {{common_result_path}}/{{merged_bam}} {{common_result_path}}/{{paired_bam}} {{common_result_path}}/{{single_bam}}
samtools sort {{common_result_path}}/{{merged_bam}} -o {{common_result_path}}/{{sorted_bam}}
"""
SCRIPT_TEMPLATE_SINGLE = """
sickle se -l 35 -f {{fastq}} -t sanger -o {{common_result_path}}/{{trimmed}}
bwa aln -R 1 {{fasta_path}} {{common_result_path}}/{{trimmed}} > {{common_result_path}}/{{sai}}
bwa samse -n 1 {{fasta_path}} {{common_result_path}}/{{sai}} {{common_result_path}}/{{trimmed}} > {{common_result_path}}/{{single_sam}}
samtools view -bhSt {{fasta_path}}.fai {{common_result_path}}/{{single_sam}} -o {{common_result_path}}/{{single_bam}}
samtools sort {{common_result_path}}/{{single_bam}} -o {{common_result_path}}/{{sorted_bam}}
"""
SCRIPT_TEMPLATE_COMMON = """
# Start here with sorted BAM
samtools index {{common_result_path}}/{{sorted_bam}}
samtools mpileup -q 30 -Q 20 -ABOf {{fasta_path}} {{common_result_path}}/{{sorted_bam}} > {{common_result_path}}/{{pileup_file}}
java -jar {{varscan_path}} mpileup2snp {{common_result_path}}/{{pileup_file}} --min-coverage 3 --min-reads2 2 --min-avg-qual 20 --min-var-freq 0.01 --min-freq-for-hom 0.9 --p-value 99e-02 --strand-filter 1 > {{common_result_path}}/{{varscan_file}}
java -jar {{varscan_path}} mpileup2cns {{common_result_path}}/{{pileup_file}} --min-coverage 3 --min-avg-qual 20 --min-var-freq 0.75 --min-reads2 2 --strand-filter 1 > {{common_result_path}}/{{cns_file}}
# FIXED specific part
perl {{resr_script_path}}/PPE_PE_INS_filt.pl {{resr_db_path}}/PPE_INS_loci.list {{common_result_path}}/{{varscan_file}} > {{fixed_result_path}}/{{ppe_file}}
perl {{resr_script_path}}/fixed_format_trans.pl {{fixed_result_path}}/{{ppe_file}} > {{fixed_result_path}}/{{for_file}}
perl {{resr_script_path}}/fix_extract.pl {{fixed_result_path}}/{{for_file}} > {{fixed_result_path}}/{{fix_file}}
perl {{resr_script_path}}/unfix_pileup_match.pl {{fixed_result_path}}/{{for_file}} {{common_result_path}}/{{pileup_file}} > {{fixed_result_path}}/{{forup_file}}
cut -f2-4 {{fixed_result_path}}/{{fix_file}} > {{fixed_result_path}}/{{snp_file}}
python {{resr_script_path}}/annotate_mtb_results.py {{fixed_result_path}}/{{snp_file}} {{common_result_path}}/{{varscan_file}} {{resr_db_path}} > {{fixed_result_path}}/{{result_file}}
# Unfixed part
# use cns and varscan file from fixed
#exclude regions belonging to PPE/PE and insertion sequences, and also exclude the regions that were recently marked as error-prone (Marin, Bioinformatics, 2022)
#perl {{resr_script_path}}/PE_IS_filt.pl {{resr_db_path}}/Excluded_loci_mask.list {{common_result_path}}/{{varscan_file}} > {{unfixed_result_path}}/{{ppe_file}}
python {{resr_script_path}}/varscan_drop_excluded.py {{resr_db_path}}/Excluded_loci_mask.list {{common_result_path}}/{{varscan_file}} {{unfixed_result_path}}/{{ppe_file}}
perl {{resr_script_path}}/unfixed_format_trans.pl {{unfixed_result_path}}/{{ppe_file}} > {{unfixed_result_path}}/{{for_file}}
#extract read location from mpileup file (where does a mutation allele locate on a seqeuncing read), for further filtering based on tail distribution
perl {{resr_script_path}}/mix_pileup_merge.pl {{unfixed_result_path}}/{{for_file}} {{common_result_path}}/{{pileup_file}} > {{unfixed_result_path}}/{{forup_file}}
#average sequencing depth, only include samples with genome coverage rate >0.9 and sequencing depth >20X
# the python script has the same effect as the sed/awk line, but is easier to understand
#sed 's/:/\\t/g' {{common_result_path}}/{{cns_file}}|awk '{if ($6 >= 3){n++;sum+=$6}} END {print \"\\t\",n/4411532,\"\\t\",sum/n}' > {{unfixed_result_path}}/{{dep_file}}
python3 {{resr_script_path}}/avg_sequencing_depth.py {{common_result_path}}/{{cns_file}} > {{unfixed_result_path}}/{{dep_file}}
#extract unfixed SNPs from forup files, this will create two files: "markdisc" and "markkept"; the suspected false positives(such as mutations with tail region enrichment) will be moved to markdisc file
perl {{resr_script_path}}/mix_extract_0.95.pl {{unfixed_result_path}}/{{forup_file}} > {{unfixed_result_path}}/{{mix_file}}
perl {{resr_script_path}}/forup_format.pl {{unfixed_result_path}}/{{mix_file}} > {{unfixed_result_path}}/{{mixfor_file}}
perl {{resr_script_path}}/info_mark.pl {{unfixed_result_path}}/{{mixfor_file}} > {{unfixed_result_path}}/{{mixmark_file}}
# this script implicitly generates ".mixmarkkept" and ".mixmarkfilt"
python {{resr_script_path}}/redepin_filt.py {{resr_db_path}}/Excluded_loci_mask.list {{unfixed_result_path}}/{{dep_file}} {{unfixed_result_path}}/{{mixmark_file}}
# Process unfiltered result
cat <(cut -f9-11 {{unfixed_result_path}}/{{markkept_file}}) <(cut -f9-11 {{unfixed_result_path}}/{{mixmark_file}}) | sort -u | uniq > {{unfixed_result_path}}/{{keptsnp_file}}
python {{resr_script_path}}/annotate_mtb_results.py {{unfixed_result_path}}/{{keptsnp_file}} {{common_result_path}}/{{varscan_file}} {{resr_db_path}} > {{unfixed_result_path}}/{{result_file}}
"""
SCRIPT_TEMPLATE_TBPROFILER1 = """
tb-profiler profile --bam {{common_result_path}}/{{sample_id}}.sorted.bam --dir {{tbprofiler_result_path}} --prefix {{sample_id}} --csv
"""
SCRIPT_TEMPLATE_TBPROFILER2 = """
conda run -n {{conda_env}} tb-profiler profile --bam {{common_result_path}}/{{sample_id}}.sorted.bam --dir {{tbprofiler_result_path}} --prefix {{sample_id}} --csv
"""
# Cleanup for large files
SCRIPT_TEMPLATE_CLEANUP = """
rm -rf {{common_result_path}}
rm -f {{fixed_result_path}}/*.{fix,for,forup,ppe,snp}
rm -f {{unfixed_result_path}}/*.{dep,for,forup,keptsnp,mix,mixfor,mixmark,mixmarkdisc,mixmarkkept,ppe}
rm -rf {{tbprofiler_result_path}}/bam
rm -rf {{tbprofiler_result_path}}/vcf
tar cvfz {{result_archive}} {{fixed_result_path}} {{unfixed_result_path}} {{tbprofiler_result_path}}
"""
if __name__ == '__main__':
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
description=DESCRIPTION)
parser.add_argument('--paired_fastq_pattern', default="*_{{readnum}}.fastq", help="paired FASTQ pattern")
parser.add_argument('--single_fastq_pattern', default="*.fastq", help="single end FASTQ pattern")
parser.add_argument('--config', default="config.json", help="config file")
parser.add_argument('input_path', help="input path to sample")
parser.add_argument('result_path', help="result path")
args = parser.parse_args()
paired_end = True
abs_input_path = os.path.abspath(args.input_path)
fastq_files = find_fastq_files(abs_input_path, [args.paired_fastq_pattern])
if len(fastq_files) == 0:
print("no paired FASTQ files found - generating single read pipeline")
paired_end = False
fastq_files = find_fastq_files(abs_input_path, [args.single_fastq_pattern])
if len(fastq_files) == 0:
print("NO single FASTQ files found - FAILURE !!!")
exit(1)
else:
fq1 = fastq_files[0][0]
basename = os.path.basename(fq1)
stem0 = basename.replace(".fastq", "").replace("gz", "")
else:
print("paired FASTQ files found - generating paired end pipeline")
fq1, fq2 = fastq_files[0]
basename1 = os.path.basename(fq1)
basename2 = os.path.basename(fq2)
stem1 = basename1.replace(".fastq", "").replace("gz", "")
stem2 = basename2.replace(".fastq", "").replace("gz", "")
# depending on the readnum format
stem0 = stem1.replace("_1", "")
with open(args.config) as infile:
config_file = json.load(infile)
print(config_file)
resr_script_path = "scripts"
resr_db_path = "databases"
common_result_path = os.path.join(args.result_path, "common")
fixed_result_path = os.path.join(args.result_path, "fixed")
unfixed_result_path = os.path.join(args.result_path, "unfixed")
tbprofiler_result_path = os.path.join(args.result_path, "tbprofiler")
# TODO: check if the BWA indexes exist, otherwise generate them with
# bwa index -a bwtsw <fasta>
config = {
"log_dir": config_file['log_dir'],
"sample_id": stem0,
"fasta_path": config_file['fasta_path'],
"resr_script_path": resr_script_path,
"resr_db_path": resr_db_path,
"varscan_path": config_file['varscan_path'],
"fixed_result_path": fixed_result_path,
"unfixed_result_path": unfixed_result_path,
"common_result_path": common_result_path,
"tbprofiler_result_path": tbprofiler_result_path,
"paired_sam": "%s.paired.sam" % stem0, "single_sam": "%s.single.sam" % stem0,
"paired_bam": "%s.paired.bam" % stem0, "single_bam": "%s.single.bam" % stem0,
"merged_bam": "%s.merged.bam" % stem0, "sorted_bam": "%s.sorted.bam" % stem0,
"pileup_file": "%s.pileup" % stem0, "varscan_file": "%s.varscan" % stem0,
"cns_file": "%s.cns" % stem0, "ppe_file": "%s.ppe" % stem0,
"for_file": "%s.for" % stem0, "fix_file": "%s.fix" % stem0,
"forup_file": "%s.forup" % stem0, "snp_file": "%s.snp" % stem0,
"dep_file": "%s.dep" % stem0, "mix_file": "%s.mix" % stem0,
"mixfor_file": "%s.mixfor" % stem0, "mixmark_file": "%s.mixmark" % stem0,
"markkept_file": "%s.mixmarkkept" % stem0,
"keptfilt_file": "%s.keptfilt" % stem0, "keptsnp_file": "%s.keptsnp" % stem0,
"keptanofilt_file": "%s.keptanofilt" % stem0,
"result_file": "%s.finalresult" % stem0,
"result_archive": "%s-results.tar.gz" % stem0
}
if "tb_profiler_env" in config_file:
tb_profiler_templ = SCRIPT_TEMPLATE_TBPROFILER2
config['conda_env'] = config_file['tb_profiler_env']
else:
tb_profiler_templ = SCRIPT_TEMPLATE_TBPROFILER1
if paired_end:
config.update({
"fastq1": fq1, "fastq2": fq2,
"trimmed1": "%s_trimmed.fq" % stem1, "trimmed2": "%s_trimmed.fq" % stem2,
"trimmedS": "%s_trimmedS.fq" % stem0,
"sai1": "%s.sai" % stem1, "sai2": "%s.sai" % stem2, "saiS": "%s_S.sai" % stem0
})
script_templ = jinja2.Template(SCRIPT_TEMPLATE_PRE + SCRIPT_TEMPLATE_PAIRED +
SCRIPT_TEMPLATE_COMMON + tb_profiler_templ +
SCRIPT_TEMPLATE_CLEANUP)
else:
config.update({
"fastq": fq1, "trimmed": "%s_trimmedS.fq" % stem0,
"sai": "%s.sai" % stem0
})
script_templ = jinja2.Template(SCRIPT_TEMPLATE_PRE + SCRIPT_TEMPLATE_SINGLE +
SCRIPT_TEMPLATE_COMMON + tb_profiler_templ +
SCRIPT_TEMPLATE_CLEANUP)
script_out = script_templ.render(config)
with open("%s_snp_calling.sh" % stem0, "w") as outfile:
outfile.write(script_out)