-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathmantis.py
executable file
·289 lines (229 loc) · 10.9 KB
/
mantis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
#!/usr/bin/env python
# @file mantis.py
# @author Esko Kautto ([email protected])
# @updated 2016-07-05
import os
import sys
import argparse
import operator
import subprocess
from helpers import iteritems, check_bedfile_format, required_modules_present
from defaults import load_settings, Parameter
if __name__ == "__main__":
prog_name = 'Microsatellite Analysis for Normal-Tumor InStability (v1.0.4)'
print(prog_name)
# Make sure Pysam and NumPy are available in environment
required_modules_present(['Pysam', 'NumPy'])
parser = argparse.ArgumentParser(description=prog_name)
parser.add_argument('-cfg', '--config', dest='cfg', type=str,
help='Configuration file.')
parser.add_argument('-n', '--normal', dest='normal', type=str, required=True,
help='Normal input BAM file.')
parser.add_argument('-t', '--tumor', dest='tumor', type=str, required=True,
help='Tumor input BAM file.')
parser.add_argument('--threads', dest='threads', type=int,
help='How many threads (processes) to use.')
parser.add_argument('-b', '--bedfile', dest='bedfile', type=str, help='Input BED file.')
parser.add_argument('-o', '--output', dest='output', type=str, help='Output filename.')
parser.add_argument('-mrq', '--min-read-quality', dest='mrq', type=float,
help='Minimum average read quality.')
parser.add_argument('-mlq', '--min-locus-quality', dest='mlq', type=float,
help='Minimum average locus quality.')
parser.add_argument('-mrl', '--min-read-length', dest='mrl', type=int,
help='Minimum (unclipped/unmasked) read length.')
parser.add_argument('-mlc', '--min-locus-coverage', dest='mlc', type=int,
help='Minimum coverage required for each of the normal ' +
'and tumor results.')
parser.add_argument('-mrr', '--min-repeat-reads', dest='mrr', type=int,
help='Minimum reads supporting a specific repeat count.')
parser.add_argument('-sd', '--standard-deviations', dest='sd', type=float,
help='Standard deviations from mean before repeat count is ' +
'considered an outlier')
parser.add_argument('--genome', dest='genome', type=str,
help='Path to reference genome (FASTA).')
parser.add_argument('--difference-threshold', dest='dif_threshold', type=float,
help='Default difference threshold value for calling a sample unstable.')
parser.add_argument('--distance-threshold', dest='euc_threshold', type=float,
help='Default distance threshold value for calling a sample unstable.')
parser.add_argument('--dissimilarity-threshold', dest='cos_threshold', type=float,
help='Default dissimilarity threshold value for calling a sample unstable.')
args = parser.parse_args()
if args.cfg is not None:
config_file_path = os.path.abspath(args.cfg)
if not os.path.isfile(config_file_path):
print('Error! Config file {0} not found!'.format(config_file_path))
exit()
else:
config_file_path = os.path.join(os.path.dirname(
os.path.realpath(__file__)), 'mantis_config.cfg')
# List of configuration parameters/arguments used by the three-tier
# config priority loader.
params = [
Parameter(key='bedfile', required=True),
Parameter(key='normal_filepath', arg_key='normal', required=True),
Parameter(key='tumor_filepath', arg_key='tumor', required=True),
Parameter(key='output_filepath', arg_key='output', required=True),
Parameter(key='genome', required=True),
Parameter(key='mrq', default=25.0),
Parameter(key='mlq', default=30.0),
Parameter(key='mrl', default=35),
Parameter(key='mlc', default=30),
Parameter(key='mrr', default=3),
Parameter(key='sd', default=3.0),
Parameter(key='threads', default=1),
Parameter(key='dif_threshold', default=0.4),
Parameter(key='euc_threshold', default=0.187),
Parameter(key='cos_threshold', default=0.07),
]
# Use config/setting loader.
config = load_settings(params, config_file_path, args)
# Convert any variables that should be filepaths into the absolute
# filepaths of the values.
filepath_vars = [
'bedfile',
'normal_filepath',
'tumor_filepath',
'output_filepath',
'genome',
]
for key in filepath_vars:
if key in config and config[key] is not None:
config[key] = os.path.abspath(config[key])
if 'bedfile' not in config:
print('Error: BED file not provided!')
exit(1)
else:
bedfile = os.path.abspath(config['bedfile'])
if not os.path.isfile(bedfile):
print('Error: {0} does not exist!'.format(bedfile))
exit(1)
else:
config['bedfile'] = bedfile
if not check_bedfile_format(config['bedfile']):
exit(1)
if 'normal_filepath' not in config:
print('Error: Normal BAM file not provided!')
exit(1)
else:
normal_filepath = os.path.abspath(config['normal_filepath'])
if not os.path.isfile(normal_filepath):
print('Error: {0} does not exist!'.format(normal_filepath))
exit(1)
else:
# Make sure a corresponding .BAI index file exists
if not os.path.isfile('{0}.bai'.format(normal_filepath)) and not os.path.isfile('{0}.bai'.format(normal_filepath[:-4])):
print('Error: {0} needs corresponding .bai index file!'.format(normal_filepath))
exit(1)
config['normal_filepath'] = normal_filepath
if 'tumor_filepath' not in config:
print('Error: Tumor BAM file not provided!')
exit(1)
else:
tumor_filepath = os.path.abspath(config['tumor_filepath'])
if not os.path.isfile(tumor_filepath):
print('Error: {0} does not exist!'.format(tumor_filepath))
exit(1)
else:
# Make sure a corresponding .BAI index file exists
if not os.path.isfile('{0}.bai'.format(tumor_filepath)) and not os.path.isfile('{0}.bai'.format(tumor_filepath[:-4])):
print('Error: {0} needs corresponding .bai index file!'.format(tumor_filepath))
exit(1)
config['tumor_filepath'] = tumor_filepath
if 'genome' not in config:
print('Error: Genome not provided!')
exit(1)
else:
genome_filepath = os.path.abspath(config['genome'])
if not os.path.isfile(genome_filepath):
print('Error: {0} does not exist!'.format(genome_filepath))
exit(1)
if args.output is None:
print('Error: Output filename not specified!')
exit(1)
else:
config['output_filepath'] = os.path.abspath(args.output)
# Create output directory if it doesn't exist.
output_dir = os.path.dirname(config['output_filepath'])
if not os.path.isdir(output_dir):
os.makedirs(output_dir)
output_filepath = os.path.abspath(args.output)
"""
Nothing below this line should need to be edited, as the code below
handles the execution of the MANTIS program.
"""
mantis_folder = os.path.dirname(os.path.realpath(__file__))
kmer_repeat_counter = os.path.join(mantis_folder, 'kmer_repeat_counter.py')
kmer_count_filter = os.path.join(mantis_folder, 'kmer_count_filter.py')
instability_calculator = os.path.join(mantis_folder, 'calculate_instability.py')
# First, use the k-mer repeat counter to count how many repeats of each
# repeat unit (k-mer) each locus has, both in the normal and tumor file.
# Results will be saved into a file for passing to the next phase.
if '.' in os.path.basename(output_filepath):
kmer_count_output = output_filepath.split('.')
kmer_count_output[-1] = '{0}.{1}'.format('kmer_counts', kmer_count_output[-1])
kmer_count_output = '.'.join(kmer_count_output)
else:
kmer_count_output = output_filepath + '.kmer_counts'
# First job - generate file with repeat counts for each n-multiple of kmers
kmer_count_filepath = output_filepath + '.kmer_counts'
command = [
'{0} {1} '.format(sys.executable, kmer_repeat_counter),
'-b {0} '.format(os.path.abspath(config['bedfile'])),
'-n {0} '.format(os.path.abspath(config['normal_filepath'])),
'-t {0} '.format(os.path.abspath(config['tumor_filepath'])),
'-o {0} '.format(os.path.abspath(kmer_count_output)),
'--min-read-quality {0} '.format(config['mrq']),
'--min-locus-quality {0} '.format(config['mlq']),
'--min-read-length {0} '.format(config['mrl']),
'--genome {0} '.format(config['genome']),
'--threads {0} '.format(config['threads']),
]
print('\\\n'.join(command))
print('Getting repeat counts for repeat units (k-mers) ...')
sp = subprocess.Popen([' '.join(command)], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
response = sp.communicate()[0]
print(response)
if sp.returncode != 0:
print('Error with k-mer repeat count calculations; terminating program.')
exit(1)
print('done.')
# Run the outlier filter to get rid of outliers that are likely caused
# due to sequencing artifacts or one-off biological events.
filtered_kmer_counts = kmer_count_output.replace('kmer_counts', 'kmer_counts_filtered')
command = [
'{0} {1} '.format(sys.executable, kmer_count_filter),
'-i {0} '.format(os.path.abspath(kmer_count_output)),
'-o {0} '.format(os.path.abspath(filtered_kmer_counts)),
'-mlc {0} '.format(config['mlc']),
'-mrr {0} '.format(config['mrr']),
'-sd {0}'.format(config['sd']),
]
print('\\\n'.join(command))
print('Filtering out outliers ... ')
sp = subprocess.Popen([' '.join(command)], stdout=subprocess.PIPE, shell=True)
response = sp.communicate()[0]
if sp.returncode != 0:
print('Error with k-mer repeat count filter; terminating program.')
exit(1)
print(response)
print('done.')
# Calculate instability scores.
# Analyze locus instability
command = [
'{0} {1} '.format(sys.executable, instability_calculator),
'-i {0} '.format(filtered_kmer_counts),
'-o {0} '.format(output_filepath),
'--difference-threshold {0}'.format(config['dif_threshold']),
'--distance-threshold {0}'.format(config['euc_threshold']),
'--dissimilarity-threshold {0}'.format(config['cos_threshold']),
]
print('\\\n'.join(command))
print('Calculating instability scores ... ')
sp = subprocess.Popen([' '.join(command)], stdout=subprocess.PIPE, shell=True)
response = sp.communicate()[0]
if sp.returncode != 0:
print('Error with instability score calculations; terminating program.')
exit(1)
print(response)
print('done.')
print('\nMANTIS complete.')