-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathprocess_anavar_results.py
executable file
·229 lines (173 loc) · 6.87 KB
/
process_anavar_results.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
#!/usr/bin/env python
from __future__ import print_function
import anavar_utils as an
import sys
import argparse
import re
def aic(n_param, max_lnl):
"""
calculates AIC as desicribed here: https://en.wikipedia.org/wiki/Akaike_information_criterion
:param n_param: int
:param max_lnl: float
:return:
"""
k = n_param
lnl = max_lnl
return (2 * k) - (2 * lnl)
def delta_aic(current_d, best_d):
"""
returns delta_AIC, which is defined as the difference between the AIC for the best fitting model
and the model on the row of the spreadsheet - from Kai's email 14th Sept 17
:param current_d: float
:param best_d: float
:return:
"""
return best_d - current_d
def reformat_mle(line, n_classes, var_type, converged, b_hit, model, p, dfe, n_search, alpha_ins, alpha_del,
custom_col=None, custom_val=None):
"""
converts mle estimate line from anavar results file to multiple lines in long form
:param line: dict
:param n_classes: int
:param var_type: str
:param converged: bool
:param b_hit: list
:param model: str
:param p: int
:param dfe: str
:param n_search: int
:param alpha_ins: float
:param alpha_del: float
:param custom_col: str
:param custom_val: str
:return: list
"""
# determine variant string
if var_type == 'indel':
variants = ['ins_', 'del_']
else:
variants = ['']
alphas = {'ins': alpha_ins, 'del': alpha_del}
# prepare header
out_lines = []
new_header = ['run', 'imp', 'exit_code',
'theta', 'scale', 'shape', 'gamma', 'e',
'var_type', 'site_class', 'sel_type', 'lnL',
'boundaries_hit', 'searches', 'converged', 'model', 'params', 'alpha']
non_variable_vals = {'run': line['run'], 'imp': line['imp'], 'exit_code': line['exit_code'],
'lnL': line['lnL'], 'boundaries_hit': '|'.join(b_hit), 'converged': converged,
'model': model, 'params': p, 'AIC': aic(p, line['lnL']), 'searches': n_search}
if custom_col is not None:
new_header.append(custom_col)
non_variable_vals[custom_col] = custom_val
new_header.append('AIC')
# process neutral variants then selected variants
for sel in ['neu_', 'sel_']:
# only ever one class of neutral variants
if sel == 'neu_':
n_class = 1
# if sel then number of classes given
else:
n_class = n_classes
for site_class in range(1, n_class+1):
# ie for ins and del or just snp
for variant in variants:
if variant == '':
var_str = 'snp'
else:
var_str = variant.rstrip('_')
# get value for each column in output table
row = []
for col in new_header:
# these values same for each row
if col in non_variable_vals.keys():
col_val = non_variable_vals[col]
elif col == 'var_type':
col_val = var_str
elif col == 'site_class':
col_val = site_class
elif col == 'sel_type':
col_val = sel.rstrip('_')
elif col == 'alpha':
col_val = alphas[var_str]
# leaving 'theta', 'scale', 'shape', 'gamma', 'e'
else:
if dfe == 'continuous' and sel == 'sel_':
class_str = ''
else:
class_str = '_' + str(site_class)
key_str = '{}{}{}{}'.format(sel, variant, col, class_str)
try:
col_val = line[key_str]
except KeyError:
col_val = 'na'
row.append(col_val)
out_lines.append(row)
return new_header, out_lines
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-file_pattern', help='takes a regular expression in order to extract a custom ID from a file'
'name, along with a column name eg) degree,_degree(\d+)\.')
parser.add_argument('-neu_ref', help='sets neutral reference type to determine ds',
default='ar', choices=['ar', 'nc'])
args = parser.parse_args()
counter = 0
if args.file_pattern is not None:
spec_col = args.file_pattern.split(',')[0]
spec_pattern = args.file_pattern.split(',')[1]
else:
spec_col, spec_pattern = None, None
all_res = []
for res in sys.stdin:
# sort custom col contents
if spec_col is not None:
spec_val = re.search(spec_pattern, res).group(1)
else:
spec_val = None
res = res.rstrip()
if 'equal_t' in res:
constraint = '_equal_t'
else:
constraint = ''
results = an.ResultsFile(open(res))
mle = results.ml_estimate()
n_class = results.num_class()
variant = results.data_type()
dfe = results.dfe()
free_params = len(results.free_parameters())
if constraint == '_equal_t':
if args.neu_ref == 'ar':
i_ds = 0.00183918200407
d_ds = 0.00383421025726
else:
i_ds = 0.00171438749928
d_ds = 0.00308343831154
ins_alpha = results.get_alpha(dn=0.000121257528807, ds=i_ds, var_type='ins')
del_alpha = results.get_alpha(dn=0.000200465669246, ds=d_ds, var_type='del')
else:
ins_alpha = 'NA'
del_alpha = 'NA'
mod_name = '{}_{}_{}class{}'.format(variant, dfe, n_class, constraint)
reformed = reformat_mle(mle, n_class, variant, results.converged(),
results.bounds_hit(gamma_r=(-5e4, 1e5), theta_r=(1e-12, 0.1), r_r=(0.01, 100),
scale_r=(0.1, 5000.0)),
mod_name, free_params, dfe, results.num_runs(),
alpha_ins=ins_alpha, alpha_del=del_alpha,
custom_col=spec_col, custom_val=spec_val)
if counter == 0:
all_res.append(reformed[0])
counter += 1
for x in reformed[1]:
all_res.append(x)
# calc AIC
aics = sorted([z[-1] for z in all_res[1:]])
best_aic = aics[0]
print(*all_res[0] + ['delta_AIC'], sep=',')
out_data = []
for line in all_res[1:]:
delta = delta_aic(line[-1], best_aic)
out_data.append((delta, line + [delta]))
for line in sorted(out_data, reverse=True):
print(*line[1], sep=',')
if __name__ == '__main__':
main()