forked from alexfields/ORF-RATER
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrate_regression_output.py
executable file
·216 lines (185 loc) · 12.6 KB
/
rate_regression_output.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
#! /usr/bin/env python
import argparse
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from multiisotonic.multiisotonic import MultiIsotonicRegressor
import sys
from time import strftime
parser = argparse.ArgumentParser(description='Combine one or more output files from regress_orfs.py into a final translation rating for each ORF. '
'Features will be loaded and calculated from the regression output, and scores will be calculated using '
'a random forest, followed by a monotonization procedure to remove some overfitting artifacts.')
parser.add_argument('regressfile', nargs='+',
help='Subdirectory/subdirectories or filename(s) containing regression output from regress_orfs.py, for use in forming a final '
'rating. If directory(ies) are provided, they should contain a file named regression.h5. Datasets treated with translation '
'inititation inhibitors (e.g. HARR, LTM) for which the --startonly toggle was set in regress_orfs.py will only be used for '
'initiation codon results; other datasets will be used for both initiation and termination codons.')
parser.add_argument('--orfstore', default='orf.h5',
help='Path to pandas HDF store containing ORFs and ORF types; generated by find_orfs_and_types.py (Default: orf.h5)')
parser.add_argument('--names', nargs='+', help='Names to use for datasets included in REGRESSFILEs. Should meaningfully indicate the important '
'features of each. (Default: inferred from REGRESSFILEs)')
parser.add_argument('--numtrees', type=int, default=2048, help='Number of trees to use in the random forest (Default: 2048)')
parser.add_argument('--minperleaf', type=int, nargs='+', default=[32],
help='Minimum samples per leaf to use in the random forest. If multiple values are provided, one will be selected based on cross '
'validation. If only one value is provided, will search for optimum by multiplying or dividing by powers of 2 (Default: 32)')
parser.add_argument('--minforestscore', type=float, default=0.3, help='Minimum forest score to require for monotonization (Default: 0.3)')
parser.add_argument('--cvfold', type=int, default=6, help='Number of folds for random forest cross-validation (Default: 6)')
parser.add_argument('--goldallcodons', action='store_true',
help='Random forest training set is normally restricted to ATG-initiated ORFs. If this flag is toggled, training will be '
'performed on all ORFs, which may unfairly penalize non-ATG-initiated ORFs.')
parser.add_argument('--goldminlen', type=int, default=100, help='Minimum length (in codons) for ORFs included in the training set (Default: 100)')
parser.add_argument('--ratingsfile', default='orfratings.h5',
help='Filename to which to output the final rating for each ORF. Formatted as pandas HDF (table name is "orfratings"). Columns '
'include basic information, raw score from random forest, and final monotonized orf rating. For ORFs appearing on multiple '
'transcripts, only one transcript will be selected for the table. (Default: orfratings.h5)')
parser.add_argument('--CSV', help='If included, also write output in CSV format to the provided filename.')
parser.add_argument('-v', '--verbose', action='store_true', help='Output a log of progress and timing (to stdout)')
parser.add_argument('-p', '--numproc', type=int, default=1, help='Number of processes to run. Defaults to 1 but more recommended if available.')
parser.add_argument('-f', '--force', action='store_true', help='Force file overwrite')
opts = parser.parse_args()
if not opts.force:
if os.path.exists(opts.ratingsfile):
raise IOError('%s exists; use --force to overwrite' % opts.ratingsfile)
if opts.CSV and os.path.exists(opts.CSV):
raise IOError('%s exists; use --force to overwrite' % opts.CSV)
regressfiles = []
colnames = []
for regressfile in opts.regressfile:
if os.path.isfile(regressfile):
regressfiles.append(regressfile)
if not opts.names:
colnames.append(os.path.basename(regressfile).rpartition(os.path.extsep)[0]) # '/path/to/myfile.h5' -> 'myfile'
elif os.path.isdir(regressfile) and os.path.isfile(os.path.join(regressfile, 'regression.h5')):
regressfiles.append(os.path.join(regressfile, 'regression.h5'))
if not opts.names:
colnames.append(os.path.basename(regressfile.strip(os.path.sep))) # '/path/to/mydir/' -> 'mydir'
else:
raise IOError('Regression file/directory %s not found' % regressfile)
if opts.names:
if len(opts.regressfile) != len(opts.names):
raise ValueError('Precisely one name must be provided for each REGRESSFILE')
colnames = opts.names
if opts.verbose:
sys.stdout.write(' '.join(sys.argv) + '\n')
def logprint(nextstr):
sys.stdout.write('[%s] %s\n' % (strftime('%Y-%m-%d %H:%M:%S'), nextstr))
sys.stdout.flush()
logprint('Loading regression output')
orf_columns = ['orfname', 'tfam', 'tid', 'tcoord', 'tstop', 'chrom', 'gcoord', 'gstop', 'strand', 'codon', 'AAlen',
'orftype', 'annot_start', 'annot_stop']
allstarts = pd.DataFrame(columns=['tfam', 'chrom', 'gcoord', 'strand'])
allorfs = pd.DataFrame()
allstops = pd.DataFrame(columns=['tfam', 'chrom', 'gstop', 'strand'])
feature_columns = []
stopcols = []
for (regressfile, colname) in zip(regressfiles, colnames):
with pd.HDFStore(regressfile, mode='r') as instore:
if 'stop_strengths' in instore:
stopcols.append(colname)
currstarts = instore.select('start_strengths', columns=['tfam', 'chrom', 'gcoord', 'strand', 'start_strength', 'W_start']) \
.rename(columns={'start_strength': 'str_start_'+colname, 'W_start': 'W_start_'+colname})
currstarts['chrom'] = currstarts['chrom'].astype(str) # strange bug when merging on categorical columns, so revert to str temporarily
currstarts['strand'] = currstarts['strand'].astype(str)
allstarts = allstarts.merge(currstarts, how='outer').fillna(0.)
allorfs = allorfs.append(instore.select('orf_strengths', columns=orf_columns), ignore_index=True).drop_duplicates('orfname')
# This line not actually used for regression output beyond just which ORFs actually got a positive score in at least one regression
# Safer to use concatenation and drop_duplicates rather than outer merges, in case one ORF somehow was assigned to different transcripts
currstops = instore.select('stop_strengths', columns=['tfam', 'chrom', 'gstop', 'strand', 'stop_strength', 'W_stop']) \
.rename(columns={'stop_strength': 'str_stop_'+colname, 'W_stop': 'W_stop_'+colname})
currstops['chrom'] = currstops['chrom'].astype(str) # strange bug when merging on categorical columns, so revert to str temporarily
currstops['strand'] = currstops['strand'].astype(str)
allstops = allstops.merge(currstops, how='outer').fillna(0.)
feature_columns.extend(['W_start_'+colname, 'W_stop_'+colname, 'str_stop_'+colname])
else:
currstarts = instore.select('start_strengths', columns=['tfam', 'chrom', 'gcoord', 'strand', 'W_start']) \
.rename(columns={'W_start': 'W_start_'+colname})
currstarts['chrom'] = currstarts['chrom'].astype(str) # strange bug when merging on categorical columns, so revert to str temporarily
currstarts['strand'] = currstarts['strand'].astype(str)
allstarts = allstarts.merge(currstarts, how='outer').fillna(0.)
feature_columns.append('W_start_'+colname)
orfratings = allorfs[allorfs['gcoord'] != allorfs['gstop']].merge(allstarts, how='left').merge(allstops, how='left')
orfratings.fillna({col: 0. for col in orfratings.columns if col not in allorfs.columns}, inplace=True)
stopgrps = orfratings.groupby(['chrom', 'gstop', 'strand'])
for stopcol in stopcols:
orfratings['stopset_rel_str_start_'+stopcol] = stopgrps['str_start_'+stopcol].transform(lambda x: x/x.max()).fillna(0.)
feature_columns.append('stopset_rel_str_start_'+stopcol)
if opts.verbose:
logprint('Training random forest on features:\n\t'+'\n\t'.join(feature_columns))
if opts.goldallcodons:
gold_set = (orfratings['AAlen'] >= opts.goldminlen)
else:
gold_set = ((orfratings['codon'] == 'ATG') & (orfratings['AAlen'] >= opts.goldminlen))
gold_df = orfratings[gold_set].drop_duplicates(['chrom', 'gcoord', 'gstop', 'strand'])
# ORFs with same start and stop are guaranteed to have identical feature values and therefore bias cross-validation - so only keep one of them
gold_class = gold_df[['annot_start', 'annot_stop']].all(1).values.astype(np.int8)*2 - 1 # convert True/False to +1/-1
gold_feat = gold_df[feature_columns].values
if opts.verbose:
logprint('Gold set contains %d annotated ORFs and %d unannotated ORFs' % ((gold_class > 0).sum(), (gold_class < 0).sum()))
mycv = StratifiedKFold(opts.cvfold, shuffle=True, random_state=42) # define random_state so same CV splits are used throughout parameter search
if len(opts.minperleaf) > 1:
currgrid = GridSearchCV(RandomForestClassifier(n_estimators=opts.numtrees), param_grid={'min_samples_leaf': opts.minperleaf},
scoring='accuracy', cv=mycv, n_jobs=opts.numproc)
currgrid.fit(gold_feat, gold_class)
if opts.verbose:
logprint('Best estimator has estimated %f accuracy with %d minimum samples per leaf' %
(currgrid.best_score_, currgrid.best_params_['min_samples_leaf']))
if currgrid.best_params_['min_samples_leaf'] == min(opts.minperleaf) and min(opts.minperleaf) > 1:
sys.stderr.write('WARNING: Optimal minimum samples per leaf is minimum tested; recommended to test lower values\n')
if currgrid.best_params_['min_samples_leaf'] == max(opts.minperleaf):
sys.stderr.write('WARNING: Optimal minimum samples per leaf is maximum tested; recommended to test greater values\n')
best_est = currgrid.best_estimator_
else:
def _get_score(val):
return cross_val_score(RandomForestClassifier(n_estimators=opts.numtrees, min_samples_leaf=val),
gold_feat, gold_class, scoring='accuracy', cv=mycv, n_jobs=opts.numproc).mean()
prevval = opts.minperleaf[0]
prevres = _get_score(prevval)
currval = prevval*2
currres = _get_score(currval)
if currres <= prevres: # getting better as val decreases
(prevval, currval) = (currval, prevval)
(prevres, currres) = (currres, prevres)
while currres >= prevres:
if currval == 1:
best_score = currres
best_param = currval
break
prevval = currval
prevres = currres
currval //= 2
currres = _get_score(currval)
else:
best_score = prevres
best_param = prevval
else: # getting better as val increases
while currres >= prevres:
prevval = currval
prevres = currres
currval *= 2
currres = _get_score(currval)
best_score = prevres
best_param = prevval
if opts.verbose:
logprint('Best estimator has estimated %f accuracy with %d minimum samples per leaf' % (best_score, best_param))
best_est = RandomForestClassifier(n_estimators=opts.numtrees, min_samples_leaf=best_param, n_jobs=opts.numproc)
best_est.fit(gold_feat, gold_class)
orfratings['forest_score'] = best_est.predict_proba(orfratings[feature_columns].values)[:, 1]
to_monotonize = orfratings['forest_score'] > opts.minforestscore
if opts.verbose:
logprint('Monotonizing %d ORFs' % to_monotonize.sum())
forest_monoreg = MultiIsotonicRegressor()
forest_monoreg.fit(orfratings.loc[to_monotonize, feature_columns].values,
orfratings.loc[to_monotonize, 'forest_score'].values)
orfratings['orfrating'] = np.nan
orfratings.loc[to_monotonize, 'orfrating'] = forest_monoreg.predict(orfratings.loc[to_monotonize, feature_columns].values)
if opts.verbose:
logprint('Saving results')
for catfield in ['chrom', 'strand', 'codon', 'orftype']:
orfratings[catfield] = orfratings[catfield].astype('category') # saves disk space and read/write time
orfratings.to_hdf(opts.ratingsfile, 'orfratings', format='t', data_columns=True)
if opts.CSV:
orfratings.to_csv(opts.CSV, index=False)
if opts.verbose:
logprint('Tasks complete')