forked from alexfields/ORF-RATER
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathregress_orfs.py
executable file
·482 lines (430 loc) · 29.6 KB
/
regress_orfs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
#! /usr/bin/env python
import argparse
import os
import pysam
import pandas as pd
import numpy as np
from scipy.optimize import nnls
import scipy.sparse
import multiprocessing as mp
from hashed_read_genome_array import HashedReadBAMGenomeArray, ReadKeyMapFactory, read_length_nmis, get_hashed_counts
from plastid.genomics.roitools import SegmentChain, positionlist_to_segments
import sys
from time import strftime
parser = argparse.ArgumentParser(description='Use linear regression to identify likely sites of translation. Regression will be performed for ORFs '
'defined by find_orfs_and_types.py using a metagene profile constructed from annotated CDSs. If '
'multiple ribosome profiling datasets are to be analyzed separately (e.g. if they were collected under '
'different drug treatments), then this program should be run separately for each, ideally in separate '
'subfolders indicated by SUBDIR.')
parser.add_argument('bamfiles', nargs='+', help='Path to transcriptome-aligned BAM file(s) for read data')
parser.add_argument('--subdir', default=os.path.curdir,
help='Convenience argument when dealing with multiple datasets. In such a case, set SUBDIR to an appropriate name (e.g. HARR, '
'CHX) to avoid file conflicts. (Default: current directory)')
parser.add_argument('--restrictbystarts', nargs='+',
help='Subdirectory/subdirectories or filename(s) containing regression output to use to restrict ORFs for regression. If a '
'directory or list of directories, file(s) of name REGRESSFILE (regression.h5 by default) will be searched for within them. '
'For use to restrict regression on e.g. CHX or no-drug data based only on positive hits from e.g. HARR or LTM data. '
'Value(s) of MINWSTART indicate the minimum W statistic to require. If multiple directories/files are provided, start '
'sites will be taken from their union.')
parser.add_argument('--minwstart', type=float, nargs='+', default=[0],
help='Minimum W_start statistic to require for regression output in RESTRICTBYSTARTS. If only one value is given, it will be '
'assumed to apply to all; if multiple values are given, the number of values must match the number of values provided for '
'RESTRICTBYSTARTS. Ignored if RESTRICTBYSTARTS not included. (Default: 0)')
parser.add_argument('--orfstore', default='orf.h5',
help='Path to pandas HDF store containing ORFs to regress; generated by find_orfs_and_types.py (Default: orf.h5)')
parser.add_argument('--inbed', default='transcripts.bed', help='Transcriptome BED-file (Default: transcripts.bed)')
parser.add_argument('--offsetfile', default='offsets.txt',
help='Path to 2-column tab-delimited file with 5\' offsets for variable P-site mappings. First column indicates read length, '
'second column indicates offset to apply. Read lengths are calculated after trimming up to MAX5MIS 5\' mismatches. Accepted '
'read lengths are defined by those present in the first column of this file. If SUBDIR is set, this file is assumed to be '
'in that directory. (Default: offsets.txt)')
parser.add_argument('--max5mis', type=int, default=1, help='Maximum 5\' mismatches to trim. Reads with more than this number will be excluded.'
'(Default: 1)')
parser.add_argument('--regressfile', default='regression.h5',
help='Filename to which to output the table of regression scores for each ORF. Formatted as pandas HDF (tables generated include '
'"start_strengths", "orf_strengths", and "stop_strengths"). If SUBDIR is set, this file will be placed in that directory. '
'(Default: regression.h5)')
parser.add_argument('--startonly', action='store_true', help='Toggle for datasets collected in the presence of initiation inhibitor (e.g. HARR, '
'LTM). If selected, "stop_strengths" will not be calculated or saved.')
parser.add_argument('--startrange', type=int, nargs=2, default=[1, 50],
help='Region around start codon (in codons) to model explicitly. Ignored if reading metagene from file (Default: 1 50, meaning '
'one full codon before the start is modeled, as are the start codon and the 49 codons following it).')
parser.add_argument('--stoprange', type=int, nargs=2, default=[7, 0],
help='Region around stop codon (in codons) to model explicitly. Ignored if reading metagene from file (Default: 7 0, meaning '
'seven full codons before and including the stop are modeled, but none after).')
parser.add_argument('--mincdsreads', type=int, default=64,
help='Minimum number of reads required within the body of the CDS (and any surrounding nucleotides indicated by STARTRANGE or '
'STOPRANGE) for it to be included in the metagene. Ignored if reading metagene from file (Default: 64).')
parser.add_argument('--startcount', type=int, default=0,
help='Minimum reads at putative translation initiation codon. Useful to reduce computational burden by only considering ORFs '
'with e.g. at least 1 read at the start. (Default: 0)')
parser.add_argument('--metagenefile', default='metagene.txt',
help='File to save metagene profile, OR if the file already exists, it will be used as the input metagene. Formatted as '
'tab-delimited text, with position, readlength, value, and type ("START", "CDS", or "STOP"). If SUBDIR is set, this file '
'will be placed in that directory. (Default: metagene.txt)')
parser.add_argument('--noregress', action='store_true', help='Only generate a metagene (i.e. do not perform any regressions)')
parser.add_argument('--exclude', nargs='+', help='Names of transcript families (tfams) to exclude from analysis due to excessive computational time '
'or memory footprint (e.g. TTN can be so large that the regression never finishes).')
parser.add_argument('-v', '--verbose', action='count', help='Output a log of progress and timing (to stdout). Repeat for higher verbosity level.')
parser.add_argument('-p', '--numproc', type=int, default=1, help='Number of processes to run. Defaults to 1 but more recommended if available.')
parser.add_argument('-f', '--force', action='store_true',
help='Force file overwrite. This will overwrite both METAGENEFILE and REGRESSFILE, if they exist. To overwrite only REGRESSFILE '
'(and not the METAGENEFILE), do not invoke this option but simply delete REGRESSFILE.')
opts = parser.parse_args()
offsetfilename = os.path.join(opts.subdir, opts.offsetfile)
metafilename = os.path.join(opts.subdir, opts.metagenefile)
regressfilename = os.path.join(opts.subdir, opts.regressfile)
if not opts.force:
if os.path.exists(regressfilename):
if os.path.exists(metafilename):
raise IOError('%s exists; use --force to overwrite (will also recalculate metagene and overwrite %s)' % (regressfilename, metafilename))
raise IOError('%s exists; use --force to overwrite' % regressfilename)
restrictbystartfilenames = []
if opts.restrictbystarts:
if len(opts.restrictbystarts) > 1 and len(opts.minwstart) == 1:
opts.minwstart *= len(opts.restrictbystarts) # expand the list to the same number of arguments
if len(opts.minwstart) != len(opts.restrictbystarts):
raise ValueError('--minwstart must be given same number of values as --restrictbystarts, or one value for all')
for restrictbystart in opts.restrictbystarts:
if os.path.isfile(restrictbystart):
restrictbystartfilenames.append(restrictbystart)
elif os.path.isdir(restrictbystart) and os.path.isfile(os.path.join(restrictbystart, opts.regressfile)):
restrictbystartfilenames.append(os.path.join(restrictbystart, opts.regressfile))
else:
raise IOError('Regression file/directory %s not found' % restrictbystart)
if opts.verbose:
sys.stdout.write(' '.join(sys.argv) + '\n')
def logprint(nextstr):
sys.stdout.write('[%s] %s\n' % (strftime('%Y-%m-%d %H:%M:%S'), nextstr))
sys.stdout.flush()
log_lock = mp.Lock()
rdlens = []
Pdict = {}
with open(offsetfilename, 'rU') as infile:
for line in infile:
ls = line.strip().split()
rdlen = int(ls[0])
for nmis in range(opts.max5mis+1):
Pdict[(rdlen, nmis)] = int(ls[1])+nmis # e.g. if nmis == 1, offset as though the read were missing that base entirely
rdlens.append(rdlen)
# Pdict = {(int(ls[0]), nmis): int(ls[1])+nmis for ls in [line.strip().split() for line in infile] for nmis in range(opts.max5mis+1)}
# Pdict = {(ls[0], nmis): ls[1] for ls in [line.strip().split() for line in infile] if opts.maxrdlen >= ls[0] >= opts.minrdlen
# for nmis in range(opts.max5mis+1)}
rdlens.sort()
# hash transcripts by ID for easy reference later
with open(opts.inbed, 'rU') as inbed:
bedlinedict = {line.split()[3]: line for line in inbed}
def _get_annotated_counts_by_chrom(chrom_to_do):
"""Accumulate counts from annotated CDSs into a metagene profile. Only the longest CDS in each transcript family will be included, and only if it
meets the minimum number-of-reads requirement. Reads are normalized by gene, so every gene included contributes equally to the final metagene."""
found_cds = pd.read_hdf(opts.orfstore, 'all_orfs', mode='r',
where="chrom == '%s' and orftype == 'annotated' and tstop > 0 and tcoord > %d and AAlen > %d"
% (chrom_to_do, -startnt[0], min_AAlen),
columns=['orfname', 'tfam', 'tid', 'tcoord', 'tstop', 'AAlen']) \
.sort_values('AAlen', ascending=False).drop_duplicates('tfam') # use the longest annotated CDS in each transcript family
num_cds_incl = 0 # number of CDSs included from this chromosome
startprof = np.zeros((len(rdlens), startlen))
cdsprof = np.zeros((len(rdlens), 3))
stopprof = np.zeros((len(rdlens), stoplen))
inbams = [pysam.Samfile(infile, 'rb') for infile in opts.bamfiles]
gnd = HashedReadBAMGenomeArray(inbams, ReadKeyMapFactory(Pdict, read_length_nmis))
for (tid, tcoord, tstop) in found_cds[['tid', 'tcoord', 'tstop']].itertuples(False):
curr_trans = SegmentChain.from_bed(bedlinedict[tid])
tlen = curr_trans.get_length()
if tlen >= tstop + stopnt[1]: # need to guarantee that the 3' UTR is sufficiently long
curr_hashed_counts = get_hashed_counts(curr_trans, gnd)
cdslen = tstop+stopnt[1]-tcoord-startnt[0] # cds length, plus the extra bases...
curr_counts = np.zeros((len(rdlens), cdslen))
for (i, rdlen) in enumerate(rdlens):
for nmis in range(opts.max5mis+1):
curr_counts[i, :] += curr_hashed_counts[(rdlen, nmis)][tcoord+startnt[0]:tstop+stopnt[1]]
# curr_counts is limited to the CDS plus any extra requested nucleotides on either side
if curr_counts.sum() >= opts.mincdsreads:
curr_counts /= curr_counts.mean() # normalize by mean of counts across all readlengths and positions within the CDS
startprof += curr_counts[:, :startlen]
cdsprof += curr_counts[:, startlen:cdslen-stoplen].reshape((len(rdlens), -1, 3)).mean(1)
stopprof += curr_counts[:, cdslen-stoplen:cdslen]
num_cds_incl += 1
for inbam in inbams:
inbam.close()
return startprof, cdsprof, stopprof, num_cds_incl
def _orf_profile(orflen):
"""Generate a profile for an ORF based on the metagene profile
Parameters
----------
orflen : int
Number of nucleotides in the ORF, including the start and stop codons
Returns
-------
np.ndarray<float>
The expected profile for the ORF. Number of rows will match the number of rows in the metagene profile. Number of columns will be
orflen + stopnt[1] - startnt[0]
"""
assert orflen % 3 == 0
assert orflen > 0
short_stop = 9
if orflen >= startnt[1]-stopnt[0]: # long enough to include everything
return np.hstack((startprof, np.tile(cdsprof, (orflen-startnt[1]+stopnt[0])/3), stopprof))
elif orflen >= startnt[1]+short_stop:
return np.hstack((startprof, stopprof[:, startnt[1]-orflen-stopnt[1]:]))
elif orflen >= short_stop:
return np.hstack((startprof[:, :orflen-short_stop-startnt[0]], stopprof[:, -short_stop-stopnt[1]:]))
else: # very short!
return np.hstack((startprof[:, :3-startnt[0]], stopprof[:, 3-orflen-stopnt[0]:]))
if opts.startonly:
failure_return = (pd.DataFrame(), pd.DataFrame())
else:
failure_return = (pd.DataFrame(), pd.DataFrame(), pd.DataFrame())
def _regress_tfam(orf_set, gnd):
"""Performs non-negative least squares regression on all of the ORFs in a transcript family, using profiles constructed via _orf_profile()
Also calculates Wald statistics for each orf and start codon, and for each stop codon if opts.startonly is False"""
tfam = orf_set['tfam'].iat[0]
strand = orf_set['strand'].iat[0]
chrom = orf_set['chrom'].iat[0]
tids = orf_set['tid'].drop_duplicates().tolist()
all_tfam_genpos = set()
tid_genpos = {}
tlens = {}
for (i, tid) in enumerate(tids):
currtrans = SegmentChain.from_bed(bedlinedict[tid])
curr_pos_set = currtrans.get_position_set()
tlens[tid] = len(curr_pos_set)
tid_genpos[tid] = curr_pos_set
all_tfam_genpos.update(curr_pos_set)
tfam_segs = SegmentChain(*positionlist_to_segments(chrom, strand, list(all_tfam_genpos)))
all_tfam_genpos = np.array(sorted(all_tfam_genpos))
if strand == '-':
all_tfam_genpos = all_tfam_genpos[::-1]
nnt = len(all_tfam_genpos)
tid_indices = {tid: np.flatnonzero(np.in1d(all_tfam_genpos, list(curr_tid_genpos), assume_unique=True))
for (tid, curr_tid_genpos) in tid_genpos.iteritems()}
hashed_counts = get_hashed_counts(tfam_segs, gnd)
counts = np.zeros((len(rdlens), nnt), dtype=np.float64) # even though they are integer-valued, will need to do float arithmetic
for (i, rdlen) in enumerate(rdlens):
for nmis in range(1+opts.max5mis):
counts[i, :] += hashed_counts[(rdlen, nmis)]
counts = counts.ravel()
if opts.startcount:
# Only include ORFS for which there is at least some minimum reads within one nucleotide of the start codon
offsetmat = np.tile(nnt*np.arange(len(rdlens)), 3) # offsets for each cond, expecting three positions to check for each
# try:
orf_set = orf_set[[(counts[(start_idxes.repeat(len(rdlens))+offsetmat)].sum() >= opts.startcount) for start_idxes in
[tid_indices[tid][tcoord-1:tcoord+2] for (tid, tcoord, tstop) in orf_set[['tid', 'tcoord', 'tstop']].itertuples(False)]]]
if orf_set.empty:
return failure_return
orf_strength_df = orf_set.sort_values('tcoord', ascending=False).drop_duplicates('orfname').reset_index(drop=True)
abort_set = orf_set.drop_duplicates('gcoord').copy()
abort_set['gstop'] = abort_set['gcoord'] # should maybe be +/-3, but then need to worry about splicing - and this is an easy flag
abort_set['tstop'] = abort_set['tcoord']+3 # stop after the first codon
abort_set['orfname'] = abort_set['gcoord'].apply(lambda x: '%s_%d_abort' % (tfam, x))
orf_strength_df = pd.concat((orf_strength_df, abort_set), ignore_index=True)
if not opts.startonly: # if marking full ORFs, include histop model
stop_set = orf_set.drop_duplicates('gstop').copy()
stop_set['gcoord'] = stop_set['gstop'] # this is an easy flag
stop_set['tcoord'] = stop_set['tstop'] # should probably be -3 nt, but this is another easy flag that distinguishes from abinit
stop_set['orfname'] = stop_set['gstop'].apply(lambda x: '%s_%d_stop' % (tfam, x))
orf_strength_df = pd.concat((orf_strength_df, stop_set), ignore_index=True)
orf_profs = []
indices = []
for (tid, tcoord, tstop) in orf_strength_df[['tid', 'tcoord', 'tstop']].itertuples(False):
if tcoord != tstop: # not a histop
tlen = tlens[tid]
if tcoord+startnt[0] < 0:
startadj = -startnt[0]-tcoord # number of nts to remove from the start due to short 5' UTR; guaranteed > 0
else:
startadj = 0
if tstop+stopnt[1] > tlen:
stopadj = tstop+stopnt[1]-tlen # number of nts to remove from the end due to short 3' UTR; guaranteed > 0
else:
stopadj = 0
curr_indices = tid_indices[tid][tcoord+startnt[0]+startadj:tstop+stopnt[1]-stopadj]
orf_profs.append(_orf_profile(tstop-tcoord)[:, startadj:tstop-tcoord+stopnt[1]-startnt[0]-stopadj].ravel())
else: # histop
curr_indices = tid_indices[tid][tstop-6:tstop]
orf_profs.append(stopprof[:, -6:].ravel())
indices.append(np.concatenate([nnt*i+curr_indices for i in xrange(len(rdlens))]))
# need to tile the indices for each read length
if len(indices[-1]) != len(orf_profs[-1]):
raise AssertionError('ORF length does not match index length')
orf_matrix = scipy.sparse.csc_matrix((np.concatenate(orf_profs),
np.concatenate(indices),
np.cumsum([0]+[len(curr_indices) for curr_indices in indices])),
shape=(nnt*len(rdlens), len(orf_strength_df)))
# better to make it a sparse matrix, even though nnls requires a dense matrix, because of linear algebra to come
nonzero_orfs = np.flatnonzero(orf_matrix.T.dot(counts) > 0)
if len(nonzero_orfs) == 0: # no possibility of anything coming up
return failure_return
orf_matrix = orf_matrix[:, nonzero_orfs]
orf_strength_df = orf_strength_df.iloc[nonzero_orfs] # don't bother fitting ORFs with zero reads throughout their entire length
(orf_strs, resid) = nnls(orf_matrix.toarray(), counts)
min_str = 1e-6 # allow for machine rounding error
usable_orfs = orf_strs > min_str
if not usable_orfs.any():
return failure_return
orf_strength_df = orf_strength_df[usable_orfs]
orf_matrix = orf_matrix[:, usable_orfs] # remove entries for zero-strength ORFs or transcripts
orf_strs = orf_strs[usable_orfs]
orf_strength_df['orf_strength'] = orf_strs
covmat = resid*resid*np.linalg.inv(orf_matrix.T.dot(orf_matrix).toarray())/(nnt*len(rdlens)-len(orf_strength_df))
# homoscedastic version (assume equal variance at all positions)
# resids = counts-orf_matrix.dot(orf_strs)
# simple_covmat = np.linalg.inv(orf_matrix.T.dot(orf_matrix).toarray())
# covmat = simple_covmat.dot(orf_matrix.T.dot(scipy.sparse.dia_matrix((resids*resids, 0), (len(resids), len(resids))))
# .dot(orf_matrix).dot(simple_covmat))
# # heteroscedastic version (Eicker-Huber-White robust estimator)
orf_strength_df['W_orf'] = orf_strength_df['orf_strength']*orf_strength_df['orf_strength']/np.diag(covmat)
orf_strength_df.set_index('orfname', inplace=True)
elongating_orfs = ~(orf_strength_df['gstop'] == orf_strength_df['gcoord'])
if opts.startonly: # count abortive initiation events towards start strength in this case
include_starts = (orf_strength_df['tcoord'] != orf_strength_df['tstop'])
if not include_starts.any():
return failure_return # no need to keep going if there weren't any useful starts
gcoord_grps = orf_strength_df[include_starts].groupby('gcoord')
# even if we are willing to count abinit towards start strength, we certainly shouldn't count histop
covmat_starts = covmat[np.ix_(include_starts.values, include_starts.values)]
orf_strs_starts = orf_strs[include_starts.values]
else:
if not elongating_orfs.any():
return failure_return
gcoord_grps = orf_strength_df[elongating_orfs].groupby('gcoord')
covmat_starts = covmat[np.ix_(elongating_orfs.values, elongating_orfs.values)]
orf_strs_starts = orf_strs[elongating_orfs.values]
start_strength_df = pd.DataFrame.from_items([('tfam', tfam),
('chrom', orf_set['chrom'].iloc[0]),
('strand', orf_set['strand'].iloc[0]),
('codon', gcoord_grps['codon'].first()),
('start_strength', gcoord_grps['orf_strength'].aggregate(np.sum))])
start_strength_df['W_start'] = pd.Series({gcoord: orf_strs_starts[rownums].dot(np.linalg.inv(covmat_starts[np.ix_(rownums, rownums)]))
.dot(orf_strs_starts[rownums]) for (gcoord, rownums) in gcoord_grps.indices.iteritems()})
if not opts.startonly:
# count histop towards the stop codon - but still exclude abinit
include_stops = (elongating_orfs | (orf_strength_df['tcoord'] == orf_strength_df['tstop']))
gstop_grps = orf_strength_df[include_stops].groupby('gstop')
covmat_stops = covmat[np.ix_(include_stops.values, include_stops.values)]
orf_strs_stops = orf_strs[include_stops.values]
stop_strength_df = pd.DataFrame.from_items([('tfam', tfam),
('chrom', orf_set['chrom'].iloc[0]),
('strand', orf_set['strand'].iloc[0]),
('stop_strength', gstop_grps['orf_strength'].aggregate(np.sum))])
stop_strength_df['W_stop'] = pd.Series({gstop: orf_strs_stops[rownums].dot(np.linalg.inv(covmat_stops[np.ix_(rownums, rownums)]))
.dot(orf_strs_stops[rownums]) for (gstop, rownums) in gstop_grps.indices.iteritems()})
# # nohistop
# gstop_grps = orf_strength_df[elongating_orfs].groupby('gstop')
# covmat_stops = covmat[np.ix_(elongating_orfs.values, elongating_orfs.values)]
# orf_strs_stops = orf_strs[elongating_orfs.values]
# stop_strength_df['stop_strength_nohistop'] = gstop_grps['orf_strength'].aggregate(np.sum)
# stop_strength_df['W_stop_nohistop'] = pd.Series({gstop:orf_strs_stops[rownums].dot(np.linalg.inv(covmat_stops[np.ix_(rownums,rownums)]))
# .dot(orf_strs_stops[rownums]) for (gstop, rownums) in gstop_grps.indices.iteritems()})
return orf_strength_df, start_strength_df, stop_strength_df
else:
return orf_strength_df, start_strength_df
def _regress_chrom(chrom_to_do):
"""Applies _regress_tfam() to all of the transcript families on a chromosome"""
chrom_orfs = pd.read_hdf(opts.orfstore, 'all_orfs', mode='r', where="chrom == %r and tstop > 0 and tcoord > 0" % chrom_to_do,
columns=['orfname', 'tfam', 'tid', 'tcoord', 'tstop', 'AAlen', 'chrom', 'gcoord', 'gstop', 'strand',
'codon', 'orftype', 'annot_start', 'annot_stop'])
# tcoord > 0 removes ORFs where the first codon is an NTG, to avoid an indexing error
# Those ORFs would never get called anyway since they couldn't possibly have any reads at their start codon
if opts.exclude:
chrom_orfs = chrom_orfs[~chrom_orfs['tfam'].isin(opts.exclude)]
if restrictbystartfilenames:
restrictedstarts = pd.DataFrame()
for (restrictbystart, minw) in zip(restrictbystartfilenames, opts.minwstart):
restrictedstarts = restrictedstarts.append(
pd.read_hdf(restrictbystart, 'start_strengths', mode='r', where="(chrom == %r) & (W_start > minw)" % chrom_to_do,
columns=['tfam', 'chrom', 'gcoord', 'strand']), ignore_index=True).drop_duplicates()
chrom_orfs = chrom_orfs.merge(restrictedstarts) # inner merge acts as a filter
if chrom_orfs.empty:
if opts.verbose > 1:
with log_lock:
logprint('No ORFs found on %s' % chrom_to_do)
return failure_return
inbams = [pysam.Samfile(infile, 'rb') for infile in opts.bamfiles]
gnd = HashedReadBAMGenomeArray(inbams, ReadKeyMapFactory(Pdict, read_length_nmis))
res = tuple([pd.concat(res_dfs) for res_dfs in zip(*[_regress_tfam(tfam_set, gnd) for (tfam, tfam_set) in chrom_orfs.groupby('tfam')])])
for inbam in inbams:
inbam.close()
if opts.verbose > 1:
with log_lock:
logprint('%s complete' % chrom_to_do)
return res
with pd.HDFStore(opts.orfstore, mode='r') as orfstore:
chroms = orfstore.select('all_orfs/meta/chrom/meta').values # because saved as categorical, this is the list of all chromosomes
if os.path.isfile(metafilename) and not opts.force:
if opts.verbose:
logprint('Loading metagene')
metagene = pd.read_csv(metafilename, sep='\t').set_index(['region', 'position'])
metagene.columns = metagene.columns.astype(int) # they are read lengths
assert (metagene.columns == rdlens).all()
startprof = metagene.loc['START']
cdsprof = metagene.loc['CDS']
stopprof = metagene.loc['STOP']
startnt = (startprof.index.min(), startprof.index.max()+1)
assert len(cdsprof) == 3
stopnt = (stopprof.index.min(), stopprof.index.max()+1)
startprof = startprof.values.T
cdsprof = cdsprof.values.T
stopprof = stopprof.values.T
else:
if opts.verbose:
logprint('Calculating metagene')
startnt = (-abs(opts.startrange[0])*3, abs(opts.startrange[1])*3) # force <=0 and >= 0 for the bounds
stopnt = (-abs(opts.stoprange[0])*3, abs(opts.stoprange[1])*3)
if stopnt[0] >= -6:
raise ValueError('STOPRANGE must encompass at least 3 codons prior to the stop')
min_AAlen = (startnt[1]-stopnt[0])/3 # actually should be longer than this to ensure at least one codon in the body
startlen = startnt[1]-startnt[0]
stoplen = stopnt[1]-stopnt[0]
workers = mp.Pool(opts.numproc)
(startprof, cdsprof, stopprof, num_cds_incl) = [sum(x) for x in zip(*workers.map(_get_annotated_counts_by_chrom, chroms))]
workers.close()
startprof /= num_cds_incl # technically not necessary, but helps for consistency of units across samples
cdsprof /= num_cds_incl
stopprof /= num_cds_incl
pd.concat((pd.DataFrame(data=startprof.T,
index=pd.MultiIndex.from_tuples([('START', x) for x in range(*startnt)], names=['region', 'position']),
columns=pd.Index(rdlens, name='rdlen')),
pd.DataFrame(data=cdsprof.T,
index=pd.MultiIndex.from_tuples([('CDS', x) for x in range(3)], names=['region', 'position']),
columns=pd.Index(rdlens, name='rdlen')),
pd.DataFrame(data=stopprof.T,
index=pd.MultiIndex.from_tuples([('STOP', x) for x in range(*stopnt)], names=['region', 'position']),
columns=pd.Index(rdlens, name='rdlen')))) \
.to_csv(metafilename, sep='\t')
catfields = ['chrom', 'strand', 'codon', 'orftype']
if not opts.noregress:
if opts.verbose:
logprint('Calculating regression results by chromosome')
workers = mp.Pool(opts.numproc)
if opts.startonly:
(orf_strengths, start_strengths) = \
[pd.concat(res_dfs).reset_index() for res_dfs in zip(*workers.map(_regress_chrom, chroms))]
if opts.verbose:
logprint('Saving results')
for catfield in catfields:
if catfield in start_strengths.columns:
start_strengths[catfield] = start_strengths[catfield].astype('category') # saves disk space and read/write time
if catfield in orf_strengths.columns:
orf_strengths[catfield] = orf_strengths[catfield].astype('category') # saves disk space and read/write time
with pd.HDFStore(regressfilename, mode='w') as outstore:
outstore.put('orf_strengths', orf_strengths, format='t', data_columns=True)
outstore.put('start_strengths', start_strengths, format='t', data_columns=True)
else:
(orf_strengths, start_strengths, stop_strengths) = \
[pd.concat(res_dfs).reset_index() for res_dfs in zip(*workers.map(_regress_chrom, chroms))]
if opts.verbose:
logprint('Saving results')
for catfield in catfields:
if catfield in start_strengths.columns:
start_strengths[catfield] = start_strengths[catfield].astype('category') # saves disk space and read/write time
if catfield in orf_strengths.columns:
orf_strengths[catfield] = orf_strengths[catfield].astype('category') # saves disk space and read/write time
if catfield in stop_strengths.columns:
stop_strengths[catfield] = stop_strengths[catfield].astype('category') # saves disk space and read/write time
with pd.HDFStore(regressfilename, mode='w') as outstore:
outstore.put('orf_strengths', orf_strengths, format='t', data_columns=True)
outstore.put('start_strengths', start_strengths, format='t', data_columns=True)
outstore.put('stop_strengths', stop_strengths, format='t', data_columns=True)
workers.close()
if opts.verbose:
logprint('Tasks complete')