-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpipeline.py
executable file
·768 lines (618 loc) · 47.2 KB
/
pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
#!/usr/bin/env python2
'''
Run this script with --help to see some examples use cases and command line option descriptions.
This is a python2.x script that generates a UNIX shell script, which then invokes external tools to handle:
* Alignment of FASTQ -> BAM
* RNA-SEQ: Counting of reads (using Subread FeatureCounts) & differential expression (via an edgeR script in R)
* CHIP-SEQ: Peak-calling using GEM and/or BCP
It requires at least python2.7 due to the use of the 'argparse' library.
Note that a lot of external software is used; check the top of the source file for the (hard-coded) paths to
programs like java (the JAVA_PATH variable), GEM (GEM_JAR_PATH), etc...
'''
from __future__ import print_function # sets print("...") to have parens like in python 3
from __future__ import division # defaults to floating point division. No more "1/2 == 0"
import sys
import pdb #pdb.set_trace() ## Python Debugger! See: http://aymanh.com/python-debugging-techniques
import os.path
import argparse # requires python 2.7
#import re
global_file_paths_already_aligned = dict() # Remember which files we are aligning, so we don't try to double-align a file and overwrite it
global_annot = dict()
# ==================================================================================
# ==================================================================================
# ======= Paths to the programs we need and various other hard-coded things.
# ======= Programs are currently assumed to be on the user's $PATH ========
global_annot['hg19' ] = "/data/info/genome/hg19_ensembl_igenome_with_chr/hg19.chr"
global_annot['mm9' ] = "/data/info/genome/mm9_ensembl_igenome_with_chr/mm9.chr"
global_annot['danRer7'] = "/data/info/genome/danRer7_ensembl_igenome_ucsc/danRer7_ucsc"
global_annot['galGal4'] = "/data/info/genome/galGal4_ensembl_igenome_ucsc/galGal4_ucsc"
TOPHAT_PATH = "tophat"
BOWTIE_PATH = "bowtie2"
BAM2BED_PATH = "bam2bed"
SAMTOOLS_PATH = "samtools"
HTSEQ_COUNT_PATH = "htseq-count"
RSCRIPT_PATH = "Rscript" # <-- part of the default R install
EDGER_SCRIPT = "/data/projects/kp-600-b2b-osono-data-pipeline-run-feb-16/B-2016-11-November/run_edgeR_diff_expr.R" # this is a CUSTOM SCRIPT that we run
GEM_JAR_PATH = "/data/applications/2015_06/bin/gem.jar" # peak caller for TF / narrow peaks with motifs
BCP_HM_PATH = "BCP_HM" # Peak caller, good on broad peaks. Not the best for motifs, but still pretty good (see Reuben et al's paper)
JAVA_PATH = "java" # needed for GEM.jar, the GEM ChIP peak calling program. GEM is best at narrow peaks with particular sequence motifs.
DEFAULT_ALIGN_BOWTIE_DIR = "Pipeline_01b_Align_Bowtie_Dir"
DEFAULT_ALIGN_TOPHAT_DIR = "Pipeline_01t_Align_Tophat_Dir"
DEFAULT_HTSEQ_COUNT_DIR = "Pipeline_02_HTSeq_Count_Dir"
DEFAULT_EDGER_DIFF_EXPR_DIR = "Pipeline_03_EdgeR_Diff_Expr_Dir"
DEFAULT_BCP_PEAK_DIR = "Pipeline_05b_BCP_Peak_Dir"
DEFAULT_GEM_PEAK_DIR = "Pipeline_05g_GEM_Peak_Dir"
CHIP_BCP_STR = "BCP"
CHIP_GEM_STR = "GEM"
GEM_READ_DIST_FILE = "/data/projects/kp-600-b2b-osono-data-pipeline-run-feb-16/A-2016-08-August/github_seqpipeline/resources/GEM_Read_Distribution_default.txt"
print("Warning: note that the gem read distribution file is a HARD CODED file path right now!")
GEM_RAM_GB = 25 # in gigabytes. Sometimes crashes if it's < 10
TOPHAT_N_THREADS = 4 # Somewhat arbitrary
BOWTIE_N_THREADS = 4
SAMTOOLS_N_THREADS = BOWTIE_N_THREADS
# You probably won't need to change anything below... hopefully
# ==================================================================================
# ==================================================================================
EXIT_CODE_IF_MISSING_FILE = 49 # just some arbitrary non-zero exit code number
LITERAL_BACKSLASH = "\\" # <-- reduces to just one backslash
LITERAL_DQUOTE = "\""
ASSAY_RNA = 1 # <-- Arbitrary. could theoretically be an Enum
ASSAY_CHIP = 2
DEFAULT_OUTPUT_BASEDIR = "./" # write files to the current directory if no one says otherwise
DELIM_FOR_ARGS_TO_EDGER_SCRIPT = ',' # should be a comma normally. Do not change this unless the edgeR R script (WHICH IS A SEPARATE FILE!!!!) also changes. Do not change this without changing the R file too!
opt = None # <-- "opt" is a global variable that stores the cmd line options. This should really be the ONLY non-constant global!
def enquote(s):
# should we check to see if s has quotes in it already? Maybe.
return("\"" + str(s) + "\"")
def verbosePrint(m):
if (opt.verbose): printStderr(m)
return
def progressPrint(m):
printStderr(m) # print even if we are NOT in verbose mode
return
def argAssert(thing, message): # for things that involve user input somehow
if not thing: dieBadArgs(message)
def xAssert(thing, message="No specific message"): # for detecting programming bugs only
if not isinstance(message, str):
raise "wrong message argument--must be a string"
if not thing: raise(Exception(message))
def isSameLenOrNone(x, y):
return (x is None) or (y is None) or (len(x) == len(y))
def printStderr(*args, **keyargs):
print(*args, file=sys.stderr, **keyargs)
def pathWithBase(basedir, path):
return path if basedir is None else os.path.join(basedir, path)
def splitFileList(string_of_files, delim, basedir=None):
# Splits up a string
if string_of_files is None:
return None
assertType(string_of_files, str)
assertType(delim, str)
list_of_files = [pathWithBase(basedir, fff) for fff in string_of_files.split(delim)] # python list comprehension---add the basedir to the beginning of each file path
return list_of_files
def getListItemUnlessNone(theList, index):
return None if (theList is None) else theList[index]
#def tar_up_that_annotation():
# '''Just a function for generating the list of files to be tarred up, if we want to send this annotation to someone else. Not really production-ready code here!'''
# for assembly in global_annot:
# a = getAnnot(assembly)
# for subitem in a:
# path = a[subitem]
# if subitem == 'bowtie_index':
# path += "*.bt2"
# pass
# else:
# if not (os.path.isfile(path) or os.path.isdir(path) or os.path.isfile(path + ".1.bt2")):
# print("**WARNING**: file or path above does not appear to exist! (This one: " + path + ")")
# pass
# pass
# print(path)
# pass
# pass
# return
def dieBadArgs(errMsg):
sys.exit("ERROR: Problem with the command line arguments! Specifically: " + errMsg + ". Try using '--help' to see all the possible arguments.")
return
def handleCmdLineArgs():
DESC = '''
A program for running a standard ChIP- or RNA-seq pipeline.
Written for Python 2.7+ by Alex Williams, 2016. (Python 2.7 is needed for the 'argparse' library)
What this program does:
1) a) First, you provide the details about an experiment and the associated samples names and input FASTQ files.
b) Then, you run this 'pipeline.py' program. An example command is listed below in section (2)
c) 'pipeline.py' does not actually perform any bioinformatics analysis directly---instead,
it generates a shell script with a list of commands.
d) You then would want to run that shell script (let's call it 'script.sh') either
directly ('bash script.sh') or by submitting it to a compute
cluster (e.g. 'qsub --options-here script.sh')
e) When the script runs, it will take quite some time. The script will (slowly) generate a bunch of output
directories, each with one step of the analysis process. The default names are:
* Pipeline_01t_Alignment_Tophat
* Pipeline_01b_Alignment_Bowtie
* Pipeline_02_HTSeq_Count_Dir
* Pipeline_03_EdgeR_Diff_Expr_Dir
* ... etc.
Important caveats to be aware of:
* You will need to specify FULL PATHS to everything if you are planning on running the output script on a **cluster**!
2) You run a command line invocation like:
RNA-SEQ example:
python2 pipeline.py --script="script_test.sh" --outdir="/data/projects/kp-600-b2b-osono-data-pipeline-run-feb-16/B-2016-11-November/" --basedir="/data/projects/kp-600-b2b-osono-data-pipeline-run-feb-16/B-2016-11-November/test_data/" --experiment-id="Test_Experiment" --sample-ids="A1,A2,B1,B2" --rna-samples=a1.mm9.chr19.fq.gz,a2.mm9.chr19.fq.gz,b1.mm9.chr19.fq.gz,b2.mm9.chr19.fq.gz --groups=1,1,2,2 --species=mm9
* Note that ***FULL PATHS** for 'outdir' and 'basedir' are specified since this example is for a cluster.
ChIP-SEQ example:
python2 pipeline.py \\
--script="chip_test.sh" \\
--outdir="/data/projects/kp-600-b2b-osono-data-pipeline-run-feb-16/B-2016-11-November/" \\
--basedir="/data/projects/kp-600-b2b-osono-data-pipeline-run-feb-16/B-2016-11-November/test_data/" \\
--experiment-id="Test_Experiment_ChIP" \\
--sample-ids="ALPHA,BETA" \\
--groups=1,2 \\
--chip-samples=a1.mm9.chr19.fq.gz,b1.mm9.chr19.fq.gz \\
--chip-inputs=a2.mm9.chr19.fq.gz,b2.mm9.chr19.fq.gz \\
--species=mm9 \\
3) Details: the '--outdir' is the location where you want the script to generate OUTPUT
Details: the '--script' is the name of the actual script to generate. One output script is generated per invocation of this program.
Note that this is actual test data (a small set of reads from mouse chr19) that you can download:
a1.mm9.chr19.fq.gz (these reads were pre-selected to align to mm9, so 100% of them should align.)
a2.mm9.chr19.fq.gz (you can of course align them to any mouse genome build, however)
a3.mm9.chr19.fq.gz
b1.mm9.chr19.fq.gz
b2.mm9.chr19.fq.gz
b3.mm9.chr19.fq.gz
3) The output is a shell script that you can then submit on your cluster.
'''
EPILOG = '''
Version history: (none yet)
'''
# Check for certain files that are maybe-required first...
for fff in [EDGER_SCRIPT, GEM_JAR_PATH, global_annot['hg19'], global_annot['mm9'], global_annot['danRer7'], global_annot['galGal4'] ]:
if not (os.path.isfile(fff) or os.path.isfile(fff + ".1.bt2")):
print("**WARNING**: Note that we can't seem to find the following (probably hard-coded) path <" + fff + ">. Probably this is because it's HARD CODED on the filesystem right now!")
#sys.sleep(1)
pass
pass
parser = argparse.ArgumentParser(description=DESC, epilog=EPILOG, add_help=True, formatter_class=argparse.RawTextHelpFormatter)
#, formatter_class=argparse.RawTextHelpFormatter)
#parser.add_argument('--sup', '-s', dest='sup', default=None, metavar="\"UMI_LEFT,LINKER_LEFT,UMI_RIGHT,LINKER_RIGHT\"", type=str, help="Default: if unspecified (or set as --sup=GUESS), we will just guess the UMI lengths by looking at the consensus sequence of the first 1000 reads (and the linker wil be assumed to be ZERO bases--it will be included as part of the 'payload' sequence). 'Sup' stands for 'Supplementary' lengths (note: numbers, NOT actual sequence) for the UMI_LEFT, LINKER_LEFT, UMI_RIGHT, and LINKER_RIGHT.\nExample: --sup=\"8,2,2,5\"\n * for a sample with an 8+5 UMI and linkers of length 2.")
parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', help="Verbose debugging")
parser.add_argument('--groups', '-g', type=str, default=None, dest='groups', help="Specify experiment groups (numeric). Must be the same length as the number of files, and must be sequentially numbered with the lowest group being 1. Can be 'NA' or blank for samples that are not associated with a group (like ChIP inputs). OK example: --groups='1,1,2,3,,,4,4' (Note the two blank samples!).")
parser.add_argument('--basedir', '-b', type=str, default=None, dest='basedir', help="Optional. Specify a base directory to prepend to all paths.!")
parser.add_argument('--chip-samples', '--c1', type=str, default=None, dest='chip_samples', help="ChIP-seq only: Specify ChIP-seq sample files, comma-delimited. Must match the order of the corresponding --chip_inputs files!")
parser.add_argument('--chip-inputs' , '--p1', type=str, default=None, dest='chip_inputs', help="ChIP-seq only: Specify ChIP-seq input files, comma-delimited. Must match the order of the corresponding --chip_samples files!")
parser.add_argument('--peak-caller' , '--pc', type=str, default=None, dest='peak_caller', help="ChIP-seq only: Specify which peak caller to use. Must be either 'gem' or 'bcp' (case-insensitive). No default.!")
parser.add_argument('--rna-samples', '--r1', type=str, default=None, dest='rna_samples', help="RNA-seq only: Specify RNA-seq input files, comma-delimited.")
parser.add_argument('--sample-mates', '--c2', '--r2', type=str, default=None, dest='sample_mates', help="The mate pair associatd with either the CHIP or RNA samples. Comma-delimited just like the normal samples. Example: --r1=DrugX.fq,DrugY.fq --r2=DrugXpair2.fq,DrugYpair2.fq")
parser.add_argument('--input-mates', '--p2', type=str, default=None, dest='input_mates', help="The mate pair associated with the second end of a pair in a CHIP sample. Comma-delimited just like the normal samples.")
parser.add_argument('--species', '-s', type=str, default=None, dest='species', help="Required: The short name for the species.")
parser.add_argument('--sample-ids', '--sids', type=str, default=None, dest='sample_ids', help="Required: The short sample ID names, comma-delimited.")
parser.add_argument('--experiment-id', '--eids', type=str, default=None, dest='exper_id', help="Required: The single experiment ID (name).")
parser.add_argument('--delim', type=str, default=',', dest='delim', help="Default file/group delimiter. Normally should be a comma, unless you're doing something very unusual.")
parser.add_argument('--script', type=str, default=None, dest='scriptname', help="The output script file to write, e.g. --script='/path/to/script.sh' ")
parser.add_argument('--outdir', '-o' , type=str, default=None, dest='outdir', help="Optional. Specify a base output directory. Must already exists. Default is the current directory.")
parser.add_argument('--debug-tar-annotations', dest="debug_tar_annotations", action='store_true', help="A debug command only for setting up this software. Generates a tar command that can be used to save the annotation data in one place.")
global opt # <-- critical, since we modify (global variable) "got" below. Do not remove this!
opt = parser.parse_args()
#if (opt.debug_tar_annotations):
# tar_up_that_annotation()
# print("Exiting early since this is just a command for tarring files.")
# return
assay = None
argAssert(opt.species is not None, "A species must be specified! It must be one species for ALL samples currently. Example species: 'mm9' or 'danRer7' or 'hg19'.")
argAssert(opt.rna_samples is None or (opt.chip_samples is None and opt.chip_inputs is None), "You cannot specify BOTH RNA-seq and ALSO ChIP-seq at the same time! Pick either RNA-seq or ChIP-seq")
argAssert(opt.groups is not None, "You must specify the groups that each sample belongs to, or either 'NA' or a blank value for files that are not direclty associated with a group (such as ChIP inputs).")
opt.outdir = DEFAULT_OUTPUT_BASEDIR if opt.outdir is None else opt.outdir
if (opt.rna_samples is not None):
assay = ASSAY_RNA
samp1 = splitFileList(opt.rna_samples , delim=opt.delim, basedir=opt.basedir)
samp2 = splitFileList(opt.sample_mates, delim=opt.delim, basedir=opt.basedir)
inp1 = None # Rna-seq doesn't have 'input' files
inp2 = None # Rna-seq doesn't have 'input' files
elif (opt.chip_samples is not None):
argAssert(opt.chip_inputs is not None, "If you specify --chip-samples, you have to also specify --chip-inputs, but it appears that was NOT specified.")
assay = ASSAY_CHIP
samp1 = splitFileList(opt.chip_samples, delim=opt.delim, basedir=opt.basedir)
samp2 = splitFileList(opt.sample_mates, delim=opt.delim, basedir=opt.basedir)
inp1 = splitFileList(opt.chip_inputs, delim=opt.delim, basedir=opt.basedir)
inp2 = splitFileList(opt.input_mates, delim=opt.delim, basedir=opt.basedir)
# Apparently we're running a ChIP-seq project
argAssert(len(inp1) == len(samp1), "Somehow your --chip-inputs (which is length " + str(len(inp1)) + ") was a different length from the --chip-samples (which is length " + str(len(samp1)) + "! Fix this.")
argAssert(isSameLenOrNone(inp1, inp2), "Forward and reverse MATE PAIRS must have the same number of files!")
else:
dieBadArgs("You have to AT LEAST have an RNA-seq sample to process or a ChIP-seq sample to process. Specify at least one!")
pass
argAssert(opt.sample_ids is not None, "Sample IDs must be specified as a comma-delimited list. For example: --sample-ids='S123,S191,S990'. One per fastq file pair.")
sample_id_list = opt.sample_ids.split(opt.delim)
argAssert(len(samp1) == len(sample_id_list), "The number of sample IDs and the number of input fastq files is DIFFERENT, which indicate a problem")
argAssert(opt.exper_id is not None, "An experiment ID (--experiment-id='Something') is required, but one was not specified")
argAssert(isSameLenOrNone(samp1, samp2), "Forward and reverse MATE PAIRS must have the same number of files!")
groups = opt.groups.split(opt.delim)
are_groups_ok = [(g.isdigit() and int(g) > 0) or g == '' or g is None or g.upper() == 'NA' for g in groups]
argAssert(all(are_groups_ok), "Some of your input groups were not in the valid set of being either a number >= 1, the literal text 'NA', or totally blank. Fix the input groups!")
argAssert(len(groups) == len(samp1), "Somehow your '--groups' (length " + str(len(groups)) + ") was not equal in length to your specified number of sample files (which had a length of " + str(len(samp1)) + ")!")
verbosePrint("Groups: " + str(groups))
# Finally, actually generate the commands...
argAssert(opt.species in global_annot, "Your specified --species (a genome assembly) was not in our recognized list, unfortunately. Check your capitalization and spaces.")
xcmd = OurScript()
base_bam_subdirectory_only = DEFAULT_ALIGN_TOPHAT_DIR if (assay == ASSAY_RNA) else DEFAULT_ALIGN_BOWTIE_DIR # this changes based on whether we're using Tophat or Bowtie
eee = Experiment(expName=opt.exper_id, species=opt.species, sid_list=sample_id_list
, fq1_list=samp1, fq2_list=samp2 # Normal fastq files for the samples
, inp1_list=inp1, inp2_list=inp2 # Inputs for ChIPseq
, base_bam_dir=pathWithBase(opt.outdir, base_bam_subdirectory_only)
, base_count_dir=pathWithBase(opt.outdir, DEFAULT_HTSEQ_COUNT_DIR)
, base_edger_dir=pathWithBase(opt.outdir, DEFAULT_EDGER_DIFF_EXPR_DIR)
, base_bcp_peak_dir=pathWithBase(opt.outdir, DEFAULT_BCP_PEAK_DIR)
, base_gem_peak_dir=pathWithBase(opt.outdir, DEFAULT_GEM_PEAK_DIR)
)
xcmd.append("echo 'First, we are checking to make sure all the FASTQ input files exist...'")
xcmd.appendCheckForRequiredFiles(file_tup=eee.getTupleOfAllFqFiles())
xcmd.append("")
if assay == ASSAY_RNA:
xcmd.append("###### Handling RNA-seq here ")
handleRNA(groups=groups, script_obj=xcmd, exper=eee)
elif assay == ASSAY_CHIP:
xcmd.append("###### Handling ChIP-seq here ")
argAssert(opt.peak_caller is not None and opt.peak_caller.upper() in ("BCP", "GEM"), "If you are running chIP-seq peak calls, then be sure to specify the peak caller as either BCP or GEM, using --peak-caller=BCP or --peak-caller=GEM .")
handleCHIP(groups=groups, script_obj=xcmd, exper=eee, caller=opt.peak_caller)
else:
sys.exit("Something went wrong (programming bug)---this assay type is not recognized!")
pass
xcmd.writeToDisk(scriptname=opt.scriptname)
return # End of command-line-reading function
def assertType(x, oktype, none_ok=False):
if none_ok and x is None:
return # this is ok!
if (not isinstance(oktype, type)):
raise Exception("The 'oktype' argument must be a 'type'.")
# Require that everything in the list_of_things is one of the "oktype"
xAssert(isinstance(x, oktype), "We expected the variable to be of type "+str(oktype)+", but instead it was a "+str(type(x))+"")
def withoutNone(aaa):
if aaa is None: # turn a 'None' input into an empty list
aaa = ()
xAssert(isinstance(aaa, (tuple, list)), "problem! incorrect argument to 'withoutNone', which must be a tuple or list, or None.")
return tuple([x for x in aaa if x is not None]) # remove any 'None' elements from the list
def maybeNoneDict(keys, values):
if values is None:
return dict.fromkeys(keys) # keys are normal, but values are all 'None' if a single "none" was passed in as 'values'
else:
return dict(zip(keys, values)) # just normally initialize a python dictionary with keys & matching values
class Experiment(object):
'''A class that keeps track of all our samples for an experiment.
This is the class that can answer questions like "what are the FASTQ files associated with a specific sample?"
Also this is where we remember all the sample names.
This is basically where ALL THE DATA about an experiment is stored.'''
def __init__(self, expName, species, sid_list, fq1_list, fq2_list, inp1_list, inp2_list, base_bam_dir, base_count_dir, base_edger_dir, base_bcp_peak_dir, base_gem_peak_dir):
xAssert(isinstance(sid_list, (list, tuple)))
xAssert(isinstance(expName, (str)))
self.expName = expName
self.sids = sid_list # list of sample IDs
self.species = {key:species for key in self.sids} # dict
self.f1 = maybeNoneDict(self.sids, fq1_list)
self.f2 = maybeNoneDict(self.sids, fq2_list) # f2 = mate pair corresponding to f1
self.i1 = maybeNoneDict(self.sids, inp1_list) # <-- INPUT (control) for ChIP. Not relevant for RNA-seq
self.i2 = maybeNoneDict(self.sids, inp2_list) # mate pair of i1, if any ("None" if there isn't one)
self.base_bam_dir = base_bam_dir
self.base_count_dir = base_count_dir
self.base_edger_dir = base_edger_dir
self.base_bcp_peak_dir = base_bcp_peak_dir
self.base_gem_peak_dir = base_gem_peak_dir
pass
def sampleHasChIPControl(self, sampName):
return self.i1[sampName] is not None
def getBamDirForSample(self, sampName):
# Example: my_bam_files/EXPERIMENT_27/SAMPLE_18/
return os.path.join(self.base_bam_dir, self.expName, sampName) # Note that this makes TWO subdirectories!
def getBamDirForChIPControlOfSample(self, sampName):
# Example: my_bam_files/EXPERIMENT_27/SAMPLE_18/
CHIP_CONTROL_SUFFIX="_CHIP_CONTROL"
return os.path.join(self.base_bam_dir, self.expName, sampName+CHIP_CONTROL_SUFFIX) # Note that this makes TWO subdirectories!
def getSampleNames(self): # returns: list
return self.sids
def getFqPairForSample(self, sampName): # returns (2-element tuple)
return (self.f1[sampName], self.f2[sampName])
def getFqControlsForSample(self, sampName): # returns (2-element tuple)
return (self.i1[sampName], self.i2[sampName]) # The inputs/controls for ChIPSeq
def getTupleOfAllFqFiles(self): # Just a list of all *unique* fastq files, not in any particularly significant order. No "none"s
return withoutNone(tuple(set(self.f1.values()).union(self.f2.values()).union(self.i1.values()).union(self.i2.values())))
def getSpeciesForSample(self, sampName): # returns: string (single assembly name)
return self.species[sampName]
def getAlignedBamForSample(self, sampName): # returns: string (single file path)
return os.path.join(self.getBamDirForSample(sampName), "accepted_hits.bam")
def getAlignedChIPControlBamForSample(self, sampName): # returns: string (single file path)
# Returns a file path, OR rturns None if this sample doesn't have a corresponding chip input.
return os.path.join(self.getBamDirForChIPControlOfSample(sampName), "accepted_hits.bam") if self.sampleHasChIPControl(sampName) else None
def getCountDirForSample(self, sampName): # returns: string (a single directory full path)
return os.path.join(self.base_count_dir, self.expName, sampName)
def getCountFileForSample(self, sampName): # returns: string (single file path)
return os.path.join(self.getCountDirForSample(sampName), "htseq_count."+self.expName+"."+sampName+".matrix.txt")
def getAllCountFiles(self): # returns: **list** of file paths
return [self.getCountFileForSample(x) for x in self.getSampleNames()] # <-- list comprehension: return a LIST of all count file paths
def getEdgeRDiffExprDir(self): # returns: string (single full file path)
return os.path.join(self.base_edger_dir)
def getEdgeRDiffExprFile(self): # returns: string (single full file path)
return os.path.join(self.getEdgeRDiffExprDir(), "edgeR." + self.expName + ".out.txt")
def getBcpPeakDir(self, sampName): # returns: string (single full file path)
return os.path.join(self.base_bcp_peak_dir, self.expName, sampName)
def getBcpPeakCallBedFile(self, sampName): # returns: string (single full file path)
return os.path.join(self.getBcpPeakDir(sampName), (sampName+"_BCP_peaks.bed"))
def getGemPeakDir(self, sampName): # returns: string (single full file path)
return os.path.join(self.base_gem_peak_dir, self.expName, sampName)
def getGemOutHTMLFile(self, sampName): # returns: string (single full file path)
return os.path.join(self.getGemPeakDir(sampName), sampName + "_result.htm")
class OurScript(object):
"""Script text for writing to a file"""
def __init__(self):
self.lines = ["#!/bin/bash -u"
, "set -e"
, "set -o pipefail"]
self.lines.append("")
self.lines.append("### You can run this script with a command like one of these:")
self.lines.append("### * bash ./this_script_name.sh")
self.lines.append("### Or submit it to a cluster: * qsub --some-options ./this_script_name.sh")
self.lines.append("")
pass
def append(self, text):
self.lines.append(text) # add exactly one line
return
def appendMkDir(self, dirpath):
self.append("mkdir -p " + enquote(dirpath) + "")
return
def appendCheckForRequiredFiles(self, file_tup=()):
# File list can be a list of files or NONE (none just means we don't do anything)
assertType(file_tup, tuple, none_ok=True)
file_tup = withoutNone(file_tup) # make sure it's a [ ] list, and not just like ONE file someone passed in
for f in file_tup:
self.append("")
self.append("# ===== Checking for a required file")
self.append("if [[ ! -e " + enquote(f) + " ]]; then")
self.append(" echo '[ERROR] Cannot find the following required file, which we expect to exist: <" + f + ">" + "'")
self.append(" exit " + str(EXIT_CODE_IF_MISSING_FILE)) # exit with a somewhat arbitrary code number if there is a missing file
self.append("fi")
self.append("")
pass
return
def appendCommandUnlessFilesExist(self, check_files=(), cmd=None, required_prereqs=(), required_output=() ):
# cmd will ONLY be executed if there is one or more filenames in "check_files" that do not exist.
# Note: check_files absolutely CANNOT be empty or None!
check_files = tuple(check_files) if isinstance(check_files, str) else check_files # wrap the string in a tuple just in case...
required_prereqs = tuple(required_prereqs) if isinstance(required_prereqs, str) else required_prereqs # wrap the string in a tuple just in case...
required_output = tuple(required_output) if isinstance(required_output, str) else required_output # wrap the string in a tuple just in case...
assertType(cmd, str)
assertType(check_files, tuple)
assertType(required_prereqs, tuple)
assertType(required_output, tuple)
self.appendCheckForRequiredFiles(required_prereqs) # make sure any prereqs (if any) actually exist
check_files = withoutNone(check_files) # make sure it's a [ ] list, and not just like ONE file someone passed in
xAssert(len(check_files) >= 1)
multiIfString = "if " + " && ".join( [ "[[ -e " + enquote(g) + " ]]" for g in check_files ] ) # <-- python list comprehension that generates results like "if [[ -e myFile ]] && [[ -e file2 ]] && [[ -e file3 ]]"
checked_files_str = ", ".join(check_files) # <-- python list comprehension that generates results like "if [[ -e myFile ]] && [[ -e file2 ]] && [[ -e file3 ]]"
self.append("\n#-----------------------")
self.append(multiIfString)
self.append("then")
self.append(" echo '[SKIPPING] All the specified output files (" + checked_files_str + ") already exist--so we are skipping this step' ")
self.append("else")
self.append(" echo '[RUNNING]...'")
self.append(cmd)
self.append("fi")
self.append("#-----------------------\n")
self.appendCheckForRequiredFiles(required_output) # make sure we generated the required files (if any)
return
def writeToDisk(self, scriptname=None):
if (scriptname is None):
print("\n".join(self.lines)) # just print to stdout
else:
scriptname = scriptname
with open(scriptname, 'w') as fff:
fff.write("\n".join(self.lines) + "\n")
pass
pass
progressPrint("Wrote script data to the following filename: " + scriptname)
return
def handleRNA(groups, script_obj, exper):
xAssert(isinstance(exper, (Experiment)))
# Step 1/3: Alignment....
for sampName in exper.getSampleNames():
(s1, s2) = exper.getFqPairForSample(sampName)
generateAlignmentCmd(fastq1=s1, fastq2=s2, species=exper.getSpeciesForSample(sampName), outdir=exper.getBamDirForSample(sampName), outbam=exper.getAlignedBamForSample(sampName), aligner=TOPHAT_PATH, script_obj=script_obj)
pass
# STEP 2/3: Counts....
for sampName in exper.getSampleNames():
theGtf = getAnnot(exper.getSpeciesForSample(sampName), "gtf")
theBam = exper.getAlignedBamForSample(sampName)
generateHtseqCountCmd(bam=theBam, gtf=theGtf, outdir=exper.getCountDirForSample(sampName), outfile=exper.getCountFileForSample(sampName), script_obj=script_obj)
pass
# STEP 3/3: Differential expression
generateEdgeRCmd(count_files=exper.getAllCountFiles(), groups=groups, outdir=exper.getEdgeRDiffExprDir(), outfile=exper.getEdgeRDiffExprFile(), script_obj=script_obj) # Note: this will call a custom edgeR script, which is also in this same repository.
return
def handleCHIP(groups, script_obj, exper, caller=None):
# Alignment...
assertType(caller, str)
for sampName in exper.getSampleNames():
species = exper.getSpeciesForSample(sampName)
(s1, s2) = exper.getFqPairForSample(sampName)
expr_bam_dir = exper.getBamDirForSample(sampName)
generateAlignmentCmd(fastq1=s1, fastq2=s2, species=species, outdir=expr_bam_dir, outbam=exper.getAlignedBamForSample(sampName), aligner=BOWTIE_PATH, script_obj=script_obj)
(p1, p2) = exper.getFqControlsForSample(sampName)
chip_control_bam_dir = exper.getBamDirForChIPControlOfSample(sampName)
generateAlignmentCmd(fastq1=p1, fastq2=p2, species=species, outdir=chip_control_bam_dir, outbam=exper.getAlignedChIPControlBamForSample(sampName), aligner=BOWTIE_PATH, script_obj=script_obj)
# Note that although we generate redundant alignment commands for controls appearing multiple times, only one of those actually gets run, so things should be A-OK
pass
# Peak calling...
for sampName in exper.getSampleNames():
if caller.upper() == CHIP_GEM_STR.upper(): generateGemPeakCmd(exper=exper, sampName=sampName, script_obj=script_obj)
elif caller.upper() == CHIP_BCP_STR.upper(): generateBcpPeakCmd(exper=exper, sampName=sampName, script_obj=script_obj)
else: raise Exception("Programming bug -- unsupported peak caller")
pass
return
def getAnnot(assembly, key=None):
# Sort of a weird wrapper to an array. Probably not the most elegant way to do this, but should be OK for our low-performance needs. Easy to change in the future!
# 'None' is how you get the whole data structure
xAssert(key in [None, 'gtf', 'bowtie_index', 'fasta', 'fa_by_chrom', 'chr_sizes'], "Invalid annotation requested") # sanity check
vals = dict()
vals['bowtie_index'] = global_annot[assembly]
vals['gtf' ] = global_annot[assembly] + ".gtf"
vals['fasta' ] = global_annot[assembly] + ".fa"
vals['fa_by_chrom' ] = global_annot[assembly] + "_chromosome_fastas"
vals['chr_sizes' ] = global_annot[assembly] + ".chrom.sizes.txt"
return vals if key is None else vals[key]
def generateHtseqCountCmd(bam, gtf, outdir, outfile, script_obj):
sortedbam = bam + ".sorted_by_name.bam"
samtools_sort_by_name_cmd = " ".join([SAMTOOLS_PATH, "sort -n", bam, ">", sortedbam])
ht_params = " --format=bam --order=name --mode=intersection-nonempty --stranded=no --minaqual=10 --type=exon --idattr=gene_id "
ht_cmd = " ".join([HTSEQ_COUNT_PATH, ht_params, sortedbam, gtf, ">", outfile])
script_obj.appendMkDir(outdir) # this directory must exist before we try moving any files to it...
script_obj.appendCommandUnlessFilesExist(check_files=(sortedbam,), cmd=samtools_sort_by_name_cmd, required_prereqs=(bam,) , required_output=(sortedbam,)) # sort by NAME and not position! That's just a thing that HTSeq needs/prefers if it's to handle paired-end files properly
script_obj.appendCommandUnlessFilesExist(check_files=(outfile,) , cmd=ht_cmd , required_prereqs=(sortedbam,), required_output=(outfile,))
return
def generateBcpPeakCmd(exper, sampName, script_obj):
'''BCP is a good all-purpose peak caller. It works especially well for histone-related (broad) peaks. We do not use it for the narrow TF-related peaks, where GEM is preferred.'''
species = exper.getSpeciesForSample(sampName)
expBam = exper.getAlignedBamForSample(sampName)
ctrlBam = exper.getAlignedChIPControlBamForSample(sampName)
outdir = exper.getBcpPeakDir(sampName)
bed_final_out = exper.getBcpPeakCallBedFile(sampName)
bcp_fragment_size = 200 # "The fragment size to which we extend the reads in pre-processing data step. Default:200bp."
bcp_window_size = 200 # "The window size we apply the adjcaent window in pre-processing data step. Default:200bp."
bcp_p_cutoff = 0.001 # The p_value you want to use for remove false positive based on control data. Range:1e-2---1e-6, default is 1e-3.
# bed_out <- paste0(outPrefix, ".bed") # The results data set with 5 columns you want to output.
# BCP_HM docs are accessible with this unusual command: BCP_HM -h 1
# -1 The ChIP-seq data set you want to input.
# -2 The control/input data set you want to input.
# -f The fragment size to which we extend the reads in pre-processing data step. Default:200bp.
# -w The window size we apply the adjacent window in pre-processing data step. Default:200bp.
# -p The p_value you want to use for remove false positive based on control data. Range:1e-2---1e-6, default is 1e-3.
# -3 The results data set with 5 columns you want to output.
# -h Use "-h 1" to display the help (not just -h)
exp_as_bedfile = expBam + ".converted_to.bed"
ctrl_as_bedfile = ctrlBam + ".converted_to.bed"
# BCP requires BED inputs, not BAM inputs. So we need to convert them using the common tool 'bam2bed'
exp_to_bed_cmd = ''.join([BAM2BED_PATH, " --do-not-sort ", " < " + expBam , " | cut -f 1-3 > ", exp_as_bedfile])
ctrl_to_bed_cmd = ''.join([BAM2BED_PATH, " --do-not-sort ", " < " + ctrlBam, " | cut -f 1-3 > ", ctrl_as_bedfile])
# Note that we cut the files down to just columns 1 through 3! We do not need any of the extra columns
bcp_cmd = BCP_HM_PATH
bcp_cmd += " -1 " + str(exp_as_bedfile) # BED file
bcp_cmd += " -2 " + str(ctrl_as_bedfile) # BED file
bcp_cmd += " -f " + str(bcp_fragment_size)
bcp_cmd += " -w " + str(bcp_window_size)
bcp_cmd += " -p " + str(bcp_p_cutoff)
bcp_cmd += " -3 " + str(bed_final_out)
# There are two versions of BCP - BCP_TF and BCP_HM. The latter is for histone marks - what you are interested in. BCP takes bed files as input.
# Note: BCP_HM has a very confusing help method: you specify "BCP_HM -h 1" for help. The "1" is mandatory.
#This is an example command I ran on a sample of H3K4me3 data:
#BCP_HM -1 /home/rthomas/ChipSeqChallenge/RealData/HistoneData/Monkey/Results/04a.mapping/wgEncodeBroadHistoneGm12878H3k4me3StdAlnRep1.bed -2 /home/rthomas/ChipSeqChallenge/RealData/HistoneData/Monkey/Results/04a.mapping/wgEncodeBroadHistoneGm12878ControlStdAlnRep1.bed -f 200 -w 200 -p 0.001 -3 GM12878_H3K4me3_Rep1_EncodeAlign_results001_HM.bed
#Note, there are changes to this command if you have Chip-exo data. You will need to replace "Read_Distribution_default.txt" by "Read_Distribution_ChIP-exo.txt" and will need to add --smooth 3 option to the commas.
# -Reuben
#BCP_HM -1 /home/rthomas/ChipSeqChallenge/RealData/HistoneData/Monkey/Results/04a.mapping/wgEncodeBroadHistoneGm12878H3k4me3StdAlnRep1.bed
# -2 /home/rthomas/ChipSeqChallenge/RealData/HistoneData/Monkey/Results/04a.mapping/wgEncodeBroadHistoneGm12878ControlStdAlnRep1.bed
# -f 200 -w 200 -p 0.001 -3 GM12878_H3K4me3_Rep1_EncodeAlign_results001_HM.bed
script_obj.appendMkDir(outdir) # this directory must exist before we try moving any files to it...
script_obj.appendCheckForRequiredFiles(file_tup=(expBam, ctrlBam))
# convert the EXPERIMENT bam file to a bed file
script_obj.appendCommandUnlessFilesExist(check_files=(exp_as_bedfile,) , cmd=exp_to_bed_cmd , required_output=(exp_as_bedfile,))
# convert the CONTROL bam file to a bed file
script_obj.appendCommandUnlessFilesExist(check_files=(ctrl_as_bedfile,), cmd=ctrl_to_bed_cmd, required_output=(ctrl_as_bedfile,))
# Finally, run the experiment on the input files
script_obj.appendCommandUnlessFilesExist(check_files=(bed_final_out,) , cmd=bcp_cmd , required_output=(bed_final_out,))
# outPrefix <- gsub("[.](bam|sam|bed)$", "", outPrefix, ignore.case=T, perl=T) # remove any '.bam'/sam/bed from the end. Those are not correct suffixes. We will add the suffix ourselves
#
# # There are two versions of BCP - BCP_TF and BCP_HM. The latter is for histone marks - what you are interested in. BCP takes bed files as input.
# # Note: BCP_HM has a very confusing help method: you specify "BCP_HM -h 1" for help. The "1" is mandatory.
# #This is an example command I ran on a sample of H3K4me3 data:
# #BCP_HM -1 /home/rthomas/ChipSeqChallenge/RealData/HistoneData/Monkey/Results/04a.mapping/wgEncodeBroadHistoneGm12878H3k4me3StdAlnRep1.bed -2 /home/rthomas/ChipSeqChallenge/RealData/HistoneData/Monkey/Results/04a.mapping/wgEncodeBroadHistoneGm12878ControlStdAlnRep1.bed -f 200 -w 200 -p 0.001 -3 GM12878_H3K4me3_Rep1_EncodeAlign_results001_HM.bed
# #Note, there are changes to this command if you have Chip-exo data. You will need to replace "Read_Distribution_default.txt" by "Read_Distribution_ChIP-exo.txt" and will need to add --smooth 3 option to the commas.
# # -Reuben
# #BCP_HM -1 /home/rthomas/ChipSeqChallenge/RealData/HistoneData/Monkey/Results/04a.mapping/wgEncodeBroadHistoneGm12878H3k4me3StdAlnRep1.bed
# # -2 /home/rthomas/ChipSeqChallenge/RealData/HistoneData/Monkey/Results/04a.mapping/wgEncodeBroadHistoneGm12878ControlStdAlnRep1.bed
# # -f 200 -w 200 -p 0.001 -3 GM12878_H3K4me3_Rep1_EncodeAlign_results001_HM.bed
#
# # These are intermediate files. The problem is that bcp cannot deal with BAM files directly, so we have to convert them.
# bed_exp <- "/04a.mapping/wgEncodeBroadHistoneGm12878H3k4me3StdAlnRep1.bed" # "The ChIP-seq data set you want to input."
# bed_ctl <- "/wgEncodeBroadHistoneGm12878ControlStdAlnRep1.bed" # "The control/input data set you want to input."
#
# bam2bed_exp_cmd <- agw_construct_cmd(get_exe_path("bam2bed"), " --do-not-sort ", " < ", ctlBam, " > ", bed_exp)
# bam2bed_ctl_cmd <- agw_construct_cmd(get_exe_path("bam2bed"), " --do-not-sort ", " < ", expBam, " > ", bed_ctl)
return # end of def generateBcpPeakCmd(...)
def generateGemPeakCmd(exper, sampName, script_obj):
'''Gem is best for TF-related peaks (ones with sequence motifs)'''
species = exper.getSpeciesForSample(sampName)
outdir = exper.getGemPeakDir(sampName)
expBam = exper.getAlignedBamForSample(sampName)
ctrlBam = exper.getAlignedChIPControlBamForSample(sampName)
gemReadDistFile = GEM_READ_DIST_FILE
GEM_THREADS = 1 # gem uses a ton of threads for some reason, so 1 is even conservative
GEM_QVAL = 2 # sets the q-value threshold (-log10, so 2 = 0.01)
GEM_KMIN = 6 # minimum kmer length. From our default UCSF settings used in the 'Monkey' pipeline
GEM_KMAX = 13 # maximum kmer length. From our default UCSF settings used in the 'Monkey' pipeline
genomeByChromFastaDir = getAnnot(species, "fa_by_chrom")
chrSizesFile = getAnnot(species, "chr_sizes")
cmd = JAVA_PATH + " -Xmx" + str(GEM_RAM_GB) + "G" + " -jar " + GEM_JAR_PATH
cmd += " --t " + str(GEM_THREADS)
cmd += " --q " + str(GEM_QVAL)
cmd += " --k_min " + str(GEM_KMIN) + " --k_max " + str(GEM_KMAX)
cmd += " --d " + gemReadDistFile # readdist file / read distribution file
cmd += " --g " + chrSizesFile
cmd += " --genome " + genomeByChromFastaDir
cmd += " --expt " + expBam
cmd += (" --ctrl " + ctrlBam) if ctrlBam is not None else "" # can be omitted if there is NO input file
cmd += " --f SAM " # <-- "SAM" is required in order to accept SAM or BAM files, otherwise BED files are the default
cmd += " --sl --outBED "
cmd += " --out " + outdir # outprefix creates a new folder with this name, respecting existing subdirectories.
expectedOutFile = exper.getGemOutHTMLFile(sampName)
script_obj.appendMkDir(outdir) # this directory must exist before we try moving any files to it...
script_obj.appendCheckForRequiredFiles(file_tup=(expBam, ctrlBam, chrSizesFile, genomeByChromFastaDir, gemReadDistFile))
script_obj.appendCommandUnlessFilesExist(check_files=(expectedOutFile,), cmd=cmd, required_output=(expectedOutFile,))
return # end of def generateGemPeakCmd(...)
def generateEdgeRCmd(count_files, groups, outdir, outfile, script_obj):
cmd = RSCRIPT_PATH + " " + EDGER_SCRIPT
cmd += " --verbose "
cmd += " --groups=" + enquote(DELIM_FOR_ARGS_TO_EDGER_SCRIPT.join(groups))
cmd += " --countfiles=" + enquote(DELIM_FOR_ARGS_TO_EDGER_SCRIPT.join(count_files))
cmd += " --outfile=" + enquote(outfile)
script_obj.appendMkDir(outdir) # this directory must exist before we try moving any files to it...
script_obj.appendCheckForRequiredFiles(file_tup=(count_files,))
script_obj.appendCommandUnlessFilesExist(check_files=(outfile,), cmd=cmd, required_prereqs=(), required_output=(outfile,))
return
def generateAlignmentCmd(fastq1, fastq2, species, outdir, outbam, aligner, script_obj):
'''Note that this is will only generate ONE alignment command per file, even if it's called multiple times.'''
xAssert(outbam is not None, "Somehow the output bam is NONE!")
fqkey = " | ".join([str(fastq1), str(fastq2), str(species), str(aligner)]) # just a unique key for this pair of reads, species, and aligner
if fqkey in global_file_paths_already_aligned:
verbosePrint("[Skipping...]: Already generated an alignment command for the file/pair " + str(fastq1) + " and " + str(fastq2) + " with species " + str(species) + " and the aligner " + str(aligner))
return # exit early
global_file_paths_already_aligned[fqkey] = True # remember that we saw this file...
isPaired = fastq2 is not None
innerDistStr = " --mate-inner-dist=150 " if isPaired else " "
bowtieIndex = getAnnot(species, "bowtie_index")
all_reads_bam = os.path.join(outdir, "all.bam") # an intermediate file
sortbam = outbam # <-- final output is sorted and aligned reads
if aligner == TOPHAT_PATH:
gtf = getAnnot(species, "gtf") # <-- only actually used by tophat
cmd = TOPHAT_PATH + " -o " + outdir
cmd += " " + "--min-anchor=5 --segment-length=25 --no-coverage-search --segment-mismatches=2 --splice-mismatches=2 --microexon-search "
cmd += " " + "--GTF=" + gtf
cmd += " " + "--num-threads=" + str(TOPHAT_N_THREADS)
cmd += " " + innerDistStr
cmd += " " + bowtieIndex
cmd += " " + fastq1 + " "
cmd += " " + fastq2 if isPaired else "" # might or might not have a second file in the pair!
TOPHAT_HARDCODED_BAM_OUT_NAME = "accepted_hits.bam" # tophat always generates files with this output name, no matter what.
tophat_outbam = os.path.join(outdir, TOPHAT_HARDCODED_BAM_OUT_NAME)
cmd += "\n" + "test -e " + sortbam + " || mv -f " + tophat_outbam + " " + sortbam # move it to the FINAL bam location. Which might be the same! "test" is so that we ONLY MOVE IT if the target does not already exist. Otherwise do not move it. So if the filenames are the same, then don't move it, since that generates an error code.
elif aligner == BOWTIE_PATH:
inputFqString = " ".join(["-1", enquote(fastq1), "-2", enquote(fastq2)]) if isPaired else " ".join(["-U", enquote(fastq1)])
cmd = BOWTIE_PATH
cmd += " " + "--threads=" + str(BOWTIE_N_THREADS)
cmd += " " + "-x " + bowtieIndex
cmd += " " + inputFqString + " "
cmd += " | " + SAMTOOLS_PATH + " view -@ " + str(SAMTOOLS_N_THREADS) + " -b - " + " > " + all_reads_bam
cmd += " && " + SAMTOOLS_PATH + " view -@ " + str(SAMTOOLS_N_THREADS) + " -b -F 0x104 " + all_reads_bam + " " # samtools view -b -F 0x104 in.bam > mapped_primary.bam This is how you get ONLY primary mapped reads
cmd += " | samtools sort - " + " > " + sortbam
cmd += " && " + " /bin/rm " + all_reads_bam # delete the intermediate (unsorted) "all_reads" bam file
else:
raise Exception("Unrecognized aligner! We currently only know about tophat and bowtie2. Note that this is case-sensitive. This is a CODING bug and should not be related to a user-specified file. Your specified aligner was: " + aligner)
pass
script_obj.appendMkDir(outdir)
script_obj.appendCommandUnlessFilesExist(check_files=(sortbam,), cmd=cmd, required_prereqs=(fastq1, fastq2), required_output=(sortbam,))
return
# Must come at the VERY END!
if __name__ == "__main__":
handleCmdLineArgs()
pass