-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathisoquant.py
executable file
·838 lines (722 loc) · 43.1 KB
/
isoquant.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
#!/usr/bin/env python3
#
# ############################################################################
# Copyright (c) 2022-2024 University of Helsinki
# Copyright (c) 2019-2022 Saint Petersburg State University
# # All Rights Reserved
# See file LICENSE for details.
############################################################################
import argparse
import glob
import json
import logging
import os.path
import pickle
import shutil
import sys
import time
from collections import namedtuple
from io import StringIO
from traceback import print_exc
import pysam
import gffutils
import pyfaidx
from src.gtf2db import convert_gtf_to_db
from src.read_mapper import (
DATA_TYPE_ALIASES,
SUPPORTED_STRANDEDNESS,
SUPPORTED_ALIGNERS,
ASSEMBLY,
PACBIO_CCS_DATA,
NANOPORE_DATA,
DataSetReadMapper
)
from src.dataset_processor import DatasetProcessor, PolyAUsageStrategies
from src.graph_based_model_construction import StrandnessReportingLevel
from src.long_read_assigner import AmbiguityResolvingMethod
from src.long_read_counter import COUNTING_STRATEGIES, CountingStrategy, NormalizationMethod, GroupedOutputFormat
from src.input_data_storage import InputDataStorage
from src.multimap_resolver import MultimapResolvingStrategy
from src.stats import combine_counts
logger = logging.getLogger('IsoQuant')
def bool_str(s):
s = s.lower()
if s not in {'false', 'true', '0', '1'}:
raise ValueError('Not a valid boolean string')
return s == 'true' or s == '1'
def parse_args(cmd_args=None, namespace=None):
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter)
ref_args_group = parser.add_argument_group('Reference data')
input_args_group = parser.add_argument_group('Input data')
output_args_group = parser.add_argument_group('Output naming')
pipeline_args_group = parser.add_argument_group('Pipeline options')
algo_args_group = parser.add_argument_group('Algorithm settings')
other_options = parser.add_argument_group("Additional options:")
show_full_help = '--full_help' in cmd_args
def add_additional_option(*args, **kwargs): # show command only with --full-help
if not show_full_help:
kwargs['help'] = argparse.SUPPRESS
other_options.add_argument(*args, **kwargs)
def add_additional_option_to_group(opt_group, *args, **kwargs): # show command only with --full-help
if not show_full_help:
kwargs['help'] = argparse.SUPPRESS
opt_group.add_argument(*args, **kwargs)
def add_hidden_option(*args, **kwargs): # show command only with --full-help
kwargs['help'] = argparse.SUPPRESS
parser.add_argument(*args, **kwargs)
parser.add_argument("--full_help", action='help', help="show full list of options")
add_hidden_option('--debug', action='store_true', default=False,
help='Debug log output.')
output_args_group.add_argument("--output", "-o", help="output folder, will be created automatically "
"[default=isoquant_output]",
type=str, default="isoquant_output")
output_args_group.add_argument('--prefix', '-p', type=str,
help='experiment name; to be used for folder and file naming; default is OUT',
default="OUT")
output_args_group.add_argument('--labels', '-l', nargs='+', type=str,
help='sample/replica labels to be used as column names; input file names are used '
'if not set; must be equal to the number of input files given via --fastq/--bam')
# REFERENCE
ref_args_group.add_argument("--reference", "-r", help="reference genome in FASTA format (can be gzipped)",
type=str)
ref_args_group.add_argument("--genedb", "-g", help="gene database in gffutils DB format or GTF/GFF "
"format (optional)", type=str)
ref_args_group.add_argument('--complete_genedb', action='store_true', default=False,
help="use this flag if gene annotation contains transcript and gene metafeatures, "
"e.g. with official annotations, such as GENCODE; "
"speeds up gene database conversion")
add_additional_option_to_group(ref_args_group, "--index", help="genome index for specified aligner (optional)",
type=str)
# INPUT READS
input_args = input_args_group.add_mutually_exclusive_group()
input_args.add_argument('--bam', nargs='+', type=str,
help='sorted and indexed BAM file(s), each file will be treated as a separate sample')
input_args.add_argument('--fastq', nargs='+', type=str,
help='input FASTQ file(s), each file will be treated as a separate sample; '
'reference genome should be provided when using reads as input')
add_additional_option_to_group(input_args,'--bam_list', type=str,
help='text file with list of BAM files, one file per line, '
'leave empty line between samples')
add_additional_option_to_group(input_args,'--fastq_list', type=str,
help='text file with list of FASTQ files, one file per line, '
'leave empty line between samples')
input_args.add_argument('--yaml', type=str, help='yaml file containing all input files, one entry per sample'
', check readme for format info')
input_args_group.add_argument('--illumina_bam', nargs='+', type=str,
help='sorted and indexed file(s) with Illumina reads from the same sample')
input_args_group.add_argument("--read_group", help="a way to group feature counts (no grouping by default): "
"by BAM file tag (tag:TAG); "
"using additional file (file:FILE:READ_COL:GROUP_COL:DELIM); "
"using read id (read_id:DELIM); "
"by original file name (file_name)", type=str)
# INPUT PROPERTIES
input_args_group.add_argument("--data_type", "-d", type=str, choices=DATA_TYPE_ALIASES.keys(),
help="type of data to process, supported types are: " + ", ".join(DATA_TYPE_ALIASES.keys()))
input_args_group.add_argument('--stranded', type=str, help="reads strandness type, supported values are: " +
", ".join(SUPPORTED_STRANDEDNESS), default="none")
input_args_group.add_argument('--fl_data', action='store_true', default=False,
help="reads represent FL transcripts; both ends of the read are considered to be reliable")
# ALGORITHM
add_additional_option_to_group(algo_args_group, "--report_novel_unspliced", "-u", type=bool_str,
help="report novel monoexonic transcripts (true/false), "
"default: false for ONT, true for other data types")
add_additional_option_to_group(algo_args_group, "--report_canonical", type=str,
choices=[e.name for e in StrandnessReportingLevel],
help="reporting level for novel transcripts based on canonical splice sites;"
" default: " + StrandnessReportingLevel.auto.name,
default=StrandnessReportingLevel.only_stranded.name)
add_additional_option_to_group(algo_args_group, "--polya_requirement", type=str,
choices=[e.name for e in PolyAUsageStrategies],
help="require polyA tails to be present when reporting transcripts; "
"default: auto (requires polyA only when polyA percentage is >= 70%%)",
default=PolyAUsageStrategies.auto.name)
add_additional_option_to_group(algo_args_group, "--transcript_quantification", choices=COUNTING_STRATEGIES,
help="transcript quantification strategy", type=str,
default=CountingStrategy.unique_only.name)
add_additional_option_to_group(algo_args_group, "--gene_quantification", choices=COUNTING_STRATEGIES,
help="gene quantification strategy", type=str,
default=CountingStrategy.unique_splicing_consistent.name)
add_additional_option_to_group(algo_args_group, "--matching_strategy",
choices=["exact", "precise", "default", "loose"],
help="read-to-isoform matching strategy from the most strict to least",
type=str, default=None)
add_additional_option_to_group(algo_args_group, "--splice_correction_strategy",
choices=["none", "default_pacbio", "default_ont",
"conservative_ont", "all", "assembly"],
help="read alignment correction strategy to use", type=str, default=None)
add_additional_option_to_group(algo_args_group, "--model_construction_strategy",
choices=["reliable", "default_pacbio", "sensitive_pacbio", "fl_pacbio",
"default_ont", "sensitive_ont", "all", "assembly"],
help="transcript model construction strategy to use", type=str, default=None)
# OUTPUT PROPERTIES
pipeline_args_group.add_argument("--threads", "-t", help="number of threads to use", type=int,
default="16")
pipeline_args_group.add_argument('--check_canonical', action='store_true', default=False,
help="report whether splice junctions are canonical")
pipeline_args_group.add_argument("--sqanti_output", help="produce SQANTI-like TSV output",
action='store_true', default=False)
pipeline_args_group.add_argument("--count_exons", help="perform exon and intron counting",
action='store_true', default=False)
add_additional_option_to_group(pipeline_args_group,"--bam_tags",
help="comma separated list of BAM tags to be imported to read_assignments.tsv",
type=str)
# PIPELINE STEPS
resume_args = pipeline_args_group.add_mutually_exclusive_group()
resume_args.add_argument("--resume", action="store_true", default=False,
help="resume failed run, specify output folder, input options are not allowed")
resume_args.add_argument("--force", action="store_true", default=False,
help="force to overwrite the previous run")
add_additional_option_to_group(pipeline_args_group, '--clean_start', action='store_true', default=False,
help='Do not use previously generated index, feature db or alignments.')
add_additional_option_to_group(pipeline_args_group, "--no_model_construction", action="store_true",
default=False, help="run only read assignment and quantification")
add_additional_option_to_group(pipeline_args_group, "--run_aligner_only", action="store_true", default=False,
help="align reads to reference without running further analysis")
# ADDITIONAL
add_additional_option("--delta", type=int, default=None,
help="delta for inexact splice junction comparison, chosen automatically based on data type")
add_hidden_option("--graph_clustering_distance", type=int, default=None,
help="intron graph clustering distance, "
"splice junctions less that this number of bp apart will not be differentiated")
add_additional_option("--no_gzip", help="do not gzip large output files", dest="gzipped",
action='store_false', default=True)
add_additional_option("--no_gtf_check", help="do not perform GTF checks", dest="gtf_check",
action='store_false', default=True)
add_additional_option("--high_memory", help="increase RAM consumption (store alignment and the genome in RAM)",
action='store_true', default=False)
add_additional_option("--no_junc_bed", action="store_true", default=False,
help="do NOT use annotation for read mapping")
add_additional_option("--junc_bed_file", type=str,
help="annotation in BED format produced by minimap's paftools.js gff2bed "
"(will be created automatically if not given)")
add_additional_option("--no_secondary", help="ignore secondary alignments (not recommended)", action='store_true',
default=False)
add_additional_option("--min_mapq", help="ignore alignments with MAPQ < this"
"(also filters out secondary alignments, default: None)", type=int)
add_additional_option("--inconsistent_mapq_cutoff", help="ignore inconsistent alignments with MAPQ < this "
"(works only with the reference annotation, default=5)",
type=int, default=5)
add_additional_option("--simple_alignments_mapq_cutoff", help="ignore alignments with 1 or 2 exons and "
"MAPQ < this (works only in annotation-free mode, "
"default=1)", type=int, default=1)
add_additional_option("--normalization_method", type=str, choices=[e.name for e in NormalizationMethod],
help="TPM normalization method: simple - conventional normalization using all counted reads;"
"usable_reads - includes all assigned reads.",
default=NormalizationMethod.simple.name)
add_additional_option("--counts_format", type=str, choices=[e.name for e in GroupedOutputFormat],
help="output format for grouped counts",
default=GroupedOutputFormat.both.name)
add_additional_option_to_group(pipeline_args_group, "--keep_tmp", help="do not remove temporary files "
"in the end", action='store_true',
default=False)
add_additional_option_to_group(input_args_group, "--read_assignments", nargs='+', type=str,
help="reuse read assignments (binary format)", default=None)
add_hidden_option("--aligner", help="force to use this alignment method, can be " + ", ".join(SUPPORTED_ALIGNERS)
+ "; chosen based on data type if not set", type=str)
add_additional_option_to_group(output_args_group, "--genedb_output", help="output folder for converted gene "
"database, will be created automatically "
" (same as output by default)", type=str)
add_hidden_option("--cage", help="bed file with CAGE peaks", type=str, default=None)
add_hidden_option("--cage-shift", type=int, default=50, help="interval before read start to look for CAGE peak")
parser.add_argument("--test", action=TestMode, nargs=0, help="run IsoQuant on toy dataset")
isoquant_version = "3.4.0"
try:
with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), "VERSION")) as version_f:
isoquant_version = version_f.readline().strip()
except FileNotFoundError:
pass
parser.add_argument('--version', '-v', action='version', version='IsoQuant ' + isoquant_version)
args = parser.parse_args(cmd_args, namespace)
if args.resume:
resume_parser = argparse.ArgumentParser(add_help=False)
resume_parser.add_argument("--resume", action="store_true", default=False,
help="resume failed run, specify only output folder, "
"input options are not allowed")
resume_parser.add_argument("--output", "-o",
help="output folder, will be created automatically [default=isoquant_output]",
type=str, required=True)
resume_parser.add_argument('--debug', action='store_true', default=argparse.SUPPRESS,
help='Debug log output.')
resume_parser.add_argument("--threads", "-t", help="number of threads to use",
type=int, default=argparse.SUPPRESS)
resume_parser.add_argument("--high_memory",
help="increase RAM consumption (store alignment and the genome in RAM)",
action='store_true', default=False)
resume_parser.add_argument("--keep_tmp", help="do not remove temporary files in the end",
action='store_true', default=argparse.SUPPRESS)
args, unknown_args = resume_parser.parse_known_args(cmd_args)
if unknown_args:
logger.error("You cannot specify options other than --output/--threads/--debug/--high_memory "
"with --resume option")
parser.print_usage()
exit(-2)
args._cmd_line = " ".join(sys.argv)
args._version = isoquant_version
args.output_exists = os.path.exists(args.output)
if not args.output_exists:
os.makedirs(args.output)
return args, parser
def check_and_load_args(args, parser):
args.param_file = os.path.join(args.output, ".params")
if args.resume:
if not os.path.exists(args.output) or not os.path.exists(args.param_file):
# logger is not defined yet
logger.error("Previous run config was not detected, cannot resume. "
"Check that output folder is correctly specified.")
exit(-3)
args = load_previous_run(args)
elif args.output_exists:
if os.path.exists(args.param_file):
if args.force:
logger.warning("Output folder already contains a previous run, will be overwritten.")
else:
logger.warning("Output folder already contains a previous run, some files may be overwritten. "
"Use --resume to resume a failed run. Use --force to avoid this message.")
logger.warning("Press Ctrl+C to interrupt the run now.")
delay = 9
for i in range(delay):
countdown = delay - i
sys.stdout.write("Resuming the run in %d second%s\r" % (countdown, "s" if countdown > 1 else ""))
time.sleep(1)
logger.info("Overwriting the previous run")
time.sleep(1)
else:
logger.warning("Output folder already exists, some files may be overwritten.")
if args.genedb_output is None:
args.genedb_output = args.output
elif not os.path.exists(args.genedb_output):
os.makedirs(args.genedb_output)
if not args.genedb:
args.genedb_filename = None
elif args.genedb.lower().endswith("db"):
args.genedb_filename = args.genedb
else:
args.genedb_filename = os.path.join(args.output, os.path.splitext(os.path.basename(args.genedb))[0] + ".db")
if not check_input_params(args):
parser.print_usage()
exit(-1)
save_params(args)
return args
def load_previous_run(args):
logger.info("Loading parameters of the previous run, all arguments will be ignored")
unpickler = pickle.Unpickler(open(args.param_file, "rb"), fix_imports=False)
loaded_args = unpickler.load()
for option in args.__dict__:
loaded_args.__dict__[option] = args.__dict__[option]
if loaded_args.debug:
logger.setLevel(logging.DEBUG)
logger.handlers[0].setLevel(logging.DEBUG)
return loaded_args
def save_params(args):
for file_opt in ["genedb", "reference", "index", "bam", "fastq", "bam_list", "fastq_list", "junc_bed_file",
"cage", "genedb_output", "read_assignments"]:
if file_opt in args.__dict__ and args.__dict__[file_opt]:
if isinstance(args.__dict__[file_opt], list):
args.__dict__[file_opt] = list(map(os.path.abspath, args.__dict__[file_opt]))
else:
args.__dict__[file_opt] = os.path.abspath(args.__dict__[file_opt])
if "read_group" in args.__dict__ and args.__dict__["read_group"]:
vals = args.read_group.split(":")
if len(vals) > 1 and vals[0] == 'file':
vals[1] = os.path.abspath(vals[1])
args.read_group = ":".join(vals)
pickler = pickle.Pickler(open(args.param_file, "wb"), -1)
pickler.dump(args)
pass
# Check user's params
def check_input_params(args):
if not args.reference:
logger.error("Reference genome was not provided")
return False
if not args.data_type:
logger.error("Data type is not provided, choose one of " + " ".join(DATA_TYPE_ALIASES.keys()))
return False
elif args.data_type not in DATA_TYPE_ALIASES.keys():
logger.error("Unsupported data type " + args.data_type + ", choose one of: " + " ".join(DATA_TYPE_ALIASES.keys()))
return False
args.data_type = DATA_TYPE_ALIASES[args.data_type]
if not args.fastq and not args.fastq_list and not args.bam and not args.bam_list and not args.read_assignments and not args.yaml:
logger.error("No input data was provided")
return False
if args.yaml and args.illumina_bam:
logger.error("When providing a yaml file it should include all input files, including the illumina bam file.")
return False
if args.illumina_bam and (args.fastq_list or args.bam_list):
logger.error("Unsupported combination of list of input files and Illumina bam file."
"To combine multiple experiments with short read correction please use yaml input.")
return False
args.input_data = InputDataStorage(args)
if args.aligner is not None and args.aligner not in SUPPORTED_ALIGNERS:
logger.error(" Unsupported aligner " + args.aligner + ", choose one of: " + " ".join(SUPPORTED_ALIGNERS))
return False
if args.run_aligner_only and args.input_data.input_type == "bam":
logger.error("Do not use BAM files with --run_aligner_only option.")
return False
if args.stranded not in SUPPORTED_STRANDEDNESS:
logger.error("Unsupported strandness " + args.stranded + ", choose one of: " + " ".join(SUPPORTED_STRANDEDNESS))
return False
if not args.genedb:
if args.count_exons:
logger.warning("--count_exons option has no effect without gene annotation")
if args.sqanti_output:
args.sqanti_output = False
logger.warning("--sqanti_output option has no effect without gene annotation")
if args.no_model_construction:
logger.warning("Setting --no_model_construction without providing a gene "
"annotation will not produce any meaningful results")
if args.no_model_construction and args.sqanti_output:
args.sqanti_output = False
logger.warning("--sqanti_output option has no effect without model construction")
check_input_files(args)
return True
def check_input_files(args):
for sample in args.input_data.samples:
for lib in sample.file_list:
for in_file in lib:
if args.input_data.input_type == "save":
saves = glob.glob(in_file + "*")
if not saves:
logger.critical("Input files " + in_file + "* do not exist")
continue
if not os.path.isfile(in_file):
logger.critical("Input file " + in_file + " does not exist")
exit(-1)
if args.input_data.input_type == "bam":
bamfile_in = pysam.AlignmentFile(in_file, "rb")
if not bamfile_in.has_index():
logger.critical("BAM file " + in_file + " is not indexed, run samtools sort and samtools index")
exit(-1)
bamfile_in.close()
if sample.illumina_bam is not None:
for illumina in sample.illumina_bam:
bamfile_in = pysam.AlignmentFile(illumina, "rb")
if not bamfile_in.has_index():
logger.critical("BAM file " + illumina + " is not indexed, run samtools sort and samtools index")
exit(-1)
bamfile_in.close()
if args.cage is not None:
logger.critical("CAGE data is not supported yet")
exit(-1)
if not os.path.isfile(args.cage):
logger.critical("Bed file with CAGE peaks " + args.cage + " does not exist")
exit(-1)
if args.genedb is not None:
if not os.path.isfile(args.genedb):
logger.critical("Gene database " + args.genedb + " does not exist")
exit(-1)
else:
args.no_junc_bed = True
if args.read_assignments is not None:
for r in args.read_assignments:
if not glob.glob(r + "*"):
logger.critical("No files found with prefix " + str(r))
exit(-1)
def create_output_dirs(args):
for sample in args.input_data.samples:
sample_dir = sample.out_dir
if os.path.exists(sample_dir):
if not args.resume:
logger.warning(sample_dir + " folder already exists, some files may be overwritten")
else:
os.makedirs(sample_dir)
sample_aux_dir = sample.aux_dir
if os.path.exists(sample_aux_dir):
if not args.resume:
logger.warning(sample_aux_dir + " folder already exists, some files may be overwritten")
else:
os.makedirs(sample_aux_dir)
def set_logger(args, logger_instance):
if "debug" not in args.__dict__ or not args.debug:
output_level = logging.INFO
else:
output_level = logging.DEBUG
logger_instance.setLevel(output_level)
log_file = os.path.join(args.output, "isoquant.log")
if os.path.exists(log_file):
old_log_file = os.path.join(args.output, "isoquant.log.old")
with open(old_log_file, "a") as olf:
olf.write("\n")
shutil.copyfileobj(open(log_file, "r"), olf)
f = open(log_file, "w")
f.write("Command line: " + args._cmd_line + '\n')
f.close()
fh = logging.FileHandler(log_file)
fh.set_name("isoquant_file_log")
fh.setLevel(output_level)
ch = logging.StreamHandler(sys.stdout)
ch.set_name("isoquant_screen_log")
ch.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
if all(fh.get_name() != h.get_name() for h in logger_instance.handlers):
logger_instance.addHandler(fh)
if all(ch.get_name() != h.get_name() for h in logger_instance.handlers):
logger_instance.addHandler(ch)
logger.info("Running IsoQuant version " + args._version)
def set_data_dependent_options(args):
matching_strategies = {ASSEMBLY: "precise", PACBIO_CCS_DATA: "precise", NANOPORE_DATA: "default"}
if args.matching_strategy is None:
args.matching_strategy = matching_strategies[args.data_type]
model_construction_strategies = {ASSEMBLY: "assembly", PACBIO_CCS_DATA: "default_pacbio", NANOPORE_DATA: "default_ont"}
if args.model_construction_strategy is None:
args.model_construction_strategy = model_construction_strategies[args.data_type]
if args.fl_data and args.model_construction_strategy == "default_pacbio":
args.model_construction_strategy = "fl_pacbio"
splice_correction_strategies = {ASSEMBLY: "assembly", PACBIO_CCS_DATA: "default_pacbio", NANOPORE_DATA: "default_ont"}
if args.splice_correction_strategy is None:
args.splice_correction_strategy = splice_correction_strategies[args.data_type]
args.resolve_ambiguous = 'monoexon_and_fsm' if args.fl_data else 'default'
args.requires_polya_for_construction = False
if args.read_group is None and args.input_data.has_replicas():
args.read_group = "file_name"
args.use_technical_replicas = args.read_group == "file_name"
def set_matching_options(args):
MatchingStrategy = namedtuple('MatchingStrategy',
('delta', 'max_intron_shift', 'max_missed_exon_len', 'max_fake_terminal_exon_len',
'max_suspicious_intron_abs_len', 'max_suspicious_intron_rel_len',
'resolve_ambiguous', 'correct_minor_errors'))
strategies = {
'exact': MatchingStrategy(0, 0, 0, 0, 0, 0.0, 'monoexon_only', False),
'precise': MatchingStrategy(4, 30, 50, 20, 0, 0.0, 'monoexon_and_fsm', True),
'default': MatchingStrategy(6, 60, 100, 40, 60, 1.0, 'monoexon_and_fsm', True),
'loose': MatchingStrategy(12, 60, 100, 40, 60, 1.0, 'all', True),
}
strategy = strategies[args.matching_strategy]
if args.delta is None:
args.delta = strategy.delta
elif args.delta < 0:
logger.error("--delta can not be negative")
exit(-3)
args.minor_exon_extension = 50
args.major_exon_extension = 300
args.max_intron_shift = strategy.max_intron_shift
args.max_missed_exon_len = strategy.max_missed_exon_len
args.max_fake_terminal_exon_len = strategy.max_fake_terminal_exon_len
# short introns that are actually long deletions, fix minimaps logic
args.max_suspicious_intron_abs_len = strategy.max_suspicious_intron_abs_len
args.max_suspicious_intron_rel_len = strategy.max_suspicious_intron_rel_len
args.min_abs_exon_overlap = 10
args.min_rel_exon_overlap = 0.2
args.micro_intron_length = 50
args.max_intron_abs_diff = min(30, args.max_intron_shift)
args.max_intron_rel_diff = 0.2
args.apa_delta = args.minor_exon_extension
args.minimal_exon_overlap = 5
args.minimal_intron_absence_overlap = 20
args.polya_window = 16
args.polya_fraction = 0.75
if args.resolve_ambiguous == 'default':
args.resolve_ambiguous = strategy.resolve_ambiguous
if args.resolve_ambiguous not in AmbiguityResolvingMethod.__dict__:
logger.error("Incorrect resolving ambiguity method: " + args.resolve_ambiguous + ", default will be used")
args.resolve_ambiguous = strategy.resolve_ambiguous
args.resolve_ambiguous = AmbiguityResolvingMethod[args.resolve_ambiguous]
args.correct_minor_errors = strategy.correct_minor_errors
updated_strategy = MatchingStrategy(args.delta, args.max_intron_shift, args.max_missed_exon_len,
args.max_fake_terminal_exon_len,
args.max_suspicious_intron_abs_len, args.max_suspicious_intron_rel_len,
args.resolve_ambiguous, args.correct_minor_errors)
logger.debug('Using %s strategy. Updated strategy: %s.' % (args.matching_strategy, updated_strategy))
def set_splice_correction_options(args):
SplicSiteCorrectionStrategy = namedtuple('SplicSiteCorrectionStrategy',
('fuzzy_junctions', 'intron_shifts', 'skipped_exons',
'terminal_exons', 'fake_terminal_exons', 'microintron_retention'))
strategies = {
'none': SplicSiteCorrectionStrategy(False, False, False, False, False, False),
'default_pacbio': SplicSiteCorrectionStrategy(True, False, True, False, False, True),
'conservative_ont': SplicSiteCorrectionStrategy(True, False, True, False, False, False),
'default_ont': SplicSiteCorrectionStrategy(True, False, True, False, True, True),
'all': SplicSiteCorrectionStrategy(True, True, True, True, True, True),
'assembly': SplicSiteCorrectionStrategy(False, False, True, False, False, False)
}
strategy = strategies[args.splice_correction_strategy]
args.correct_fuzzy_junctions = strategy.fuzzy_junctions
args.correct_intron_shifts = strategy.intron_shifts
args.correct_skipped_exons = strategy.skipped_exons
args.correct_terminal_exons = strategy.terminal_exons
args.correct_fake_terminal_exons = strategy.fake_terminal_exons
args.correct_microintron_retention = strategy.microintron_retention
def set_model_construction_options(args):
ModelConstructionStrategy = namedtuple('ModelConstructionStrategy',
('min_novel_intron_count',
'graph_clustering_ratio', 'graph_clustering_distance',
'min_novel_isolated_intron_abs', 'min_novel_isolated_intron_rel',
'terminal_position_abs', 'terminal_position_rel',
'terminal_internal_position_rel',
'min_known_count', 'min_nonfl_count',
'min_novel_count', 'min_novel_count_rel',
'min_mono_count_rel', 'singleton_adjacent_cov',
'fl_only', 'novel_monoexonic',
'require_monointronic_polya', 'require_monoexonic_polya',
'report_canonical'))
strategies = {
'reliable': ModelConstructionStrategy(2, 0.5, 20, 5, 0.05, 1, 0.1, 0.1, 2, 4, 8, 0.05, 0.05, 50,
True, False, True, True, StrandnessReportingLevel.only_canonical),
'default_pacbio': ModelConstructionStrategy(1, 0.5, 10, 2, 0.02, 1, 0.05, 0.05, 1, 2, 2, 0.02, 0.005, 100,
False, True, False, True, StrandnessReportingLevel.only_canonical),
'sensitive_pacbio':ModelConstructionStrategy(1, 0.5, 5, 2, 0.005, 1, 0.01, 0.02, 1, 2, 2, 0.005, 0.001, 100,
False, True, False, False, StrandnessReportingLevel.only_stranded),
'default_ont': ModelConstructionStrategy(1, 0.5, 20, 3, 0.02, 1, 0.05, 0.05, 1, 3, 3, 0.02, 0.02, 10,
False, False, True, True, StrandnessReportingLevel.only_canonical),
'sensitive_ont': ModelConstructionStrategy(1, 0.5, 20, 3, 0.005, 1, 0.01, 0.02, 1, 2, 3, 0.005, 0.005, 10,
False, True, False, False, StrandnessReportingLevel.only_stranded),
'fl_pacbio': ModelConstructionStrategy(1, 0.5, 10, 2, 0.02, 1, 0.05, 0.01, 1, 2, 3, 0.02, 0.005, 100,
True, True, False, False, StrandnessReportingLevel.only_canonical),
'all': ModelConstructionStrategy(0, 0.3, 5, 1, 0.002, 1, 0.01, 0.01, 1, 1, 1, 0.002, 0.001, 500,
False, True, False, False, StrandnessReportingLevel.all),
'assembly': ModelConstructionStrategy(0, 0.3, 5, 1, 0.05, 1, 0.01, 0.02, 1, 1, 1, 0.05, 0.01, 50,
False, True, False, False, StrandnessReportingLevel.only_stranded)
}
strategy = strategies[args.model_construction_strategy]
args.min_novel_intron_count = strategy.min_novel_intron_count
args.graph_clustering_ratio = strategy.graph_clustering_ratio
if args.graph_clustering_distance is None:
args.graph_clustering_distance = strategy.graph_clustering_distance
elif args.graph_clustering_distance < 0:
logger.error("--graph_clustering_distance can not be negative")
exit(-3)
args.min_novel_isolated_intron_abs = strategy.min_novel_isolated_intron_abs
args.min_novel_isolated_intron_rel = strategy.min_novel_isolated_intron_rel
args.terminal_position_abs = strategy.terminal_position_abs
args.terminal_position_rel = strategy.terminal_position_rel
args.terminal_internal_position_rel = strategy.terminal_internal_position_rel
args.min_known_count = strategy.min_known_count
args.min_nonfl_count = strategy.min_nonfl_count
args.min_novel_count = strategy.min_novel_count
args.min_mono_count_rel = strategy.min_mono_count_rel
args.min_novel_count_rel = strategy.min_novel_count_rel
args.singleton_adjacent_cov = strategy.singleton_adjacent_cov
args.fl_only = strategy.fl_only
args.min_mono_exon_coverage = 0.75
if args.report_novel_unspliced is None:
args.report_novel_unspliced = strategy.novel_monoexonic
if not args.report_novel_unspliced and not args.no_model_construction:
logger.info("Novel unspliced transcripts will not be reported, "
"set --report_novel_unspliced true to discover them")
args.require_monointronic_polya = strategy.require_monointronic_polya
args.require_monoexonic_polya = strategy.require_monoexonic_polya
args.polya_requirement_strategy = PolyAUsageStrategies[args.polya_requirement]
args.report_canonical_strategy = StrandnessReportingLevel[args.report_canonical]
if args.report_canonical_strategy == StrandnessReportingLevel.auto:
args.report_canonical_strategy = strategy.report_canonical
def set_configs_directory(args):
config_dir = os.path.join(os.environ['HOME'], '.config', 'IsoQuant')
os.makedirs(config_dir, exist_ok=True)
args.db_config_path = os.path.join(config_dir, 'db_config.json')
args.index_config_path = os.path.join(config_dir, 'index_config.json')
args.bed_config_path = os.path.join(config_dir, 'bed_config.json')
args.alignment_config_path = os.path.join(config_dir, 'alignment_config.json')
for config_path in (args.db_config_path, args.index_config_path, args.bed_config_path, args.alignment_config_path):
if not os.path.exists(config_path):
with open(config_path, 'w') as f_out:
json.dump({}, f_out)
def set_additional_params(args):
set_configs_directory(args)
set_data_dependent_options(args)
set_matching_options(args)
set_model_construction_options(args)
set_splice_correction_options(args)
args.print_additional_info = True
args.indel_near_splice_site_dist = 10
args.upstream_region_len = 20
args.multimap_strategy = "take_best"
multimap_strategies = {}
for e in MultimapResolvingStrategy:
multimap_strategies[e.name] = e.value
args.multimap_strategy = MultimapResolvingStrategy(multimap_strategies[args.multimap_strategy])
args.needs_reference = True
if args.needs_reference and not args.reference:
logger.warning("Reference genome is not provided! This may affect quality of the results!")
args.needs_reference = False
args.simple_models_mapq_cutoff = 30
args.polya_percentage_threshold = 0.7
args.low_polya_percentage_threshold = 0.1
if args.bam_tags:
args.bam_tags = args.bam_tags.split(",")
else:
args.bam_tags = []
def run_pipeline(args):
logger.info(" === IsoQuant pipeline started === ")
logger.info("gffutils version: %s" % gffutils.__version__)
logger.info("pysam version: %s" % pysam.__version__)
logger.info("pyfaidx version: %s" % pyfaidx.__version__)
# convert GTF/GFF if needed
if args.genedb and not args.genedb.lower().endswith('db'):
args.genedb = convert_gtf_to_db(args)
# map reads if fastqs are provided
if args.input_data.input_type == "fastq":
# substitute input reads with bams
dataset_mapper = DataSetReadMapper(args)
args.index = dataset_mapper.index_fname
args.input_data = dataset_mapper.map_reads(args)
if args.run_aligner_only:
logger.info("Isoform assignment step is skipped because --run-aligner-only option was used")
else:
# run isoform assignment
dataset_processor = DatasetProcessor(args)
dataset_processor.process_all_samples(args.input_data)
# aggregate counts for all samples
if len(args.input_data.samples) > 1 and args.genedb:
combine_counts(args.input_data, args.output)
logger.info(" === IsoQuant pipeline finished === ")
# Test mode is triggered by --test option
class TestMode(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
out_dir = 'isoquant_test'
if os.path.exists(out_dir):
shutil.rmtree(out_dir)
source_dir = os.path.dirname(os.path.realpath(__file__))
options = ['--output', out_dir, '--threads', '2',
'--fastq', os.path.join(source_dir, 'tests/simple_data/chr9.4M.ont.sim.fq.gz'),
'--reference', os.path.join(source_dir, 'tests/simple_data/chr9.4M.fa.gz'),
'--genedb', os.path.join(source_dir, 'tests/simple_data/chr9.4M.gtf.gz'),
'--clean_start', '--data_type', 'nanopore', '--complete_genedb', '--force', '-p', 'TEST_DATA']
print('=== Running in test mode === ')
print('Any other option is ignored ')
main(options)
if self._check_log():
logger.info(' === TEST PASSED CORRECTLY === ')
else:
logger.error(' === TEST FAILED ===')
exit(-1)
parser.exit()
@staticmethod
def _check_log():
with open('isoquant_test/isoquant.log', 'r') as f:
log = f.read()
correct_results = ['total assignments 4', 'polyA tail detected in 2', 'unique: 1', 'known: 2', 'Processed 1 experiment']
return all([result in log for result in correct_results])
def main(cmd_args):
args, parser = parse_args(cmd_args)
if not cmd_args:
parser.print_usage()
exit(0)
set_logger(args, logger)
args = check_and_load_args(args, parser)
create_output_dirs(args)
set_additional_params(args)
run_pipeline(args)
if __name__ == "__main__":
# stuff only to run when not called via 'import' here
try:
main(sys.argv[1:])
except SystemExit:
raise
except KeyboardInterrupt:
raise
except:
if logger.handlers:
strout = StringIO()
print_exc(file=strout)
s = strout.getvalue()
if s:
logger.critical("IsoQuant failed with the following error, please, submit this issue to "
"https://github.com/ablab/IsoQuant/issues" + s)
else:
print_exc()
else:
sys.stderr.write("IsoQuant failed with the following error, please, submit this issue to "
"https://github.com/ablab/IsoQuant/issues")
print_exc()
sys.exit(-1)