forked from melbournebioinformatics/variant_calling_pipeline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcount_flagstat_wgs.py
107 lines (88 loc) · 3.54 KB
/
count_flagstat_wgs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
"""
Read all output flagstat files and store values in a text table.
Creates two tables in the output directory: readcounts.txt and readcounts_fractions.txt, which contains useful human-readable percentage statistics.
Each row will be a sample.
Values to store (columns) are:
Total reads from SAMPLE.bam.flagstat
Mapped reads from SAMPLE.bam.flagstat
Mapped reads from SAMPLE.dedup.bam.flagstat
There is an assumption that the bams were mapped with something like bwa, which includes unmapped reads.
Usage: python count_flagstat_wgs.py flagstat_directory output_directory
"""
import sys
import os
import re
import optparse
from collections import defaultdict
class FlagstatParseException (Exception):
pass
def read_flagstat(filename):
"""
Given a filename, parse the flagstat values and return as a hash.
Relying on flagstat contents being in usual order.
"""
numbers = re.compile(r'^(\d+)\s+\+\s+(\d+)\s+')
values = {}
f = open(filename)
for field in ['total',
'duplicates',
'mapped',
'paired',
'read1',
'read2',
'properly_paired',
'both_mapped',
'singletons',
'mate_distant',
'mate_distant_goodqual']:
line = f.readline().strip()
match = numbers.match(line)
if not match:
raise FlagstatParseException
values[field] = int(match.group(1))
values[field+'_QCfailed'] = int(match.group(2))
f.close()
return values
# -----
# Get arguments and input filenames
parser = optparse.OptionParser(usage=__doc__)
#parser.add_option()
(options, args) = parser.parse_args()
if len(args) != 2:
parser.error("Wrong number of arguments - see usage info")
in_dir = args[0]
out_dir = args[1]
if not (os.path.exists(out_dir) and os.path.isdir(out_dir)):
sys.exit("There does not seem to be a directory %s , exiting" % out_dir)
filenames = os.listdir(in_dir)
#print ', '.join(filenames)
alignedname = re.compile('^([^_\.]+).bam.flagstat')
dedupname = re.compile('^([^_\.]+).dedup.bam.flagstat')
samples = defaultdict(dict)
for filename in filenames:
if dedupname.match(filename):
match = dedupname.match(filename)
name = match.group(1)
values = read_flagstat( os.path.join(in_dir, filename) )
samples[name]['deduped'] = values['mapped']
elif alignedname.match(filename):
match = alignedname.match(filename)
name = match.group(1)
values = read_flagstat( os.path.join(in_dir, filename) )
samples[name]['mapped'] = values['mapped']
samples[name]['total'] = values['total']
#print ', '.join(samples.keys())
tablefile = os.path.join(out_dir, "readcounts.txt")
tablefile_plus = os.path.join(out_dir, "readcounts_fractions.txt")
OUT_TABLE = open(tablefile, 'w')
OUT_TABLEPLUS = open(tablefile_plus, 'w')
OUT_TABLE.write("Sample\tTotal\tMapped\tDeduped\n")
OUT_TABLEPLUS.write("Sample\tTotal\tMapped\t%\tDeduped\t%\n")
for sample in sorted(samples.keys()):
values = samples[sample]
fraction_mapped = float(values['mapped'])/values['total']
fraction_deduped = float(values['deduped'])/values['mapped']
OUT_TABLE.write( "%s\t%d\t%d\t%d\n" % (sample, values['total'], values['mapped'], values['deduped']) )
OUT_TABLEPLUS.write( "%s\t%d\t%d\t%.3f\t%d\t%.3f\n" % (sample, values['total'], values['mapped'], fraction_mapped, values['deduped'], fraction_deduped) )
OUT_TABLE.close()
OUT_TABLEPLUS.close()