-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathannotator.py
145 lines (135 loc) · 5.86 KB
/
annotator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import csv
import pickle
import argparse
## Define imported arguments
input_parser = argparse.ArgumentParser(
description="Annotate a 1-based, 5-column annotated variant file (MGI-annotation style)",
)
input_parser.add_argument(
'gnomad_type',
choices=['exomes', 'genomes'],
help="gnomAD data type to use. Whole genome or exome.",
)
input_parser.add_argument(
'input_file',
help="5 column, 1-based, tab-separated input file with header to be annotated with gnomAD allele frequencies.",
)
input_parser.add_argument(
'output_file_prefix',
help="Final, gnomAD annotated output file location and name without file extension.",
)
input_parser.add_argument(
'--build',
choices=['GRCH37', 'GRCH38'],
default='GRCH37',
help="Genome build to be used. Default = GRCH37"
)
input_parser.add_argument(
'--gnomad_version',
choices=['2.0.1'],
default='2.0.1',
help="Version of gnomAD to be used. Default = 2.0.1"
)
input_parser.add_argument(
'--cutoff',
type=float,
help="Allele frequency cutoff to use to filter the file. Unless cutoff is defined, all variants are printed to one file."
)
input_parser.add_argument(
'--add_allele_count',
action='store_true',
help="Provide allele count and total allele number as well as allele frequency."
)
args = input_parser.parse_args()
# Choose the correct gnomAD version
vcf_key = "_".join([args.build, args.gnomad_version, args.gnomad_type])
# Hash of versions and their VCF locations
vcf_loc = {
'GRCH38_2.0.1_exomes': '/gscmnt/gc2602/griffithlab/kcotto/GRCH38_2.0.1_exomes.trie.pickle',
'GRCH38_2.0.1_genomes': '/gscmnt/gc2602/griffithlab/kcotto/GRCH38_2.0.1_genomes.trie.pickle',
'GRCH37_2.0.1_exomes': '/gscmnt/gc2602/griffithlab/kcotto/GRCH37_2.0.1_exomes.trie.pickle',
'GRCH37_2.0.1_genomes': '/gscmnt/gc2602/griffithlab/kcotto/GRCH37_2.0.1_genomes.trie.pickle'
}
print("\nUsing gnomAD input: ", vcf_loc[vcf_key])
if args.cutoff is not None:
print("Using allele frequency cutoff: ", args.cutoff)
def tsv_header(gnomad_type,add_allele_count):
if add_allele_count:
tsv = ["AC","AN","AF"]
else:
tsv = ["AF"]
headers = []
for t in tsv:
headers.append("gnomAD_{}_{}".format(t,gnomad_type))
return headers
def trie_header(gnomad_type):
trie = ["AF","AC","AN"]
headers = []
for r in trie:
headers.append("gnomAD_{}_{}".format(r,gnomad_type))
return headers
# Read in variant file and print new annotated file
def annotate(**kwargs):
mutation_filename = kwargs.pop('mutation_filename')
file_out = kwargs.pop('file_out')
gnomad_annotations = kwargs.pop('gnomad_annotations')
file_fail_out = kwargs.pop('file_fail_out', None)
with open(mutation_filename, "r") as mgi_tsv, open(file_out, "w") as outfile:
mgi_tsv_reader = csv.DictReader(mgi_tsv, delimiter="\t")
header = mgi_tsv_reader.fieldnames
header_new = header + tsv_header(args.gnomad_type, args.add_allele_count)
mgi_tsv_writer = csv.DictWriter(outfile, fieldnames=header_new, delimiter="\t", extrasaction='ignore', restval="NA")
mgi_tsv_writer.writeheader()
if args.cutoff is not None:
fail_file = open(file_fail_out, "w")
mgi_tsv_fail_writer = csv.DictWriter(fail_file, fieldnames=header_new, delimiter="\t", extrasaction='ignore', restval="NA")
mgi_tsv_fail_writer.writeheader()
fail_counter = 0
pass_counter = 1
not_found_counter = 0
match_counter = 0
line_count = 0
for line in mgi_tsv_reader:
line_count += 1
if line_count % 50000 == 0:
print('Processing line {} from input file'.format(line_count))
mgi_key = "_".join([str(line["chromosome_name"]), str(line["start"]), line["reference"], line["variant"]])
if mgi_key not in gnomad_annotations:
mgi_key = mgi_key.replace("-", "0")
if mgi_key in gnomad_annotations:
gnomad_record = gnomad_annotations[mgi_key]
new_line = line.copy()
for index, elem in enumerate(trie_header(args.gnomad_type)):
new_line[elem] = gnomad_record[0][index]
match_counter += 1
if args.cutoff is None:
mgi_tsv_writer.writerow(new_line)
else:
if float(new_line[gnomad_record[0][0]]) >= args.cutoff:
mgi_tsv_fail_writer.writerow(new_line)
fail_counter += 1
else:
mgi_tsv_writer.writerow(new_line)
pass_counter += 1
else:
new_line = line.copy()
mgi_tsv_writer.writerow(new_line)
not_found_counter += 1
print("\nTotal variants processed: ", line_count)
print("Total variants not found in gnomAD", args.gnomad_type, ": ", not_found_counter)
print("Total variants matched gnomAD", args.gnomad_type, ": ", match_counter)
if args.cutoff is not None:
print("Total variants failed allele frequency cutoff of", args.cutoff, ": ", fail_counter)
print("Total variants passed allele frequency cutoff of", args.cutoff, ": ", pass_counter)
# fail_file.close()
print("\n")
records = pickle.load(open(vcf_loc[vcf_key], 'rb'))
print('Finished reading records')
#Get the correct file suffix based on whether filtering is happening or not
if args.cutoff is None:
file_out = str(args.output_file_prefix) + '.tsv'
annotate(mutation_filename=args.input_file, file_out=file_out, gnomad_annotations=records)
else:
file_out = str(args.output_file_prefix) + 'pass.tsv'
file_fail_out = str(args.output_file_prefix) + 'fail.tsv'
annotate(mutation_filename=args.input_file, file_out=file_out, gnomad_annotations=records, file_fail_out=file_fail_out)