-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathnormalize_vcf.py
54 lines (46 loc) · 1.75 KB
/
normalize_vcf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# Copyright 2022 - Barcelona Supercomputing Center
# Author: Rodrigo Martin
# BSC Dual License
'''
Generates a normalized VCF file from a VCF
Expected usage:
$ python normalize.py <vcf_file> <output_vcf_file>
Use --help for more information.
'''
import sys
from argparse import ArgumentParser
import pysam
def _contig_to_int(contig):
contig = contig.lower().replace('chr', '')
if contig.isdigit():
return int(contig)
else:
return 22 + ord(contig[0])
if __name__ == '__main__':
import os
import sys
sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)) + '/../src/')
from variant_extractor import VariantExtractor
from variant_extractor.variants import VariantType
# Parse arguments
parser = ArgumentParser(description='Generate normalized VCF file from a VCF file')
parser.add_argument('vcf_file', help='VCF file')
parser.add_argument('output_vcf_file', help='Output VCF file')
args = parser.parse_args()
# Extract header from original file
with open(args.vcf_file, 'r') as f:
with pysam.VariantFile(f) as input_file:
input_file.header.add_meta('cmdline', ' '.join(sys.argv))
header_str = str(input_file.header)
# Open output file, write as stream
with open(args.output_vcf_file, 'w') as output_vcf:
# Write header
output_vcf.write(header_str)
print(f'Reading {args.vcf_file}...')
# Open input file, read with variant_extractor
extractor = VariantExtractor(args.vcf_file)
records = list(extractor)
# Sort record by chromosome and position
records.sort(key=lambda x: (_contig_to_int(x.contig), x.pos))
for variant_record in records:
output_vcf.write(str(variant_record)+'\n')