From 17fa9c4ca72351fb6152a0e6cadc56ab57cafc5c Mon Sep 17 00:00:00 2001 From: Cory McLean Date: Wed, 7 Mar 2018 17:38:33 -0500 Subject: [PATCH] Fix gVCF reference base issue and update timing --- deepvariant/BUILD | 1 + deepvariant/core/genomics_io.py | 12 ++- deepvariant/core/genomics_io_test.py | 8 +- deepvariant/core/python/reference_fai.clif | 5 +- deepvariant/postprocess_variants.py | 47 ++++++--- deepvariant/postprocess_variants_test.py | 96 +++++++++++-------- .../golden.postprocess_gvcf_output.g.vcf | 2 +- docs/deepvariant-case-study.md | 4 +- docs/deepvariant-exome-case-study.md | 4 +- 9 files changed, 119 insertions(+), 60 deletions(-) diff --git a/deepvariant/BUILD b/deepvariant/BUILD index 646ee203..c9842c16 100644 --- a/deepvariant/BUILD +++ b/deepvariant/BUILD @@ -419,6 +419,7 @@ py_library( "//deepvariant/core:genomics_math", "//deepvariant/core:io_utils", "//deepvariant/core:proto_utils", + "//deepvariant/core:ranges", "//deepvariant/core:variantutils", "//deepvariant/core/genomics:struct_py_pb2", "//deepvariant/core/genomics:variants_py_pb2", diff --git a/deepvariant/core/genomics_io.py b/deepvariant/core/genomics_io.py index 3bdb9925..49d66b46 100644 --- a/deepvariant/core/genomics_io.py +++ b/deepvariant/core/genomics_io.py @@ -49,11 +49,15 @@ SAM_EXTENSIONS = frozenset(['.sam', '.bam']) -def make_ref_reader(reference_filename): +def make_ref_reader(reference_filename, cache_size=None): """Creates an indexed GenomeReference for reference_filename.""" - return reference_fai.GenomeReferenceFai.from_file( - reference_filename.encode('utf8'), - reference_filename.encode('utf8') + '.fai') + fasta_path = reference_filename.encode('utf8') + fai_path = fasta_path + '.fai' + if cache_size is None: + return reference_fai.GenomeReferenceFai.from_file(fasta_path, fai_path) + else: + return reference_fai.GenomeReferenceFai.from_file(fasta_path, fai_path, + cache_size) def make_sam_reader(reads_source, diff --git a/deepvariant/core/genomics_io_test.py b/deepvariant/core/genomics_io_test.py index 89e4d4cc..81e065b6 100644 --- a/deepvariant/core/genomics_io_test.py +++ b/deepvariant/core/genomics_io_test.py @@ -51,11 +51,17 @@ class RefReaderTests(parameterized.TestCase): @parameterized.parameters('test.fasta', 'test.fasta.gz') - def test_make_ref_reader(self, fasta_filename): + def test_make_ref_reader_default(self, fasta_filename): fasta_path = test_utils.genomics_core_testdata(fasta_filename) with genomics_io.make_ref_reader(fasta_path) as reader: self.assertEqual(reader.bases(ranges.make_range('chrM', 1, 6)), 'ATCAC') + @parameterized.parameters('test.fasta', 'test.fasta.gz') + def test_make_ref_reader_cache_specified(self, fasta_filename): + fasta_path = test_utils.genomics_core_testdata(fasta_filename) + with genomics_io.make_ref_reader(fasta_path, cache_size=10) as reader: + self.assertEqual(reader.bases(ranges.make_range('chrM', 1, 5)), 'ATCA') + class SamReaderTests(parameterized.TestCase): """Test the iteration functionality provided by io.SamReader.""" diff --git a/deepvariant/core/python/reference_fai.clif b/deepvariant/core/python/reference_fai.clif index cc824acb..983918ef 100644 --- a/deepvariant/core/python/reference_fai.clif +++ b/deepvariant/core/python/reference_fai.clif @@ -49,7 +49,10 @@ from "deepvariant/core/reference_fai.h": contigs: list = property(`Contigs`) @classmethod - def `FromFile` as from_file(cls, fasta_path: str, fai_path: str) + def `FromFile` as from_file(cls, + fasta_path: str, + fai_path: str, + cache_size_bases: int = default) -> StatusOr @__enter__ diff --git a/deepvariant/postprocess_variants.py b/deepvariant/postprocess_variants.py index ea8d7f3f..b52ce11c 100644 --- a/deepvariant/postprocess_variants.py +++ b/deepvariant/postprocess_variants.py @@ -33,6 +33,7 @@ from __future__ import print_function import collections +import copy import itertools import tempfile @@ -52,6 +53,7 @@ from deepvariant.core import genomics_math from deepvariant.core import io_utils from deepvariant.core import proto_utils +from deepvariant.core import ranges from deepvariant.core import variantutils from deepvariant.core.protos import core_pb2 from deepvariant.protos import deepvariant_pb2 @@ -116,6 +118,9 @@ # The genotype likelihood of the gVCF alternate allele for variant calls. _GVCF_ALT_ALLELE_GL = -99 +# FASTA cache size. Span 300 Mb so that we query each chromosome at most once. +_FASTA_CACHE_SIZE = 300000000 + def _extract_single_sample_name(record): """Returns the name of the single sample within the CallVariantsOutput file. @@ -713,12 +718,29 @@ def lessthanfn(variant1, variant2): return lessthanfn -def _create_record_from_template(template, start, end): - """Returns a copy of the template variant with the new start and end.""" - retval = variants_pb2.Variant() - retval.CopyFrom(template) +def _create_record_from_template(template, start, end, fasta_reader): + """Returns a copy of the template variant with the new start and end. + + Updates to the start position cause a different reference base to be set. + + Args: + template: third_party.nucleus.protos.Variant. The template variant whose + non-location and reference base information to use. + start: int. The desired new start location. + end: int. The desired new end location. + fasta_reader: GenomeReferenceFai object. The reader used to determine the + correct start base to use for the updated variant. + + Returns: + An updated third_party.nucleus.protos.Variant with the proper start, end, + and reference base set and all other fields inherited from the template. + """ + retval = copy.deepcopy(template) retval.start = start retval.end = end + if start != template.start: + retval.reference_bases = fasta_reader.bases( + ranges.make_range(retval.reference_name, start, start + 1)) return retval @@ -751,7 +773,7 @@ def _transform_to_gvcf_record(variant): def merge_variants_and_nonvariants(variant_iterable, nonvariant_iterable, - lessthan): + lessthan, fasta_reader): """Yields records consisting of the merging of variant and non-variant sites. The merging strategy used for single-sample records is to emit variants @@ -768,6 +790,8 @@ def merge_variants_and_nonvariants(variant_iterable, nonvariant_iterable, lessthan: Callable. A function that takes two Variant protos as input and returns True iff the first argument is located "before" the second and the variants do not overlap. + fasta_reader: GenomeReferenceFai object. The reference genome reader used to + ensure gVCF records have the correct reference base. Yields: Variant protos representing both variant and non-variant sites in the sorted @@ -799,12 +823,12 @@ def next_or_none(iterable): if nonvariant.start < variant.start: # Emit a non-variant region up to the start of the variant. yield _create_record_from_template(nonvariant, nonvariant.start, - variant.start) + variant.start, fasta_reader) if nonvariant.end > variant.end: # There is an overhang of the non-variant site after the variant is # finished, so update the non-variant to point to that. nonvariant = _create_record_from_template(nonvariant, variant.end, - nonvariant.end) + nonvariant.end, fasta_reader) else: # This non-variant site is subsumed by a Variant. Ignore it. nonvariant = next_or_none(nonvariant_iterable) @@ -828,8 +852,9 @@ def main(argv=()): logging_level.set_from_flag() - with genomics_io.make_ref_reader(FLAGS.ref) as reader: - contigs = reader.contigs + fasta_reader = genomics_io.make_ref_reader(FLAGS.ref, + cache_size=_FASTA_CACHE_SIZE) + contigs = fasta_reader.contigs paths = io_utils.maybe_generate_sharded_filenames(FLAGS.infile) with tempfile.NamedTemporaryFile() as temp: postprocess_variants_lib.process_single_sites_tfrecords( @@ -865,12 +890,12 @@ def main(argv=()): with genomics_io.make_vcf_reader( FLAGS.outfile, use_index=False, include_likelihoods=True) as variant_reader: - lessthanfn = _get_contig_based_lessthan(variant_reader.contigs) + lessthanfn = _get_contig_based_lessthan(contigs) gvcf_variants = ( _transform_to_gvcf_record(variant) for variant in variant_reader.iterate()) merged_variants = merge_variants_and_nonvariants( - gvcf_variants, nonvariant_generator, lessthanfn) + gvcf_variants, nonvariant_generator, lessthanfn, fasta_reader) write_variants_to_vcf( contigs=contigs, variant_generator=merged_variants, diff --git a/deepvariant/postprocess_variants_test.py b/deepvariant/postprocess_variants_test.py index 6c64e86a..9fbf9b1d 100644 --- a/deepvariant/postprocess_variants_test.py +++ b/deepvariant/postprocess_variants_test.py @@ -68,6 +68,21 @@ ] +class DummyReferenceReader(object): + + def __init__(self): + self._bases = { + '1': 'AACCGGTTACGTTCGATTTTAAAACCCCGGGG', + '2': 'GCAGTGACGTAGCGATGACGTAGACGCTTACG' + } + + def bases(self, region): + chrom = region.reference_name + if chrom not in self._bases or region.end > len(self._bases[chrom]): + raise ValueError('Invalid region for dummy reader: {}'.format(region)) + return self._bases[chrom][region.start:region.end] + + def setUpModule(): test_utils.init() @@ -145,13 +160,14 @@ def _simple_variant(ref_name, start, ref_base): alleles=[ref_base, 'A' if ref_base != 'A' else 'C']) -def _create_nonvariant(ref_name, start, end): +def _create_nonvariant(ref_name, start, end, ref_base): """Creates a non-variant Variant record for testing. Args: ref_name: str. Reference name for this variant. start: int. start position on the contig [0-based, half open). end: int. end position on the contig [0-based, half open). + ref_base: str. reference base at the start position. Returns: A non-variant Variant record created with the specified arguments. @@ -160,7 +176,7 @@ def _create_nonvariant(ref_name, start, end): chrom=ref_name, start=start, end=end, - alleles=['A', variantutils.GVCF_ALT_ALLELE]) + alleles=[ref_base, variantutils.GVCF_ALT_ALLELE]) def make_golden_dataset(compressed_inputs=False): @@ -934,21 +950,23 @@ class MergeVcfAndGvcfTest(parameterized.TestCase): (('1', 6, 9), ('1', 3, 4), False), ) def test_lessthan_comparison(self, v1, v2, expected): - variant1 = _create_nonvariant(*v1) - variant2 = _create_nonvariant(*v2) + variant1 = _create_nonvariant(*v1, ref_base='A') + variant2 = _create_nonvariant(*v2, ref_base='C') lessthan = postprocess_variants._get_contig_based_lessthan(_CONTIGS) self.assertEqual(lessthan(variant1, variant2), expected) self.assertEqual(lessthan(variant1, None), True) self.assertEqual(lessthan(None, variant1), False) @parameterized.parameters( - (_create_nonvariant('1', 10, 15), 10, 12, _create_nonvariant('1', 10, - 12)), - (_create_nonvariant('2', 1, 9), 3, 4, _create_nonvariant('2', 3, 4)), + (_create_nonvariant('1', 10, 15, 'G'), 10, 12, + _create_nonvariant('1', 10, 12, 'G')), + (_create_nonvariant('2', 1, 9, 'C'), 3, 4, + _create_nonvariant('2', 3, 4, 'G')), ) def test_create_record_from_template(self, template, start, end, expected): + reader = DummyReferenceReader() actual = postprocess_variants._create_record_from_template( - template, start, end) + template, start, end, reader) self.assertEqual(actual, expected) @parameterized.parameters( @@ -1035,43 +1053,44 @@ def test_transform_to_gvcf_no_allele_addition(self, alts, gls, vaf): ([], [], []), # Non-overlapping records. ([('1', 1, 'A')], [], [_simple_variant('1', 1, 'A')]), - ([('1', 3, 'A'), ('1', 7, 'C'), - ('2', 6, 'G')], [('2', 3, 6), ('2', 7, 9)], [ - _simple_variant('1', 3, 'A'), - _simple_variant('1', 7, 'C'), - _create_nonvariant('2', 3, 6), - _simple_variant('2', 6, 'G'), - _create_nonvariant('2', 7, 9) + ([('1', 3, 'C'), ('1', 7, 'T'), + ('2', 6, 'A')], [('2', 3, 6, 'G'), ('2', 7, 9, 'C')], [ + _simple_variant('1', 3, 'C'), + _simple_variant('1', 7, 'T'), + _create_nonvariant('2', 3, 6, 'G'), + _simple_variant('2', 6, 'A'), + _create_nonvariant('2', 7, 9, 'C') ]), # Non-variant record overlaps a variant from the left. - ([('1', 5, 'CACGTG')], [('1', 2, 8)], - [_create_nonvariant('1', 2, 5), - _simple_variant('1', 5, 'CACGTG')]), + ([('1', 5, 'GTTACG')], [('1', 2, 8, 'C')], + [_create_nonvariant('1', 2, 5, 'C'), + _simple_variant('1', 5, 'GTTACG')]), # Non-variant record overlaps a variant from the right. - ([('1', 5, 'CACGTG')], [('1', 8, 15)], - [_simple_variant('1', 5, 'CACGTG'), - _create_nonvariant('1', 11, 15)]), + ([('1', 5, 'GTTACG')], [('1', 8, 15, 'A')], [ + _simple_variant('1', 5, 'GTTACG'), + _create_nonvariant('1', 11, 15, 'T') + ]), # Non-variant record is subsumed by a variant. - ([('1', 5, 'CACGTG')], [('1', 5, 11)], - [_simple_variant('1', 5, 'CACGTG')]), + ([('1', 5, 'GTTACG')], [('1', 5, 11, 'G')], + [_simple_variant('1', 5, 'GTTACG')]), # Non-variant record subsumes a variant. - ([('1', 5, 'CACGTG')], [('1', 4, 12)], [ - _create_nonvariant('1', 4, 5), - _simple_variant('1', 5, 'CACGTG'), - _create_nonvariant('1', 11, 12) + ([('1', 5, 'GTTACG')], [('1', 4, 12, 'G')], [ + _create_nonvariant('1', 4, 5, 'G'), + _simple_variant('1', 5, 'GTTACG'), + _create_nonvariant('1', 11, 12, 'T') ]), # Non-variant record subsumes multiple overlapping variants. - ([('1', 3, 'AAAAAAA'), ('1', 5, 'A')], [('1', 1, 15)], [ - _create_nonvariant('1', 1, 3), - _simple_variant('1', 3, 'AAAAAAA'), - _simple_variant('1', 5, 'A'), - _create_nonvariant('1', 10, 15) + ([('1', 3, 'CGGTTAC'), ('1', 5, 'G')], [('1', 1, 15, 'A')], [ + _create_nonvariant('1', 1, 3, 'A'), + _simple_variant('1', 3, 'CGGTTAC'), + _simple_variant('1', 5, 'G'), + _create_nonvariant('1', 10, 15, 'G') ]), - ([('1', 3, 'AAAAAAA'), ('1', 5, 'AAAAAAA')], [('1', 1, 15)], [ - _create_nonvariant('1', 1, 3), - _simple_variant('1', 3, 'AAAAAAA'), - _simple_variant('1', 5, 'AAAAAAA'), - _create_nonvariant('1', 12, 15) + ([('1', 3, 'CGGTTAC'), ('1', 5, 'GTTACGT')], [('1', 1, 15, 'A')], [ + _create_nonvariant('1', 1, 3, 'A'), + _simple_variant('1', 3, 'CGGTTAC'), + _simple_variant('1', 5, 'GTTACGT'), + _create_nonvariant('1', 12, 15, 'T') ]), ) def test_merge_variants_and_nonvariants(self, variants, nonvariants, @@ -1079,8 +1098,9 @@ def test_merge_variants_and_nonvariants(self, variants, nonvariants, viter = (_simple_variant(*v) for v in variants) nonviter = (_create_nonvariant(*nv) for nv in nonvariants) lessthan = postprocess_variants._get_contig_based_lessthan(_CONTIGS) + reader = DummyReferenceReader() actual = postprocess_variants.merge_variants_and_nonvariants( - viter, nonviter, lessthan) + viter, nonviter, lessthan, reader) self.assertEqual(list(actual), expected) diff --git a/deepvariant/testdata/golden.postprocess_gvcf_output.g.vcf b/deepvariant/testdata/golden.postprocess_gvcf_output.g.vcf index 957084e2..0780a9c0 100644 --- a/deepvariant/testdata/golden.postprocess_gvcf_output.g.vcf +++ b/deepvariant/testdata/golden.postprocess_gvcf_output.g.vcf @@ -210,7 +210,7 @@ chr20 10008936 . C <*> 0 . END=10008947 GT:GQ:MIN_DP:PL 0/0:50:30:0,90,899 chr20 10008948 . TA T,<*> 36.8 PASS . GT:GQ:DP:AD:VAF:PL 1/1:24:39:0,39,0:1,0:36,24,0,990,990,990 chr20 10008950 . A <*> 0 . END=10008951 GT:GQ:MIN_DP:PL 0/0:50:35:0,105,1049 chr20 10008952 . CACACACACACA C,CCACACACACA,<*> 31 PASS . GT:GQ:DP:AD:VAF:PL 1/2:11:41:1,14,25,0:0.341463,0.609756,0:30,20,44,20,0,11,990,990,990,990 -chr20 10008964 . A <*> 0 . END=10008999 GT:GQ:MIN_DP:PL 0/0:50:23:0,69,689 +chr20 10008964 . C <*> 0 . END=10008999 GT:GQ:MIN_DP:PL 0/0:50:23:0,69,689 chr20 10009000 . T <*> 0 . END=10009226 GT:GQ:MIN_DP:PL 0/0:50:45:0,144,1439 chr20 10009227 . A G,<*> 43.5 PASS . GT:GQ:DP:AD:VAF:PL 0/1:41:66:34,32,0:0.484848,0:43,0,45,990,990,990 chr20 10009228 . G <*> 0 . END=10009245 GT:GQ:MIN_DP:PL 0/0:50:67:0,201,2009 diff --git a/docs/deepvariant-case-study.md b/docs/deepvariant-case-study.md index 3f4096d8..40248122 100644 --- a/docs/deepvariant-case-study.md +++ b/docs/deepvariant-case-study.md @@ -263,8 +263,8 @@ Step | wall time `make_examples` | 5h 37m 42s `call_variants` | 11h 0m 29s `postprocess_variants` (no gVCF) | 21m 54s -`postprocess_variants` (with gVCF) | 58m 24s -total time (single machine) | 17h - 17h 36m +`postprocess_variants` (with gVCF) | 59m 13s +total time (single machine) | 17h - 17h 37m ## Variant call quality diff --git a/docs/deepvariant-exome-case-study.md b/docs/deepvariant-exome-case-study.md index a80b79c8..352bfa6f 100644 --- a/docs/deepvariant-exome-case-study.md +++ b/docs/deepvariant-exome-case-study.md @@ -216,8 +216,8 @@ Step | wall time `make_examples` | 69m 22s `call_variants` | 6m 32s `postprocess_variants` (no gVCF) | 0m 13s -`postprocess_variants` (with gVCF) | 0m 41s -total time (single machine) | ~ 1h 16m +`postprocess_variants` (with gVCF) | 1m 29s +total time (single machine) | ~ 1h 17m ## Variant call quality