From e7cda057968907a9e774b59c320037cfdc6471ad Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Tue, 13 Jul 2021 23:35:13 -0400 Subject: [PATCH] fix: underflow in HaplotypeProbabilitiesFromContaminatorSequence (#1697) * fix: underflow in HaplotypeProbabilitiesFromContaminatorSequence, which affects both IdentifyContamination and ExtractFingerprint in deep coverage. This was leading to non-informative sites in RNA and similar sequencing data. --- .../picard/fingerprint/FingerprintUtils.java | 4 +- ...ProbabilitiesFromContaminatorSequence.java | 16 +++---- .../fingerprint/FingerprintCheckerTest.java | 43 +++++++++++++++++-- src/test/java/picard/util/TestNGUtil.java | 17 +++++++- 4 files changed, 65 insertions(+), 15 deletions(-) diff --git a/src/main/java/picard/fingerprint/FingerprintUtils.java b/src/main/java/picard/fingerprint/FingerprintUtils.java index e3695f22b5..6cc7e98067 100644 --- a/src/main/java/picard/fingerprint/FingerprintUtils.java +++ b/src/main/java/picard/fingerprint/FingerprintUtils.java @@ -24,6 +24,7 @@ package picard.fingerprint; +import com.google.common.annotations.VisibleForTesting; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.samtools.reference.ReferenceSequenceFileFactory; @@ -143,7 +144,8 @@ public static VariantContextSet createVCSetFromFingerprint(final Fingerprint fin return variantContexts; } - private static VariantContext getVariantContext(final ReferenceSequenceFile reference, + @VisibleForTesting + static VariantContext getVariantContext(final ReferenceSequenceFile reference, final String sample, final HaplotypeProbabilities haplotypeProbabilities) { final Snp snp = haplotypeProbabilities.getRepresentativeSnp(); diff --git a/src/main/java/picard/fingerprint/HaplotypeProbabilitiesFromContaminatorSequence.java b/src/main/java/picard/fingerprint/HaplotypeProbabilitiesFromContaminatorSequence.java index a4e9de3b2b..b08838622e 100644 --- a/src/main/java/picard/fingerprint/HaplotypeProbabilitiesFromContaminatorSequence.java +++ b/src/main/java/picard/fingerprint/HaplotypeProbabilitiesFromContaminatorSequence.java @@ -39,9 +39,9 @@ public class HaplotypeProbabilitiesFromContaminatorSequence extends HaplotypePro public double contamination; - // for each model (contGenotype, mainGenotype) there's a likelihood of the data. These need to be collected separately + // for each model (contGenotype, mainGenotype) there's a LogLikelihood of the data. These need to be collected separately // and only collated once all the data is in. - private final double[][] likelihoodMap = {{1, 1, 1}, {1, 1, 1}, {1, 1, 1}}; + private final double[][] logLikelihoodMap = {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}; private boolean valuesNeedUpdating = true; public HaplotypeProbabilitiesFromContaminatorSequence(final HaplotypeBlock haplotypeBlock, final double contamination) { @@ -58,7 +58,7 @@ public HaplotypeProbabilitiesFromContaminatorSequence(HaplotypeProbabilitiesFrom contamination = other.contamination; for (final Genotype g : Genotype.values()) { - System.arraycopy(other.likelihoodMap[g.v], 0, likelihoodMap[g.v], 0, NUM_GENOTYPES); + System.arraycopy(other.logLikelihoodMap[g.v], 0, logLikelihoodMap[g.v], 0, NUM_GENOTYPES); } } @@ -92,9 +92,9 @@ public void addToProbs(final Snp snp, final byte base, final byte qual) { for (final Genotype mainGeno : Genotype.values()) { // theta is the expected frequency of the alternate allele final double theta = 0.5 * ((1 - contamination) * mainGeno.v + contamination * contGeno.v); - likelihoodMap[contGeno.v][mainGeno.v] *= - (( altAllele ? theta : (1 - theta)) * (1 - pErr) + - (!altAllele ? theta : (1 - theta)) * pErr ); + logLikelihoodMap[contGeno.v][mainGeno.v] += + Math.log10((( altAllele ? theta : (1 - theta)) * (1 - pErr) + + (!altAllele ? theta : (1 - theta)) * pErr )); } } } @@ -108,7 +108,7 @@ private void updateLikelihoods() { final double[] ll = new double[NUM_GENOTYPES]; for (final Genotype contGeno : Genotype.values()) { // p(a | g_c) = \sum_g_m { P(g_m) \prod_i P(a_i| g_m, g_c)} - ll[contGeno.v] = Math.log10(Double.MIN_VALUE + MathUtil.sum(MathUtil.multiply(this.getPriorProbablities(), likelihoodMap[contGeno.v]))); + ll[contGeno.v] = MathUtil.LOG_10_MATH.sum( MathUtil.sum(MathUtil.LOG_10_MATH.getLogValue(this.getPriorProbablities()), logLikelihoodMap[contGeno.v])); } setLogLikelihoods(ll); } @@ -137,7 +137,7 @@ public HaplotypeProbabilitiesFromContaminatorSequence merge(final HaplotypeProba } for (final Genotype contGeno : Genotype.values()) { - this.likelihoodMap[contGeno.v] = MathUtil.multiply(this.likelihoodMap[contGeno.v], o.likelihoodMap[contGeno.v]); + this.logLikelihoodMap[contGeno.v] = MathUtil.sum(this.logLikelihoodMap[contGeno.v], o.logLikelihoodMap[contGeno.v]); } valuesNeedUpdating = true; return this; diff --git a/src/test/java/picard/fingerprint/FingerprintCheckerTest.java b/src/test/java/picard/fingerprint/FingerprintCheckerTest.java index 303667e9a0..1c1252a205 100644 --- a/src/test/java/picard/fingerprint/FingerprintCheckerTest.java +++ b/src/test/java/picard/fingerprint/FingerprintCheckerTest.java @@ -1,13 +1,17 @@ package picard.fingerprint; import htsjdk.samtools.metrics.MetricsFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFileFactory; import htsjdk.samtools.util.CollectionUtil; +import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFFileReader; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import picard.PicardException; +import picard.util.MathUtil; import picard.vcf.VcfTestUtils; import java.io.File; @@ -27,6 +31,7 @@ import java.util.stream.Collectors; import static picard.util.TestNGUtil.compareDoubleWithAccuracy; +import static picard.util.TestNGUtil.interaction; /** * Created by farjoun on 8/27/15. @@ -34,7 +39,7 @@ public class FingerprintCheckerTest { private final double maf = 0.4; - private final Snp snp = new Snp("test", "chr1", 1, (byte) 'A', (byte) 'C', maf, Collections.singletonList("dummy")); + private final Snp snp = new Snp("test", "chr1", 1, (byte) 'A', (byte) 'T', maf, Collections.singletonList("dummy")); private final HaplotypeBlock hb = new HaplotypeBlock(maf); private static final double DELTA = 1e-6; @@ -316,8 +321,38 @@ public void testQueryable(final File vcf, boolean expectedQueryable) { } } + Object[][] deepData() { + return new Object[][]{{10}, {50}, {100}, {500}, {1000}, {5000}, {10000}, {50000}}; + } + + Object[][] haplotypeProbabilities() { + return new Object[][]{{new HaplotypeProbabilitiesFromSequence(hb)}, {new HaplotypeProbabilitiesFromContaminatorSequence(hb, 0.999)}}; + } + + @DataProvider + Iterator deepDataProvider(){ + return interaction(haplotypeProbabilities(), deepData()); + } + + @Test(dataProvider = "deepDataProvider") + void testCanHandleDeepData(final HaplotypeProbabilitiesFromSequence hp, final int counts) throws IOException { + + addObservation(hp, hb, counts, hb.getFirstSnp().getAllele1()); + addObservation(hp, hb, counts, hb.getFirstSnp().getAllele2()); + + final double[] logLikelihoods = hp.getLogLikelihoods(); + Assert.assertTrue( MathUtil.min(logLikelihoods) < 0 ); + + final File fasta = new File(TEST_DATA_DIR, "reference.fasta"); + + try (final ReferenceSequenceFile ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(fasta)) { + final VariantContext vc = FingerprintUtils.getVariantContext(ref, "test", hp); + Assert.assertTrue(MathUtil.max(MathUtil.promote(vc.getGenotype(0).getPL())) > 0); + } + } + @DataProvider() - Object[][] mergeIsDafeProvider() { + Object[][] mergeIsSafeProvider() { final HaplotypeProbabilitiesFromSequence hp1 = new HaplotypeProbabilitiesFromSequence(hb); final HaplotypeProbabilitiesFromSequence hp2 = new HaplotypeProbabilitiesFromSequence(hb); @@ -353,7 +388,7 @@ private static void addObservation(final HaplotypeProbabilitiesFromSequence hapl } } - @Test(dataProvider = "mergeIsDafeProvider") + @Test(dataProvider = "mergeIsSafeProvider") public void testMergeHaplotypeProbabilitiesIsSafe(final HaplotypeProbabilities hp1, final HaplotypeProbabilities hp2) { @@ -363,7 +398,7 @@ public void testMergeHaplotypeProbabilitiesIsSafe(final HaplotypeProbabilities h Assert.assertEquals(merged1.getLikelihoods(), merged2.getLikelihoods()); } - @Test(dataProvider = "mergeIsDafeProvider") + @Test(dataProvider = "mergeIsSafeProvider") public void testMergeFingerprintIsSafe(final HaplotypeProbabilities hp1, final HaplotypeProbabilities hp2) { final Fingerprint fpA = new Fingerprint("test2", null, "none"); diff --git a/src/test/java/picard/util/TestNGUtil.java b/src/test/java/picard/util/TestNGUtil.java index 2d22b21475..12e68e4ab2 100644 --- a/src/test/java/picard/util/TestNGUtil.java +++ b/src/test/java/picard/util/TestNGUtil.java @@ -22,11 +22,24 @@ */ public class TestNGUtil { + // Not actually the smallest possible value on purpose, since that would be indistinguishable from 0 and then useless. + // This is small enough to be meaningless, but representable. static final double EPSILON = 1e-300; //a constant near the smallest possible positive value representable in double, - // not actually the smallest possible value on purpose, since that would be indistinguishable from 0 and then useless. - // This is small enough to be meaningless, but representable. + // A method that takes two Object[][] (normally from two data providers) and creates a third of all possible combinations + public static Iterator interaction(final Object[][] dataprovider1, final Object[][] dataprovider2) { + final List testcases = new ArrayList<>(dataprovider1.length * dataprovider2.length); + for (final Object[] objects1 : dataprovider1) { + for (final Object[] objects2 : dataprovider2) { + final Object[] objects = new Object[objects1.length + objects2.length]; + System.arraycopy(objects1, 0, objects, 0, objects1.length); + System.arraycopy(objects2, 0, objects, objects1.length, objects2.length); + testcases.add(objects); + } + } + return testcases.iterator(); + } public static abstract class SingleTestUnitTest { @DataProvider(name = "testcases")