Skip to content

Commit

Permalink
bugfix and nio access in Fingerprinting code (no need to panic) (#1694)
Browse files Browse the repository at this point in the history
* - Convert CalculateFingerprintMetrics, ExtractFingerprint, and ExtractContamination to use NIO and thus be able to to their work on a non-localized input file.
- Fixed a bug in the AD output of extractFingerprint and CalculateContamination (Ref and Alt were swapped...)
- Renamed PL variable to be GL since that's what is actually was.

* -removed unneeded mathUtil method.
  • Loading branch information
Yossi Farjoun authored Jul 7, 2021
1 parent 77b9159 commit 7172f0b
Show file tree
Hide file tree
Showing 6 changed files with 49 additions and 17 deletions.
13 changes: 10 additions & 3 deletions src/main/java/picard/fingerprint/CalculateFingerprintMetrics.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import org.apache.commons.math3.stat.inference.ChiSquareTest;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.Hidden;
import org.broadinstitute.barclay.help.DocumentedFeature;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.StandardOptionDefinitions;
Expand Down Expand Up @@ -70,7 +71,7 @@ public class CalculateFingerprintMetrics extends CommandLineProgram {

static final String USAGE_DETAILS =
"This tools collects various statistics that pertain to a single fingerprint (<b>not</b> the comparison, or " +
"\'fingerprinting\' of two distinct samples) and reports the results in a metrics file. " +
"'fingerprinting' of two distinct samples) and reports the results in a metrics file. " +
"<p>" +
"The statistics collected are p-values, where the null-hypothesis is that the fingerprint is collected from " +
"a non-contaminated, diploid human, whose genotypes are modelled by the probabilities given in the " +
Expand Down Expand Up @@ -112,6 +113,10 @@ public class CalculateFingerprintMetrics extends CommandLineProgram {
@Argument(doc="Number of randomization trials for calculating the DISCRIMINATORY_POWER metric.")
public final int NUMBER_OF_SAMPLING = 100;

@Hidden
@Argument(doc = "When true code will check for readability on input files (this can be slow on cloud access)")
public boolean TEST_INPUT_READABILITY = true;

// a fixed random seed for reproducibility;
private static final int RANDOM_SEED = 42;
private static final ChiSquareTest chiSquareTest = new ChiSquareTest();
Expand All @@ -120,7 +125,9 @@ public class CalculateFingerprintMetrics extends CommandLineProgram {
protected int doWork() {

final List<Path> inputPaths = IOUtil.getPaths(INPUT);
IOUtil.assertPathsAreReadable(inputPaths);
if (TEST_INPUT_READABILITY) {
IOUtil.assertPathsAreReadable(inputPaths);
}
IOUtil.assertFileIsReadable(HAPLOTYPE_MAP);
IOUtil.assertFileIsWritable(OUTPUT);

Expand Down Expand Up @@ -197,7 +204,7 @@ public FingerprintMetrics getFingerprintMetrics(final Fingerprint fingerprint) {
fingerprintMetrics.LOG10_HOM_CHI_SQUARED_PVALUE = Math.log10(homsChiSquaredTest);

// calculate LOD (cross-entropy)
fingerprintMetrics.HOM_CROSS_ENTROPY_LOD = MathUtil.klDivergance(homAllele1VsAllele2Counts, homAllele1VsAllele2Expect);;
fingerprintMetrics.HOM_CROSS_ENTROPY_LOD = MathUtil.klDivergance(homAllele1VsAllele2Counts, homAllele1VsAllele2Expect);

final MathUtils.RunningStat randomTrials = new MathUtils.RunningStat();

Expand Down
23 changes: 20 additions & 3 deletions src/main/java/picard/fingerprint/ExtractFingerprint.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,14 @@
import htsjdk.samtools.util.Log;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.Hidden;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.DiagnosticsAndQCProgramGroup;

import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.util.Map;

/**
Expand All @@ -50,7 +53,7 @@
public class ExtractFingerprint extends CommandLineProgram {

@Argument(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "Input SAM/BAM/CRAM file.")
public File INPUT;
public String INPUT;

@Argument(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "Output fingerprint file (VCF).")
public File OUTPUT;
Expand All @@ -75,6 +78,10 @@ public class ExtractFingerprint extends CommandLineProgram {
"It names the sample in the VCF <SAMPLE>-contaminant, using the SM value from the SAM header.")
public boolean EXTRACT_CONTAMINATION = false;

@Hidden
@Argument(doc = "When true code will check for readability on input files (this can be slow on cloud access)")
public boolean TEST_INPUT_READABILITY = true;

@Override
protected boolean requiresReference() {
return true;
Expand All @@ -84,7 +91,17 @@ protected boolean requiresReference() {

@Override
protected int doWork() {
IOUtil.assertFileIsReadable(INPUT);
// IOUtil.assertFileIsReadable(INPUT);
final Path inputPath;
try {
inputPath = IOUtil.getPath(INPUT);
} catch (IOException e) {
throw new RuntimeException(e);
}
if (TEST_INPUT_READABILITY) {
IOUtil.assertFileIsReadable(inputPath);
}

IOUtil.assertFileIsReadable(HAPLOTYPE_MAP);
IOUtil.assertFileIsWritable(OUTPUT);
IOUtil.assertFileIsReadable(referenceSequence.getReferenceFile());
Expand All @@ -104,7 +121,7 @@ protected int doWork() {
checker.setDefaultSampleID(SAMPLE_ALIAS);
}

final Map<String, Fingerprint> fingerprintMap = checker.identifyContaminant(INPUT.toPath(), CONTAMINATION);
final Map<String, Fingerprint> fingerprintMap = checker.identifyContaminant(inputPath, CONTAMINATION);

if (fingerprintMap.size() != 1) {
log.error("Expected exactly 1 fingerprint, found " + fingerprintMap.size());
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/picard/fingerprint/FingerprintChecker.java
Original file line number Diff line number Diff line change
Expand Up @@ -481,9 +481,9 @@ public Map<FingerprintIdDetails, Fingerprint> fingerprintSamFile(final Path samF
checkDictionaryGoodForFingerprinting(in.getFileHeader().getSequenceDictionary());

if (!in.hasIndex()) {
log.warn(String.format("Operating without an index! We could be here for a while. (%s)", samFile));
log.warn(String.format("Operating without an index! We could be here for a while. (%s)", samFile.toUri().toString()));
} else {
log.info(String.format("Reading an indexed file (%s)", samFile));
log.info(String.format("Reading an indexed file (%s)", samFile.toUri().toString()));
}

final SamLocusIterator iterator = new SamLocusIterator(in, this.haplotypes.getIntervalList(), in.hasIndex());
Expand Down
11 changes: 6 additions & 5 deletions src/main/java/picard/fingerprint/FingerprintUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -173,18 +173,19 @@ private static VariantContext getVariantContext(final ReferenceSequenceFile refe
obsAlt = haplotypeProbabilities.getObsAllele2();
}

final double[] origPLs = haplotypeProbabilities.getLogLikelihoods();
final double[] PLs = Arrays.copyOf(origPLs,origPLs.length);
final double[] origGLs = haplotypeProbabilities.getLogLikelihoods();
final double[] GLs = Arrays.copyOf(origGLs, origGLs.length);

if (swap12) {
ArrayUtils.reverse(PLs);
ArrayUtils.reverse(GLs);
}
final List<Allele> alleles = Arrays.asList(alleleRef, alleleAlt);

final Genotype gt = new GenotypeBuilder()
.DP(haplotypeProbabilities.getTotalObs())
.noAttributes()
.PL(PLs)
.AD(new int[]{obsAlt, obsRef})
.PL(GLs)
.AD(new int[]{obsRef, obsAlt})
.name(sample)
.make();
try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,11 @@ public void addToProbs(final Snp snp, final byte base, final byte qual) {

for (final Genotype contGeno : Genotype.values()) {
for (final Genotype mainGeno : Genotype.values()) {
//theta is the expected frequency of the alternate allele
// theta is the expected frequency of the alternate allele
final double theta = 0.5 * ((1 - contamination) * mainGeno.v + contamination * contGeno.v);
likelihoodMap[contGeno.v][mainGeno.v] *= ((altAllele ? theta : (1 - theta)) * (1 - pErr) +
(!altAllele ? theta : (1 - theta)) * pErr);
likelihoodMap[contGeno.v][mainGeno.v] *=
(( altAllele ? theta : (1 - theta)) * (1 - pErr) +
(!altAllele ? theta : (1 - theta)) * pErr );
}
}
}
Expand Down
8 changes: 7 additions & 1 deletion src/main/java/picard/fingerprint/IdentifyContaminant.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.Hidden;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.DiagnosticsAndQCProgramGroup;
Expand All @@ -48,7 +49,7 @@
public class IdentifyContaminant extends CommandLineProgram {

@Argument(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "Input SAM or BAM file.")
public File INPUT;
public String INPUT;

@Argument(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "Output fingerprint file (VCF).")
public File OUTPUT;
Expand All @@ -71,6 +72,10 @@ public class IdentifyContaminant extends CommandLineProgram {
"It names the sample in the VCF <SAMPLE>, using the SM value from the SAM header.")
public boolean EXTRACT_CONTAMINATED = false;

@Hidden
@Argument(doc = "When true code will check for readability on input files (this can be slow on cloud access)")
public boolean TEST_INPUT_READABILITY = true;

@Override
protected boolean requiresReference() {
return true;
Expand All @@ -90,6 +95,7 @@ protected int doWork() {
extractFingerprint.SAMPLE_ALIAS = SAMPLE_ALIAS;
extractFingerprint.VALIDATION_STRINGENCY = VALIDATION_STRINGENCY;
extractFingerprint.VERBOSITY = VERBOSITY;
extractFingerprint.TEST_INPUT_READABILITY = TEST_INPUT_READABILITY;
extractFingerprint.referenceSequence = referenceSequence;

extractFingerprint.doWork();
Expand Down

0 comments on commit 7172f0b

Please sign in to comment.