Skip to content

Commit

Permalink
Properly handle info fields with embedded spaces by VCF version.
Browse files Browse the repository at this point in the history
  • Loading branch information
cmnbroad committed Jun 1, 2023
1 parent 5700958 commit d995e73
Show file tree
Hide file tree
Showing 5 changed files with 184 additions and 4 deletions.
2 changes: 1 addition & 1 deletion src/main/java/htsjdk/variant/vcf/VCFCodec.java
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ protected void reportDuplicateInfoKeyValue(final String duplicateKey, final Stri
* @return a mapping of keys to objects
*/
protected Map<String, Object> parseInfo(String infoField) {
if (infoField.indexOf(' ') != -1) {
if ((infoField.indexOf(' ') != -1) && !version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) {
generateException(
String.format("Whitespace is not allowed in the INFO field in VCF version %s: %s",
version == null ?
Expand Down
19 changes: 19 additions & 0 deletions src/test/java/htsjdk/variant/vcf/VCFCodec42FeaturesTest.java
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
package htsjdk.variant.vcf;

import htsjdk.HtsjdkTest;
import htsjdk.samtools.util.Tuple;
import htsjdk.tribble.TribbleException;
import htsjdk.variant.variantcontext.VariantContext;
import org.testng.Assert;
import org.testng.annotations.Test;

import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;

public class VCFCodec42FeaturesTest extends HtsjdkTest {
private static final Path TEST_PATH = Paths.get("src/test/resources/htsjdk/variant/");
Expand All @@ -21,4 +25,19 @@ public void testV42PedigreeParsing() {
Assert.assertEquals(vcf42PedigreeLine.getClass(), VCFHeaderLine.class);
Assert.assertEquals(vcf42PedigreeLine.getValue(), "<Derived=NA12891, Original=NA12878>");
}

@Test(expectedExceptions = TribbleException.class)
public void testVCF42RejectsInfoFieldWithSpaces() {
// 1st variant has an info field with a value containing an embedded space
final Path infoSpace42File = TEST_PATH.resolve("infoSpace42.vcf");

try ( final VCFFileReader vcfReader = new VCFFileReader(infoSpace42File, false) ){
for (final VariantContext vc : vcfReader) {

}
} catch (final TribbleException e) {
Assert.assertTrue(e.getMessage().contains("Whitespace is not allowed"));
throw e;
}
}
}
43 changes: 40 additions & 3 deletions src/test/java/htsjdk/variant/vcf/VCFCodec43FeaturesTest.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package htsjdk.variant.vcf;

import htsjdk.beta.io.IOPathUtils;
import htsjdk.io.IOPath;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.util.FileExtensions;
import htsjdk.samtools.util.Interval;
Expand All @@ -19,8 +21,7 @@
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;

Expand Down Expand Up @@ -146,7 +147,7 @@ public void testVCF43PercentEncoding(final Path testFile, int ignored) {
Assert.assertEquals(vc.getContig(), "1");
Assert.assertEquals(vc.getStart(), 327);
// set=fil%3AteredInBoth
Assert.assertEquals(vc.getCommonInfo().getAttribute("set"), "fil:teredInBoth");
Assert.assertEquals(vc.getCommonInfo().getAttribute("set"), "fil:teredIn Both");
}

@Test(dataProvider="all43Files")
Expand Down Expand Up @@ -203,6 +204,42 @@ public void testVCF43IndexRoundTripQuery(final Path testFile) throws IOException
}
}

@Test
public void testVCF43AcceptsInfoFieldWithSpaces() {
// 1st variant has an info field with a value containing an embedded space
final Path infoSpaceFile = TEST_PATH.resolve("infoSpace43.vcf");
final Tuple<VCFHeader, List<VariantContext>> infoSpace43 = readEntireVCFIntoMemory(infoSpaceFile);
Assert.assertTrue(infoSpace43.b.get(0).getAttribute("set").toString().contains(" "));

// // also make sure it fails if we read in a VCF4.2 that contains a space in an info field
// final IOPath tmpVCF = IOPathUtils.createTempPath("testInfoSpace", FileExtensions.VCF);
// final Set<VCFHeaderLine> headerLinesWithoutVersionLine = infoSpace43.a
// .getMetaDataInInputOrder()
// .stream()
// .filter(l -> !VCFHeaderVersion.isFormatString(l.getKey())).collect(Collectors.toSet());
// final Set<VCFHeaderLine> vcf42HeaderLines = new LinkedHashSet<>(headerLinesWithoutVersionLine);
// vcf42HeaderLines.add(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_2));
// final VCFHeader vcf42Header = new VCFHeader(vcf42HeaderLines, infoSpace43.a.getSampleNamesInOrder());
// // if its not 4.3, the rest of the test isn't valid
// Assert.assertEquals(vcf42Header.getVCFHeaderVersion(), VCFHeaderVersion.VCF4_2);
//
// boolean testPasses = false;
// try (final VariantContextWriter writer = new VariantContextWriterBuilder()
// .setOutputFile(tmpVCF.toString())
// .unsetOption(Options.INDEX_ON_THE_FLY)
// .build()) {
// writer.writeHeader(vcf42Header);
// writer.add(infoSpace43.b.get(0));
// }
// try {
// readEntireVCFIntoMemory(tmpVCF.toPath());
// } catch (final TribbleException e) {
// Assert.assertTrue(e.getMessage().contains("Whitespace is not allowed"));
// testPasses = true;
// }
// Assert.assertTrue(testPasses);
}

//
// UTF8-specific tests
//
Expand Down
62 changes: 62 additions & 0 deletions src/test/resources/htsjdk/variant/infoSpace42.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
##fileformat=VCFv4.2
##ALT=<ID=DEL,Description="Deletion",ExtraAltField="extra alt">
##ALT=<ID=DUP,Description="Duplication">
##ALT=<ID=INS,Description="Insertion">
##ALT=<ID=INV,Description="Inversion">
##ALT=<ID=INVDUP,Description="InvertedDUP with unknown boundaries">
##ALT=<ID=TRA,Description="Translocation">
##FILTER=<ID=GATK_STANDARD,Description="Standard GATK filter",ExtraFilterField="extra filter field">
##FILTER=<ID=HARD_TO_VALIDATE,Description="Hard to validate">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed",ExtraFormatField="extra format">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth (only filtered reads used for calling)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##INFO=<ID=AB,Number=1,Type=Float,Description="Allele Balance for hets (ref/(ref+alt))",ExtraInfoField="extra info">
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Phred-scaled p-value From Wilcoxon Rank Sum Test of Alt Vs. Ref base qualities">
##INFO=<ID=BaseQRankSumZ,Number=1,Type=Float,Description="Z-score From Wilcoxon Rank Sum Test of Alt Vs. Ref base qualities">
##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP Membership">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
##INFO=<ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?">
##INFO=<ID=Dels,Number=1,Type=Float,Description="Fraction of Reads Containing Spanning Deletions">
##INFO=<ID=HRun,Number=1,Type=Integer,Description="Largest Contiguous Homopolymer Run of Variant Allele In Either Direction">
##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Phred-scaled p-value From Wilcoxon Rank Sum Test of Alt Vs. Ref read mapping qualities">
##INFO=<ID=MQRankSumZ,Number=1,Type=Float,Description="Z-score From Wilcoxon Rank Sum Test of Alt Vs. Ref read mapping qualities">
##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Phred-scaled p-value From Wilcoxon Rank Sum Test of Alt Vs. Ref read position bias">
##INFO=<ID=ReadPosRankSumZ,Number=1,Type=Float,Description="Z-score From Wilcoxon Rank Sum Test of Alt Vs. Ref read position bias">
##INFO=<ID=SB,Number=1,Type=Float,Description="Strand Bias">
##INFO=<ID=set,Number=1,Type=String,Description="Source VCF for the merged record in CombineVariants">
##contig=<ID=1,length=249250621,assembly=b37,extraContigField="extra contig field">
##contig=<ID=10,length=135534747,assembly=b37>
##contig=<ID=11,length=135006516,assembly=b37>
##contig=<ID=12,length=133851895,assembly=b37>
##contig=<ID=13,length=115169878,assembly=b37>
##contig=<ID=14,length=107349540,assembly=b37>
##contig=<ID=15,length=102531392,assembly=b37>
##contig=<ID=16,length=90354753,assembly=b37>
##contig=<ID=17,length=81195210,assembly=b37>
##contig=<ID=18,length=78077248,assembly=b37>
##contig=<ID=19,length=59128983,assembly=b37>
##contig=<ID=2,length=243199373,assembly=b37>
##contig=<ID=20,length=63025520,assembly=b37>
##contig=<ID=21,length=48129895,assembly=b37>
##contig=<ID=22,length=51304566,assembly=b37>
##contig=<ID=3,length=198022430,assembly=b37>
##contig=<ID=4,length=191154276,assembly=b37>
##contig=<ID=5,length=180915260,assembly=b37>
##contig=<ID=6,length=171115067,assembly=b37>
##contig=<ID=7,length=159138663,assembly=b37>
##contig=<ID=8,length=146364022,assembly=b37>
##contig=<ID=9,length=141213431,assembly=b37>
##contig=<ID=X,length=155270560,assembly=b37>
##contig=<ID=Y,length=59373566,assembly=b37>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA19238 NA19239 NA19240
1 327 . T <*> 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredIn Both GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99
62 changes: 62 additions & 0 deletions src/test/resources/htsjdk/variant/vcf43/infoSpace43.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
##fileformat=VCFv4.3
##ALT=<ID=DEL,Description="Deletion",ExtraAltField="extra alt">
##ALT=<ID=DUP,Description="Duplication">
##ALT=<ID=INS,Description="Insertion">
##ALT=<ID=INV,Description="Inversion">
##ALT=<ID=INVDUP,Description="InvertedDUP with unknown boundaries">
##ALT=<ID=TRA,Description="Translocation">
##FILTER=<ID=GATK_STANDARD,Description="Standard GATK filter",ExtraFilterField="extra filter field">
##FILTER=<ID=HARD_TO_VALIDATE,Description="Hard to validate">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed",ExtraFormatField="extra format">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth (only filtered reads used for calling)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##INFO=<ID=AB,Number=1,Type=Float,Description="Allele Balance for hets (ref/(ref+alt))",ExtraInfoField="extra info">
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Phred-scaled p-value From Wilcoxon Rank Sum Test of Alt Vs. Ref base qualities">
##INFO=<ID=BaseQRankSumZ,Number=1,Type=Float,Description="Z-score From Wilcoxon Rank Sum Test of Alt Vs. Ref base qualities">
##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP Membership">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
##INFO=<ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?">
##INFO=<ID=Dels,Number=1,Type=Float,Description="Fraction of Reads Containing Spanning Deletions">
##INFO=<ID=HRun,Number=1,Type=Integer,Description="Largest Contiguous Homopolymer Run of Variant Allele In Either Direction">
##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Phred-scaled p-value From Wilcoxon Rank Sum Test of Alt Vs. Ref read mapping qualities">
##INFO=<ID=MQRankSumZ,Number=1,Type=Float,Description="Z-score From Wilcoxon Rank Sum Test of Alt Vs. Ref read mapping qualities">
##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Phred-scaled p-value From Wilcoxon Rank Sum Test of Alt Vs. Ref read position bias">
##INFO=<ID=ReadPosRankSumZ,Number=1,Type=Float,Description="Z-score From Wilcoxon Rank Sum Test of Alt Vs. Ref read position bias">
##INFO=<ID=SB,Number=1,Type=Float,Description="Strand Bias">
##INFO=<ID=set,Number=1,Type=String,Description="Source VCF for the merged record in CombineVariants">
##contig=<ID=1,length=249250621,assembly=b37,extraContigField="extra contig field">
##contig=<ID=10,length=135534747,assembly=b37>
##contig=<ID=11,length=135006516,assembly=b37>
##contig=<ID=12,length=133851895,assembly=b37>
##contig=<ID=13,length=115169878,assembly=b37>
##contig=<ID=14,length=107349540,assembly=b37>
##contig=<ID=15,length=102531392,assembly=b37>
##contig=<ID=16,length=90354753,assembly=b37>
##contig=<ID=17,length=81195210,assembly=b37>
##contig=<ID=18,length=78077248,assembly=b37>
##contig=<ID=19,length=59128983,assembly=b37>
##contig=<ID=2,length=243199373,assembly=b37>
##contig=<ID=20,length=63025520,assembly=b37>
##contig=<ID=21,length=48129895,assembly=b37>
##contig=<ID=22,length=51304566,assembly=b37>
##contig=<ID=3,length=198022430,assembly=b37>
##contig=<ID=4,length=191154276,assembly=b37>
##contig=<ID=5,length=180915260,assembly=b37>
##contig=<ID=6,length=171115067,assembly=b37>
##contig=<ID=7,length=159138663,assembly=b37>
##contig=<ID=8,length=146364022,assembly=b37>
##contig=<ID=9,length=141213431,assembly=b37>
##contig=<ID=X,length=155270560,assembly=b37>
##contig=<ID=Y,length=59373566,assembly=b37>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA19238 NA19239 NA19240
1 327 . T <*> 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredIn Both GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99

0 comments on commit d995e73

Please sign in to comment.