From ce7cdcf609d6ab9e3bba97bfd0b4d504f6d5a2f4 Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Mon, 8 Nov 2021 17:11:25 -0500 Subject: [PATCH 01/12] Commit with raw GATK SequenceDictionaryUtils and SequenceDictionaryUtilsTest. --- .../samtools/SAMSequenceDictionaryUtils.java | 505 ++++++++++++++++++ .../SAMSequenceDictionaryUtilsTest.java | 357 +++++++++++++ 2 files changed, 862 insertions(+) create mode 100644 src/main/java/htsjdk/samtools/SAMSequenceDictionaryUtils.java create mode 100644 src/test/java/htsjdk/samtools/SAMSequenceDictionaryUtilsTest.java diff --git a/src/main/java/htsjdk/samtools/SAMSequenceDictionaryUtils.java b/src/main/java/htsjdk/samtools/SAMSequenceDictionaryUtils.java new file mode 100644 index 0000000000..7f1db9fd94 --- /dev/null +++ b/src/main/java/htsjdk/samtools/SAMSequenceDictionaryUtils.java @@ -0,0 +1,505 @@ +package org.broadinstitute.hellbender.utils; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.exceptions.UserException; + +import java.util.*; +import java.util.stream.Collectors; + +/** + * + * A series of utility functions that enable the GATK to compare two sequence dictionaries -- from the reference, + * from BAMs, or from feature sources -- for consistency. The system supports two basic modes: get an enum state that + * describes at a high level the consistency between two dictionaries, or a validateDictionaries that will + * blow up with a UserException if the dicts are too incompatible. + * + * Dictionaries are tested for contig name overlaps, consistency in ordering in these overlap set, and length, + * if available. + */ +public final class SequenceDictionaryUtils { + + private SequenceDictionaryUtils(){} + + /** + * Compares sequence records by their order + */ + private static final Comparator SEQUENCE_INDEX_ORDER = Comparator.comparing(SAMSequenceRecord::getSequenceIndex); + + // The following sets of contig records are used to perform the non-canonical human ordering check. + // This check ensures that the order is 1,2,3... instead of 1, 10, 11, 12...2, 20, 21... + + // hg18 + protected static final SAMSequenceRecord CHR1_HG18 = new SAMSequenceRecord("chr1", 247249719); + protected static final SAMSequenceRecord CHR2_HG18 = new SAMSequenceRecord("chr2", 242951149); + protected static final SAMSequenceRecord CHR10_HG18 = new SAMSequenceRecord("chr10", 135374737); + + // hg19 + protected static final SAMSequenceRecord CHR1_HG19 = new SAMSequenceRecord("chr1", 249250621); + protected static final SAMSequenceRecord CHR2_HG19 = new SAMSequenceRecord("chr2", 243199373); + protected static final SAMSequenceRecord CHR10_HG19 = new SAMSequenceRecord("chr10", 135534747); + + // b36 + protected static final SAMSequenceRecord CHR1_B36 = new SAMSequenceRecord("1", 247249719); + protected static final SAMSequenceRecord CHR2_B36 = new SAMSequenceRecord("2", 242951149); + protected static final SAMSequenceRecord CHR10_B36 = new SAMSequenceRecord("10", 135374737); + + // b37 + protected static final SAMSequenceRecord CHR1_B37 = new SAMSequenceRecord("1", 249250621); + protected static final SAMSequenceRecord CHR2_B37 = new SAMSequenceRecord("2", 243199373); + protected static final SAMSequenceRecord CHR10_B37 = new SAMSequenceRecord("10", 135534747); + + + public enum SequenceDictionaryCompatibility { + IDENTICAL, // the dictionaries are identical + COMMON_SUBSET, // there exists a common subset of equivalent contigs + SUPERSET, // the first dict's set of contigs supersets the second dict's set + NO_COMMON_CONTIGS, // no overlap between dictionaries + UNEQUAL_COMMON_CONTIGS, // common subset has contigs that have the same name but different lengths + NON_CANONICAL_HUMAN_ORDER, // human reference detected but the order of the contigs is non-standard (lexicographic, for example) + OUT_OF_ORDER, // the two dictionaries overlap but the overlapping contigs occur in different + // orders with respect to each other + DIFFERENT_INDICES // the two dictionaries overlap and the overlapping contigs occur in the same + // order with respect to each other, but one or more of them have different + // indices in the two dictionaries. Eg., { chrM, chr1, chr2 } vs. { chr1, chr2 } + } + + /** + * Tests for compatibility between two sequence dictionaries, using standard validation settings appropriate + * for the GATK. If the dictionaries are incompatible, then UserExceptions are thrown with detailed error messages. + * + * The standard validation settings used by this method are: + * + * -Require the dictionaries to share a common subset of equivalent contigs + * + * -Do not require dict1 to be a superset of dict2. + * + * -Do not perform checks related to contig ordering: don't throw if the common contigs are in + * different orders with respect to each other, occur at different absolute indices, or are + * lexicographically sorted human dictionaries. GATK uses contig names rather than contig + * indices, and so should not be sensitive to contig ordering issues. + * + * For comparing a CRAM dictionary against a reference dictionary, call + * {@link #validateCRAMDictionaryAgainstReference(SAMSequenceDictionary, SAMSequenceDictionary)} instead. + * + * @param name1 name associated with dict1 + * @param dict1 the sequence dictionary dict1 + * @param name2 name associated with dict2 + * @param dict2 the sequence dictionary dict2 + */ + public static void validateDictionaries( final String name1, + final SAMSequenceDictionary dict1, + final String name2, + final SAMSequenceDictionary dict2) { + final boolean requireSuperset = false; + final boolean checkContigOrdering = false; + + validateDictionaries(name1, dict1, name2, dict2, requireSuperset, checkContigOrdering); + } + + /** + * Tests for compatibility between a reference dictionary and a CRAM dictionary, using appropriate + * validation settings. If the dictionaries are incompatible, then UserExceptions are thrown with + * detailed error messages. + * + * The standard validation settings used by this method are: + * + * -Require the reference dictionary to be a superset of the cram dictionary + * + * -Do not perform checks related to contig ordering: don't throw if the common contigs are in + * different orders with respect to each other, occur at different absolute indices, or are + * lexicographically sorted human dictionaries. GATK uses contig names rather than contig + * indices, and so should not be sensitive to contig ordering issues. + * + * @param referenceDictionary the sequence dictionary for the reference + * @param cramDictionary sequence dictionary from a CRAM file + */ + public static void validateCRAMDictionaryAgainstReference( final SAMSequenceDictionary referenceDictionary, + final SAMSequenceDictionary cramDictionary ) { + // For CRAM, we require the reference dictionary to be a superset of the reads dictionary + final boolean requireSuperset = true; + final boolean checkContigOrdering = false; + + validateDictionaries("reference", referenceDictionary, "reads", cramDictionary, requireSuperset, checkContigOrdering); + } + + + /** + * Tests for compatibility between two sequence dictionaries. If the dictionaries are incompatible, then + * UserExceptions are thrown with detailed error messages. + * + * Two sequence dictionaries are compatible if they share a common subset of equivalent contigs, + * where equivalent contigs are defined as having the same name and length. + * + * @param name1 name associated with dict1 + * @param dict1 the sequence dictionary dict1 + * @param name2 name associated with dict2 + * @param dict2 the sequence dictionary dict2 + * @param requireSuperset if true, require that dict1 be a superset of dict2, rather than dict1 and dict2 sharing a common subset + * @param checkContigOrdering if true, require common contigs to be in the same relative order with respect to each other + * and occur at the same absolute indices, and forbid lexicographically-sorted human dictionaries + */ + public static void validateDictionaries( final String name1, + final SAMSequenceDictionary dict1, + final String name2, + final SAMSequenceDictionary dict2, + final boolean requireSuperset, + final boolean checkContigOrdering ) { + Utils.nonNull(dict1, "Something went wrong with sequence dictionary detection, check that "+name1+" has a valid sequence dictionary"); + Utils.nonNull(dict2, "Something went wrong with sequence dictionary detection, check that "+name2+" has a valid sequence dictionary"); + + final SequenceDictionaryCompatibility type = compareDictionaries(dict1, dict2, checkContigOrdering); + + switch ( type ) { + case IDENTICAL: + return; + case SUPERSET: + return; + case COMMON_SUBSET: + if ( requireSuperset ) { + final Set contigs1 = dict1.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(Collectors.toSet()); + final List missingContigs = dict2.getSequences().stream() + .map(SAMSequenceRecord::getSequenceName) + .filter(contig -> !contigs1.contains(contig)) + .collect(Collectors.toList()); + throw new UserException.IncompatibleSequenceDictionaries(String.format("Dictionary %s is missing contigs found in dictionary %s. Missing contigs: \n %s \n", name1, name2, String.join(", ", missingContigs)), name1, dict1, name2, dict2); + } + return; + case NO_COMMON_CONTIGS: + throw new UserException.IncompatibleSequenceDictionaries("No overlapping contigs found", name1, dict1, name2, dict2); + + case UNEQUAL_COMMON_CONTIGS: { + final List x = findDisequalCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2); + final SAMSequenceRecord elt1 = x.get(0); + final SAMSequenceRecord elt2 = x.get(1); + throw new UserException.IncompatibleSequenceDictionaries( + String.format("Found contigs with the same name but different lengths:\n contig %s = %s / %d\n contig %s = %s / %d", + name1, elt1.getSequenceName(), elt1.getSequenceLength(), + name2, elt2.getSequenceName(), elt2.getSequenceLength()), + name1, dict1, name2, dict2 + ); + } + + case NON_CANONICAL_HUMAN_ORDER: { + // We only get NON_CANONICAL_HUMAN_ORDER if the caller explicitly requested that we check contig ordering, + // so we should always throw when we see it. + final UserException ex; + if ( nonCanonicalHumanContigOrder(dict1) ) { + ex = new UserException.LexicographicallySortedSequenceDictionary(name1, dict1); + } + else { + ex = new UserException.LexicographicallySortedSequenceDictionary(name2, dict2); + } + + throw ex; + } + + case OUT_OF_ORDER: { + // We only get OUT_OF_ORDER if the caller explicitly requested that we check contig ordering, + // so we should always throw when we see it. + throw new UserException.IncompatibleSequenceDictionaries( + "The relative ordering of the common contigs in " + name1 + " and " + name2 + + " is not the same; to fix this please see: " + + "(https://www.broadinstitute.org/gatk/guide/article?id=1328), " + + " which describes reordering contigs in BAM and VCF files.", + name1, dict1, name2, dict2); + } + + case DIFFERENT_INDICES: { + // We only get DIFFERENT_INDICES if the caller explicitly requested that we check contig ordering, + // so we should always throw when we see it. + final String msg = "One or more contigs common to both dictionaries have " + + "different indices (ie., absolute positions) in each dictionary. Code " + + "that is sensitive to contig ordering can fail when this is the case. " + + "You should fix the sequence dictionaries so that all shared contigs " + + "occur at the same absolute positions in both dictionaries."; + throw new UserException.IncompatibleSequenceDictionaries(msg, name1, dict1, name2, dict2); + } + default: + throw new GATKException("Unexpected SequenceDictionaryComparison type: " + type); + } + } + + /** + * Workhorse routine that takes two dictionaries and returns their compatibility. + * + * @param dict1 first sequence dictionary + * @param dict2 second sequence dictionary + * @param checkContigOrdering if true, perform checks related to contig ordering: forbid lexicographically-sorted + * dictionaries, and require common contigs to be in the same relative order and at the + * same absolute indices + * @return A SequenceDictionaryCompatibility enum value describing the compatibility of the two dictionaries + */ + public static SequenceDictionaryCompatibility compareDictionaries( final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2, final boolean checkContigOrdering ) { + if ( checkContigOrdering && (nonCanonicalHumanContigOrder(dict1) || nonCanonicalHumanContigOrder(dict2)) ) { + return SequenceDictionaryCompatibility.NON_CANONICAL_HUMAN_ORDER; + } + + final Set commonContigs = getCommonContigsByName(dict1, dict2); + + if (commonContigs.isEmpty()) { + return SequenceDictionaryCompatibility.NO_COMMON_CONTIGS; + } + else if ( ! commonContigsHaveSameLengths(commonContigs, dict1, dict2) ) { + return SequenceDictionaryCompatibility.UNEQUAL_COMMON_CONTIGS; + } + + final boolean commonContigsAreInSameRelativeOrder = commonContigsAreInSameRelativeOrder(commonContigs, dict1, dict2); + + if ( checkContigOrdering && ! commonContigsAreInSameRelativeOrder ) { + return SequenceDictionaryCompatibility.OUT_OF_ORDER; + } + else if ( commonContigsAreInSameRelativeOrder && commonContigs.size() == dict1.size() && commonContigs.size() == dict2.size() ) { + return SequenceDictionaryCompatibility.IDENTICAL; + } + else if ( checkContigOrdering && ! commonContigsAreAtSameIndices(commonContigs, dict1, dict2) ) { + return SequenceDictionaryCompatibility.DIFFERENT_INDICES; + } + else if ( supersets(dict1, dict2) ) { + return SequenceDictionaryCompatibility.SUPERSET; + } + else { + return SequenceDictionaryCompatibility.COMMON_SUBSET; + } + } + + + /** + * Utility function that tests whether dict1's set of contigs is a superset of dict2's + * + * @param dict1 first sequence dictionary + * @param dict2 second sequence dictionary + * @return true if dict1's set of contigs supersets dict2's + */ + private static boolean supersets( SAMSequenceDictionary dict1, SAMSequenceDictionary dict2 ) { + // Cannot rely on SAMSequenceRecord.equals() as it's too strict (takes extended attributes into account). + for ( final SAMSequenceRecord dict2Record : dict2.getSequences() ) { + final SAMSequenceRecord dict1Record = dict1.getSequence(dict2Record.getSequenceName()); + if ( dict1Record == null || ! sequenceRecordsAreEquivalent(dict2Record, dict1Record) ) { + return false; + } + } + + return true; + } + + + + /** + * Utility function that tests whether the commonContigs in both dicts are equivalent. Equivalence means + * that the seq records have the same length, if both are non-zero. + * + * @param commonContigs + * @param dict1 + * @param dict2 + * @return true if all of the common contigs are equivalent + */ + private static boolean commonContigsHaveSameLengths(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { + return findDisequalCommonContigs(commonContigs, dict1, dict2) == null; + } + + /** + * Returns a List(x,y) that contains two disequal sequence records among the common contigs in both dicts. Returns + * null if all common contigs are equivalent + * + * @param commonContigs + * @param dict1 + * @param dict2 + * @return + */ + private static List findDisequalCommonContigs(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { + for ( String name : commonContigs ) { + SAMSequenceRecord elt1 = dict1.getSequence(name); + SAMSequenceRecord elt2 = dict2.getSequence(name); + if ( ! sequenceRecordsAreEquivalent(elt1, elt2) ) + return Arrays.asList(elt1,elt2); + } + + return null; + } + + /** + * Helper routine that returns whether two sequence records are equivalent, defined as having the same name and + * lengths. + * + * NOTE: we allow the lengths to differ if one or both are UNKNOWN_SEQUENCE_LENGTH + * + * @param first first sequence record to compare + * @param second second sequence record to compare + * @return true if first and second have the same names and lengths, otherwise false + */ + public static boolean sequenceRecordsAreEquivalent(final SAMSequenceRecord first, final SAMSequenceRecord second) { + if ( first == second ) { + return true; + } + if ( first == null || second == null ) { + return false; + } + final int length1 = first.getSequenceLength(); + final int length2 = second.getSequenceLength(); + + if (length1 != length2 && length1 != SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH && length2 != SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH){ + return false; + } + if (! first.getSequenceName().equals(second.getSequenceName())){ + return false; + } + return true; + } + + /** + * A very simple (and naive) algorithm to determine (1) if the dict is a human reference (hg18, hg19, b36, or b37) and if it's + * lexicographically sorted. Works by matching lengths of the static chr1, chr10, and chr2, and then if these + * are all matched, requiring that the order be chr1, chr2, chr10. + * + * @param dict + * @return + */ + private static boolean nonCanonicalHumanContigOrder(SAMSequenceDictionary dict) { + SAMSequenceRecord chr1 = null, chr2 = null, chr10 = null; + for ( SAMSequenceRecord elt : dict.getSequences() ) { + if ( isHumanSeqRecord(elt, CHR1_HG18, CHR1_HG19, CHR1_B36, CHR1_B37) ) chr1 = elt; + if ( isHumanSeqRecord(elt, CHR2_HG18, CHR2_HG19, CHR2_B36, CHR2_B37) ) chr2 = elt; + if ( isHumanSeqRecord(elt, CHR10_HG18, CHR10_HG19, CHR10_B36, CHR10_B37) ) chr10 = elt; + } + if ( chr1 != null && chr2 != null && chr10 != null) { + return ! ( chr1.getSequenceIndex() < chr2.getSequenceIndex() && chr2.getSequenceIndex() < chr10.getSequenceIndex() ); + } + + return false; + } + + /** + * Trivial helper that returns true if elt has the same name and length as rec1 or rec2 + * @param elt record to test + * @param recs the list of records to check for name and length equivalence + * @return true if elt has the same name and length as any of the recs + */ + private static boolean isHumanSeqRecord(SAMSequenceRecord elt, SAMSequenceRecord... recs) { + for (SAMSequenceRecord rec : recs) { + if (elt.getSequenceLength() == rec.getSequenceLength() && elt.getSequenceName().equals(rec.getSequenceName())) { + return true; + } + } + return false; + } + + /** + * Returns true if the common contigs in dict1 and dict2 are in the same relative order, without regard to + * absolute index position. This is accomplished by getting the common contigs in both dictionaries, sorting + * these according to their indices, and then walking through the sorted list to ensure that each ordered contig + * is equivalent + * + * @param commonContigs names of the contigs common to both dictionaries + * @param dict1 first SAMSequenceDictionary + * @param dict2 second SAMSequenceDictionary + * @return true if the common contigs occur in the same relative order in both dict1 and dict2, otherwise false + */ + private static boolean commonContigsAreInSameRelativeOrder(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { + final List list1 = getSequencesOfName(commonContigs, dict1); + final List list2 = getSequencesOfName(commonContigs, dict2); + list1.sort(SEQUENCE_INDEX_ORDER); + list2.sort(SEQUENCE_INDEX_ORDER); + + for ( int i = 0; i < list1.size(); i++ ) { + SAMSequenceRecord elt1 = list1.get(i); + SAMSequenceRecord elt2 = list2.get(i); + if ( ! elt1.getSequenceName().equals(elt2.getSequenceName()) ) + return false; + } + + return true; + } + + /** + * Gets the subset of SAMSequenceRecords in commonContigs in dict + * + * @param commonContigs + * @param dict + * @return + */ + private static List getSequencesOfName(Set commonContigs, SAMSequenceDictionary dict) { + List l = new ArrayList<>(commonContigs.size()); + for ( String name : commonContigs ) { + l.add(dict.getSequence(name) ); + } + + return l; + } + + /** + * Checks whether the common contigs in the given sequence dictionaries occur at the same indices + * in both dictionaries + * + * @param commonContigs Set of names of the contigs that occur in both dictionaries + * @param dict1 first sequence dictionary + * @param dict2 second sequence dictionary + * @return true if the contigs common to dict1 and dict2 occur at the same indices in both dictionaries, + * otherwise false + */ + private static boolean commonContigsAreAtSameIndices( final Set commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2 ) { + for ( String commonContig : commonContigs ) { + SAMSequenceRecord dict1Record = dict1.getSequence(commonContig); + SAMSequenceRecord dict2Record = dict2.getSequence(commonContig); + + // Each common contig must have the same index in both dictionaries + if ( dict1Record.getSequenceIndex() != dict2Record.getSequenceIndex() ) { + return false; + } + } + + return true; + } + + /** + * Returns the set of contig names found in both dicts. + * @param dict1 + * @param dict2 + * @return + */ + public static Set getCommonContigsByName(SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { + Set intersectingSequenceNames = getContigNames(dict1); + intersectingSequenceNames.retainAll(getContigNames(dict2)); + return intersectingSequenceNames; + } + + public static Set getContigNames(SAMSequenceDictionary dict) { + Set contigNames = new LinkedHashSet(Utils.optimumHashSize(dict.size())); + for (SAMSequenceRecord dictionaryEntry : dict.getSequences()) + contigNames.add(dictionaryEntry.getSequenceName()); + return contigNames; + } + + public static List getContigNamesList(final SAMSequenceDictionary refSeqDict) { + Utils.nonNull(refSeqDict, "provided reference sequence ditionary is null"); + return refSeqDict.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(Collectors.toList()); + } + + /** + * Returns a compact String representation of the sequence dictionary it's passed + * + * The format of the returned String is: + * [ contig1Name(length: contig1Length) contig2Name(length: contig2Length) ... ] + * + * @param dict a non-null SAMSequenceDictionary + * @return A String containing all of the contig names and lengths from the sequence dictionary it's passed + */ + public static String getDictionaryAsString( final SAMSequenceDictionary dict ) { + Utils.nonNull(dict, "Sequence dictionary must be non-null"); + + StringBuilder s = new StringBuilder("[ "); + + for ( SAMSequenceRecord dictionaryEntry : dict.getSequences() ) { + s.append(dictionaryEntry.getSequenceName()); + s.append("(length:"); + s.append(dictionaryEntry.getSequenceLength()); + s.append(") "); + } + + s.append("]"); + + return s.toString(); + } + +} diff --git a/src/test/java/htsjdk/samtools/SAMSequenceDictionaryUtilsTest.java b/src/test/java/htsjdk/samtools/SAMSequenceDictionaryUtilsTest.java new file mode 100644 index 0000000000..37842f8a9a --- /dev/null +++ b/src/test/java/htsjdk/samtools/SAMSequenceDictionaryUtilsTest.java @@ -0,0 +1,357 @@ +package org.broadinstitute.hellbender.utils; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.GATKBaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import static org.broadinstitute.hellbender.utils.SequenceDictionaryUtils.*; +import static org.broadinstitute.hellbender.utils.SequenceDictionaryUtils.SequenceDictionaryCompatibility.*; + +public final class SequenceDictionaryUtilsUnitTest extends GATKBaseTest { + + private static Logger logger = LogManager.getLogger(SequenceDictionaryUtilsUnitTest.class); + + @DataProvider( name = "testSequenceRecordsAreEquivalentDataProvider" ) + public Object[][] testSequenceRecordsAreEquivalentDataProvider() { + final SAMSequenceRecord CHRM_HG19 = new SAMSequenceRecord("chrM", 16571); + final SAMSequenceRecord CHR_NONSTANDARD1 = new SAMSequenceRecord("NonStandard1", 8675309); + final SAMSequenceRecord CHR1_HG19_WITH_UNKNOWN_LENGTH = new SAMSequenceRecord(CHR1_HG19.getSequenceName(), SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH); + final SAMSequenceRecord CHR1_HG19_WITH_DIFFERENT_LENGTH = new SAMSequenceRecord(CHR1_HG19.getSequenceName(), 123456); + return new Object[][]{ + {CHR1_HG19, CHR1_HG19, true}, + {CHR1_HG19, CHRM_HG19, false}, + {CHR1_HG19, CHR_NONSTANDARD1, false}, + {null, null, true}, + {CHR1_HG19, null, false}, + {null, CHR1_HG19, false}, + {CHR1_HG19, CHR1_HG19_WITH_UNKNOWN_LENGTH, true}, + {CHR1_HG19, CHR1_HG19_WITH_DIFFERENT_LENGTH, false}, + {CHR1_HG19_WITH_UNKNOWN_LENGTH, CHR1_HG19, true}, + {CHR1_HG19_WITH_DIFFERENT_LENGTH, CHR1_HG19, false}, + }; + } + + @Test(dataProvider = "testSequenceRecordsAreEquivalentDataProvider") + public void testSequenceRecordsAreEquivalent(final SAMSequenceRecord one, final SAMSequenceRecord two, final boolean expected){ + final boolean actual = SequenceDictionaryUtils.sequenceRecordsAreEquivalent(one, two); + Assert.assertEquals(actual, expected); + } + + @DataProvider( name = "SequenceDictionaryDataProvider" ) + public Object[][] generateSequenceDictionaryTestData() { + final SAMSequenceRecord CHRM_HG19 = new SAMSequenceRecord("chrM", 16571); + final SAMSequenceRecord CHR_NONSTANDARD1 = new SAMSequenceRecord("NonStandard1", 8675309); + final SAMSequenceRecord CHR_NONSTANDARD2 = new SAMSequenceRecord("NonStandard2", 8675308); + final SAMSequenceRecord CHR1_HG19_WITH_UNKNOWN_LENGTH = new SAMSequenceRecord(CHR1_HG19.getSequenceName(), SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH); + final SAMSequenceRecord CHR1_HG19_WITH_DIFFERENT_LENGTH = new SAMSequenceRecord(CHR1_HG19.getSequenceName(), 123456); + + final SAMSequenceRecord CHR1_HG19_WITH_ATTRIBUTES = new SAMSequenceRecord(CHR1_HG19.getSequenceName(), CHR1_HG19.getSequenceLength()); + CHR1_HG19_WITH_ATTRIBUTES.setAttribute("M5", "0dec9660ec1efaaf33281c0d5ea2560f"); + CHR1_HG19_WITH_ATTRIBUTES.setAttribute("UR", "file:/foo/bar"); + + final Class NO_COMMON_CONTIGS_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class; + final Class UNEQUAL_COMMON_CONTIGS_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class; + final Class NON_CANONICAL_HUMAN_ORDER_EXCEPTION = UserException.LexicographicallySortedSequenceDictionary.class; + final Class OUT_OF_ORDER_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class; + final Class DIFFERENT_INDICES_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class; + + final List hg19AllContigsIntervalSet = Arrays.asList( + new SimpleInterval("chrM", 1, 1), + new SimpleInterval("chr1", 1, 1), + new SimpleInterval("chr2", 1, 1), + new SimpleInterval("chr10", 1, 1)); + final List hg19PartialContigsIntervalSet = Arrays.asList( + new SimpleInterval("chrM", 1, 1), + new SimpleInterval("chr1", 1, 1)); + + return new Object[][] { + // Identical dictionaries: + {Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, null, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, null, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, null, true, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, null, false, true}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, null, true, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), IDENTICAL, null, false, false}, + { Arrays.asList(CHR1_B37), Arrays.asList(CHR1_B37), IDENTICAL, null, false, false}, + { Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), IDENTICAL, null, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19_WITH_UNKNOWN_LENGTH), IDENTICAL, null, false, false}, + { Arrays.asList(CHR1_HG19_WITH_UNKNOWN_LENGTH), Arrays.asList(CHR1_HG19), IDENTICAL, null, false, false}, + + // Dictionaries with a common subset: + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, null, false, true}, + // If requireSuperset == true, we should get an exception upon COMMON_SUBSET: + { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR10_HG19), COMMON_SUBSET, UserException.IncompatibleSequenceDictionaries.class, true, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR_NONSTANDARD2), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHRM_HG19), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD2), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHR2_HG19, CHR10_HG19), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHR2_HG19, CHR10_HG19, CHRM_HG19), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37, CHR_NONSTANDARD1), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37, CHR_NONSTANDARD2), COMMON_SUBSET, null, false, false}, + // If requireSuperset == true, we should get an exception upon COMMON_SUBSET: + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), COMMON_SUBSET, UserException.IncompatibleSequenceDictionaries.class, true, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), COMMON_SUBSET, null, false, false}, + // If checkContigOrdering == false, ordering of the common contigs should not matter: + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR10_HG19, CHR2_HG19, CHR1_HG19), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR10_HG19, CHR1_HG19), COMMON_SUBSET, null, false, false}, + + // Dictionaries with no common contigs: + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, true, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, true, true}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_B37), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, false}, + + // Dictionaries with unequal common contigs: + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19_WITH_DIFFERENT_LENGTH), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_HG19_WITH_DIFFERENT_LENGTH), Arrays.asList(CHR1_HG19), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, true, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, true, true}, + { Arrays.asList(CHR1_B36), Arrays.asList(CHR1_B37), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), Arrays.asList(CHR1_B36, CHR2_B36, CHR10_B36), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18, CHR_NONSTANDARD2), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG18, CHR2_HG18, CHR10_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, + + // One or both dictionaries in non-canonical human order: + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, true, true}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, true, true}, + { Arrays.asList(CHR1_HG18, CHR10_HG18, CHR2_HG18), Arrays.asList(CHR1_HG18, CHR10_HG18, CHR2_HG18), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHR1_B37, CHR10_B37, CHR2_B37), Arrays.asList(CHR1_B37, CHR10_B37, CHR2_B37), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHR1_B36, CHR10_B36, CHR2_B36), Arrays.asList(CHR1_B36, CHR10_B36, CHR2_B36), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, + // If checkContigOrdering == false, we should not get NON_CANONICAL_HUMAN_ORDER: + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), IDENTICAL, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, null, false, false}, + + // Dictionaries with a common subset, but different relative ordering within that subset + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, true, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, true, true}, + { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHRM_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHRM_HG19, CHR1_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHR1_B37, CHR2_B37), Arrays.asList(CHR2_B37, CHR1_B37), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, true}, + // If checkContigOrdering == false, we should not get OUT_OF_ORDER: + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), SUPERSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, null, false, false}, + + // Dictionaries with a common subset in the same relative order, but with different indices. + // This will only throw an exception during validation if checkContigOrdering is true + + // These have checkContigOrdering == true, so we expect DIFFERENT_INDICES and an exception: + { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, + // Setting requireSuperset == true should make no difference here (we should still get DIFFERENT_INDICES and an exception): + { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, + + // Same test cases as above, but these have checkContigOrdering == false, so we expect SUPERSET or COMMON_SUBSET instead of DIFFERENT_INDICES, and no exception: + { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), SUPERSET, null, false, false}, + { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), SUPERSET, null, true, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19), SUPERSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), SUPERSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19), SUPERSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), SUPERSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), SUPERSET, null, false, false}, + + // tests for SUPERSET + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19), SUPERSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19), SUPERSET, null, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19), SUPERSET, null, true, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19), SUPERSET, null, true, true}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19), SUPERSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), SUPERSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), SUPERSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19), SUPERSET, null, false, false}, + // Extended attributes should be ignored when determining whether a superset exists: + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19_WITH_ATTRIBUTES), SUPERSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHRM_HG19), Arrays.asList(CHR1_HG19_WITH_ATTRIBUTES, CHR10_HG19), SUPERSET, null, false, false} + }; + } + + @Test( dataProvider = "SequenceDictionaryDataProvider" ) + public void testSequenceDictionaryValidation( final List firstDictionaryContigs, + final List secondDictionaryContigs, + final SequenceDictionaryUtils.SequenceDictionaryCompatibility dictionaryCompatibility, //not needed by this test + final Class expectedExceptionUponValidation, + final boolean requireSuperset, + final boolean checkContigOrdering) { + final SAMSequenceDictionary firstDictionary = createSequenceDictionary(firstDictionaryContigs); + final SAMSequenceDictionary secondDictionary = createSequenceDictionary(secondDictionaryContigs); + final String testDescription = String.format("First dictionary: %s Second dictionary: %s", + SequenceDictionaryUtils.getDictionaryAsString(firstDictionary), + SequenceDictionaryUtils.getDictionaryAsString(secondDictionary)); + Exception exceptionThrown = null; + try { + SequenceDictionaryUtils.validateDictionaries( + "firstDictionary", + firstDictionary, + "secondDictionary", + secondDictionary, + requireSuperset, + checkContigOrdering); + } + catch ( Exception e ) { + exceptionThrown = e; + } + if ( expectedExceptionUponValidation != null ) { + Assert.assertTrue(exceptionThrown != null && expectedExceptionUponValidation.isInstance(exceptionThrown), + String.format("Expected exception %s but saw %s instead. %s", + expectedExceptionUponValidation.getSimpleName(), + exceptionThrown == null ? "no exception" : exceptionThrown.getClass().getSimpleName(), + testDescription)); + } + else { + Assert.assertTrue(exceptionThrown == null, + String.format("Expected no exception but saw exception %s instead. %s", + exceptionThrown != null ? exceptionThrown.getClass().getSimpleName() : "none", + testDescription)); + } + } + + @Test( dataProvider = "SequenceDictionaryDataProvider" ) + public void testSequenceDictionaryComparison( final List firstDictionaryContigs, + final List secondDictionaryContigs, + final SequenceDictionaryUtils.SequenceDictionaryCompatibility dictionaryCompatibility, + final Class expectedExceptionUponValidation, + final boolean requireSuperset, + final boolean checkContigOrdering) { + + final SAMSequenceDictionary firstDictionary = createSequenceDictionary(firstDictionaryContigs); + final SAMSequenceDictionary secondDictionary = createSequenceDictionary(secondDictionaryContigs); + final String testDescription = String.format("First dictionary: %s Second dictionary: %s", + SequenceDictionaryUtils.getDictionaryAsString(firstDictionary), + SequenceDictionaryUtils.getDictionaryAsString(secondDictionary)); + + final SequenceDictionaryUtils.SequenceDictionaryCompatibility reportedCompatibility = + SequenceDictionaryUtils.compareDictionaries(firstDictionary, secondDictionary, checkContigOrdering); + + Assert.assertTrue(reportedCompatibility == dictionaryCompatibility, + String.format("Dictionary comparison should have returned %s but instead returned %s. %s", + dictionaryCompatibility, reportedCompatibility, testDescription)); + } + + @DataProvider(name = "StandardValidationIgnoresContigOrderData") + public Object[][] getStandardValidationIgnoresContigOrderData() { + return new Object[][] { + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19) }, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19) }, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR10_HG19, CHR2_HG19, CHR1_HG19) }, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR10_HG19, CHR1_HG19) }, + + }; + } + + @Test(dataProvider = "StandardValidationIgnoresContigOrderData") + public void testStandardValidationIgnoresContigOrder( final List firstDictionaryContigs, final List secondDictionaryContigs ) { + final SAMSequenceDictionary firstDictionary = createSequenceDictionary(firstDictionaryContigs); + final SAMSequenceDictionary secondDictionary = createSequenceDictionary(secondDictionaryContigs); + + // Standard validation (the overload of validateDictionaries() that doesn't take any boolean args) + // should ignore differences in ordering of common contigs, so we shouldn't get an exception here + SequenceDictionaryUtils.validateDictionaries("first", firstDictionary, "second", secondDictionary); + } + + @DataProvider(name = "NonSupersetData") + public Object[][] getNonSupersetData() { + return new Object[][] { + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19) }, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR10_HG19, CHR2_HG19, CHR1_HG19) } + }; + } + + @Test(dataProvider = "NonSupersetData") + public void testStandardValidationDoesNotRequireSuperset( final List firstDictionaryContigs, final List secondDictionaryContigs ) { + final SAMSequenceDictionary firstDictionary = createSequenceDictionary(firstDictionaryContigs); + final SAMSequenceDictionary secondDictionary = createSequenceDictionary(secondDictionaryContigs); + + // Standard validation (the overload of validateDictionaries() that doesn't take any boolean args) + // should not require a superset relationship, so we shouldn't get an exception here + SequenceDictionaryUtils.validateDictionaries("first", firstDictionary, "second", secondDictionary); + } + + @Test(dataProvider = "NonSupersetData", expectedExceptions = UserException.IncompatibleSequenceDictionaries.class) + public void testCRAMValidationDoesRequireSuperset( final List refDictionaryContigs, final List cramDictionaryContigs ) { + final SAMSequenceDictionary refDictionary = createSequenceDictionary(refDictionaryContigs); + final SAMSequenceDictionary cramDictionary = createSequenceDictionary(cramDictionaryContigs); + + // CRAM validation against the reference SHOULD require a superset relationship, so we should + // get an exception here + SequenceDictionaryUtils.validateCRAMDictionaryAgainstReference(refDictionary, cramDictionary); + } + + @DataProvider(name = "SupersetData") + public Object[][] getSupersetData() { + return new Object[][] { + { Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19)}, //exactly same + { Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19) }, + { Arrays.asList(CHR10_HG19, CHR2_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19) } + }; + } + + @Test(dataProvider = "SupersetData") + public void testCRAMValidationDoesAcceptSuperset( final List refDictionaryContigs, final List cramDictionaryContigs ) { + final SAMSequenceDictionary refDictionary = createSequenceDictionary(refDictionaryContigs); + final SAMSequenceDictionary cramDictionary = createSequenceDictionary(cramDictionaryContigs); + + //In these inputs , cram contigs are subsets of ref contigs and so it should be accepted + SequenceDictionaryUtils.validateCRAMDictionaryAgainstReference(refDictionary, cramDictionary); + } + + private SAMSequenceDictionary createSequenceDictionary( final List contigs ) { + final List clonedContigs = new ArrayList(contigs.size()); + + // Clone the individual SAMSequenceRecords to avoid contig-index issues with shared objects + // across multiple dictionaries in tests + for ( SAMSequenceRecord contig : contigs ) { + clonedContigs.add(contig.clone()); + } + + return new SAMSequenceDictionary(clonedContigs); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testGetContigNamesListExpectingException() { + getContigNamesList(null); + } + + @Test + public void testGetContigNamesList() { + + final SAMSequenceDictionary samSequenceDictionary = new SAMSequenceDictionary(Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37)); + + Assert.assertEquals(getContigNamesList(samSequenceDictionary), Arrays.asList("1", "2", "10")); + } +} \ No newline at end of file From 4d08d5d6a79b9dd75ffa4ec0e2d2c6b4ff753686 Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Mon, 8 Nov 2021 17:09:17 -0500 Subject: [PATCH 02/12] VCFHeader and VCFHeaderLine refactoring to enable support for VCF4.3/BCF2.2 and bug fixes. --- src/main/java/htsjdk/samtools/Defaults.java | 6 + .../samtools/SAMSequenceDictionary.java | 15 + .../samtools/SAMSequenceDictionaryUtils.java | 181 +--- .../java/htsjdk/tribble/TribbleException.java | 6 + .../java/htsjdk/variant/bcf2/BCF2Utils.java | 29 +- .../variantcontext/writer/VCFWriter.java | 54 +- .../htsjdk/variant/vcf/AbstractVCFCodec.java | 548 +++++++---- .../java/htsjdk/variant/vcf/VCF3Codec.java | 69 +- .../htsjdk/variant/vcf/VCFAltHeaderLine.java | 40 +- .../java/htsjdk/variant/vcf/VCFCodec.java | 127 +-- .../variant/vcf/VCFCompoundHeaderLine.java | 580 ++++++------ .../java/htsjdk/variant/vcf/VCFConstants.java | 26 +- .../variant/vcf/VCFContigHeaderLine.java | 165 +++- .../variant/vcf/VCFFilterHeaderLine.java | 48 +- .../variant/vcf/VCFFormatHeaderLine.java | 61 +- .../java/htsjdk/variant/vcf/VCFHeader.java | 643 +++++++------ .../htsjdk/variant/vcf/VCFHeaderLine.java | 125 ++- .../variant/vcf/VCFHeaderLineCount.java | 69 ++ .../variant/vcf/VCFHeaderLineTranslator.java | 127 +-- .../htsjdk/variant/vcf/VCFHeaderLineType.java | 30 +- .../htsjdk/variant/vcf/VCFHeaderMerger.java | 286 ++++++ .../htsjdk/variant/vcf/VCFHeaderVersion.java | 43 +- .../htsjdk/variant/vcf/VCFInfoHeaderLine.java | 72 +- .../htsjdk/variant/vcf/VCFMetaDataLines.java | 525 +++++++++++ .../htsjdk/variant/vcf/VCFMetaHeaderLine.java | 32 +- .../variant/vcf/VCFPedigreeHeaderLine.java | 42 +- .../htsjdk/variant/vcf/VCFRecordCodec.java | 3 +- .../variant/vcf/VCFSampleHeaderLine.java | 33 +- .../variant/vcf/VCFSimpleHeaderLine.java | 216 +++-- .../variant/vcf/VCFStandardHeaderLines.java | 50 +- .../java/htsjdk/variant/vcf/VCFUtils.java | 150 +-- .../variant/vcf/VCFValidationFailure.java | 63 ++ .../SAMSequenceDictionaryUtilsTest.java | 345 +++---- .../variant/bcf2/BCF2UtilsUnitTest.java | 26 +- .../variant/bcf2/BCF2WriterUnitTest.java | 1 + .../VariantContextTestProvider.java | 1 + .../AsyncVariantContextWriterUnitTest.java | 3 +- .../writer/VCFWriterUnitTest.java | 11 +- .../variant/vcf/AbstractVCFCodecTest.java | 69 +- .../variant/vcf/VCFAltHeaderLineUnitTest.java | 43 + .../variant/vcf/VCFCodec43FeaturesTest.java | 34 +- .../vcf/VCFCompoundHeaderLineUnitTest.java | 237 ++++- .../vcf/VCFContigHeaderLineUnitTest.java | 184 ++++ .../htsjdk/variant/vcf/VCFEncoderTest.java | 1 + .../vcf/VCFFormatHeaderLineUnitTest.java | 19 + .../vcf/VCFHeaderLineTranslatorUnitTest.java | 25 +- .../variant/vcf/VCFHeaderLineUnitTest.java | 123 ++- .../variant/vcf/VCFHeaderMergerUnitTest.java | 554 +++++++++++ .../htsjdk/variant/vcf/VCFHeaderUnitTest.java | 875 +++++++++++------- .../variant/vcf/VCFHeaderUnitTestData.java | 203 ++++ .../vcf/VCFInfoHeaderLineUnitTest.java | 86 ++ .../variant/vcf/VCFMetaDataLinesUnitTest.java | 354 +++++++ .../vcf/VCFMetaHeaderLineUnitTest.java | 44 + .../vcf/VCFPedigreeHeaderLineUnitTest.java | 50 + .../vcf/VCFSampleHeaderLineUnitTest.java | 43 + .../vcf/VCFSimpleHeaderLineUnitTest.java | 151 +++ .../vcf/VCFStandardHeaderLinesUnitTest.java | 9 +- .../java/htsjdk/variant/vcf/VCFUtilsTest.java | 55 +- .../resources/htsjdk/variant/HiSeq.10000.vcf | 1 - .../htsjdk/variant/VCF4HeaderTest.vcf | 1 - 60 files changed, 5905 insertions(+), 2107 deletions(-) create mode 100644 src/main/java/htsjdk/variant/vcf/VCFHeaderMerger.java create mode 100644 src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java create mode 100644 src/main/java/htsjdk/variant/vcf/VCFValidationFailure.java create mode 100644 src/test/java/htsjdk/variant/vcf/VCFAltHeaderLineUnitTest.java create mode 100644 src/test/java/htsjdk/variant/vcf/VCFContigHeaderLineUnitTest.java create mode 100644 src/test/java/htsjdk/variant/vcf/VCFFormatHeaderLineUnitTest.java create mode 100644 src/test/java/htsjdk/variant/vcf/VCFHeaderMergerUnitTest.java create mode 100644 src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTestData.java create mode 100644 src/test/java/htsjdk/variant/vcf/VCFInfoHeaderLineUnitTest.java create mode 100644 src/test/java/htsjdk/variant/vcf/VCFMetaDataLinesUnitTest.java create mode 100644 src/test/java/htsjdk/variant/vcf/VCFMetaHeaderLineUnitTest.java create mode 100644 src/test/java/htsjdk/variant/vcf/VCFPedigreeHeaderLineUnitTest.java create mode 100644 src/test/java/htsjdk/variant/vcf/VCFSampleHeaderLineUnitTest.java create mode 100644 src/test/java/htsjdk/variant/vcf/VCFSimpleHeaderLineUnitTest.java diff --git a/src/main/java/htsjdk/samtools/Defaults.java b/src/main/java/htsjdk/samtools/Defaults.java index 0cfff05d5d..9d4c64bdda 100644 --- a/src/main/java/htsjdk/samtools/Defaults.java +++ b/src/main/java/htsjdk/samtools/Defaults.java @@ -110,6 +110,11 @@ public class Defaults { */ public static final boolean DISABLE_SNAPPY_COMPRESSOR; + /** + * Strict VCF version validation. Default = true. + */ + public static final boolean STRICT_VCF_VERSION_VALIDATION; + public static final String SAMJDK_PREFIX = "samjdk."; static { @@ -134,6 +139,7 @@ public class Defaults { SAM_FLAG_FIELD_FORMAT = SamFlagField.valueOf(getStringProperty("sam_flag_field_format", SamFlagField.DECIMAL.name())); SRA_LIBRARIES_DOWNLOAD = getBooleanProperty("sra_libraries_download", false); DISABLE_SNAPPY_COMPRESSOR = getBooleanProperty(DISABLE_SNAPPY_PROPERTY_NAME, false); + STRICT_VCF_VERSION_VALIDATION = getBooleanProperty("strict_version_validation", true); } /** diff --git a/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java b/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java index cf40fe6532..1e6cb764e0 100644 --- a/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java +++ b/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java @@ -53,6 +53,13 @@ public SAMSequenceDictionary(final List list) { setSequences(list); } + //TODO: this returns sequences in the internal list order instead of + // honoring each sequence's contigIndex + /** + * Get a list of sequences for this dictionary. + * @return the list of sequences for this dictionary in internal order (the order in which the sequences + * were added to this dictionary) + */ public List getSequences() { return Collections.unmodifiableList(mSequences); } @@ -75,6 +82,14 @@ public void setSequences(final List list) { list.forEach(this::addSequence); } + /** + * Add a sequence to the dictionary. + * @param sequenceRecord the sequence record to add - note that this method mutates the contig + * index of the sequenceRecord to match the newly added record's relative + * order in the list + */ + //TODO: this method ignores (and actually mutates) the sequenceRecord's contig index to make it match + // the record's relative placement in the dictionary's internal list public void addSequence(final SAMSequenceRecord sequenceRecord) { if (mSequenceMap.containsKey(sequenceRecord.getSequenceName())) { throw new IllegalArgumentException("Cannot add sequence that already exists in SAMSequenceDictionary: " + diff --git a/src/main/java/htsjdk/samtools/SAMSequenceDictionaryUtils.java b/src/main/java/htsjdk/samtools/SAMSequenceDictionaryUtils.java index 7f1db9fd94..0d5073a0ba 100644 --- a/src/main/java/htsjdk/samtools/SAMSequenceDictionaryUtils.java +++ b/src/main/java/htsjdk/samtools/SAMSequenceDictionaryUtils.java @@ -1,16 +1,13 @@ -package org.broadinstitute.hellbender.utils; +package htsjdk.samtools; -import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.samtools.SAMSequenceRecord; -import org.broadinstitute.hellbender.exceptions.GATKException; -import org.broadinstitute.hellbender.exceptions.UserException; +import htsjdk.utils.ValidationUtils; import java.util.*; import java.util.stream.Collectors; /** * - * A series of utility functions that enable the GATK to compare two sequence dictionaries -- from the reference, + * A series of utility functions that enable comparison of two sequence dictionaries -- from the reference, * from BAMs, or from feature sources -- for consistency. The system supports two basic modes: get an enum state that * describes at a high level the consistency between two dictionaries, or a validateDictionaries that will * blow up with a UserException if the dicts are too incompatible. @@ -18,9 +15,9 @@ * Dictionaries are tested for contig name overlaps, consistency in ordering in these overlap set, and length, * if available. */ -public final class SequenceDictionaryUtils { +public final class SAMSequenceDictionaryUtils { - private SequenceDictionaryUtils(){} + private SAMSequenceDictionaryUtils(){} /** * Compares sequence records by their order @@ -59,166 +56,10 @@ public enum SequenceDictionaryCompatibility { UNEQUAL_COMMON_CONTIGS, // common subset has contigs that have the same name but different lengths NON_CANONICAL_HUMAN_ORDER, // human reference detected but the order of the contigs is non-standard (lexicographic, for example) OUT_OF_ORDER, // the two dictionaries overlap but the overlapping contigs occur in different - // orders with respect to each other + // orders with respect to each other DIFFERENT_INDICES // the two dictionaries overlap and the overlapping contigs occur in the same - // order with respect to each other, but one or more of them have different - // indices in the two dictionaries. Eg., { chrM, chr1, chr2 } vs. { chr1, chr2 } - } - - /** - * Tests for compatibility between two sequence dictionaries, using standard validation settings appropriate - * for the GATK. If the dictionaries are incompatible, then UserExceptions are thrown with detailed error messages. - * - * The standard validation settings used by this method are: - * - * -Require the dictionaries to share a common subset of equivalent contigs - * - * -Do not require dict1 to be a superset of dict2. - * - * -Do not perform checks related to contig ordering: don't throw if the common contigs are in - * different orders with respect to each other, occur at different absolute indices, or are - * lexicographically sorted human dictionaries. GATK uses contig names rather than contig - * indices, and so should not be sensitive to contig ordering issues. - * - * For comparing a CRAM dictionary against a reference dictionary, call - * {@link #validateCRAMDictionaryAgainstReference(SAMSequenceDictionary, SAMSequenceDictionary)} instead. - * - * @param name1 name associated with dict1 - * @param dict1 the sequence dictionary dict1 - * @param name2 name associated with dict2 - * @param dict2 the sequence dictionary dict2 - */ - public static void validateDictionaries( final String name1, - final SAMSequenceDictionary dict1, - final String name2, - final SAMSequenceDictionary dict2) { - final boolean requireSuperset = false; - final boolean checkContigOrdering = false; - - validateDictionaries(name1, dict1, name2, dict2, requireSuperset, checkContigOrdering); - } - - /** - * Tests for compatibility between a reference dictionary and a CRAM dictionary, using appropriate - * validation settings. If the dictionaries are incompatible, then UserExceptions are thrown with - * detailed error messages. - * - * The standard validation settings used by this method are: - * - * -Require the reference dictionary to be a superset of the cram dictionary - * - * -Do not perform checks related to contig ordering: don't throw if the common contigs are in - * different orders with respect to each other, occur at different absolute indices, or are - * lexicographically sorted human dictionaries. GATK uses contig names rather than contig - * indices, and so should not be sensitive to contig ordering issues. - * - * @param referenceDictionary the sequence dictionary for the reference - * @param cramDictionary sequence dictionary from a CRAM file - */ - public static void validateCRAMDictionaryAgainstReference( final SAMSequenceDictionary referenceDictionary, - final SAMSequenceDictionary cramDictionary ) { - // For CRAM, we require the reference dictionary to be a superset of the reads dictionary - final boolean requireSuperset = true; - final boolean checkContigOrdering = false; - - validateDictionaries("reference", referenceDictionary, "reads", cramDictionary, requireSuperset, checkContigOrdering); - } - - - /** - * Tests for compatibility between two sequence dictionaries. If the dictionaries are incompatible, then - * UserExceptions are thrown with detailed error messages. - * - * Two sequence dictionaries are compatible if they share a common subset of equivalent contigs, - * where equivalent contigs are defined as having the same name and length. - * - * @param name1 name associated with dict1 - * @param dict1 the sequence dictionary dict1 - * @param name2 name associated with dict2 - * @param dict2 the sequence dictionary dict2 - * @param requireSuperset if true, require that dict1 be a superset of dict2, rather than dict1 and dict2 sharing a common subset - * @param checkContigOrdering if true, require common contigs to be in the same relative order with respect to each other - * and occur at the same absolute indices, and forbid lexicographically-sorted human dictionaries - */ - public static void validateDictionaries( final String name1, - final SAMSequenceDictionary dict1, - final String name2, - final SAMSequenceDictionary dict2, - final boolean requireSuperset, - final boolean checkContigOrdering ) { - Utils.nonNull(dict1, "Something went wrong with sequence dictionary detection, check that "+name1+" has a valid sequence dictionary"); - Utils.nonNull(dict2, "Something went wrong with sequence dictionary detection, check that "+name2+" has a valid sequence dictionary"); - - final SequenceDictionaryCompatibility type = compareDictionaries(dict1, dict2, checkContigOrdering); - - switch ( type ) { - case IDENTICAL: - return; - case SUPERSET: - return; - case COMMON_SUBSET: - if ( requireSuperset ) { - final Set contigs1 = dict1.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(Collectors.toSet()); - final List missingContigs = dict2.getSequences().stream() - .map(SAMSequenceRecord::getSequenceName) - .filter(contig -> !contigs1.contains(contig)) - .collect(Collectors.toList()); - throw new UserException.IncompatibleSequenceDictionaries(String.format("Dictionary %s is missing contigs found in dictionary %s. Missing contigs: \n %s \n", name1, name2, String.join(", ", missingContigs)), name1, dict1, name2, dict2); - } - return; - case NO_COMMON_CONTIGS: - throw new UserException.IncompatibleSequenceDictionaries("No overlapping contigs found", name1, dict1, name2, dict2); - - case UNEQUAL_COMMON_CONTIGS: { - final List x = findDisequalCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2); - final SAMSequenceRecord elt1 = x.get(0); - final SAMSequenceRecord elt2 = x.get(1); - throw new UserException.IncompatibleSequenceDictionaries( - String.format("Found contigs with the same name but different lengths:\n contig %s = %s / %d\n contig %s = %s / %d", - name1, elt1.getSequenceName(), elt1.getSequenceLength(), - name2, elt2.getSequenceName(), elt2.getSequenceLength()), - name1, dict1, name2, dict2 - ); - } - - case NON_CANONICAL_HUMAN_ORDER: { - // We only get NON_CANONICAL_HUMAN_ORDER if the caller explicitly requested that we check contig ordering, - // so we should always throw when we see it. - final UserException ex; - if ( nonCanonicalHumanContigOrder(dict1) ) { - ex = new UserException.LexicographicallySortedSequenceDictionary(name1, dict1); - } - else { - ex = new UserException.LexicographicallySortedSequenceDictionary(name2, dict2); - } - - throw ex; - } - - case OUT_OF_ORDER: { - // We only get OUT_OF_ORDER if the caller explicitly requested that we check contig ordering, - // so we should always throw when we see it. - throw new UserException.IncompatibleSequenceDictionaries( - "The relative ordering of the common contigs in " + name1 + " and " + name2 + - " is not the same; to fix this please see: " - + "(https://www.broadinstitute.org/gatk/guide/article?id=1328), " - + " which describes reordering contigs in BAM and VCF files.", - name1, dict1, name2, dict2); - } - - case DIFFERENT_INDICES: { - // We only get DIFFERENT_INDICES if the caller explicitly requested that we check contig ordering, - // so we should always throw when we see it. - final String msg = "One or more contigs common to both dictionaries have " + - "different indices (ie., absolute positions) in each dictionary. Code " + - "that is sensitive to contig ordering can fail when this is the case. " + - "You should fix the sequence dictionaries so that all shared contigs " + - "occur at the same absolute positions in both dictionaries."; - throw new UserException.IncompatibleSequenceDictionaries(msg, name1, dict1, name2, dict2); - } - default: - throw new GATKException("Unexpected SequenceDictionaryComparison type: " + type); - } + // order with respect to each other, but one or more of them have different + // indices in the two dictionaries. Eg., { chrM, chr1, chr2 } vs. { chr1, chr2 } } /** @@ -465,14 +306,14 @@ public static Set getCommonContigsByName(SAMSequenceDictionary dict1, SA } public static Set getContigNames(SAMSequenceDictionary dict) { - Set contigNames = new LinkedHashSet(Utils.optimumHashSize(dict.size())); + Set contigNames = new LinkedHashSet(dict.size()); for (SAMSequenceRecord dictionaryEntry : dict.getSequences()) contigNames.add(dictionaryEntry.getSequenceName()); return contigNames; } public static List getContigNamesList(final SAMSequenceDictionary refSeqDict) { - Utils.nonNull(refSeqDict, "provided reference sequence ditionary is null"); + ValidationUtils.nonNull(refSeqDict, "provided reference sequence ditionary is null"); return refSeqDict.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(Collectors.toList()); } @@ -486,7 +327,7 @@ public static List getContigNamesList(final SAMSequenceDictionary refSeq * @return A String containing all of the contig names and lengths from the sequence dictionary it's passed */ public static String getDictionaryAsString( final SAMSequenceDictionary dict ) { - Utils.nonNull(dict, "Sequence dictionary must be non-null"); + ValidationUtils.nonNull(dict, "Sequence dictionary must be non-null"); StringBuilder s = new StringBuilder("[ "); diff --git a/src/main/java/htsjdk/tribble/TribbleException.java b/src/main/java/htsjdk/tribble/TribbleException.java index abcbc25ca0..4e2651640b 100644 --- a/src/main/java/htsjdk/tribble/TribbleException.java +++ b/src/main/java/htsjdk/tribble/TribbleException.java @@ -86,6 +86,12 @@ public static class InternalCodecException extends TribbleException { public InternalCodecException(String message) { super (message); } } + public static class VersionValidationFailure extends TribbleException { + public VersionValidationFailure(final String message) { + super(String.format("Version validation failure: %s", message)); + } + } + // ////////////////////////////////////////////////////////////////////// // Index exceptions // ////////////////////////////////////////////////////////////////////// diff --git a/src/main/java/htsjdk/variant/bcf2/BCF2Utils.java b/src/main/java/htsjdk/variant/bcf2/BCF2Utils.java index 39478bf069..545ede7497 100644 --- a/src/main/java/htsjdk/variant/bcf2/BCF2Utils.java +++ b/src/main/java/htsjdk/variant/bcf2/BCF2Utils.java @@ -27,7 +27,11 @@ import htsjdk.samtools.util.FileExtensions; import htsjdk.tribble.TribbleException; -import htsjdk.variant.vcf.*; +import htsjdk.variant.vcf.VCFConstants; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; +import htsjdk.variant.vcf.VCFIDHeaderLine; +import htsjdk.variant.vcf.VCFSimpleHeaderLine; import java.io.File; import java.io.FileNotFoundException; @@ -93,10 +97,15 @@ public static ArrayList makeDictionary(final VCFHeader header) { // set up the strings dictionary for ( VCFHeaderLine line : header.getMetaDataInInputOrder() ) { if ( line.shouldBeAddedToDictionary() ) { - final VCFIDHeaderLine idLine = (VCFIDHeaderLine)line; - if ( ! seen.contains(idLine.getID())) { - dict.add(idLine.getID()); - seen.add(idLine.getID()); + if (!line.isIDHeaderLine()) { + //is there a better way to ensure that shouldBeAddedToDictionary==true only when isIDHeaderLine==true + throw new TribbleException(String.format( + "The header line %s cannot be added to the BCF dictionary since its not an ID header line", + line)); + } + if ( ! seen.contains(line.getID())) { + dict.add(line.getID()); + seen.add(line.getID()); } } } @@ -291,7 +300,7 @@ else if ( o.getClass().isArray() ) { * Are the elements and their order in the output and input headers consistent so that * we can write out the raw genotypes block without decoding and recoding it? * - * If the order of INFO, FILTER, or contrig elements in the output header is different than + * If the order of INFO, FILTER, or contig elements in the output header is different than * in the input header we must decode the blocks using the input header and then recode them * based on the new output order. * @@ -308,15 +317,15 @@ public static boolean headerLinesAreOrderedConsistently(final VCFHeader outputHe if ( ! nullAsEmpty(outputHeader.getSampleNamesInOrder()).equals(nullAsEmpty(genotypesBlockHeader.getSampleNamesInOrder())) ) return false; - final Iterator outputLinesIt = outputHeader.getIDHeaderLines().iterator(); - final Iterator inputLinesIt = genotypesBlockHeader.getIDHeaderLines().iterator(); + final Iterator outputLinesIt = outputHeader.getIDHeaderLines().iterator(); + final Iterator inputLinesIt = genotypesBlockHeader.getIDHeaderLines().iterator(); while ( inputLinesIt.hasNext() ) { if ( ! outputLinesIt.hasNext() ) // missing lines in output return false; - final VCFIDHeaderLine outputLine = outputLinesIt.next(); - final VCFIDHeaderLine inputLine = inputLinesIt.next(); + final VCFSimpleHeaderLine outputLine = outputLinesIt.next(); + final VCFSimpleHeaderLine inputLine = inputLinesIt.next(); if ( ! inputLine.getClass().equals(outputLine.getClass()) || ! inputLine.getID().equals(outputLine.getID()) ) return false; diff --git a/src/main/java/htsjdk/variant/variantcontext/writer/VCFWriter.java b/src/main/java/htsjdk/variant/variantcontext/writer/VCFWriter.java index 21f1453fbb..1b6edae1d8 100644 --- a/src/main/java/htsjdk/variant/variantcontext/writer/VCFWriter.java +++ b/src/main/java/htsjdk/variant/variantcontext/writer/VCFWriter.java @@ -27,8 +27,11 @@ import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.util.IOUtil; +import htsjdk.samtools.util.Log; import htsjdk.samtools.util.RuntimeIOException; +import htsjdk.tribble.TribbleException; import htsjdk.tribble.index.IndexCreator; +import htsjdk.utils.ValidationUtils; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.vcf.VCFConstants; @@ -36,6 +39,7 @@ import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFHeaderVersion; +import htsjdk.variant.vcf.VCFUtils; import java.io.BufferedWriter; import java.io.ByteArrayOutputStream; @@ -45,14 +49,15 @@ import java.io.OutputStreamWriter; import java.io.Writer; import java.nio.file.Path; +import java.util.stream.Collectors; /** * this class writes VCF files */ class VCFWriter extends IndexingVariantContextWriter { + protected final static Log logger = Log.getInstance(VCFWriter.class); - private static final String VERSION_LINE = - VCFHeader.METADATA_INDICATOR + VCFHeaderVersion.VCF4_2.getFormatString() + "=" + VCFHeaderVersion.VCF4_2.getVersionString(); + private static final String DEFAULT_VERSION_LINE = VCFHeader.DEFAULT_VCF_VERSION.toHeaderVersionLine(); // Initialized when the header is written to the output stream private VCFEncoder vcfEncoder = null; @@ -164,7 +169,7 @@ public void writeHeader(final VCFHeader header) { } public static String getVersionLine() { - return VERSION_LINE; + return DEFAULT_VERSION_LINE; } public static VCFHeader writeHeader(VCFHeader header, @@ -175,12 +180,18 @@ public static VCFHeader writeHeader(VCFHeader header, try { rejectVCFV43Headers(header); - // the file format field needs to be written first + // Validate that the file version we're writing is version-compatible this header's version. + validateHeaderVersion(header, versionLine); + + // The file format field needs to be written first; below any file format lines + // embedded in the header will be removed writer.write(versionLine + "\n"); for (final VCFHeaderLine line : header.getMetaDataInSortedOrder() ) { - if ( VCFHeaderVersion.isFormatString(line.getKey()) ) + // Remove the fileformat header lines + if ( VCFHeaderVersion.isFormatString(line.getKey()) ) { continue; + } writer.write(VCFHeader.METADATA_INDICATOR); writer.write(line.toString()); @@ -189,14 +200,9 @@ public static VCFHeader writeHeader(VCFHeader header, // write out the column line writer.write(VCFHeader.HEADER_INDICATOR); - boolean isFirst = true; - for (final VCFHeader.HEADER_FIELDS field : header.getHeaderFields() ) { - if ( isFirst ) - isFirst = false; // don't write out a field separator - else - writer.write(VCFConstants.FIELD_SEPARATOR); - writer.write(field.toString()); - } + writer.write(header.getHeaderFields().stream() + .map(f -> f.name()) + .collect(Collectors.joining(VCFConstants.FIELD_SEPARATOR)).toString()); if ( header.hasGenotypingData() ) { writer.write(VCFConstants.FIELD_SEPARATOR); @@ -274,6 +280,28 @@ private static void rejectVCFV43Headers(final VCFHeader targetHeader) { if (targetHeader.getVCFHeaderVersion() != null && targetHeader.getVCFHeaderVersion().isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { throw new IllegalArgumentException(String.format("Writing VCF version %s is not implemented", targetHeader.getVCFHeaderVersion())); } + } + // Given a header and a requested target output version, see if the header's version is compatible with the + // requested version (where compatible means its ok to just declare that the header has the requested + // version). + private static void validateHeaderVersion(final VCFHeader header, final String requestedVersionLine) { + ValidationUtils.nonNull(header); + ValidationUtils.nonNull(requestedVersionLine); + + final VCFHeaderVersion vcfCurrentVersion = header.getVCFHeaderVersion(); + final VCFHeaderVersion vcfRequestedVersion = VCFHeaderVersion.fromHeaderVersionLine(requestedVersionLine); + if (!vcfCurrentVersion.equals(vcfRequestedVersion)) { + if (!VCFHeaderVersion.versionsAreCompatible(VCFHeaderVersion.fromHeaderVersionLine(requestedVersionLine), vcfCurrentVersion)) { + final String message = String.format("Attempting to write a %s VCF header to a %s VCFWriter", + vcfRequestedVersion, + vcfCurrentVersion.getVersionString()); + if (VCFUtils.isStrictVCFVersionValidation()) { + throw new TribbleException(message); + } + logger.warn(message); + } + } } + } diff --git a/src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java b/src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java index bfa718453e..1a1267e5c8 100644 --- a/src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java +++ b/src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java @@ -26,12 +26,14 @@ package htsjdk.variant.vcf; import htsjdk.samtools.util.BlockCompressedInputStream; +import htsjdk.samtools.util.Log; import htsjdk.samtools.util.IOUtil; import htsjdk.tribble.AsciiFeatureCodec; import htsjdk.tribble.Feature; import htsjdk.tribble.NameAwareCodec; import htsjdk.tribble.TribbleException; import htsjdk.tribble.index.tabix.TabixFormat; +import htsjdk.tribble.readers.LineIterator; import htsjdk.tribble.util.ParsingUtils; import htsjdk.utils.ValidationUtils; import htsjdk.variant.utils.GeneralUtils; @@ -46,6 +48,8 @@ import java.util.zip.GZIPInputStream; public abstract class AbstractVCFCodec extends AsciiFeatureCodec implements NameAwareCodec { + protected final static Log logger = Log.getInstance(AbstractVCFCodec.class); + public final static int MAX_ALLELE_SIZE_BEFORE_WARNING = (int)Math.pow(2, 20); protected final static int NUM_STANDARD_FIELDS = 8; // INFO is the 8th @@ -60,26 +64,22 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec private VCFTextTransformer vcfTextTransformer = passThruTextTransformer; // a mapping of the allele - protected Map> alleleMap = new HashMap>(3); - - // for performance testing purposes - public static boolean validate = true; + protected final Map> alleleMap = new HashMap<>(3); // a key optimization -- we need a per thread string parts array, so we don't allocate a big array over and over // todo: make this thread safe? protected String[] parts = null; protected String[] genotypeParts = null; - protected final String[] locParts = new String[6]; // for performance we cache the hashmap of filter encodings for quick lookup - protected HashMap> filterHash = new HashMap>(); + protected final HashMap> filterHash = new HashMap<>(); // we store a name to give to each of the variant contexts we emit protected String name = "Unknown"; protected int lineNo = 0; - protected Map stringCache = new HashMap(); + protected final Map stringCache = new HashMap<>(); protected boolean warnedAboutNoEqualsForNonFlag = false; @@ -117,17 +117,72 @@ class LazyVCFGenotypesParser implements LazyGenotypesContext.LazyParser { @Override public LazyGenotypesContext.LazyData parse(final Object data) { - //System.out.printf("Loading genotypes... %s:%d%n", contig, start); return createGenotypeMap((String) data, alleles, contig, start); } } /** - * parse the filter string, first checking to see if we already have parsed it in a previous attempt - * @param filterString the string to parse - * @return a set of the filters applied + * Return true if this codec can decode files with the target version + * @param targetVersion the target version to consider + * @return true if this codec can handle targetVersion + */ + public abstract boolean canDecodeVersion(final VCFHeaderVersion targetVersion); + + /** + * Reads all of the header from the provided iterator, but reads no further. + * @param lineIterator the line reader to take header lines from + * @return The parsed header */ - protected abstract List parseFilters(String filterString); + @Override + public Object readActualHeader(final LineIterator lineIterator) { + final List headerStrings = new ArrayList<>(); + + // Extract one line and retrieve the file format and version, which must be the first line, + // and then add it back into the headerLines. + final VCFHeaderVersion fileFormatVersion = readFormatVersionLine(lineIterator); + headerStrings.add(fileFormatVersion.toHeaderVersionLine()); + + // collect metadata lines until we hit the required header line, or a non-metadata line, + // in which case throw since there was no header line + while (lineIterator.hasNext()) { + final String line = lineIterator.peek(); + if (line.startsWith(VCFHeader.METADATA_INDICATOR)) { + lineNo++; + headerStrings.add(lineIterator.next()); + } else if (line.startsWith(VCFHeader.HEADER_INDICATOR)) { + lineNo++; + headerStrings.add(lineIterator.next()); + this.header = parseHeaderFromLines(headerStrings, fileFormatVersion); + return this.header; + } + } + throw new TribbleException.InvalidHeader( + "The required header line (starting with one #) is missing in the input VCF file"); + } + + /** + * Read ahead one line to obtain and return the vcf header version for this file + * + * @param headerLineIterator + * @return VCFHeaderVersion for this file + * @throws TribbleException if no file format header line is found in the first line or, the version can't + * be handled by this codec + */ + protected VCFHeaderVersion readFormatVersionLine(final LineIterator headerLineIterator) { + if (headerLineIterator.hasNext()) { + final String headerVersionLine = headerLineIterator.next(); + if (headerVersionLine.startsWith(VCFHeader.METADATA_INDICATOR)) { + final VCFHeaderVersion vcfFileVersion = VCFHeaderVersion.fromHeaderVersionLine(headerVersionLine); + if (!canDecodeVersion(vcfFileVersion)) { + throw new TribbleException.InvalidHeader( + String.format("The \"(%s)\" codec does not support VCF version: %s", getName(), vcfFileVersion)); + } else { + return vcfFileVersion; + } + } + } + throw new TribbleException.InvalidHeader("The VCF version header line is missing"); + } /** * create a VCF header from a set of header record lines @@ -135,180 +190,306 @@ public LazyGenotypesContext.LazyData parse(final Object data) { * @param headerStrings a list of strings that represent all the ## and # entries * @return a VCFHeader object */ - protected VCFHeader parseHeaderFromLines( final List headerStrings, final VCFHeaderVersion version ) { - this.version = version; + protected VCFHeader parseHeaderFromLines( final List headerStrings, final VCFHeaderVersion sourceVersion ) { + this.version = sourceVersion; - Set metaData = new LinkedHashSet(); - Set sampleNames = new LinkedHashSet(); + final Set metaData = new LinkedHashSet<>(); + Set sampleNames = new LinkedHashSet<>(); int contigCounter = 0; - // iterate over all the passed in strings - for ( String str : headerStrings ) { - if ( !str.startsWith(VCFHeader.METADATA_INDICATOR) ) { - String[] strings = str.substring(1).split(VCFConstants.FIELD_SEPARATOR); - if ( strings.length < VCFHeader.HEADER_FIELDS.values().length ) - throw new TribbleException.InvalidHeader("there are not enough columns present in the header line: " + str); - - int arrayIndex = 0; - for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) { - try { - if (field != VCFHeader.HEADER_FIELDS.valueOf(strings[arrayIndex])) - throw new TribbleException.InvalidHeader("we were expecting column name '" + field + "' but we saw '" + strings[arrayIndex] + "'"); - } catch (IllegalArgumentException e) { - throw new TribbleException.InvalidHeader("unknown column name '" + strings[arrayIndex] + "'; it does not match a legal column header name."); - } - arrayIndex++; - } - - boolean sawFormatTag = false; - if ( arrayIndex < strings.length ) { - if ( !strings[arrayIndex].equals("FORMAT") ) - throw new TribbleException.InvalidHeader("we were expecting column name 'FORMAT' but we saw '" + strings[arrayIndex] + "'"); - sawFormatTag = true; - arrayIndex++; - } - - while ( arrayIndex < strings.length ) - sampleNames.add(strings[arrayIndex++]); - - if ( sawFormatTag && sampleNames.isEmpty()) - throw new TribbleException.InvalidHeader("The FORMAT field was provided but there is no genotype/sample data"); - - // If we're performing sample name remapping and there is exactly one sample specified in the header, replace - // it with the remappedSampleName. Throw an error if there are 0 or multiple samples and remapping was requested - // for this file. - if ( remappedSampleName != null ) { - // We currently only support on-the-fly sample name remapping for single-sample VCFs - if ( sampleNames.isEmpty() || sampleNames.size() > 1 ) { - throw new TribbleException(String.format("Cannot remap sample name to %s because %s samples are specified in the VCF header, and on-the-fly sample name remapping is only supported for single-sample VCFs", - remappedSampleName, sampleNames.isEmpty() ? "no" : "multiple")); - } - - sampleNames.clear(); - sampleNames.add(remappedSampleName); - } + for ( String headerLine : headerStrings ) { + if ( !headerLine.startsWith(VCFHeader.METADATA_INDICATOR) ) { + sampleNames = parsePrimaryHeaderLine(headerLine); } else { - if ( str.startsWith(VCFConstants.INFO_HEADER_START) ) { - final VCFInfoHeaderLine info = new VCFInfoHeaderLine(str.substring(7), version); - metaData.add(info); - } else if ( str.startsWith(VCFConstants.FILTER_HEADER_START) ) { - final VCFFilterHeaderLine filter = new VCFFilterHeaderLine(str.substring(9), version); - metaData.add(filter); - } else if ( str.startsWith(VCFConstants.FORMAT_HEADER_START) ) { - final VCFFormatHeaderLine format = new VCFFormatHeaderLine(str.substring(9), version); - metaData.add(format); - } else if ( str.startsWith(VCFConstants.CONTIG_HEADER_START) ) { - final VCFContigHeaderLine contig = new VCFContigHeaderLine(str.substring(9), version, VCFConstants.CONTIG_HEADER_START.substring(2), contigCounter++); - metaData.add(contig); - } else if ( str.startsWith(VCFConstants.ALT_HEADER_START) ) { - metaData.add(getAltHeaderLine(str.substring(VCFConstants.ALT_HEADER_OFFSET), version)); - } else if ( str.startsWith(VCFConstants.PEDIGREE_HEADER_START) && version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { - // only model pedigree header lines as structured header lines starting with v4.3 - metaData.add(getPedigreeHeaderLine(str.substring(VCFConstants.PEDIGREE_HEADER_OFFSET), version)); - } else if ( str.startsWith(VCFConstants.META_HEADER_START) ) { - metaData.add(getMetaHeaderLine(str.substring(VCFConstants.META_HEADER_OFFSET), version)); - } else if ( str.startsWith(VCFConstants.SAMPLE_HEADER_START) ) { - metaData.add(getSampleHeaderLine(str.substring(VCFConstants.SAMPLE_HEADER_OFFSET), version)); + if ( headerLine.startsWith(VCFConstants.INFO_HEADER_START) ) { + metaData.add(getInfoHeaderLine(headerLine.substring(VCFConstants.INFO_HEADER_OFFSET), sourceVersion)); + } else if ( headerLine.startsWith(VCFConstants.FILTER_HEADER_START) ) { + metaData.add(getFilterHeaderLine(headerLine.substring(VCFConstants.FILTER_HEADER_OFFSET), sourceVersion)); + } else if ( headerLine.startsWith(VCFConstants.FORMAT_HEADER_START) ) { + metaData.add(getFormatHeaderLine(headerLine.substring(VCFConstants.FORMAT_HEADER_OFFSET), sourceVersion)); + } else if ( headerLine.startsWith(VCFConstants.CONTIG_HEADER_START) ) { + metaData.add(getContigHeaderLine(headerLine.substring(VCFConstants.CONTIG_HEADER_OFFSET), sourceVersion, contigCounter++)); + } else if ( headerLine.startsWith(VCFConstants.ALT_HEADER_START) ) { + metaData.add(getAltHeaderLine(headerLine.substring(VCFConstants.ALT_HEADER_OFFSET), sourceVersion)); + } else if ( headerLine.startsWith(VCFConstants.PEDIGREE_HEADER_START) ) { + metaData.add(getPedigreeHeaderLine(headerLine.substring(VCFConstants.PEDIGREE_HEADER_OFFSET), sourceVersion)); + } else if ( headerLine.startsWith(VCFConstants.META_HEADER_START) ) { + metaData.add(getMetaHeaderLine(headerLine.substring(VCFConstants.META_HEADER_OFFSET), sourceVersion)); + } else if ( headerLine.startsWith(VCFConstants.SAMPLE_HEADER_START) ) { + metaData.add(getSampleHeaderLine(headerLine.substring(VCFConstants.SAMPLE_HEADER_OFFSET), sourceVersion)); } else { - int equals = str.indexOf('='); - if ( equals != -1 ) - metaData.add(new VCFHeaderLine(str.substring(2, equals), str.substring(equals+1))); + final VCFHeaderLine otherHeaderLine = getOtherHeaderLine( + headerLine.substring(VCFHeader.METADATA_INDICATOR.length()), + sourceVersion); + if (otherHeaderLine != null) + metaData.add(otherHeaderLine); } } } - - setVCFHeader(new VCFHeader(version, metaData, sampleNames), version); - return this.header; + // return the header that is returned by setVCFHeader, since it may be different than the + // one we create here since setVCFHeader calls + // {@link VCFStandardHeaderLines#repairStandardHeaderLines(VCFHeader)}, which can create an + // entirely new "repaired" header. + final VCFHeader vcfHeader = new VCFHeader(metaData, sampleNames); + return setVCFHeader(vcfHeader); } /** - * @return the header that was either explicitly set on this codec, or read from the file. May be null. - * The returned value should not be modified. + * Create and return a VCFInfoHeader object from a header line string that conforms to the {@code sourceVersion} + * @param headerLineString VCF header line being parsed without the leading "##" + * @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header + * line object should be validate for this header version. + * @return a VCFInfoHeaderLine object */ - public VCFHeader getHeader() { - return header; + protected VCFInfoHeaderLine getInfoHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { + return new VCFInfoHeaderLine(headerLineString, sourceVersion); } /** - * @return the version number that was either explicitly set on this codec, or read from the file. May be null. + * Create and return a VCFFormatHeader object from a header line string that conforms to the {@code sourceVersion} + * @param headerLineString VCF header line being parsed without the leading "##" + * @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header + * line object should be validate for this header version. + * @return a VCFFormatHeaderLine object */ - public VCFHeaderVersion getVersion() { - return version; + protected VCFFormatHeaderLine getFormatHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { + return new VCFFormatHeaderLine(headerLineString, sourceVersion); } /** - * Explicitly set the VCFHeader on this codec. This will overwrite the header read from the file - * and the version state stored in this instance; conversely, reading the header from a file will - * overwrite whatever is set here. - * - * @param newHeader - * @param newVersion - * @return the actual header for this codec. The returned header may not be identical to the header - * argument since the header lines may be "repaired" (i.e., rewritten) if doOnTheFlyModifications is set. - * @throws TribbleException if the requested header version is not compatible with the existing version + * Create and return a VCFFilterHeaderLine object from a header line string that conforms to the {@code sourceVersion} + * @param headerLineString VCF header line being parsed without the leading "##" + * @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header + * line object should be validate for this header version. + * @return a VCFFilterHeaderLine object */ - public VCFHeader setVCFHeader(final VCFHeader newHeader, final VCFHeaderVersion newVersion) { - validateHeaderVersionTransition(newHeader, newVersion); - if (this.doOnTheFlyModifications) { - final VCFHeader repairedHeader = VCFStandardHeaderLines.repairStandardHeaderLines(newHeader); - // validate the new header after repair to ensure the resulting header version is - // still compatible with the current version - validateHeaderVersionTransition(repairedHeader, newVersion); - this.header = repairedHeader; - } else { - this.header = newHeader; - } - - this.version = newVersion; - this.vcfTextTransformer = getTextTransformerForVCFVersion(newVersion); + protected VCFFilterHeaderLine getFilterHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { + return new VCFFilterHeaderLine(headerLineString, sourceVersion); + } - return this.header; + /** + * Create and return a VCFContigHeaderLine object from a header line string that conforms to the {@code sourceVersion} + * @param headerLineString VCF header line being parsed without the leading "##" + * @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header + * line object should be valid for this header version. + * @return a VCFContigHeaderLine object + */ + protected VCFContigHeaderLine getContigHeaderLine( + final String headerLineString, + final VCFHeaderVersion sourceVersion, + final int contigIndex) { + return new VCFContigHeaderLine(headerLineString, sourceVersion, contigIndex); } /** * Create and return a VCFAltHeaderLine object from a header line string that conforms to the {@code sourceVersion} - * @param headerLineString VCF header line being parsed without the leading "##ALT=" + * @param headerLineString VCF header line being parsed without the leading "##" * @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header * line object should be validate for this header version. * @return a VCFAltHeaderLine object */ - public VCFAltHeaderLine getAltHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { + protected VCFAltHeaderLine getAltHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { return new VCFAltHeaderLine(headerLineString, sourceVersion); } /** * Create and return a VCFPedigreeHeaderLine object from a header line string that conforms to the {@code sourceVersion} - * @param headerLineString VCF header line being parsed without the leading "##PEDIGREE=" + * @param headerLineString VCF header line being parsed without the leading "##" * @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header * line object should be validate for this header version. * @return a VCFPedigreeHeaderLine object + * + * NOTE:this can't return a VCFPedigreeHeaderLine since for pre-v4.3 PEDIGREE lines must be modeled as + * VCFHeaderLine due to the lack of a requirement for an ID field */ - public VCFPedigreeHeaderLine getPedigreeHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { - return new VCFPedigreeHeaderLine(headerLineString, sourceVersion); + protected VCFHeaderLine getPedigreeHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { + if (sourceVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { + return new VCFPedigreeHeaderLine(headerLineString, sourceVersion); + } else { + return new VCFHeaderLine(VCFConstants.PEDIGREE_HEADER_KEY, headerLineString); + } } /** * Create and return a VCFMetaHeaderLine object from a header line string that conforms to the {@code sourceVersion} - * @param headerLineString VCF header line being parsed without the leading "##META=" + * @param headerLineString VCF header line being parsed without the leading "##" * @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header * line object should be validate for this header version. * @return a VCFMetaHeaderLine object */ - public VCFMetaHeaderLine getMetaHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { + protected VCFMetaHeaderLine getMetaHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { return new VCFMetaHeaderLine(headerLineString, sourceVersion); } /** * Create and return a VCFSampleHeaderLine object from a header line string that conforms to the {@code sourceVersion} - * @param headerLineString VCF header line being parsed without the leading "##SAMPLE=" + * @param headerLineString VCF header line being parsed without the leading "##" * @param sourceVersion the VCF header version derived from which the source was retrieved. The resulting header * line object should be validate for this header version. * @return a VCFSampleHeaderLine object */ - public VCFSampleHeaderLine getSampleHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { + protected VCFSampleHeaderLine getSampleHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { return new VCFSampleHeaderLine(headerLineString, sourceVersion); } + /** + * Create and return a header line that is not modeled by a specific VCFHeaderLine subclass, ie., its not + * a info/format/contig/alt/pedigree/meta/sample VCFHeaderLine. This may return either a VCFSimpleHeaderLine + * or a VCFHeaderLine. + * + * @param headerLineString VCF header line being parsed without the leading "##" + * @param sourceVersion VCFHeaderVersion being parsed + * @return a VCFHeaderLine + */ + protected VCFHeaderLine getOtherHeaderLine(final String headerLineString, final VCFHeaderVersion sourceVersion) { + final int indexOfEquals = headerLineString.indexOf('='); + if (indexOfEquals < 1) { // must at least have "?=" + if (VCFUtils.isStrictVCFVersionValidation()) { + throw new TribbleException.InvalidHeader("Unrecognized metadata line type: " + headerLineString); + } + logger.warn("Dropping unrecognized VCFHeader metadata line type: " + headerLineString); + return null; + } + final String headerLineValue = headerLineString.substring(indexOfEquals + 1).trim(); + if (headerLineValue.startsWith("<") && headerLineValue.endsWith(">")) { + if (sourceVersion.isAtLeastAsRecentAs((VCFHeaderVersion.VCF4_3)) || headerLineString.contains(""), + // but which do not contain an ID attribute, i.e., GATK Funcotator uses v4.1 ClinVar test + // files with lines like that look like this: + // + // "ID=" + // + // where the key is "ID", and no ID attribute is present + return new VCFHeaderLine(headerLineString.substring(0, indexOfEquals), headerLineString.substring(indexOfEquals + 1)); + } + } else { + return new VCFHeaderLine(headerLineString.substring(0, indexOfEquals), headerLineString.substring(indexOfEquals + 1)); + } + } + + // Parse the primary header line of the form: + // + // #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ... + // + // The string passed in is the first non-metadata line we've seen, so it should conform. + // + private Set parsePrimaryHeaderLine(final String headerLine) { + final Set sampleNames = new LinkedHashSet<>(); + + final String[] columns = headerLine.substring(1).split(VCFConstants.FIELD_SEPARATOR); + if ( columns.length < VCFHeader.HEADER_FIELDS.values().length ) { + throw new TribbleException.InvalidHeader("not enough columns present in header line: " + headerLine); + } + + int col = 0; + for (VCFHeader.HEADER_FIELDS field : VCFHeader.HEADER_FIELDS.values()) { + try { + if (field != VCFHeader.HEADER_FIELDS.valueOf(columns[col])) { + throw new TribbleException.InvalidHeader("expected column headerLineID '" + field + "' but saw '" + columns[col] + "'"); + } + } catch (IllegalArgumentException e) { + throw new TribbleException.InvalidHeader("column headerLineID '" + columns[col] + "' is not a legal column header headerLineID."); + } + col++; + } + + boolean sawFormatTag = false; + if ( col < columns.length ) { + if ( !columns[col].equals("FORMAT") ) + throw new TribbleException.InvalidHeader("expected column headerLineID 'FORMAT' but saw '" + columns[col] + "'"); + sawFormatTag = true; + col++; + } + + while ( col < columns.length ) { + sampleNames.add(columns[col++]); + } + + if ( sawFormatTag && sampleNames.isEmpty()) + throw new TribbleException.InvalidHeader("The FORMAT field was provided but there is no genotype/sample data"); + + // If we're performing sample name remapping and there is exactly one sample specified in the header, replace + // it with the remappedSampleName. Throw an error if there are 0 or multiple samples and remapping was requested + // for this file. + if ( remappedSampleName != null ) { + // We currently only support on-the-fly sample name remapping for single-sample VCFs + if ( sampleNames.isEmpty() || sampleNames.size() > 1 ) { + throw new TribbleException( + String.format("Cannot remap sample headerLineID to %s because %s samples are specified in the VCF header, " + + "and on-the-fly sample headerLineID remapping is only supported for single-sample VCFs", + remappedSampleName, sampleNames.isEmpty() ? "no" : "multiple")); + } + + sampleNames.clear(); + sampleNames.add(remappedSampleName); + } + + return sampleNames; + } + + /** + * @return the header that was either explicitly set on this codec, or read from the file. May be null. + * The returned value should not be modified. + */ + public VCFHeader getHeader() { + return header; + } + + /** + * @return the version number that was either explicitly set on this codec, or read from the file. May be null. + */ + public VCFHeaderVersion getVersion() { + return version; + } + + @Deprecated // starting after version 2.24.1 + //Note: this is currently used by Disq + public VCFHeader setVCFHeader(final VCFHeader newHeader, final VCFHeaderVersion newVersion) { + ValidationUtils.nonNull(newHeader); + ValidationUtils.nonNull(newVersion); + ValidationUtils.validateArg( + newHeader.getVCFHeaderVersion().equals(newVersion), + "new version must equal the newHeader's version"); + return setVCFHeader(newHeader); + } + + /** + * Set the VCFHeader for this codec. The final header may be a complete replacement for the + * provided input header, since header lines may be "repaired" (upgraded to vcf v4.2) if + * doOnTheFlyModifications is set. See + * {@link VCFStandardHeaderLines#repairStandardHeaderLines(VCFHeader)}. + * + * @param newHeader the new header to be used by this codec + * @return the actual header that is established for this codec. See {@link + * VCFStandardHeaderLines#repairStandardHeaderLines(VCFHeader)}. + */ + public VCFHeader setVCFHeader(final VCFHeader newHeader) { + ValidationUtils.nonNull(newHeader); + + if (this.doOnTheFlyModifications) { + // calling this with a header that has any pre-v4.3 version will always result in a header + // with version vcfV4.2, no matter what the header version originally was, since the "repair" + // operation is essentially a transform of the header so that it conforms with header line rules + // as of 4.2 + this.header = VCFStandardHeaderLines.repairStandardHeaderLines(newHeader); + } else { + this.header = newHeader; + } + this.version = this.header.getVCFHeaderVersion(); + // Obtain a text transformer (technically, this should be based on the ORIGINAL header version, not + // the updated version after repairStandardHeaderLines is called), but it doesn't matter in practice + // since the transformer only differs starting with 4.3. + this.vcfTextTransformer = getTextTransformerForVCFVersion(this.version); + + return this.header; + } + /** * the fast decode function * @param line the line of text for the record @@ -328,28 +509,6 @@ public VariantContext decode(String line) { return decodeLine(line, true); } - /** - * Throw if new a version/header are not compatible with the existing version/header. Generally, any version - * before v4.2 can be up-converted to v4.2, but not to v4.3. Once a header is established as v4.3, it cannot - * can not be up or down converted, and it must remain at v4.3. - * @param newHeader - * @param newVersion - * @throws TribbleException if the header conversion is not valid - */ - private void validateHeaderVersionTransition(final VCFHeader newHeader, final VCFHeaderVersion newVersion) { - ValidationUtils.nonNull(newHeader); - ValidationUtils.nonNull(newVersion); - - VCFHeader.validateVersionTransition(version, newVersion); - - // If this codec currently has no header (this happens when the header is being established for - // the first time during file parsing), establish an initial header and version, and bypass - // validation. - if (header != null && newHeader.getVCFHeaderVersion() != null) { - VCFHeader.validateVersionTransition(header.getVCFHeaderVersion(), newHeader.getVCFHeaderVersion()); - } - } - /** * For v4.3 up, attribute values can contain embedded percent-encoded characters which must be decoded * on read. Return a version-aware text transformer that can decode encoded text. @@ -421,7 +580,7 @@ else if ( parts[2].equals(VCFConstants.EMPTY_ID_FIELD) ) final String alts = parts[4]; builder.log10PError(parseQual(parts[5])); - final List filters = parseFilters(getCachedString(parts[6])); + final Set filters = parseFilters(getCachedString(parts[6])); if ( filters != null ) { builder.filters(new HashSet<>(filters)); } @@ -432,7 +591,7 @@ else if ( parts[2].equals(VCFConstants.EMPTY_ID_FIELD) ) // update stop with the end key if provided try { builder.stop(Integer.parseInt(attrs.get(VCFConstants.END_KEY).toString())); - } catch (Exception e) { + } catch (NumberFormatException e) { generateException("the END value in the INFO field is not valid"); } } else { @@ -499,20 +658,64 @@ protected String getCachedString(String str) { return internedString; } + /** + * parse the filter string, first checking to see if we already have parsed it in a previous attempt + * @param filterString the string to parse + * @return a set of the filters applied + */ + protected Set parseFilters(final String filterString) { + // null for unfiltered + if ( filterString.equals(VCFConstants.UNFILTERED) ) + return null; + + if ( filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) + return Collections.emptySet(); + if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) ) + generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter headerLineID in vcf4", lineNo); + if (filterString.isEmpty()) + generateException("The VCF specification requires a valid filter status: filter was " + filterString, lineNo); + + // do we have the filter string cached? + if ( filterHash.containsKey(filterString) ) + return filterHash.get(filterString); + + // empty set for passes filters + final Set fFields = new HashSet<>(); + // otherwise we have to parse and cache the value + if ( !filterString.contains(VCFConstants.FILTER_CODE_SEPARATOR) ) + fFields.add(filterString); + else { + // Variant context uses a Set to store these, so duplicates have historically been + // dropped in previous versions. Delegate handling of warning for these to the + // specific codec subclass. + String[] filters = filterString.split(VCFConstants.FILTER_CODE_SEPARATOR); + for (int i = 0; i < filters.length; i++) { + if (!fFields.add(filters[i])) { + reportDuplicateFilterIDs(filters[i], lineNo); + } + } + } + + filterHash.put(filterString, Collections.unmodifiableSet(fFields)); + + return fFields; + } + /** * parse out the info fields * @param infoField the fields * @return a mapping of keys to objects */ - private Map parseInfo(String infoField) { - Map attributes = new HashMap(); + protected Map parseInfo(String infoField) { + Map attributes = new HashMap<>(); if ( infoField.isEmpty() ) generateException("The VCF specification requires a valid (non-zero length) info field"); if ( !infoField.equals(VCFConstants.EMPTY_INFO_FIELD) ) { - if ( infoField.indexOf('\t') != -1 || infoField.indexOf(' ') != -1 ) - generateException("The VCF specification does not allow for whitespace in the INFO field. Offending field value was \"" + infoField + "\""); + if ( infoField.indexOf('\t') != -1 ) { + generateException("The VCF specification does not allow for tab characters in the INFO field. Offending field value was \"" + infoField + "\""); + } List infoFields = ParsingUtils.split(infoField, VCFConstants.INFO_FIELD_SEPARATOR_CHAR); for (int i = 0; i < infoFields.size(); i++) { @@ -540,8 +743,8 @@ private Map parseInfo(String infoField) { key = infoFields.get(i); final VCFInfoHeaderLine headerLine = header.getInfoHeaderLine(key); if ( headerLine != null && headerLine.getType() != VCFHeaderLineType.Flag ) { - if ( GeneralUtils.DEBUG_MODE_ENABLED && ! warnedAboutNoEqualsForNonFlag ) { - System.err.println("Found info key " + key + " without a = value, but the header says the field is of type " + if ( warnedAboutNoEqualsForNonFlag ) { + logger.warn("Found info key " + key + " without a = value, but the header says the field is of type " + headerLine.getType() + " but this construct is only value for FLAG type fields"); warnedAboutNoEqualsForNonFlag = true; } @@ -555,6 +758,10 @@ private Map parseInfo(String infoField) { // this line ensures that key/value pairs that look like key=; are parsed correctly as MISSING if ( "".equals(value) ) value = VCFConstants.MISSING_VALUE_v4; + if (attributes.containsKey(key)) { + reportDuplicateInfoKeyValue(key, infoField, lineNo); + } + attributes.put(key, value); } } @@ -562,6 +769,23 @@ private Map parseInfo(String infoField) { return attributes; } + /** + * Handle reporting of duplicate filter IDs + * + * @param duplicateFilterString the duplicate filter string + * @param lineNo line number of the offending line + */ + protected void reportDuplicateFilterIDs(final String duplicateFilterString, final int lineNo) {} + + /** + * Handle reporting of duplicate info line field values + * + * @param duplicateKey the key name of the field that is duplicated + * @param infoField the entire info field line + * @param lineNo line number of the offending line + */ + protected void reportDuplicateInfoKeyValue(final String duplicateKey, final String infoField, final int lineNo) { } + /** * create a an allele from an index and an array of alleles * @param index the index @@ -796,8 +1020,8 @@ public LazyGenotypesContext.LazyData createGenotypeMap(final String str, } else if ( missing ) { // if its truly missing (there no provided value) skip adding it to the attributes } else if (gtKey.equals(VCFConstants.GENOTYPE_FILTER_KEY)) { - final List filters = parseFilters(getCachedString(genotypeValues.get(i))); - if ( filters != null ) gb.filters(filters); + final Set filters = parseFilters(getCachedString(genotypeValues.get(i))); + if ( filters != null ) gb.filters(new ArrayList<>(filters)); } else if ( genotypeValues.get(i).equals(VCFConstants.MISSING_VALUE_v4) ) { // don't add missing values to the map } else { @@ -880,11 +1104,11 @@ public void setRemappedSampleName( final String remappedSampleName ) { } protected void generateException(String message) { - throw new TribbleException(String.format("The provided VCF file is malformed at approximately line number %d: %s", lineNo, message)); + throw new TribbleException(String.format("Failure parsing VCF file at (approximately) line number %d: %s", lineNo, message)); } protected static void generateException(String message, int lineNo) { - throw new TribbleException(String.format("The provided VCF file is malformed at approximately line number %d: %s", lineNo, message)); + throw new TribbleException(String.format("Failure parsing VCF file at (approximately) line number %d: %s", lineNo, message)); } @Override diff --git a/src/main/java/htsjdk/variant/vcf/VCF3Codec.java b/src/main/java/htsjdk/variant/vcf/VCF3Codec.java index e9ca3abdf7..3c19a7f051 100644 --- a/src/main/java/htsjdk/variant/vcf/VCF3Codec.java +++ b/src/main/java/htsjdk/variant/vcf/VCF3Codec.java @@ -25,12 +25,9 @@ package htsjdk.variant.vcf; -import htsjdk.tribble.TribbleException; -import htsjdk.tribble.readers.LineIterator; - -import java.util.ArrayList; import java.util.Arrays; -import java.util.List; +import java.util.HashSet; +import java.util.Set; /** @@ -53,45 +50,19 @@ public class VCF3Codec extends AbstractVCFCodec { public final static String VCF3_MAGIC_HEADER = "##fileformat=VCFv3"; /** - * @param reader the line reader to take header lines from - * @return the number of header lines + * Return true if this codec can handle the target version + * @param targetHeaderVersion + * @return true if this codec can handle this version */ @Override - public Object readActualHeader(final LineIterator reader) { - final List headerStrings = new ArrayList(); - - VCFHeaderVersion version = null; - boolean foundHeaderVersion = false; - while (reader.hasNext()) { - lineNo++; - final String line = reader.peek(); - if (line.startsWith(VCFHeader.METADATA_INDICATOR)) { - final String[] lineFields = line.substring(2).split("="); - if (lineFields.length == 2 && VCFHeaderVersion.isFormatString(lineFields[0]) ) { - if ( !VCFHeaderVersion.isVersionString(lineFields[1]) ) - throw new TribbleException.InvalidHeader(lineFields[1] + " is not a supported version"); - foundHeaderVersion = true; - version = VCFHeaderVersion.toHeaderVersion(lineFields[1]); - if ( version != VCFHeaderVersion.VCF3_3 && version != VCFHeaderVersion.VCF3_2 ) - throw new TribbleException.InvalidHeader("This codec is strictly for VCFv3 and does not support " + lineFields[1]); - } - headerStrings.add(reader.next()); - } - else if (line.startsWith(VCFHeader.HEADER_INDICATOR)) { - if (!foundHeaderVersion) { - throw new TribbleException.InvalidHeader("We never saw a header line specifying VCF version"); - } - headerStrings.add(reader.next()); - return super.parseHeaderFromLines(headerStrings, version); - } - else { - throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); - } - - } - throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); + public boolean canDecodeVersion(final VCFHeaderVersion targetHeaderVersion) { + return targetHeaderVersion == VCFHeaderVersion.VCF3_3 || targetHeaderVersion == VCFHeaderVersion.VCF3_2; } + @Override + public boolean canDecode(final String potentialInputFile) { + return canDecodeFile(potentialInputFile, VCF3_MAGIC_HEADER); + } /** * parse the filter string, first checking to see if we already have parsed it in a previous attempt @@ -99,24 +70,24 @@ else if (line.startsWith(VCFHeader.HEADER_INDICATOR)) { * @return a set of the filters applied */ @Override - protected List parseFilters(String filterString) { + protected Set parseFilters(String filterString) { // null for unfiltered if ( filterString.equals(VCFConstants.UNFILTERED) ) return null; // empty set for passes filters - List fFields = new ArrayList(); + HashSet fFields = new HashSet<>(); if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) ) - return new ArrayList(fFields); + return new HashSet<>(fFields); if (filterString.isEmpty()) generateException("The VCF specification requires a valid filter status"); // do we have the filter string cached? if ( filterHash.containsKey(filterString) ) - return new ArrayList(filterHash.get(filterString)); + return new HashSet<>(filterHash.get(filterString)); // otherwise we have to parse and cache the value if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 ) @@ -130,7 +101,13 @@ protected List parseFilters(String filterString) { } @Override - public boolean canDecode(final String potentialInput) { - return canDecodeFile(potentialInput, VCF3_MAGIC_HEADER); + protected void reportDuplicateFilterIDs(final String duplicateFilterString, final int lineNo) { + // no-op since this codec historically doesn't report duplicates } + + @Override + protected void reportDuplicateInfoKeyValue(final String duplicateKey, final String infoField, final int lineNo) { + // no-op since this codec historically doesn't report duplicates + } + } diff --git a/src/main/java/htsjdk/variant/vcf/VCFAltHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFAltHeaderLine.java index 71c4850f07..37ac9874e9 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFAltHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFAltHeaderLine.java @@ -1,5 +1,7 @@ package htsjdk.variant.vcf; +import htsjdk.samtools.util.Log; + import java.util.*; /** @@ -7,16 +9,46 @@ */ public class VCFAltHeaderLine extends VCFSimpleHeaderLine { private static final long serialVersionUID = 1L; + protected final static Log logger = Log.getInstance(VCFHeader.class); private static List expectedTags = Collections.unmodifiableList( new ArrayList(2) {{ - add(ID_ATTRIBUTE); - add(DESCRIPTION_ATTRIBUTE); - }} + add(ID_ATTRIBUTE); + add(DESCRIPTION_ATTRIBUTE); + }} ); public VCFAltHeaderLine(final String line, final VCFHeaderVersion version) { - super(VCFConstants.ALT_HEADER_KEY, new VCF4Parser().parseLine(line, expectedTags)); + // Honor the requested version to choose the parser, and let validateForVersion figure out + // whether that version is valid for this line (for example, if this is called with a pre-4.0 version) + super(VCFConstants.ALT_HEADER_KEY, VCFHeaderLineTranslator.parseLine(version, line, expectedTags)); + validateForVersion(version); } + public VCFAltHeaderLine(final String id, final String description) { + super(VCFConstants.ALT_HEADER_KEY, + new LinkedHashMap() {{ + put(ID_ATTRIBUTE, id); + put(DESCRIPTION_ATTRIBUTE, description); + }} + ); + } + + @Override + public Optional> getValidationFailure(final VCFHeaderVersion vcfTargetVersion) { + //TODO: Should we validate/constrain these to match the 4.3 spec constraints ? + if (!vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0)) { + final VCFValidationFailure validationFailure = new VCFValidationFailure<>( + vcfTargetVersion, + this, + String.format("%s header lines are not allowed in VCF version %s headers", getKey(), vcfTargetVersion)); + if (VCFUtils.isStrictVCFVersionValidation()) { + return Optional.of(validationFailure); + } else { + logger.warn(validationFailure.getFailureMessage()); + } + } + + return super.getValidationFailure(vcfTargetVersion); + } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFCodec.java b/src/main/java/htsjdk/variant/vcf/VCFCodec.java index 42f07150d1..3ebf47c02a 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFCodec.java +++ b/src/main/java/htsjdk/variant/vcf/VCFCodec.java @@ -1,6 +1,6 @@ /* * Copyright (c) 2012 The Broad Institute -* +* * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without @@ -25,17 +25,10 @@ package htsjdk.variant.vcf; -import htsjdk.tribble.TribbleException; -import htsjdk.tribble.readers.LineIterator; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.LinkedList; -import java.util.List; +import java.util.*; /** - * A feature codec for the VCF 4 specification + * A feature codec for the VCF 4.0, 4.1, 4.2, and 4.3 specification versions * *

* VCF is a text file format (most likely stored in a compressed manner). It contains meta-information lines, a @@ -45,7 +38,7 @@ * of related samples. Recently the format for storing next-generation read alignments has been * standardised by the SAM/BAM file format specification. This has significantly improved the * interoperability of next-generation tools for alignment, visualisation, and variant calling. - * We propose the Variant Call Format (VCF) as a standarised format for storing the most prevalent + * We propose the Variant Call Format (VCF) as a standardised format for storing the most prevalent * types of sequence variation, including SNPs, indels and larger structural variants, together * with rich annotations. VCF is usually stored in a compressed manner and can be indexed for * fast data retrieval of variants from a range of positions on the reference genome. @@ -72,91 +65,55 @@ * @since 2010 */ public class VCFCodec extends AbstractVCFCodec { - // Our aim is to read in the records and convert to VariantContext as quickly as possible, relying on VariantContext to do the validation of any contradictory (or malformed) record parameters. + // Our aim is to read in the records and convert to VariantContext as quickly as possible, relying + // on VariantContext to do the validation of any contradictory (or malformed) record parameters. public final static String VCF4_MAGIC_HEADER = "##fileformat=VCFv4"; /** - * Reads all of the header from the provided iterator, but no reads no further. - * @param lineIterator the line reader to take header lines from - * @return The parsed header + * Return true if this codec can handle the target version + * @param targetHeaderVersion + * @return true if this codec can handle this version */ @Override - public Object readActualHeader(final LineIterator lineIterator) { - final List headerStrings = new ArrayList(); - - String line; - boolean foundHeaderVersion = false; - while (lineIterator.hasNext()) { - line = lineIterator.peek(); - lineNo++; - if (line.startsWith(VCFHeader.METADATA_INDICATOR)) { - final String[] lineFields = line.substring(2).split("="); - if (lineFields.length == 2 && VCFHeaderVersion.isFormatString(lineFields[0]) ) { - if ( !VCFHeaderVersion.isVersionString(lineFields[1]) ) - throw new TribbleException.InvalidHeader(lineFields[1] + " is not a supported version"); - foundHeaderVersion = true; - version = VCFHeaderVersion.toHeaderVersion(lineFields[1]); - if ( ! version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0) ) - throw new TribbleException.InvalidHeader("This codec is strictly for VCFv4; please use the VCF3 codec for " + lineFields[1]); - if ( version != VCFHeaderVersion.VCF4_0 && version != VCFHeaderVersion.VCF4_1 && version != VCFHeaderVersion.VCF4_2 && version != VCFHeaderVersion.VCF4_3) - throw new TribbleException.InvalidHeader("This codec is strictly for VCFv4 and does not support " + lineFields[1]); - } - headerStrings.add(lineIterator.next()); - } - else if (line.startsWith(VCFHeader.HEADER_INDICATOR)) { - if (!foundHeaderVersion) { - throw new TribbleException.InvalidHeader("We never saw a header line specifying VCF version"); - } - headerStrings.add(lineIterator.next()); - super.parseHeaderFromLines(headerStrings, version); - return this.header; - } - else { - throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); - } - - } - throw new TribbleException.InvalidHeader("We never saw the required CHROM header line (starting with one #) for the input VCF file"); + public boolean canDecodeVersion(final VCFHeaderVersion targetHeaderVersion) { + return targetHeaderVersion == VCFHeaderVersion.VCF4_0 || + targetHeaderVersion == VCFHeaderVersion.VCF4_1 || + targetHeaderVersion == VCFHeaderVersion.VCF4_2 || + targetHeaderVersion == VCFHeaderVersion.VCF4_3; } - /** - * parse the filter string, first checking to see if we already have parsed it in a previous attempt - * - * @param filterString the string to parse - * @return a set of the filters applied or null if filters were not applied to the record (e.g. as per the missing value in a VCF) - */ @Override - protected List parseFilters(final String filterString) { - // null for unfiltered - if ( filterString.equals(VCFConstants.UNFILTERED) ) - return null; - - if ( filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) - return Collections.emptyList(); - if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) ) - generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4", lineNo); - if (filterString.isEmpty()) - generateException("The VCF specification requires a valid filter status: filter was " + filterString, lineNo); - - // do we have the filter string cached? - if ( filterHash.containsKey(filterString) ) - return filterHash.get(filterString); - - // empty set for passes filters - final List fFields = new LinkedList(); - // otherwise we have to parse and cache the value - if ( !filterString.contains(VCFConstants.FILTER_CODE_SEPARATOR) ) - fFields.add(filterString); - else - fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR))); - - filterHash.put(filterString, Collections.unmodifiableList(fFields)); + public boolean canDecode(final String potentialInput) { + return canDecodeFile(potentialInput, VCF4_MAGIC_HEADER); + } - return fFields; + @Override + protected void reportDuplicateFilterIDs(final String duplicateFilterString, final int lineNo) { + // older versions of htsjdk have been silently dropping these for a while, but we can at least warn + logger.warn(String.format("Duplicate filter %s found on line %d", duplicateFilterString, lineNo)); } @Override - public boolean canDecode(final String potentialInput) { - return canDecodeFile(potentialInput, VCF4_MAGIC_HEADER); + protected void reportDuplicateInfoKeyValue(final String duplicateKey, final String infoField, final int lineNo) { + logger.warn(String.format("Duplicate key %s found in %s on line %d", duplicateKey, infoField, lineNo)); } + + /** + * parse out the info fields + * @param infoField the fields + * @return a mapping of keys to objects + */ + protected Map parseInfo(String infoField) { + if (infoField.indexOf(' ') != -1) { + generateException( + String.format("Whitespace is not allowed in the INFO field in VCF version %s: %s", + version == null ? + "unknown" : + version.getVersionString(), + infoField) + ); + } + return super.parseInfo(infoField); + } + } diff --git a/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java index f955a434e1..60eb4fc90f 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java @@ -25,60 +25,185 @@ package htsjdk.variant.vcf; +import htsjdk.samtools.util.Log; import htsjdk.tribble.TribbleException; -import htsjdk.variant.utils.GeneralUtils; +import htsjdk.utils.ValidationUtils; import htsjdk.variant.variantcontext.GenotypeLikelihoods; import htsjdk.variant.variantcontext.VariantContext; +import java.util.Optional; +import java.util.function.BiFunction; +import java.util.regex.Pattern; + import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; /** - * a base class for compound header lines, which include info lines and format lines (so far) + * Abstract base class for compound header lines, which include INFO lines and FORMAT lines. + * + * Compound header lines are distinguished only in that are required to have TYPE and NUMBER attributes + * (VCFHeaderLineCount, a VCFHeaderLineType, and a count). */ -public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCFIDHeaderLine { +public abstract class VCFCompoundHeaderLine extends VCFSimpleHeaderLine { + private static final long serialVersionUID = 1L; + protected static final Log logger = Log.getInstance(VCFCompoundHeaderLine.class); + + // regex pattern corresponding to legal info/format field keys + protected static final Pattern VALID_HEADER_ID_PATTERN = Pattern.compile("^[A-Za-z_][0-9A-Za-z_.]*$"); + protected static final String UNBOUND_DESCRIPTION = "Not provided in original VCF header"; + + protected static final String NUMBER_ATTRIBUTE = "Number"; + protected static final String TYPE_ATTRIBUTE = "Type"; + + // List of expected tags that have a predefined order (used by the parser to verify order only). The + // header line class itself should verify that all required tags are present. + protected static final List expectedTagOrder = Collections.unmodifiableList( + new ArrayList(4) {{ + add(ID_ATTRIBUTE); + add(NUMBER_ATTRIBUTE); + add(TYPE_ATTRIBUTE); + add(DESCRIPTION_ATTRIBUTE); + }} + ); + + // immutable, cached binary representations of compound header line attributes + private final VCFHeaderLineType type; + private final VCFHeaderLineCount countType; + private final int count; - public enum SupportedHeaderLineType { - INFO(true), FORMAT(false); + /** + * create a VCF compound header line with count type = VCFHeaderLineCount.INTEGER + * + * @param key the key (header line type) for this header line + * @param headerLineID the is or this header line + * @param count the count for this header line, sets countType type as VCFHeaderLineCount.INTEGER + * @param type the type for this header line + * @param description the description for this header line + */ + protected VCFCompoundHeaderLine( + final String key, + final String headerLineID, + final int count, + final VCFHeaderLineType type, + final String description) + { + this(key, createAttributeMap(headerLineID, VCFHeaderLineCount.INTEGER, count, type, description), VCFHeader.DEFAULT_VCF_VERSION); + } - public final boolean allowFlagValues; - SupportedHeaderLineType(boolean flagValues) { - allowFlagValues = flagValues; - } + /** + * create a VCF compound header line + * + * @param key the key (header line type) for this header line + * @param headerLineID the id for this header line + * @param countType the count type for this header line + * @param type the type for this header line + * @param description the description for this header line + */ + protected VCFCompoundHeaderLine( + final String key, + final String headerLineID, + final VCFHeaderLineCount countType, + final VCFHeaderLineType type, + final String description) { + this(key, createAttributeMap(headerLineID, countType, VCFHeaderLineCount.VARIABLE_COUNT, type, description), VCFHeader.DEFAULT_VCF_VERSION); } - // the field types - private String name; - private int count = -1; - private VCFHeaderLineCount countType; - private String description; - private VCFHeaderLineType type; - private String source; - private String version; + /** + * create a VCF compound header line from an attribute map + * + * @param key the key (header line type) for this header line + * @param mapping the header line attribute map + * @param vcfVersion the VCF header version. This may be null, in which case + */ + protected VCFCompoundHeaderLine(final String key, final Map mapping, final VCFHeaderVersion vcfVersion) { + super(key, mapping); + ValidationUtils.nonNull(vcfVersion); + + this.type = decodeLineType(getGenericFieldValue(TYPE_ATTRIBUTE)); + final String countString = getGenericFieldValue(NUMBER_ATTRIBUTE); + this.countType = decodeCountType(countString, vcfVersion); + this.count = decodeCount(countString, this.countType); + validateForVersion(vcfVersion); + } + + /** + * Return the description for this header line. + * @return the header line's description + */ + public String getDescription() { + final String description = getGenericFieldValue(DESCRIPTION_ATTRIBUTE); + return description == null ? + UNBOUND_DESCRIPTION : + description; + } - // access methods - @Override - public String getID() { return name; } - public String getDescription() { return description; } public VCFHeaderLineType getType() { return type; } + public VCFHeaderLineCount getCountType() { return countType; } - public boolean isFixedCount() { return countType == VCFHeaderLineCount.INTEGER; } + + /** + * @return true if this header line has a fixed integer count type ({@link #getCountType()} + * equals {@link VCFHeaderLineCount#INTEGER}) + */ + public boolean isFixedCount() { return countType.isFixedCount(); } + + /** + * @return the integer count for this header line if the header has a fixed integer + * count type ({@link #isFixedCount()} is true). A TribbleException is thrown if the + * header line does not have a fixed integer count type ({@link #getCountType()} equals + * {@link VCFHeaderLineCount#INTEGER}). + * + * @throws TribbleException if the {@link VCFHeaderLineCount} is not a fixed integer + */ public int getCount() { - if (!isFixedCount()) - throw new TribbleException("Asking for header line count when type is not an integer"); + if (!isFixedCount()) { + throw new TribbleException("Header line count request when count type is not an integer"); + } return count; } public String getSource() { - return source; + return getGenericFieldValue(SOURCE_ATTRIBUTE); } public String getVersion() { - return version; + return getGenericFieldValue(VERSION_ATTRIBUTE); + } + + @Override + public Optional> getValidationFailure(final VCFHeaderVersion vcfTargetVersion) { + // The VCF 4.3 spec does not phrase this restriction as one on the form of the ID value of + // INFO/FORMAT lines but instead on the INFO/FORMAT fixed field key values (c.f. section 1.6.1). + // However, the key values correspond to INFO/FORMAT header lines defining the attribute and its type, + // so we do the validation here + if (vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { + if (!validHeaderID(getID())) { + final VCFValidationFailure validationFailure = new VCFValidationFailure<>( + vcfTargetVersion, + this, + String.format("ID tag \"%s\" does not conform to tag restrictions", getID())); + if (VCFUtils.isStrictVCFVersionValidation()) { + return Optional.of(validationFailure); + } else { + // warn for older versions - this line can't be used as a v4.3 line + logger.warn(validationFailure.getFailureMessage()); + } + } + } + + return super.getValidationFailure(vcfTargetVersion); + } + + /** + * @param id the candidate ID + * @return true if ID conforms to header line id requirements, otherwise false + */ + //TODO: the existing VCFHeaderLine.validateKeyOrID method should be refactored so it can be used instead of this + protected boolean validHeaderID(final String id) { + return VALID_HEADER_ID_PATTERN.matcher(id).matches(); } /** @@ -113,278 +238,209 @@ public int getCount(final VariantContext vc) { } } - public void setNumberToUnbounded() { - countType = VCFHeaderLineCount.UNBOUNDED; - count = -1; - } - - // our type of line, i.e. format, info, etc - private final SupportedHeaderLineType lineType; - /** - * create a VCF format header line + * Specify annotation source + *

+ * This value is optional starting with VCFv4.2. * - * @param name the name for this header line - * @param count the count for this header line - * @param type the type for this header line - * @param description the description for this header line - * @param lineType the header line type + * @param source annotation source (case-insensitive, e.g. "dbsnp") */ - protected VCFCompoundHeaderLine(String name, int count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType) { - this(name, count, type, description, lineType, null, null); + @Deprecated // after 2.24.1 + public void setSource(final String source) { + updateGenericField(SOURCE_ATTRIBUTE, source); } /** - * create a VCF format header line + * Specify annotation version + *

+ * This value is optional starting with VCFv4.2. * - * @param name the name for this header line - * @param count the count type for this header line - * @param type the type for this header line - * @param description the description for this header line - * @param lineType the header line type + * @param version exact version (e.g. "138") */ - protected VCFCompoundHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType) { - this(name, count, type, description, lineType, null, null); + @Deprecated // after version 2.24.1 + public void setVersion(final String version) { + updateGenericField(VERSION_ATTRIBUTE, version); } - /** - * create a VCF format header line - * - * @param name the name for this header line - * @param count the count for this header line - * @param type the type for this header line - * @param description the description for this header line - * @param lineType the header line type - * @param source annotation source (case-insensitive, e.g. "dbsnp") - * @param version exact version (e.g. "138") - */ - protected VCFCompoundHeaderLine(String name, int count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType, String source, String version) { - super(lineType.toString(), ""); - this.name = name; - this.countType = VCFHeaderLineCount.INTEGER; - this.count = count; - this.type = type; - this.description = description; - this.lineType = lineType; - this.source = source; - this.version = version; - validate(); - } + @Override + public boolean equals(final Object o) { + if (this == o) return true; + if (!(o instanceof VCFCompoundHeaderLine)) return false; + if (!super.equals(o)) return false; - /** - * create a VCF format header line - * - * @param name the name for this header line - * @param count the count type for this header line - * @param type the type for this header line - * @param description the description for this header line - * @param lineType the header line type - * @param source annotation source (case-insensitive, e.g. "dbsnp") - * @param version exact version (e.g. "138") - */ - protected VCFCompoundHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType, String source, String version) { - super(lineType.toString(), ""); - this.name = name; - this.countType = count; - this.type = type; - this.description = description; - this.lineType = lineType; - this.source = source; - this.version = version; - validate(); + final VCFCompoundHeaderLine that = (VCFCompoundHeaderLine) o; + + if (count != that.count) return false; + if (type != that.type) return false; + return countType == that.countType; } - /** - * create a VCF format header line - * - * @param line the header line - * @param version the VCF header version - * @param lineType the header line type - * - */ - protected VCFCompoundHeaderLine(String line, VCFHeaderVersion version, SupportedHeaderLineType lineType) { - super(lineType.toString(), ""); + @Override + public int hashCode() { + int result = super.hashCode(); + result = 31 * result + type.hashCode(); + result = 31 * result + countType.hashCode(); + result = 31 * result + count; + return result; + } - final ArrayList expectedTags = new ArrayList(Arrays.asList("ID", "Number", "Type", "Description")); - final List recommendedTags; - if (version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_2)) { - recommendedTags = Arrays.asList("Source", "Version"); - } else { - recommendedTags = Collections.emptyList(); - } - final Map mapping = VCFHeaderLineTranslator.parseLine(version, line, expectedTags, recommendedTags); - name = mapping.get("ID"); - count = -1; - final String numberStr = mapping.get("Number"); - if (numberStr.equals(VCFConstants.PER_ALTERNATE_COUNT)) { - countType = VCFHeaderLineCount.A; - } else if (numberStr.equals(VCFConstants.PER_ALLELE_COUNT)) { - countType = VCFHeaderLineCount.R; - } else if (numberStr.equals(VCFConstants.PER_GENOTYPE_COUNT)) { - countType = VCFHeaderLineCount.G; - } else if ((version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0) && numberStr.equals(VCFConstants.UNBOUNDED_ENCODING_v4)) || - (!version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0) && numberStr.equals(VCFConstants.UNBOUNDED_ENCODING_v3))) { - countType = VCFHeaderLineCount.UNBOUNDED; + private VCFHeaderLineType decodeLineType(final String lineTypeString) { + if (lineTypeString == null) { + throw new TribbleException(String.format("A line type attribute is required for %s header lines", getKey())); } else { - countType = VCFHeaderLineCount.INTEGER; - count = Integer.parseInt(numberStr); - - } - - if (count < 0 && countType == VCFHeaderLineCount.INTEGER) - throw new TribbleException.InvalidHeader("Count < 0 for fixed size VCF header field " + name); - - try { - type = VCFHeaderLineType.valueOf(mapping.get("Type")); - } catch (Exception e) { - throw new TribbleException(mapping.get("Type") + " is not a valid type in the VCF specification (note that types are case-sensitive)"); + try { + return VCFHeaderLineType.valueOf(lineTypeString); + } catch (IllegalArgumentException e) { + throw new TribbleException(String.format( + "\"%s\" is not a valid type for %s header lines (note that types are case-sensitive)", + lineTypeString, + getKey())); + } } - if (type == VCFHeaderLineType.Flag && !allowFlagValues()) - throw new IllegalArgumentException("Flag is an unsupported type for this kind of field at line - " + line); - - description = mapping.get("Description"); - if (description == null && ALLOW_UNBOUND_DESCRIPTIONS) // handle the case where there's no description provided - description = UNBOUND_DESCRIPTION; - - this.lineType = lineType; + } - if (version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_2)) { - this.source = mapping.get("Source"); - this.version = mapping.get("Version"); + private VCFHeaderLineCount decodeCountType(final String countString, final VCFHeaderVersion vcfVersion) { + if (countString == null) { + throw new TribbleException.InvalidHeader( + String.format("A count type/value must be provided for %s header lines.", getID())); } - - validate(); + return VCFHeaderLineCount.decode(vcfVersion, countString); } - private void validate() { - if (type != VCFHeaderLineType.Flag && countType == VCFHeaderLineCount.INTEGER && count <= 0) - throw new IllegalArgumentException(String.format("Invalid count number, with fixed count the number should be 1 or higher: key=%s name=%s type=%s desc=%s lineType=%s count=%s", - getKey(), name, type, description, lineType, count)); - if (name == null || type == null || description == null || lineType == null) - throw new IllegalArgumentException(String.format("Invalid VCFCompoundHeaderLine: key=%s name=%s type=%s desc=%s lineType=%s", - getKey(), name, type, description, lineType)); - if (name.contains("<") || name.contains(">")) - throw new IllegalArgumentException("VCFHeaderLine: ID cannot contain angle brackets"); - if (name.contains("=")) - throw new IllegalArgumentException("VCFHeaderLine: ID cannot contain an equals sign"); - - if (type == VCFHeaderLineType.Flag && count != 0) { - count = 0; - if (GeneralUtils.DEBUG_MODE_ENABLED) { - System.err.println("FLAG fields must have a count value of 0, but saw " + count + " for header line " + getID() + ". Changing it to 0 inside the code"); + private int decodeCount(final String countString, final VCFHeaderLineCount requestedCountType) { + int lineCount = VCFHeaderLineCount.VARIABLE_COUNT; + if (requestedCountType.isFixedCount()) { + if (countString == null) { + throw new TribbleException.InvalidHeader(String.format("Missing count value in VCF header field %s", getID())); + } + try { + lineCount = Integer.parseInt(countString); + } catch (NumberFormatException e) { + throw new TribbleException.InvalidHeader(String.format("Invalid count value %s in VCF header field %s", lineCount, getID())); + } + if (getType() == VCFHeaderLineType.Flag) { + if (lineCount != 0) { + // This check is here on behalf of INFO lines (which are the only header line type allowed to have Flag + // type). A Flag type with a count value other than 0 violates the spec (at least v4.2 and v4.3), but + // to retain backward compatibility with previous implementations, we accept (and repair) and the line here. + updateGenericField(NUMBER_ATTRIBUTE, "0"); + lineCount = 0; + logger.warn(String.format("FLAG fields must have a count value of 0, but saw count %d for header line %s. A value of 0 will be used", + lineCount, + getID())); + } + } else if (lineCount <= 0) { + throw new TribbleException.InvalidHeader( + String.format("Invalid count number %d for fixed count in header line with ID %s. For fixed count, the count number must be 1 or higher.", + lineCount, + getID())); } } + return lineCount; } - /** - * make a string representation of this header line - * @return a string representation - */ - @Override - protected String toStringEncoding() { - Map map = new LinkedHashMap(); - map.put("ID", name); - Object number; - switch (countType) { - case A: - number = VCFConstants.PER_ALTERNATE_COUNT; - break; - case R: - number = VCFConstants.PER_ALLELE_COUNT; - break; - case G: - number = VCFConstants.PER_GENOTYPE_COUNT; - break; - case UNBOUNDED: - number = VCFConstants.UNBOUNDED_ENCODING_v4; - break; - case INTEGER: - default: - number = count; - } - map.put("Number", number); - map.put("Type", type); - map.put("Description", description); - if (source != null) { - map.put("Source", source); - } - if (version != null) { - map.put("Version", version); - } - return lineType.toString() + "=" + VCFHeaderLine.toStringEncoding(map); + // Create a backing attribute map out of VCFCompoundHeaderLine elements + private static LinkedHashMap createAttributeMap( + final String headerLineID, + final VCFHeaderLineCount countType, + final int count, + final VCFHeaderLineType type, + final String description) { + return new LinkedHashMap() { + { put(ID_ATTRIBUTE, headerLineID); } + { put(NUMBER_ATTRIBUTE, countType.encode(count)); } + { put(TYPE_ATTRIBUTE, type.encode()); } + { + // Handle the case where there's no description provided, ALLOW_UNBOUND_DESCRIPTIONS is the default + // note: if no description was provided, don't cache it, which means we don't round trip it + if (description != null) { + put(DESCRIPTION_ATTRIBUTE, description); + } + } + }; } /** - * returns true if we're equal to another compound header line - * @param o a compound header line - * @return true if equal + * Compare two VCFCompoundHeaderLine (FORMAT or INFO) lines to determine if they have compatible number types, + * and return a VCFCompoundHeaderLine that can be used to represent the result of merging these lines. In the + * case where the merged line requires "promoting" one of the types to the other, a new line of the appropriate + * type is created by calling the {@code compoundHeaderLineResolver} to produce new line of the correct + * subclass (INFO or FORMAT). + * + * @param line1 first line to merge + * @param line2 second line to merge + * @param conflictWarner conflict warning manager + * @param compoundHeaderLineResolver function that accepts two compound header lines of the same type (info or + * format, and returns a new header line representing the combination of the + * two input header lines + * @param type of VCFCompoundHeaderLine to merge (subclass of VCFCompoundHeaderLine) + * @return the merged line if one can be created */ - @Override - public boolean equals(final Object o) { - if ( this == o ) { - return true; + static T getMergedCompoundHeaderLine( + final T line1, + final T line2, + final VCFHeaderMerger.HeaderMergeConflictWarnings conflictWarner, + BiFunction compoundHeaderLineResolver) + { + ValidationUtils.nonNull(line1); + ValidationUtils.nonNull(line2); + ValidationUtils.validateArg(line1.getKey().equals(line2.getKey()) && line1.getID().equals(line2.getID()), + "header lines must have the same type to merge"); + T mergedLine = line1; + + if (!line1.equalsExcludingExtraAttributes(line2)) { + if (getCompoundLineDifferenceScore(line1, line2) > 1) { + // merge lines if they have zero or one mergeable differences, but if there are multiple + // differences, call the headers incompatible and bail, since we need to choose one line + // or the other as the merge line (we can't do generic field-level resolution) + throw new TribbleException( + String.format("Incompatible header merge, can't merge lines with multiple attribute differences %s/%s.", + line1, line2)); + } + if (line1.getType().equals(line2.getType())) { + // The lines have a common type. + // The Number entry is an Integer that describes the number of values that can be + // included with the INFO field. For example, if the INFO field contains a single + // number, then this value should be 1. However, if the INFO field describes a pair + // of numbers, then this value should be 2 and so on. If the number of possible + // values varies, is unknown, or is unbounded, then this value should be '.'. + conflictWarner.warn("Promoting header field Number to . due to number differences in header lines: " + line1 + " " + line2); + mergedLine = compoundHeaderLineResolver.apply(line1, line2); + } else if (line1.getType() == VCFHeaderLineType.Integer && line2.getType() == VCFHeaderLineType.Float) { + // promote key to Float + conflictWarner.warn("Promoting Integer to Float in header: " + line2); + mergedLine = line2; + } else if (line1.getType() == VCFHeaderLineType.Float && line2.getType() == VCFHeaderLineType.Integer) { + // promote key to Float + conflictWarner.warn("Promoting Integer to Float in header: " + line2); + } else { + throw new IllegalStateException("Attempt to merge incompatible headers, can't merge these lines: " + line1 + " " + line2); + } } - if ( o == null || getClass() != o.getClass() || ! super.equals(o) ) { - return false; + if (!line1.getDescription().equals(line2.getDescription())) { + conflictWarner.warn("Allowing unequal description fields through: keeping " + line2 + " excluding " + line1); } - final VCFCompoundHeaderLine that = (VCFCompoundHeaderLine) o; - return equalsExcludingDescription(that) && - description.equals(that.description); - } - - @Override - public int hashCode() { - int result = super.hashCode(); - result = 31 * result + name.hashCode(); - result = 31 * result + count; - result = 31 * result + (countType != null ? countType.hashCode() : 0); // only nullable field according to validate() - result = 31 * result + description.hashCode(); - result = 31 * result + type.hashCode(); - result = 31 * result + lineType.hashCode(); - result = 31 * result + (source != null ? source.hashCode() : 0); - result = 31 * result + (version != null ? version.hashCode() : 0); - return result; + return mergedLine; } - public boolean equalsExcludingDescription(VCFCompoundHeaderLine other) { + boolean equalsExcludingExtraAttributes(final VCFCompoundHeaderLine other) { return count == other.count && countType == other.countType && type == other.type && - lineType == other.lineType && - name.equals(other.name); - } - - public boolean sameLineTypeAndName(VCFCompoundHeaderLine other) { - return lineType == other.lineType && - name.equals(other.name); + getKey().equals(other.getKey()) && + getID().equals(other.getID()); } - /** - * do we allow flag (boolean) values? (i.e. booleans where you don't have specify the value, AQ means AQ=true) - * @return true if we do, false otherwise - */ - abstract boolean allowFlagValues(); - - /** - * Specify annotation source - *

- * This value is optional starting with VCFv4.2. - * - * @param source annotation source (case-insensitive, e.g. "dbsnp") - */ - public void setSource(final String source) { - this.source = source; - } - - /** - * Specify annotation version - *

- * This value is optional starting with VCFv4.2. - * - * @param version exact version (e.g. "138") - */ - public void setVersion(final String version) { - this.version = version; + private static int getCompoundLineDifferenceScore(final T line1, final T line2) { + final int dataTypeDiffers = line1.getType().equals(line2.getType()) ? 0 : 1; // data type + final int countTypeDiffers = line1.getCountType().equals(line2.getCountType()) ? 0 : 1; // count type + // getCount is only valid if the getCountType==Integer + final int countDiffers = + (countTypeDiffers == 0 && + line1.getCountType().equals(VCFHeaderLineCount.INTEGER) && + line2.getCountType().equals(VCFHeaderLineCount.INTEGER) && + line1.getCount() != line2.getCount()) ? 1 : 0; + return dataTypeDiffers + countTypeDiffers + countDiffers; } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFConstants.java b/src/main/java/htsjdk/variant/vcf/VCFConstants.java index 64fdf2bc8e..11f12cf07c 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFConstants.java +++ b/src/main/java/htsjdk/variant/vcf/VCFConstants.java @@ -45,7 +45,7 @@ public final class VCFConstants { public static final String GENOTYPE_KEY = "GT"; public static final String GENOTYPE_POSTERIORS_KEY = "GP"; public static final String GENOTYPE_QUALITY_KEY = "GQ"; - public static final String GENOTYPE_ALLELE_DEPTHS = "AD"; //AD isn't reserved, but is specifically handled by VariantContext + public static final String GENOTYPE_ALLELE_DEPTHS = "AD"; //AD is now reserved public static final String GENOTYPE_PL_KEY = "PL"; // phred-scaled genotype likelihoods public static final String EXPECTED_ALLELE_COUNT_KEY = "EC"; @Deprecated public static final String GENOTYPE_LIKELIHOODS_KEY = "GL"; // log10 scaled genotype likelihoods @@ -86,14 +86,20 @@ public final class VCFConstants { public static final String PHASING_TOKENS = "/|\\"; // header lines - public static final String FILTER_HEADER_START = "##FILTER"; - public static final String FORMAT_HEADER_START = "##FORMAT"; - public static final String INFO_HEADER_START = "##INFO"; - public static final String ALT_HEADER_KEY = "ALT"; - public static final String ALT_HEADER_START = VCFHeader.METADATA_INDICATOR + ALT_HEADER_KEY ; - public static final String CONTIG_HEADER_KEY = "contig"; - public static final String CONTIG_HEADER_START = "##" + CONTIG_HEADER_KEY; + public static final String FILTER_HEADER_KEY = "FILTER"; + public static final String FILTER_HEADER_START = VCFHeader.METADATA_INDICATOR + FILTER_HEADER_KEY; + public static final int FILTER_HEADER_OFFSET = FILTER_HEADER_START.length() + 1; + + public static final String FORMAT_HEADER_KEY = "FORMAT"; + public static final String FORMAT_HEADER_START = VCFHeader.METADATA_INDICATOR + FORMAT_HEADER_KEY; + public static final int FORMAT_HEADER_OFFSET = FORMAT_HEADER_START.length() + 1; + + public static final String INFO_HEADER_KEY = "INFO"; + public static final String INFO_HEADER_START = VCFHeader.METADATA_INDICATOR + INFO_HEADER_KEY; + public static final int INFO_HEADER_OFFSET = INFO_HEADER_START.length() + 1; + public static final String ALT_HEADER_KEY = "ALT"; + public static final String ALT_HEADER_START = VCFHeader.METADATA_INDICATOR + ALT_HEADER_KEY; public static final int ALT_HEADER_OFFSET = ALT_HEADER_START.length() + 1; public static final String PEDIGREE_HEADER_KEY = "PEDIGREE"; @@ -108,6 +114,10 @@ public final class VCFConstants { public static final String META_HEADER_START = VCFHeader.METADATA_INDICATOR + META_HEADER_KEY; public static final int META_HEADER_OFFSET = META_HEADER_START.length() + 1; + public static final String CONTIG_HEADER_KEY = "contig"; + public static final String CONTIG_HEADER_START = VCFHeader.METADATA_INDICATOR + CONTIG_HEADER_KEY; + public static final int CONTIG_HEADER_OFFSET = CONTIG_HEADER_START.length() + 1; + // old indel alleles public static final char DELETION_ALLELE_v3 = 'D'; public static final char INSERTION_ALLELE_v3 = 'I'; diff --git a/src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java index 9ec50681b4..d8a19e2fa5 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java @@ -26,11 +26,14 @@ package htsjdk.variant.vcf; import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.util.Log; import htsjdk.tribble.TribbleException; import java.util.Collections; import java.util.LinkedHashMap; import java.util.Map; +import java.util.Optional; +import java.util.regex.Pattern; /** * A special class representing a contig VCF header line. Knows the true contig order and sorts on that @@ -40,42 +43,111 @@ * @author mdepristo */ public class VCFContigHeaderLine extends VCFSimpleHeaderLine { + private static final long serialVersionUID = 1L; + protected final static Log logger = Log.getInstance(VCFContigHeaderLine.class); + + final static Pattern VALID_CONTIG_ID_PATTERN = Pattern.compile("[0-9A-Za-z!#$%&+./:;?@^_|~-][0-9A-Za-z!#$%&*+./:;=?@^_|~-]*"); final Integer contigIndex; + public static final String LENGTH_ATTRIBUTE = "length"; + public static final String ASSEMBLY_ATTRIBUTE = "assembly"; + public static final String MD5_ATTRIBUTE = "md5"; + public static final String URL_ATTRIBUTE = "URL"; + public static final String SPECIES_ATTRIBUTE = "species"; + /** * create a VCF contig header line * + * NOTE: This is retained for backward compatibility, but is deprecated and should not be used. + * * @param line the header line * @param version the vcf header version * @param key the key for this header line + * @param contigIndex the contig index for this contig */ + @Deprecated // starting after version 2.24.1 public VCFContigHeaderLine(final String line, final VCFHeaderVersion version, final String key, final int contigIndex) { - super(line, version, key, null, Collections.emptyList()); - if (contigIndex < 0) throw new TribbleException("The contig index is less than zero."); - this.contigIndex = contigIndex; + // deprecated because this constructor has a parameter to specify the key (??), but for + // contig lines the key has to be "contig" + this(line, version, contigIndex); + if (!VCFHeader.CONTIG_KEY.equals(key)) { + logger.warn(String.format( + "Found key \"%s\". The key for contig header lines must be %s.", + key, + VCFHeader.CONTIG_KEY)); + } + } + + /** + * create a VCF contig header line + * + * @param line the header line + * @param version the vcf header version + * @param contigIndex the contig index for this contig + */ + public VCFContigHeaderLine(final String line, final VCFHeaderVersion version, final int contigIndex) { + this(VCFHeaderLineTranslator.parseLine( + version, line, Collections.singletonList(VCFSimpleHeaderLine.ID_ATTRIBUTE)), contigIndex); + if (!VCFHeader.CONTIG_KEY.equals(getKey())) { + logger.warn(String.format( + "Found key \"%s\". The key for contig header lines must be %s.", + getKey(), + VCFHeader.CONTIG_KEY)); + } + if (contigIndex < 0) { + throw new TribbleException(String.format("The contig index (%d) is less than zero.", contigIndex)); + } + validateForVersion(version); } public VCFContigHeaderLine(final Map mapping, final int contigIndex) { super(VCFHeader.CONTIG_KEY, mapping); - if (contigIndex < 0) throw new TribbleException("The contig index is less than zero."); + if (contigIndex < 0) { + throw new TribbleException(String.format("The contig index (%d) is less than zero.", contigIndex)); + } this.contigIndex = contigIndex; } - VCFContigHeaderLine(final SAMSequenceRecord sequenceRecord, final String assembly) { - // Using LinkedHashMap to preserve order of keys in contig line (ID, length, assembly) - super(VCFHeader.CONTIG_KEY, new LinkedHashMap() {{ - // Now inside an init block in an anon HashMap subclass - this.put("ID", sequenceRecord.getSequenceName()); - this.put("length", Integer.toString(sequenceRecord.getSequenceLength())); - if ( assembly != null ) this.put("assembly", assembly); - }}); - this.contigIndex = sequenceRecord.getSequenceIndex(); + /** + * Return a VCFContigHeaderLine representing a SAMSequenceRecord. + * + * NOTE: round-tripping between VCFContigHeaderLines and SAMSequenceRecords can be lossy since they + * don't necessarily have equivalent attributes, i.e., SAMSequenceRecord can have a species attribute + * that isn't defined by the VCF spec. + * + * @return VCFContigHeaderLine for the SAMSequenceRecord + */ + public VCFContigHeaderLine(final SAMSequenceRecord sequenceRecord, final String assembly) { + // preserve order of keys in contig line (ID, length, assembly) + this(new LinkedHashMap() {{ + this.put(ID_ATTRIBUTE, sequenceRecord.getSequenceName()); + if (sequenceRecord.getSequenceLength() != 0) { + this.put(LENGTH_ATTRIBUTE, Integer.toString(sequenceRecord.getSequenceLength())); + } + if (assembly != null) { + if (!assembly.equals(sequenceRecord.getAssembly())) { + logger.warn(String.format( + "Inconsistent \"assembly\" attribute values found while creating VCFContigLine " + + "(with assembly \"%s\") from SAMSequenceRecord (with assembly \"%s\")", + assembly, + sequenceRecord.getAssembly())); + } + this.put(ASSEMBLY_ATTRIBUTE, assembly); + } + if (sequenceRecord.getMd5() != null) { + this.put(MD5_ATTRIBUTE, sequenceRecord.getMd5()); + } + if (sequenceRecord.getAttribute(SAMSequenceRecord.URI_TAG) != null) { + this.put(URL_ATTRIBUTE, sequenceRecord.getAttribute(SAMSequenceRecord.URI_TAG)); + } + if (sequenceRecord.getAttribute(SAMSequenceRecord.SPECIES_TAG) != null) { + this.put(SPECIES_ATTRIBUTE, sequenceRecord.getAttribute(SAMSequenceRecord.SPECIES_TAG)); + } + }}, + sequenceRecord.getSequenceIndex() + ); } - public Integer getContigIndex() { - return contigIndex; - } - /** * Get the SAMSequenceRecord that corresponds to this VCF header line. * If the VCF header line does not have a length tag, the SAMSequenceRecord returned will be set to have a length of @@ -85,20 +157,56 @@ public Integer getContigIndex() { * contig header line does not have a length. */ public SAMSequenceRecord getSAMSequenceRecord() { - final String lengthString = this.getGenericFieldValue("length"); - final int length; - if (lengthString == null) { - length = SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH; + final String lengthString = this.getGenericFieldValue(LENGTH_ATTRIBUTE); + final int length; + if (lengthString == null) { + length = SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH; } else { - length = Integer.parseInt(lengthString); + length = Integer.parseInt(lengthString); + } + final SAMSequenceRecord record = new SAMSequenceRecord(this.getID(), length); + final String assemblyString = this.getGenericFieldValue(ASSEMBLY_ATTRIBUTE); + if (assemblyString != null) { + record.setAssembly(assemblyString); } - final SAMSequenceRecord record = new SAMSequenceRecord(this.getID(), length); - record.setAssembly(this.getGenericFieldValue("assembly")); - record.setSequenceIndex(this.contigIndex); - return record; + record.setSequenceIndex(this.contigIndex); + final String md5 = getGenericFieldValue(MD5_ATTRIBUTE); + if (md5 != null) { + record.setMd5(md5); + } + final String url = getGenericFieldValue(URL_ATTRIBUTE); + if (url != null) { + record.setAttribute(SAMSequenceRecord.URI_TAG, url); + } + final String species = getGenericFieldValue(SPECIES_ATTRIBUTE); + if (species != null) { + record.setSpecies(species); + } + return record; } @Override + public Optional> getValidationFailure(final VCFHeaderVersion vcfTargetVersion) { + if (vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { + if (!VALID_CONTIG_ID_PATTERN.matcher(getID()).matches()) { + return Optional.of(new VCFValidationFailure<>( + vcfTargetVersion, + this, + String.format("Contig headerLine ID \"%s\" doesn't conform to contig ID restrictions", getID()))); + } + } + + return super.getValidationFailure(vcfTargetVersion); + } + + public Integer getContigIndex() { + return contigIndex; + } + + /** + * Note: this class has a natural ordering that is inconsistent with equals() + */ + @Override public boolean equals(final Object o) { if ( this == o ) { return true; @@ -120,6 +228,11 @@ public int hashCode() { /** * IT IS CRITICAL THAT THIS BE OVERRIDDEN SO WE SORT THE CONTIGS IN THE CORRECT ORDER + * + * NOTE: this class has a natural ordering that is inconsistent with equals(). This results + * in inconsistent behavior when these lines are used in the sets that are created/accepted + * by VCFHeader (ie., getMetaDataInSortedOrder will filter out VCFContigHeaderLines that are + * returned by getMetaDataInInputOrder or getContigheaderLines). */ @Override public int compareTo(final Object other) { diff --git a/src/main/java/htsjdk/variant/vcf/VCFFilterHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFFilterHeaderLine.java index 6ca8f3f532..1b890db1b1 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFFilterHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFFilterHeaderLine.java @@ -25,26 +25,40 @@ package htsjdk.variant.vcf; -import java.util.Arrays; -import java.util.Collections; +import htsjdk.tribble.TribbleException; + +import java.util.*; /** * @author ebanks * - * A class representing a key=value entry for FILTER fields in the VCF header + * A class representing FILTER fields in the VCF header */ -public class VCFFilterHeaderLine extends VCFSimpleHeaderLine { - +public class VCFFilterHeaderLine extends VCFSimpleHeaderLine { + private static final long serialVersionUID = 1L; + private static List requiredTagOrder = Collections.unmodifiableList( + new ArrayList(2) {{ + add(ID_ATTRIBUTE); + add(VCFSimpleHeaderLine.DESCRIPTION_ATTRIBUTE); + }} + ); + /** * create a VCF filter header line * - * @param name the name for this header line + * @param id the headerLineID for this header line * @param description the description for this header line */ - public VCFFilterHeaderLine(final String name, final String description) { - super("FILTER", name, description); + public VCFFilterHeaderLine(final String id, final String description) { + super(VCFConstants.FILTER_HEADER_KEY, + new LinkedHashMap(2) {{ + put(ID_ATTRIBUTE, id); + put(DESCRIPTION_ATTRIBUTE, description); + }} + ); + validate(); } /** @@ -52,29 +66,37 @@ public VCFFilterHeaderLine(final String name, final String description) { * @param name */ public VCFFilterHeaderLine(final String name) { - super("FILTER", name, name); + this(name, name); } /** - * create a VCF info header line + * create a VCF filter header line * * @param line the header line * @param version the vcf header version */ public VCFFilterHeaderLine(final String line, final VCFHeaderVersion version) { - super(line, version, "FILTER", Arrays.asList("ID", "Description"), Collections.emptyList()); + super(VCFConstants.FILTER_HEADER_KEY, VCFHeaderLineTranslator.parseLine(version, line, requiredTagOrder)); + validate(); + validateForVersion(version); + } + + private void validate() { + if (getDescription() == null) { + throw new TribbleException.InvalidHeader("Missing Description attribute in filter header line"); + } } @Override public boolean shouldBeAddedToDictionary() { return true; } - + /** * get the "Description" field * @return the "Description" field */ public String getDescription() { - return getGenericFieldValue("Description"); + return getGenericFieldValue(DESCRIPTION_ATTRIBUTE); } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFFormatHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFFormatHeaderLine.java index 74f4d5e5e3..fc75ee5291 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFFormatHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFFormatHeaderLine.java @@ -26,34 +26,75 @@ package htsjdk.variant.vcf; +import htsjdk.samtools.util.Log; +import htsjdk.tribble.TribbleException; +import htsjdk.utils.ValidationUtils; + /** * @author ebanks *

* Class VCFFormatHeaderLine *

*

- * A class representing a key=value entry for genotype FORMAT fields in the VCF header

+ * A class representing genotype FORMAT fields in the VCF header

*/ public class VCFFormatHeaderLine extends VCFCompoundHeaderLine { + private static final long serialVersionUID = 1L; + protected final static Log logger = Log.getInstance(VCFFormatHeaderLine.class); public VCFFormatHeaderLine(String name, int count, VCFHeaderLineType type, String description) { - super(name, count, type, description, SupportedHeaderLineType.FORMAT); - if (type == VCFHeaderLineType.Flag) - throw new IllegalArgumentException("Flag is an unsupported type for format fields"); + super(VCFConstants.FORMAT_HEADER_KEY, name, count, type, description); + validate(); } public VCFFormatHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description) { - super(name, count, type, description, SupportedHeaderLineType.FORMAT); + super(VCFConstants.FORMAT_HEADER_KEY, name, count, type, description); + validate(); } public VCFFormatHeaderLine(String line, VCFHeaderVersion version) { - super(line, version, SupportedHeaderLineType.FORMAT); + super(VCFConstants.FORMAT_HEADER_KEY, + VCFHeaderLineTranslator.parseLine(version, line, expectedTagOrder), + version); + validate(); + validateForVersion(version); } - // format fields do not allow flag values (that wouldn't make much sense, how would you encode this in the genotype). - @Override - boolean allowFlagValues() { - return false; + /** + * Compare two VCFFormatHeaderLine objects to determine if they have compatible number types, and return a + * VCFFormatHeaderLine that represents the result of merging these two lines. + * + * @param formatLine1 first format line to merge + * @param formatLine2 second format line to merge + * @param conflictWarner conflict warning emitter + * @return a merged VCFFormatHeaderLine + */ + public static VCFFormatHeaderLine getMergedFormatHeaderLine( + final VCFFormatHeaderLine formatLine1, + final VCFFormatHeaderLine formatLine2, + final VCFHeaderMerger.HeaderMergeConflictWarnings conflictWarner) + { + ValidationUtils. nonNull(formatLine1); + ValidationUtils. nonNull(formatLine2); + ValidationUtils. nonNull(conflictWarner); + + // delegate to the generic VCFCompoundHeaderLine merger, passing a resolver lambda + return VCFCompoundHeaderLine.getMergedCompoundHeaderLine( + formatLine1, + formatLine2, + conflictWarner, + (l1, l2) -> new VCFFormatHeaderLine( + l1.getID(), + VCFHeaderLineCount.UNBOUNDED, + l1.getType(), + l1.getDescription()) + ); + } + + private void validate() { + if (this.getType() == VCFHeaderLineType.Flag) { + throw new TribbleException(String.format("Flag is an unsupported type for format fields: ", this.toStringEncoding())); + } } @Override diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeader.java b/src/main/java/htsjdk/variant/vcf/VCFHeader.java index c39bef5684..637c04c4fc 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeader.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeader.java @@ -27,60 +27,52 @@ import htsjdk.beta.plugin.HtsHeader; import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.util.Log; import htsjdk.tribble.TribbleException; import htsjdk.tribble.util.ParsingUtils; import htsjdk.utils.ValidationUtils; -import htsjdk.variant.utils.GeneralUtils; import htsjdk.variant.variantcontext.VariantContextComparator; import java.io.Serializable; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.Collections; -import java.util.HashMap; -import java.util.HashSet; -import java.util.LinkedHashMap; -import java.util.LinkedHashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; - +import java.util.*; +import java.util.stream.Collectors; /** - * A class to represent a VCF header + * A class to represent a VCF header. + * + * A VCFHeader has a "current" VCFHeaderVersion that is established when the header is constructed. If + * metadata lines are provided to the constructor, a ##fileformat line must be included, and all lines + * in that are provided must be valid for the specified version. If no metadata lines are initially + * provided, the default version {@link VCFHeader#DEFAULT_VCF_VERSION} will be used. + * + * Each line in the list is always guaranteed to be valid for the current version, and any line added must + * conform to the current version (as defined by the VCF specification). If a new line is added that fails to + * validate against the current version, or a new line that changes the current version, and an existing line + * in the list fails to validate against the new version, an exception will be thrown. * - * @author aaron - * NOTE: This class stores header lines in lots of places. The original author noted that this should - * be cleaned up at some point in the future (jgentry - 5/2013) + * Once a header version is established, it can be changed by adding a new file format/version line (see + * {@link VCFHeader#makeHeaderVersionLine)} (the new version line will replace any existing line), but only + * if the new version is newer than the previous version. Attempts to move the version to an older version + * will result in an exception. */ public class VCFHeader implements HtsHeader, Serializable { public static final long serialVersionUID = 1L; + protected static final Log logger = Log.getInstance(VCFHeader.class); + public static final VCFHeaderVersion DEFAULT_VCF_VERSION = VCFHeaderVersion.VCF4_2; // the mandatory header fields public enum HEADER_FIELDS { CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO } - /** - * The VCF version for this header; once a header version is established, it can only be - * changed subject to version transition rules defined by - * {@link #validateVersionTransition(VCFHeaderVersion, VCFHeaderVersion)} - */ + // the VCF version for this header private VCFHeaderVersion vcfHeaderVersion; - // the associated meta data - private final Set mMetaData = new LinkedHashSet(); - private final Map mInfoMetaData = new LinkedHashMap(); - private final Map mFormatMetaData = new LinkedHashMap(); - private final Map mFilterMetaData = new LinkedHashMap(); - private final Map mOtherMetaData = new LinkedHashMap(); - private final Map contigMetaData = new LinkedHashMap<>(); + // header meta data + private final VCFMetaDataLines mMetaData = new VCFMetaDataLines(); - // the list of auxillary tags - private final List mGenotypeSampleNames = new ArrayList(); + // the list of auxiliary tags + private final List mGenotypeSampleNames = new ArrayList<>(); // the character string that indicates meta data public static final String METADATA_INDICATOR = "##"; @@ -108,59 +100,74 @@ public enum HEADER_FIELDS { private boolean writeCommandLine = true; /** - * Create an empty VCF header with no header lines and no samples + * Create an empty VCF header with no header lines and no samples. Defaults to + * VCF version {@link VCFHeader#DEFAULT_VCF_VERSION}. */ public VCFHeader() { - this(Collections.emptySet(), Collections.emptySet()); + this(makeHeaderVersionLineSet(DEFAULT_VCF_VERSION), Collections.emptySet()); } /** - * create a VCF header, given a list of meta data and auxiliary tags + * Create a VCF header, given a list of meta data and auxiliary tags. The provided metadata + * header line list MUST contain a version (fileformat) line in order to establish the version + * for the header, and each metadata line must be valid for that version. * - * @param metaData the meta data associated with this header + * @param metaData the meta data associated with this header + * @throws TribbleException if the provided header line metadata does not include a header line that + * establishes the VCF version for the lines, or if any line does not conform to the established + * version */ public VCFHeader(final Set metaData) { - mMetaData.addAll(metaData); - removeVCFVersionLines(mMetaData); - createLookupEntriesForAllHeaderLines(); - checkForDeprecatedGenotypeLikelihoodsKey(); + this(metaData, Collections.emptySet()); } /** - * Creates a deep copy of the given VCFHeader, duplicating all its metadata and + * Creates a copy of the given VCFHeader, duplicating all it's metadata and * sample names. */ public VCFHeader(final VCFHeader toCopy) { - this(toCopy.mMetaData, toCopy.mGenotypeSampleNames); + this(toCopy.getMetaDataInInputOrder(), toCopy.mGenotypeSampleNames); } /** - * create a VCF header, given a list of meta data and auxiliary tags + * Create a VCF header, given a set of meta data and auxiliary tags. The provided metadata + * list MUST contain a version (fileformat) line in order to establish the version + * for this header, and each metadata line must be valid for that version. * - * @param metaData the meta data associated with this header + * @param metaData set of meta data associated with this header * @param genotypeSampleNames the sample names + * @throws TribbleException if the provided header line metadata does not include a header line that + * establishes the VCF version for the lines, or if any line does not conform to the established + * version */ public VCFHeader(final Set metaData, final Set genotypeSampleNames) { - this(metaData, new ArrayList(genotypeSampleNames)); + this(metaData, new ArrayList<>(genotypeSampleNames)); } /** - * create a VCF header, given a target version, a list of meta data and auxiliary tags + * Create a versioned VCF header. * - * @param vcfHeaderVersion the vcf header version for this header, can not be null - * @param metaData the meta data associated with this header - * @param genotypeSampleNames the sample names + * @param metaData The metadata lines for this header.The provided metadata + * header line list MUST contain a version (fileformat) line in order to establish the version + * for this header, and each metadata line must be valid for that version. + * @param genotypeSampleNames Sample names for this header. + * @throws TribbleException if the provided header line metadata does not include a header line that + * establishes the VCF version for the lines, or if any line does not conform to the established + * version */ - public VCFHeader(final VCFHeaderVersion vcfHeaderVersion, final Set metaData, final Set genotypeSampleNames) { - this(metaData, new ArrayList(genotypeSampleNames)); - ValidationUtils.nonNull(vcfHeaderVersion); - setVCFHeaderVersion(vcfHeaderVersion); - } - public VCFHeader(final Set metaData, final List genotypeSampleNames) { - this(metaData); + ValidationUtils.nonNull(metaData); + ValidationUtils.nonNull(genotypeSampleNames); - if ( genotypeSampleNames.size() != new HashSet(genotypeSampleNames).size() ) + // propagate the lines and establish the version for this header; note that if multiple version + // lines are presented in the set, a warning will be issued, only the last one will be retained, + // and the header version will be established using the last version line encountered + mMetaData.addMetaDataLines(metaData); + vcfHeaderVersion = initializeHeaderVersion(); + mMetaData.validateMetaDataLines(vcfHeaderVersion); + + checkForDeprecatedGenotypeLikelihoodsKey(); + if ( genotypeSampleNames.size() != new HashSet<>(genotypeSampleNames).size() ) throw new TribbleException.InvalidHeader("BUG: VCF header has duplicate sample names"); mGenotypeSampleNames.addAll(genotypeSampleNames); @@ -168,50 +175,34 @@ public VCFHeader(final Set metaData, final List genotypeS buildVCFReaderMaps(genotypeSampleNames); } - /** - * Establish the header version for this header. If the header version has already been established - * for this header, the new version will be subject to version transition validation. - * @param vcfHeaderVersion - * @throws TribbleException if the requested header version is not compatible with the existing version - */ - public void setVCFHeaderVersion(final VCFHeaderVersion vcfHeaderVersion) { - validateVersionTransition(this.vcfHeaderVersion, vcfHeaderVersion); - this.vcfHeaderVersion = vcfHeaderVersion; + /** + * Get the header version for this header. + * @return the VCFHeaderVersion for this header. will not be null + */ + public VCFHeaderVersion getVCFHeaderVersion() { + return vcfHeaderVersion; } /** - * Throw if {@code fromVersion} is not compatible with a {@code toVersion}. Generally, any version before - * version 4.2 can be up-converted to version 4.2, but not to version 4.3. Once a header is established as - * version 4.3, it cannot be up or down converted, and it must remain at version 4.3. - * @param fromVersion current version. May be null, in which case {@code toVersion} can be any version - * @param toVersion new version. Cannot be null. - * @throws TribbleException if {@code fromVersion} is not compatible with {@code toVersion} + * Adds a new line to the VCFHeader. If a duplicate line is already exists (same key/ID pair for + * structured lines, or duplicate content for unstructured lines with identical keys), the new + * line will replace the existing line. + * + * @param headerLine header line to attempt to add */ - public static void validateVersionTransition(final VCFHeaderVersion fromVersion, final VCFHeaderVersion toVersion) { - ValidationUtils.nonNull(toVersion); - - final String errorMessageFormatString = "VCF cannot be automatically promoted from %s to %s"; - - // fromVersion can be null, in which case anything goes (any transition from null is legal) - if (fromVersion != null) { - if (toVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { - if (!fromVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { - // we're trying to go from pre-v4.3 to v4.3+ - throw new TribbleException(String.format(errorMessageFormatString, fromVersion, toVersion)); - } + public void addMetaDataLine(final VCFHeaderLine headerLine) { + // propagate the new line to the metadata lines object + mMetaData.addMetaDataLine(headerLine); - } else if (fromVersion.equals(VCFHeaderVersion.VCF4_3)) { - // we're trying to go from v4.3 to pre-v4.3 - throw new TribbleException(String.format(errorMessageFormatString, fromVersion, toVersion)); - } + // update the current version in case this line triggered a version change + final VCFHeaderVersion newHeaderVersion = mMetaData.getVCFVersion(); + if (!newHeaderVersion.equals(vcfHeaderVersion)) { + validateVersionTransition(vcfHeaderVersion, newHeaderVersion); } - } + vcfHeaderVersion = newHeaderVersion; + headerLine.validateForVersion(vcfHeaderVersion); - /** - * @return the VCFHeaderVersion for this header. Can be null. - */ - public VCFHeaderVersion getVCFHeaderVersion() { - return vcfHeaderVersion; + checkForDeprecatedGenotypeLikelihoodsKey(); } /** @@ -220,81 +211,58 @@ public VCFHeaderVersion getVCFHeaderVersion() { * using this header (i.e., read by the VCFCodec) will have genotypes * occurring in the same order * - * @param genotypeSampleNamesInAppearenceOrder genotype sample names, must iterator in order of appearance + * @param genotypeSampleNamesInAppearanceOrder genotype sample names, must iterator in order of appearance */ - private void buildVCFReaderMaps(final Collection genotypeSampleNamesInAppearenceOrder) { - sampleNamesInOrder = new ArrayList(genotypeSampleNamesInAppearenceOrder.size()); - sampleNameToOffset = new HashMap(genotypeSampleNamesInAppearenceOrder.size()); + private void buildVCFReaderMaps(final Collection genotypeSampleNamesInAppearanceOrder) { + sampleNamesInOrder = new ArrayList<>(genotypeSampleNamesInAppearanceOrder.size()); + sampleNameToOffset = new HashMap<>(genotypeSampleNamesInAppearanceOrder.size()); int i = 0; - for (final String name : genotypeSampleNamesInAppearenceOrder) { + for (final String name : genotypeSampleNamesInAppearanceOrder) { sampleNamesInOrder.add(name); sampleNameToOffset.put(name, i++); } Collections.sort(sampleNamesInOrder); } - /** - * Adds a new line to the VCFHeader. If there is an existing header line of the - * same type with the same key, the new line is not added and the existing line - * is preserved. + * Return all contig line in SORTED order, where the sort order is determined by contig index. + * Note that this behavior differs from other VCFHeader methods that return lines in input order. * - * @param headerLine header line to attempt to add - */ - public void addMetaDataLine(final VCFHeaderLine headerLine) { - // Try to create a lookup entry for the new line. If this succeeds (because there was - // no line of this type with the same key), add the line to our master list of header - // lines in mMetaData. - if ( addMetadataLineLookupEntry(headerLine) ) { - mMetaData.add(headerLine); - checkForDeprecatedGenotypeLikelihoodsKey(); - } - } - - /** - * @return all of the VCF header lines of the ##contig form in order, or an empty list if none were present + * @return all of the VCF header lines of the ##contig form in SORTED order, or an empty list if none were present */ public List getContigLines() { - // this must preserve input order - return Collections.unmodifiableList(new ArrayList<>(contigMetaData.values())); - } + // this must return lines in SORTED order + return mMetaData.getContigLines(); + } /** - * Returns the contigs in this VCF file as a SAMSequenceDictionary. Returns null if contigs lines are - * not present in the header. If contig lines are missing length tags, they will be created with - * length set to SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH. Records with unknown length will match any record with - * the same name when evaluated by SAMSequenceRecord.isSameSequence. + * Returns the contigs in this VCF Header as a SAMSequenceDictionary. + * + * @return Returns null if contig lines are not present in the header. + * @throws TribbleException if one or more contig lines do not have length + * information. */ public SAMSequenceDictionary getSequenceDictionary() { + // this must ensure that the lines used to create the dictionary are sorted by contig index final List contigHeaderLines = this.getContigLines(); - if (contigHeaderLines.isEmpty()) return null; - - final List sequenceRecords = new ArrayList(contigHeaderLines.size()); - for (final VCFContigHeaderLine contigHeaderLine : contigHeaderLines) { - final SAMSequenceRecord samSequenceRecord = contigHeaderLine.getSAMSequenceRecord(); - sequenceRecords.add(samSequenceRecord); - } - - return new SAMSequenceDictionary(sequenceRecords); + return contigHeaderLines.isEmpty() ? null : + new SAMSequenceDictionary( + contigHeaderLines.stream() + .map(contigLine -> contigLine.getSAMSequenceRecord()) + .collect(Collectors.toCollection(ArrayList::new)) + ); } /** - * Completely replaces the contig records in this header with those in the given SAMSequenceDictionary. + * Completely replaces all contig header lines in this header with ones derived from the given SAMSequenceDictionary. + * + * @param dictionary SAMSequenceDictionary to use to create VCFContigHeaderLines for this header */ public void setSequenceDictionary(final SAMSequenceDictionary dictionary) { - this.contigMetaData.clear(); - - // Also need to remove contig record lines from mMetaData - final List toRemove = new ArrayList(); - for (final VCFHeaderLine line : mMetaData) { - if (line instanceof VCFContigHeaderLine) { - toRemove.add(line); - } - } - mMetaData.removeAll(toRemove); - for (final SAMSequenceRecord record : dictionary.getSequences()) { - addMetaDataLine(new VCFContigHeaderLine(record, record.getAssembly())); + getContigLines().forEach(hl -> mMetaData.removeMetaDataLine(hl)); + if (dictionary != null) { + dictionary.getSequences().forEach(r -> addMetaDataLine(new VCFContigHeaderLine(r, r.getAssembly()))); } } @@ -305,128 +273,12 @@ public VariantContextComparator getVCFRecordComparator() { /** * @return all of the VCF FILTER lines in their original file order, or an empty list if none were present */ - public List getFilterLines() { - final List filters = new ArrayList(); - for (final VCFHeaderLine line : mMetaData) { - if ( line instanceof VCFFilterHeaderLine ) { - filters.add((VCFFilterHeaderLine)line); - } - } - return filters; - } - - /** - * @return all of the VCF ID-based header lines in their original file order, or an empty list if none were present - */ - public List getIDHeaderLines() { - final List lines = new ArrayList(); - for (final VCFHeaderLine line : mMetaData) { - if (line instanceof VCFIDHeaderLine) { - lines.add((VCFIDHeaderLine)line); - } - } - return lines; - } - - /** - * Remove all lines with a VCF version tag from the provided set of header lines - */ - private void removeVCFVersionLines( final Set headerLines ) { - final List toRemove = new ArrayList(); - for (final VCFHeaderLine line : headerLines) { - if (VCFHeaderVersion.isFormatString(line.getKey())) { - toRemove.add(line); - } - } - headerLines.removeAll(toRemove); - } + public List getFilterLines() { return mMetaData.getFilterLines(); } /** - * Creates lookup table entries for all header lines in mMetaData. + * @return all of the VCFSimpleHeaderLine (ID) lines in their original file order, or an empty list if none are present */ - private void createLookupEntriesForAllHeaderLines() { - for (final VCFHeaderLine line : mMetaData) { - addMetadataLineLookupEntry(line); - } - } - - /** - * Add a single header line to the appropriate type-specific lookup table (but NOT to the master - * list of lines in mMetaData -- this must be done separately if desired). - * - * If a header line is present that has the same key as an existing line, it will not be added. A warning - * will be shown if this occurs when GeneralUtils.DEBUG_MODE_ENABLED is true, otherwise this will occur - * silently. - * - * @param line header line to attempt to add to its type-specific lookup table - * @return true if the line was added to the appropriate lookup table, false if there was an existing - * line with the same key and the new line was not added - */ - private boolean addMetadataLineLookupEntry(final VCFHeaderLine line) { - if ( line instanceof VCFInfoHeaderLine ) { - final VCFInfoHeaderLine infoLine = (VCFInfoHeaderLine)line; - return addMetaDataLineMapLookupEntry(mInfoMetaData, infoLine.getID(), infoLine); - } else if ( line instanceof VCFFormatHeaderLine ) { - final VCFFormatHeaderLine formatLine = (VCFFormatHeaderLine)line; - return addMetaDataLineMapLookupEntry(mFormatMetaData, formatLine.getID(), formatLine); - } else if ( line instanceof VCFFilterHeaderLine ) { - final VCFFilterHeaderLine filterLine = (VCFFilterHeaderLine)line; - return addMetaDataLineMapLookupEntry(mFilterMetaData, filterLine.getID(), filterLine); - } else if ( line instanceof VCFContigHeaderLine ) { - return addContigMetaDataLineLookupEntry((VCFContigHeaderLine) line); - } else { - return addMetaDataLineMapLookupEntry(mOtherMetaData, line.getKey(), line); - } - } - - /** - * Add a contig header line to the lookup list for contig lines (contigMetaData). If there's - * already a contig line with the same ID, does not add the line. - * - * Note: does not add the contig line to the master list of header lines in mMetaData -- - * this must be done separately if desired. - * - * @param line contig header line to add - * @return true if line was added to the list of contig lines, otherwise false - */ - private boolean addContigMetaDataLineLookupEntry(final VCFContigHeaderLine line) { - // if we are trying to add a contig for the same ID - if (contigMetaData.containsKey(line.getID())) { - if ( GeneralUtils.DEBUG_MODE_ENABLED ) { - System.err.println("Found duplicate VCF contig header lines for " + line.getID() + "; keeping the first only" ); - } - // do not add this contig if it exists - return false; - } - contigMetaData.put(line.getID(), line); - return true; - } - - /** - * Add a header line to the provided map at a given key. If the key already exists, it will not be replaced. - * If it does already exist and GeneralUtils.DEBUG_MODE_ENABLED is true, it will issue warnings about duplicates, - * otherwise it will silently leave the existing key/line pair as is. - * - * Note: does not add the header line to the master list of header lines in mMetaData -- - * this must be done separately if desired. - * - * @param map a map from each key to the associated VCFHeaderLine - * @param key the key to insert this line at - * @param line the line to insert at this key - * @param a type of vcf header line that extends VCFHeaderLine - * @return true if the line was added to the map, false if it was not added because there's already a line with that key - */ - private boolean addMetaDataLineMapLookupEntry(final Map map, final String key, final T line) { - if ( map.containsKey(key) ) { - if ( GeneralUtils.DEBUG_MODE_ENABLED ) { - System.err.println("Found duplicate VCF header lines for " + key + "; keeping the first only" ); - } - return false; - } - - map.put(key, line); - return true; - } + public List getIDHeaderLines() { return mMetaData.getIDHeaderLines(); } /** * Check for the presence of a format line with the deprecated key {@link VCFConstants#GENOTYPE_LIKELIHOODS_KEY}. @@ -435,12 +287,14 @@ private boolean addMetaDataLineMapLookupEntry(final Ma */ private void checkForDeprecatedGenotypeLikelihoodsKey() { if ( hasFormatLine(VCFConstants.GENOTYPE_LIKELIHOODS_KEY) && ! hasFormatLine(VCFConstants.GENOTYPE_PL_KEY) ) { - if ( GeneralUtils.DEBUG_MODE_ENABLED ) { - System.err.println("Found " + VCFConstants.GENOTYPE_LIKELIHOODS_KEY + " format, but no " - + VCFConstants.GENOTYPE_PL_KEY + " field. We now only manage PL fields internally" - + " automatically adding a corresponding PL field to your VCF header"); - } - addMetaDataLine(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_PL_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); + logger.warn("Found " + VCFConstants.GENOTYPE_LIKELIHOODS_KEY + " format, but no " + + VCFConstants.GENOTYPE_PL_KEY + " field. We now only manage PL fields internally" + + " automatically adding a corresponding PL field to your VCF header"); + addMetaDataLine(new VCFFormatHeaderLine( + VCFConstants.GENOTYPE_PL_KEY, + VCFHeaderLineCount.G, + VCFHeaderLineType.Integer, + "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); } } @@ -451,48 +305,44 @@ private void checkForDeprecatedGenotypeLikelihoodsKey() { * @return a set of the header fields, in order */ public Set getHeaderFields() { - return new LinkedHashSet(Arrays.asList(HEADER_FIELDS.values())); + return new LinkedHashSet<>(Arrays.asList(HEADER_FIELDS.values())); } /** - * get the meta data, associated with this header, in sorted order + * get the meta data, associated with this header, in input order * * @return a set of the meta data */ - public Set getMetaDataInInputOrder() { - return makeGetMetaDataSet(mMetaData); - } - - public Set getMetaDataInSortedOrder() { - return makeGetMetaDataSet(new TreeSet(mMetaData)); - } + public Set getMetaDataInInputOrder() { return mMetaData.getMetaDataInInputOrder(); } - private Set makeGetMetaDataSet(final Set headerLinesInSomeOrder) { - final Set lines = new LinkedHashSet(); - if (vcfHeaderVersion != null && vcfHeaderVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { - // always propagate version 4.3+ to prevent these header lines from magically being back-versioned to < 4.3 - lines.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_3.getFormatString(), VCFHeaderVersion.VCF4_3.getVersionString())); - } else { - lines.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_2.getFormatString(), VCFHeaderVersion.VCF4_2.getVersionString())); - } - lines.addAll(headerLinesInSomeOrder); - return Collections.unmodifiableSet(lines); - } + /** + * Get the metadata associated with this header in sorted order. + * + * @return Metadata lines in sorted order (based on lexicographical sort of string encodings). + */ + public Set getMetaDataInSortedOrder() { return mMetaData.getMetaDataInSortedOrder(); } /** * Get the VCFHeaderLine whose key equals key. Returns null if no such line exists - * @param key - * @return + * + * Deprecated. Use {@link #getMetaDataLines(String)}. see https://github.com/samtools/hts-specs/issues/602 + * + * @param key the key to use to find header lines to return + * @return the header line with key "key", or null if none is present */ + @Deprecated // starting after version 2.24.1 public VCFHeaderLine getMetaDataLine(final String key) { - for (final VCFHeaderLine line: mMetaData) { - if ( line.getKey().equals(key) ) - return line; - } - - return null; + return mMetaData.getMetaDataLines(key).stream().findFirst().orElse(null); } + /** + * Get the VCFHeaderLines whose key equals key. Returns an empty list if no such lines exist. + * + * @param key the key to use to find header lines to return + * @return the header lines with key "key" + */ + public Collection getMetaDataLines(final String key) { return mMetaData.getMetaDataLines(key); } + /** * get the genotyping sample names * @@ -532,40 +382,32 @@ public int getColumnCount() { /** * Returns the INFO HeaderLines in their original ordering */ - public Collection getInfoHeaderLines() { - return mInfoMetaData.values(); - } + public Collection getInfoHeaderLines() { return mMetaData.getInfoHeaderLines(); } /** * Returns the FORMAT HeaderLines in their original ordering */ - public Collection getFormatHeaderLines() { - return mFormatMetaData.values(); - } + public Collection getFormatHeaderLines() { return mMetaData.getFormatHeaderLines(); } /** - * @param id the header key name + * @param id the id of the requested header line * @return the meta data line, or null if there is none */ public VCFInfoHeaderLine getInfoHeaderLine(final String id) { - return mInfoMetaData.get(id); + return mMetaData.getInfoHeaderLine(id); } /** - * @param id the header key name + * @param id the id of the requested header line * @return the meta data line, or null if there is none */ - public VCFFormatHeaderLine getFormatHeaderLine(final String id) { - return mFormatMetaData.get(id); - } + public VCFFormatHeaderLine getFormatHeaderLine(final String id) { return mMetaData.getFormatHeaderLine(id); } /** - * @param id the header key name + * @param id the id of the requested header line * @return the meta data line, or null if there is none */ - public VCFFilterHeaderLine getFilterHeaderLine(final String id) { - return mFilterMetaData.get(id); - } + public VCFFilterHeaderLine getFilterHeaderLine(final String id) { return mMetaData.getFilterHeaderLine(id); } public boolean hasInfoLine(final String id) { return getInfoHeaderLine(id) != null; @@ -580,24 +422,82 @@ public boolean hasFilterLine(final String id) { } /** - * @param key the header key name + * Deprecated. Use {@link #getOtherHeaderLines(String)}. see https://github.com/samtools/hts-specs/issues/602 + * + * @param key the of the requested header line * @return the meta data line, or null if there is none */ + @Deprecated // starting after version 2.24.1 this selects one from what can be many) public VCFHeaderLine getOtherHeaderLine(final String key) { - return mOtherMetaData.get(key); + final Collection otherLines = mMetaData.getOtherHeaderLines(); + for (final VCFHeaderLine next: otherLines) { + if (next.getKey().equals(key)) { + // note that this returns the first match it finds, which is why this method is deprecated + return next; + } + } + return null; } /** - * Returns the other HeaderLines in their original ordering + * Returns all "other" VCFHeaderLines, in their original (input) order, where "other" means any + * VCFHeaderLine that is not a contig, info, format or filter header line. + */ + public Collection getOtherHeaderLines() { return mMetaData.getOtherHeaderLines(); } + + /** + * Returns "other" HeaderLines that have the key "key", in their original ordering, where "other" + * means any VCFHeaderLine that is not a contig, info, format or filter header line. + */ + public List getOtherHeaderLines(final String key) { + return mMetaData.getOtherHeaderLines().stream().filter(hl -> hl.getKey().equals(key)).collect(Collectors.toList()); + } + + /** + * Adds a single "other" VCFHeaderLine that has key "key". Any lines with that key that already exist + * in the header will be removed. This method can only be used to set unique non-structured (non-ID) + * header lines. + * + * @param uniqueLine the unique line to add + * @throws TribbleException if the line to be added is an ID line. */ - public Collection getOtherHeaderLines() { - return mOtherMetaData.values(); + public void addOtherHeaderLineUnique(final VCFHeaderLine uniqueLine) { + if (uniqueLine.isIDHeaderLine()) { + throw new TribbleException(String.format("Only non-ID header lines can be added using this method: %s", uniqueLine)); + } + getOtherHeaderLines(uniqueLine.getKey()).forEach(hl -> mMetaData.removeMetaDataLine(hl)); + addMetaDataLine(uniqueLine); + } + + /** + * Returns a single "other" VCFHeaderLine that has the key "key", where "other" + * means any VCFHeaderLine that is not a contig, info, format or filter header line. If more than + * one such line is available, throws a TribbleException. + * + * @param key the key to match + * @return a single VCHeaderLine, or null if none + * @throws TribbleException if more than one other line matches the key + */ + public VCFHeaderLine getOtherHeaderLineUnique(final String key) { + final List lineList = getOtherHeaderLines(key); + if (lineList.isEmpty()) { + return null; + } else if (lineList.size() > 1) { + throw new TribbleException( + String.format( + "More than one \"other\" header line matches the key \"%s\". Use getOtherHeaderLines() to retrieve multiple lines:", + key, + lineList.stream().map(VCFHeaderLine::toString).collect(Collectors.joining(",")))); + } else { + return lineList.get(0); + } } /** * If true additional engine headers will be written to the VCF, otherwise only the walker headers will be output. * @return true if additional engine headers will be written to the VCF */ + @Deprecated // starting after version 2.24.1 public boolean isWriteEngineHeaders() { return writeEngineHeaders; } @@ -606,6 +506,7 @@ public boolean isWriteEngineHeaders() { * If true additional engine headers will be written to the VCF, otherwise only the walker headers will be output. * @param writeEngineHeaders true if additional engine headers will be written to the VCF */ + @Deprecated // starting after version 2.24.1 public void setWriteEngineHeaders(final boolean writeEngineHeaders) { this.writeEngineHeaders = writeEngineHeaders; } @@ -614,6 +515,7 @@ public void setWriteEngineHeaders(final boolean writeEngineHeaders) { * If true, and isWriteEngineHeaders also returns true, the command line will be written to the VCF. * @return true if the command line will be written to the VCF */ + @Deprecated // starting after version 2.24.1 public boolean isWriteCommandLine() { return writeCommandLine; } @@ -622,6 +524,7 @@ public boolean isWriteCommandLine() { * If true, and isWriteEngineHeaders also returns true, the command line will be written to the VCF. * @param writeCommandLine true if the command line will be written to the VCF */ + @Deprecated // starting after version 2.24.1 public void setWriteCommandLine(final boolean writeCommandLine) { this.writeCommandLine = writeCommandLine; } @@ -640,10 +543,98 @@ public HashMap getSampleNameToOffset() { @Override public String toString() { - final StringBuilder b = new StringBuilder(); - b.append("[VCFHeader:"); - for ( final VCFHeaderLine line : mMetaData ) - b.append("\n\t").append(line); - return b.append("\n]").toString(); + return mMetaData.toString(); } + + /** + * Obtain a valid fileformat/version line for the requestedVersion + * @param requestedVersion the version for which a version line should be obtained + * @return the version line + */ + public static VCFHeaderLine makeHeaderVersionLine(final VCFHeaderVersion requestedVersion) { + return new VCFHeaderLine(requestedVersion.getFormatString(), requestedVersion.getVersionString()); + } + + /** + * Obtain a VCFHeaderLine set containing only a fileformat/version line for the requestedVersion + * @param requestedVersion the version for which a version line should be obtained + * @return a VCFHeaderLine set containing only fileformat/version line for the requestedVersion + */ + public static Set makeHeaderVersionLineSet(final VCFHeaderVersion requestedVersion) { + return new LinkedHashSet() {{ add(VCFHeader.makeHeaderVersionLine(requestedVersion)); }}; + } + + @Override + public boolean equals(final Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + final VCFHeader vcfHeader = (VCFHeader) o; + + if (samplesWereAlreadySorted != vcfHeader.samplesWereAlreadySorted) return false; + if (writeEngineHeaders != vcfHeader.writeEngineHeaders) return false; + if (writeCommandLine != vcfHeader.writeCommandLine) return false; + if (vcfHeaderVersion != vcfHeader.vcfHeaderVersion) return false; + if (!mMetaData.equals(vcfHeader.mMetaData)) return false; + if (mGenotypeSampleNames != null ? !mGenotypeSampleNames.equals(vcfHeader.mGenotypeSampleNames) : + vcfHeader.mGenotypeSampleNames != null) + return false; + if (sampleNamesInOrder != null ? !sampleNamesInOrder.equals(vcfHeader.sampleNamesInOrder) : + vcfHeader.sampleNamesInOrder != null) + return false; + return sampleNameToOffset != null ? sampleNameToOffset.equals(vcfHeader.sampleNameToOffset) : + vcfHeader.sampleNameToOffset == null; + } + + @Override + public int hashCode() { + int result = vcfHeaderVersion.hashCode(); + result = 31 * result + mMetaData.hashCode(); + result = 31 * result + (mGenotypeSampleNames != null ? mGenotypeSampleNames.hashCode() : 0); + result = 31 * result + (samplesWereAlreadySorted ? 1 : 0); + result = 31 * result + (sampleNamesInOrder != null ? sampleNamesInOrder.hashCode() : 0); + result = 31 * result + (sampleNameToOffset != null ? sampleNameToOffset.hashCode() : 0); + result = 31 * result + (writeEngineHeaders ? 1 : 0); + result = 31 * result + (writeCommandLine ? 1 : 0); + return result; + } + + /** + * Establish the version for this header using the (required) ##fileformat metadata line in the metadata list. + * @throws TribbleException if no ##fileformat line is included in the metadata lines + */ + private VCFHeaderVersion initializeHeaderVersion() { + final VCFHeaderVersion metaDataVersion = mMetaData.getVCFVersion(); + if (metaDataVersion == null) { + //we dont relax this even if VCFUtils.getStrictVCFVersionValidation() == false, since that + //would confound subsequent header version management + throw new TribbleException("The VCFHeader metadata must include a ##fileformat (version) header line"); + } + return metaDataVersion; + } + + private void validateVersionTransition( + final VCFHeaderVersion previousVersion, + final VCFHeaderVersion newVersion) { + final int compareTo = newVersion.compareTo(previousVersion); + if (compareTo < 0) { + // We only allow going forward to a newer version, not backwards to an older one, since there + // is really no way to validate old header lines (pre vcfV4.2). The only way to create a header with + // an old version is to create it that way from the start. + // to be created with the old version from the start. + throw new TribbleException(String.format( + "When changing a header version, the new header version %s must be > the previous version %s", + newVersion, + previousVersion)); + } else if (compareTo > 0) { + logger.debug(() -> String.format("Updating VCFHeader version from %s to %s", + previousVersion.getVersionString(), + newVersion.getVersionString())); + + // the version moved forward, so validate ALL of the existing lines in the list to ensure + // that the transition is valid + mMetaData.validateMetaDataLines(newVersion); + } + } + } diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java index 0d07a83078..94a3a0849e 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java @@ -26,28 +26,23 @@ package htsjdk.variant.vcf; import htsjdk.tribble.TribbleException; +import htsjdk.utils.ValidationUtils; import java.io.Serializable; import java.util.Map; - +import java.util.Optional; /** - * @author ebanks - *

- * Class VCFHeaderLine - *

- *

- * A class representing a key=value entry in the VCF header - *

+ *

A class representing a key=value entry in the VCF header, and the base class for structured header lines. + * Header lines are immutable, and derived classes should maintain immutability. + *

*/ public class VCFHeaderLine implements Comparable, Serializable { public static final long serialVersionUID = 1L; - protected static final boolean ALLOW_UNBOUND_DESCRIPTIONS = true; - protected static final String UNBOUND_DESCRIPTION = "Not provided in original VCF header"; - - private String mKey = null; - private String mValue = null; + // immutable - we don't want to let the hash value change + private final String mKey; + private final String mValue; /** * create a VCF header line @@ -56,14 +51,9 @@ public class VCFHeaderLine implements Comparable, Serializable { * @param value the value for this header line */ public VCFHeaderLine(String key, String value) { - if ( key == null ) - throw new IllegalArgumentException("VCFHeaderLine: key cannot be null"); - if ( key.contains("<") || key.contains(">") ) - throw new IllegalArgumentException("VCFHeaderLine: key cannot contain angle brackets"); - if ( key.contains("=") ) - throw new IllegalArgumentException("VCFHeaderLine: key cannot contain an equals sign"); mKey = key; mValue = value; + validate(); } /** @@ -76,16 +66,97 @@ public String getKey() { } /** - * Get the value + * Get the value. May be null. * - * @return the value + * @return the value. may be null (for subclass implementations that use structured values) */ public String getValue() { return mValue; } /** - * By default the header lines won't be added to the dictionary, unless this method will be override (for example in FORMAT, INFO or FILTER header lines) + * @return true if this is a structured header line (has a unique ID, and key/value pairs), otherwise false + */ + public boolean isIDHeaderLine() { return false; } + + /** + * Return the unique ID for this line. Returns null iff {@link #isIDHeaderLine()} is false. + * @return the line's ID, or null if isIDHeaderLine() is false + */ + public String getID() { return null; } + + /** + * Validates this header line against {@code vcfTargetVersion}. + * Subclasses can override this to provide line type-specific version validation, and the + * overrides should also call super.getValidationFailure to allow each class in the class hierarchy + * to do class-level validation. + * + * @return Optional containing a {@link VCFValidationFailure} describing validation failure if this + * line fails validation, otherwise Optional.empty(). + */ + public Optional> getValidationFailure(final VCFHeaderVersion vcfTargetVersion) { + // If this header line is itself a fileformat/version line, + // make sure it doesn't clash with the requested vcfTargetVersion. + if (VCFHeaderVersion.isFormatString(getKey())) { + if (!vcfTargetVersion.getFormatString().equals(getKey()) || + !vcfTargetVersion.getVersionString().equals(getValue()) + ) { + return Optional.of(new VCFValidationFailure<>( + vcfTargetVersion, + this, + String.format("The target version (%s) is incompatible with the header line's content.", + vcfTargetVersion))); + } + } else if (getKey().equals(VCFConstants.PEDIGREE_HEADER_KEY)) { + // previous to vcf4.3, PEDIGREE header lines are not modeled as VCFPedigreeHeaderLine because they + // were not structured header lines (had no ID), so we need to check HERE to see if an attempt is + // being made to use one of those old-style pedigree lines in a newer-versioned header, and reject + // it if so + if (vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3) && ! (this instanceof VCFPedigreeHeaderLine)) { + return Optional.of(new VCFValidationFailure<>( + vcfTargetVersion, + this, + String.format("A pedigree line with no ID cannot be merged with version %s", vcfTargetVersion))); + } + } + + return Optional.empty(); + } + + /** + * Validate that the header line conforms to {@code vcfTargetVersion. + * @param vcfTargetVersion + * @throws {@link TribbleException.VersionValidationFailure} if this header line fails to conform + */ + public void validateForVersion(final VCFHeaderVersion vcfTargetVersion) { + final Optional> error = getValidationFailure(vcfTargetVersion); + if (error.isPresent()) { + throw new TribbleException.VersionValidationFailure(error.get().getSourceMessage()); + } + } + + /** + * Validate a string that is to be used as a unique id or key field. + */ + protected static void validateKeyOrID(final String keyString, final String sourceName) { + ValidationUtils.nonNull(sourceName); + if (keyString == null) { + throw new TribbleException( + String.format("VCFHeaderLine: %s cannot be null or empty", sourceName)); + } + if ( keyString.contains("<") || keyString.contains(">") ) { + throw new TribbleException( + String.format("VCFHeaderLine: %s cannot contain angle brackets", sourceName)); + } + if ( keyString.contains("=") ) { + throw new TribbleException( + String.format("VCFHeaderLine: %s cannot contain an equals sign", sourceName)); + } + } + + /** + * By default the header lines won't be added to the BCF dictionary, unless this method is overriden + * (for example in FORMAT, INFO or FILTER header lines). * * @return false */ @@ -141,10 +212,11 @@ public static boolean isHeaderLine(String line) { } /** - * create a string of a mapping pair for the target VCF version + * create a string of a mapping pair * @param keyValues a mapping of the key->value pairs to output * @return a string, correctly formatted */ + @Deprecated // starting after version 2.24.1 public static String toStringEncoding(Map keyValues) { StringBuilder builder = new StringBuilder(); builder.append('<'); @@ -167,6 +239,13 @@ public static String toStringEncoding(Map keyValues) { return builder.toString(); } + /** + * Validate the state of this header line. Require the key be valid as an "id". + */ + private void validate() { + validateKeyOrID(mKey, "key"); + } + private static String escapeQuotes(final String value) { // java escaping in a string literal makes this harder to read than it should be // without string literal escaping and quoting the regex would be: replaceAll( ([^\])" , $1\" ) diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeaderLineCount.java b/src/main/java/htsjdk/variant/vcf/VCFHeaderLineCount.java index 080153a990..24195c73d3 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeaderLineCount.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeaderLineCount.java @@ -25,9 +25,78 @@ package htsjdk.variant.vcf; +import htsjdk.utils.ValidationUtils; + /** * the count encodings we use for fields in VCF header lines */ public enum VCFHeaderLineCount { INTEGER, A, R, G, UNBOUNDED; + + // A default int value used to represent an integral count value (not a count *type*) when the + // actual count is derived and not a fixed integer (i.e., when isFixedCount()==false) + public static final int VARIABLE_COUNT = -1; + + /** Return true if this line uses a fixed (integer) count. **/ + public boolean isFixedCount() { return this.equals(INTEGER); } + + /** + * Decode a header line count string and return the corresponding VCFHeaderLineCount enum value. + * If the value is not recognized as a valid constant, assume the string represents a fixed, numeric + * value, and return Integer. The caller should convert and validate the actual value. + * + * @param vcfVersion + * @param countTypeString + * @return + */ + protected static VCFHeaderLineCount decode(final VCFHeaderVersion vcfVersion, final String countTypeString) { + ValidationUtils.nonNull(vcfVersion); + ValidationUtils.nonNull(countTypeString); + + if (countTypeString.equals(VCFConstants.PER_ALTERNATE_COUNT)) { + return A; + } else if (countTypeString.equals(VCFConstants.PER_ALLELE_COUNT)) { + return R; + } else if (countTypeString.equals(VCFConstants.PER_GENOTYPE_COUNT)) { + return G; + } else if ( + (vcfVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0) && countTypeString.equals(VCFConstants.UNBOUNDED_ENCODING_v4)) || + (!vcfVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0) && countTypeString.equals(VCFConstants.UNBOUNDED_ENCODING_v3))) { + return VCFHeaderLineCount.UNBOUNDED; + } else { + return VCFHeaderLineCount.INTEGER; // assume integer + } + } + + /** + * Encode a count type as a string suitable for serialization to a VCF header. Note this is + * not version aware and defaults to VCFv4 format. + * + * @param actualCount Must be the special value {@code VARIABLE_COUNT} unless this object is {@code VCFHeaderLineCount.INTEGER}. + * @return String encoding of this enum, or the {@code actualCount} if the type of this count + * is VCFHeaderLineCount.INTEGER. + * + * @throws IllegalArgumentException if {@code actualCount} is not the special value {@code VARIABLE_COUNT} and this + * is not the {@code VCFHeaderLineCount.INTEGER} enum object. + */ + public String encode(final int actualCount) { + if (this != INTEGER && actualCount != VARIABLE_COUNT) { + // Should only supply an actualCount if the count type == INTEGER + throw new IllegalArgumentException("Inconsistent header line number encoding request"); + } + switch (this) { + case A: + return VCFConstants.PER_ALTERNATE_COUNT; + case R: + return VCFConstants.PER_ALLELE_COUNT; + case G: + return VCFConstants.PER_GENOTYPE_COUNT; + case UNBOUNDED: + return VCFConstants.UNBOUNDED_ENCODING_v4; + case INTEGER: + return Integer.toString(actualCount); + } + throw new IllegalStateException("Unexpected VCFHeaderLineCount enum value"); + } + } diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeaderLineTranslator.java b/src/main/java/htsjdk/variant/vcf/VCFHeaderLineTranslator.java index 6c83574fee..a22ecd2102 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeaderLineTranslator.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeaderLineTranslator.java @@ -34,7 +34,7 @@ import java.util.Map; /** - * A class for translating between vcf header versions + * A class for translating between vcf header versions and corresponding header line parsers. */ public class VCFHeaderLineTranslator { private static final Map mapping; @@ -50,57 +50,57 @@ public class VCFHeaderLineTranslator { mapping = Collections.unmodifiableMap(map); } + /** + * Parse a VCFHeaderLine for the given version. + * + * @param version VCFHeaderVersion of the header line + * @param valueLine the header line string + * @param expectedTagOrder List of expected tags (interpreted differently by the VCF3 and VCF4 parsers). + * @return a mapping of the tags parsed out. Note that the order of attributes is significant (ID must be + * first) and this should return a LinkedHashMap in order to preserve attribute order. + */ public static Map parseLine(VCFHeaderVersion version, String valueLine, List expectedTagOrder) { - return parseLine(version, valueLine, expectedTagOrder, Collections.emptyList()); - } - - public static Map parseLine(VCFHeaderVersion version, String valueLine, List expectedTagOrder, List recommendedTags) { - return mapping.get(version).parseLine(valueLine, expectedTagOrder, recommendedTags); + return mapping.get(version).parseLine(valueLine, expectedTagOrder); } } - +/** + * Parse a VCFHeaderLine. + */ interface VCFLineParser { /** * parse a VCF line - * - * @see #parseLine(String, List, List) VCFv4.2+ recommended tags support - * - * @param valueLine the line - * @param expectedTagOrder List of expected tags - * @return a mapping of the tags parsed out - */ - default Map parseLine(String valueLine, List expectedTagOrder) { - return parseLine(valueLine, expectedTagOrder, Collections.emptyList()); - } - - /** - * parse a VCF line - * - * The recommended tags were introduced in VCFv4.2. - * Older implementations may throw an exception when the recommendedTags field is not empty. - * - * We use a list to represent tags as we assume there will be a very small amount of them, - * so using a {@code Set} is overhead. - * + * + * @see #parseLine(String, List) VCFv4.2+ recommended tags support + * * @param valueLine the line * @param expectedTagOrder List of expected tags - * @param recommendedTags List of tags that may or may not be present. Use an empty list instead of NULL for none. * @return a mapping of the tags parsed out */ - Map parseLine(String valueLine, List expectedTagOrder, List recommendedTags); + Map parseLine(String valueLine, List expectedTagOrder); } - /** * a class that handles the to and from disk for VCF 4 lines */ class VCF4Parser implements VCFLineParser { - + + /** + * Parse a VCFHeaderLine. The expectedTagOrder list prescribes the order in which tags should appear, but + * all tags are treated as optional. Additional tags are allowed after the expected tags, and may appear in + * any order. It is the caller's responsibility to validate that all required tags are present and that + * any additional "optional" tags are valid. + * + * @param valueLine the header line string + * @param expectedTagOrder List of tags that are required to appear in the order they're expected. Additional + * "extra" tags are allowed after the tags in this list, and must be validated by + * the caller. + * @return a mapping of all tags parsed out + */ @Override - public Map parseLine(String valueLine, List expectedTagOrder, List recommendedTags) { + public Map parseLine(String valueLine, List expectedTagOrder) { // our return map - Map ret = new LinkedHashMap(); + Map ret = new LinkedHashMap<>(); // a builder to store up characters as we go StringBuilder builder = new StringBuilder(); @@ -159,28 +159,23 @@ public Map parseLine(String valueLine, List expectedTagO throw new TribbleException.InvalidHeader("Unclosed quote in header line value " + valueLine); } - // validate the tags against the expected list - index = 0; + // Validate the order of all discovered tags against requiredTagOrder. All tags are treated as + // "optional". Succeeding does not mean that all expected tags in the list were seen. Also, all + // structured header lines can have "extra" tags, with no order specified, so additional tags + // are tolerated. if ( expectedTagOrder != null ) { - if (ret.keySet().isEmpty() && !expectedTagOrder.isEmpty()) { - throw new TribbleException.InvalidHeader("Header with no tags is not supported when there are expected tags in line " + valueLine); - } - for ( String str : ret.keySet() ) { - if (index < expectedTagOrder.size()) { - if (!expectedTagOrder.get(index).equals(str)) { - if (expectedTagOrder.contains(str)) { - throw new TribbleException.InvalidHeader("Tag " + str + " in wrong order (was #" + (index+1) + ", expected #" + (expectedTagOrder.indexOf(str)+1) + ") in line " + valueLine); - } else if (recommendedTags.contains(str)) { - throw new TribbleException.InvalidHeader("Recommended tag " + str + " must be listed after all expected tags in line " + valueLine); - } - else { - throw new TribbleException.InvalidHeader("Unexpected tag " + str + " in line " + valueLine); - } - } + index = 0; + for (String str : ret.keySet()) { + if (index >= expectedTagOrder.size()) { + break; // done - end of requiredTagOrder list + } else if (!expectedTagOrder.get(index).equals(str)) { + throw new TribbleException.InvalidHeader( + String.format("Unexpected tag or tag order for tag \"%s\" in line %s", str, valueLine)); } index++; } } + return ret; } } @@ -188,13 +183,9 @@ public Map parseLine(String valueLine, List expectedTagO class VCF3Parser implements VCFLineParser { @Override - public Map parseLine(String valueLine, List expectedTagOrder, List recommendedTags) { - if (!recommendedTags.isEmpty()) { - throw new TribbleException.InternalCodecException("Recommended tags are not allowed in VCFv3.x"); - } - + public Map parseLine(String valueLine, List expectedTagOrder) { // our return map - Map ret = new LinkedHashMap(); + Map ret = new LinkedHashMap<>(); // a builder to store up characters as we go StringBuilder builder = new StringBuilder(); @@ -211,20 +202,34 @@ public Map parseLine(String valueLine, List expectedTagO for (char c: valueLine.toCharArray()) { switch (c) { case ('\"') : inQuote = !inQuote; break; // a quote means we ignore ',' in our strings, keep track of it - case (',') : if (!inQuote) { ret.put(expectedTagOrder.get(tagIndex++),builder.toString()); builder = new StringBuilder(); break; } // drop the current key value to the return map + case (',') : + if (!inQuote) { + ret.put(expectedTagOrder.get(tagIndex++),builder.toString()); + builder = new StringBuilder(); + break; + } // drop the current key value to the return map default: builder.append(c); // otherwise simply append to the current string } index++; } ret.put(expectedTagOrder.get(tagIndex++),builder.toString()); - // validate the tags against the expected list + // Validate that: + // we have no more tags than are expected + // the ones we have are in the expected list + // they appear in the same order as in the expected list + // This does no checking for missing tags; all tags are treated as optional + // index = 0; - if (tagIndex != expectedTagOrder.size()) throw new IllegalArgumentException("Unexpected tag count " + tagIndex + ", we expected " + expectedTagOrder.size()); + if (tagIndex != expectedTagOrder.size()) { + throw new IllegalArgumentException("Unexpected tag count " + tagIndex + ", we expected " + expectedTagOrder.size()); + } for (String str : ret.keySet()){ - if (!expectedTagOrder.get(index).equals(str)) throw new IllegalArgumentException("Unexpected tag " + str + " in string " + valueLine); + if (!expectedTagOrder.get(index).equals(str)) { + throw new IllegalArgumentException("Unexpected tag " + str + " in string " + valueLine); + } index++; } return ret; } -} +} \ No newline at end of file diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeaderLineType.java b/src/main/java/htsjdk/variant/vcf/VCFHeaderLineType.java index 785449de89..88432f0b18 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeaderLineType.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeaderLineType.java @@ -25,9 +25,37 @@ package htsjdk.variant.vcf; +import htsjdk.utils.ValidationUtils; + /** * the type encodings we use for fields in VCF header lines */ public enum VCFHeaderLineType { - Integer, Float, String, Character, Flag; + Integer, + Float, + String, + Character, + Flag; + + /** + * Decode a header line count string and return the corresponding VCFHeaderLineCount enum value. + * If the value is not recognized as a valid constant, we assume the string represents a numeric + * value and return Integer. The caller should convert and validate the value. + * + * @param lineTypeString + * @return VCFHeaderLineType for {@code lineTypeString} + */ + protected static VCFHeaderLineType decode(final String lineTypeString) { + ValidationUtils.nonNull(lineTypeString); + return VCFHeaderLineType.valueOf(lineTypeString); + } + + /** + * Encode this line type as a string suitable for serialization to a VCF header. Note this is + * not version specific and defaults to VCFv42. + * + * The serialized encoding is the simple name of the enum constant + * @return string encoding of this line type + */ + String encode() { return this.toString(); } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeaderMerger.java b/src/main/java/htsjdk/variant/vcf/VCFHeaderMerger.java new file mode 100644 index 0000000000..becbf64eb1 --- /dev/null +++ b/src/main/java/htsjdk/variant/vcf/VCFHeaderMerger.java @@ -0,0 +1,286 @@ +package htsjdk.variant.vcf; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceDictionaryUtils; +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.tribble.TribbleException; +import htsjdk.utils.ValidationUtils; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashSet; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * Class used to produce a set of header lines resulting from the merger of one or more input VCFHeaders. + *

+ * The resulting lines have a version line matching the highest version of any of the input headers. + *

+ * The headers to be merged must conform to certain requirements: + * Some headers sets cannot be merged, and will result in an exception being thrown: + *

    + *
  • Headers must have a version that is at least VCF v4.2. Headers from older versions may not be merged (note + * that older headers that are read from input files are automatically "converted" to VCF v4.2 by VCFCodec. See + * {@link AbstractVCFCodec#setVCFHeader(VCFHeader).}
  • + *
  • any header that contains a header line that doesn't conform to the resulting (highest )version of any + * header in the merge list
  • + *
  • any header that has a sequence dictionary that is incompatible with any other merged header's + * sequence dictionary. All headers must either share a common sequence dictionary, or have a sequence dictionary + * that is a subset of the common sequence dictionary that is taken from the remaining headers.
  • + *
+ */ +public class VCFHeaderMerger { + + /** + * Merge all header lines in a set of headers into a single set of header lines. The resulting set includes + * all unique lines that appeared in any header; duplicates of lines are excluded from the result set. Equivalent + * header lines are reduced to a single representative header line. The resulting set contains a ##fileformat + * version line for the newest version seen in any of the headers provided in the input header collection, + * and all lines in the merged set are compatible with that version. + * + * @param headers the headers to merge + * @param emitWarnings true if warnings should be emitted + * @return a set of merged VCFHeaderLines + * @throws TribbleException if any header has a version < VCFv4.2, or if any header line in any + * input header is not compatible the newest version selected from amongst all headers provided, or if any + * header has a sequence dictionary that is incompatible with any other header's sequence dictionary + */ + public static Set getMergedHeaderLines(final Collection headers, final boolean emitWarnings) { + ValidationUtils.nonNull(headers, "headers"); + ValidationUtils.validateArg(!headers.isEmpty(), "headers collection must be non empty"); + + // use a VCFMetaDataLines object to accumulate header lines + final VCFMetaDataLines mergedMetaData = new VCFMetaDataLines(); + final HeaderMergeConflictWarnings conflictWarner = new HeaderMergeConflictWarnings(emitWarnings); + + final VCFHeaderVersion newestVersion = getNewestHeaderVersion(headers); + final SAMSequenceDictionary commonSequenceDictionary = getCommonSequenceDictionaryOrThrow(headers, conflictWarner); + + for (final VCFHeader sourceHeader : headers) { + for (final VCFHeaderLine line : sourceHeader.getMetaDataInSortedOrder()) { + final String key = line.getKey(); + if (VCFHeaderVersion.isFormatString(key) || key.equals(VCFHeader.CONTIG_KEY)) { + // drop all version and contig lines, and at the end we'll set the version and + // commonSequenceDictionary + continue; + } + + // Structured header lines are only considered equal if they have identical key, id, and + // attribute/value pairs, but for merging we need to reduce lines that have the same key/id pairs + // but different attributes to a single line. So use the more permissive "findEquivalentHeaderLine" + // to detect equivalent lines, and delegate to the individual header line implementations to do the + // smart reconciliation. + final VCFHeaderLine other = mergedMetaData.findEquivalentHeaderLine(line); + if (other != null && !line.equals(other)) { + if (key.equals(VCFConstants.FORMAT_HEADER_KEY)) { + // Delegate to the FORMAT line resolver + mergedMetaData.addMetaDataLine( + VCFFormatHeaderLine.getMergedFormatHeaderLine( + (VCFFormatHeaderLine) line, + (VCFFormatHeaderLine) other, + conflictWarner) + ); + } else if (key.equals(VCFConstants.INFO_HEADER_KEY)) { + // Delegate to the INFO line resolver + mergedMetaData.addMetaDataLine( + VCFInfoHeaderLine.getMergedInfoHeaderLine( + (VCFInfoHeaderLine) line, + (VCFInfoHeaderLine) other, + conflictWarner) + ); + } else if (line.isIDHeaderLine()) { + // equivalent ID header line, but not a compound(format/info) line, and also not strictly equal + // to the existing line: preserve the existing line (this *may* drop attributes/values if the + // dropped line has additional attributes) + conflictWarner.warn( + String.format("Dropping duplicate header line %s during header merge, retaining equivalent line %s", + line, + other)); + } else { + // a non-structured line with a duplicate key of an existing line, but a different value, + // retain the new line in addition to the old one + mergedMetaData.addMetaDataLine(line); + } + } else { + mergedMetaData.addMetaDataLine(line); + } + } + } + return makeMergedMetaDataSet(mergedMetaData, newestVersion, commonSequenceDictionary, conflictWarner); + } + + // Create the final set of all of our merged header lines. Start with the version line for the new + // version, add in the lines from the merged set, use the resulting list to create a header, add the common + // sequence dictionary to that, and then extract and return the resulting set of lines in sorted order + private static Set makeMergedMetaDataSet( + final VCFMetaDataLines mergedMetaData, + final VCFHeaderVersion newestVersion, + final SAMSequenceDictionary commonSequenceDictionary, + final HeaderMergeConflictWarnings conflictWarner) { + + if (conflictWarner.emitWarnings) { + mergedMetaData.getValidationErrors(newestVersion) + .forEach(validationError -> conflictWarner.warn(validationError.getFailureMessage())); + } + + final Set mergedLines = VCFHeader.makeHeaderVersionLineSet(newestVersion); + mergedLines.addAll(mergedMetaData.getMetaDataInInputOrder()); + final VCFHeader mergedHeader = new VCFHeader(mergedLines, Collections.emptySet()); + if (commonSequenceDictionary != null) { + mergedHeader.setSequenceDictionary(commonSequenceDictionary); + } else { + conflictWarner.warn( + "The header lines resulting from a header merge contain no contig lines because none " + + "of the input headers contains a sequence dictionary."); + } + + return new LinkedHashSet<>(mergedHeader.getMetaDataInSortedOrder()); + } + + // Find the newest version af any header in the input set, and return that to use as the target + // version for the merged lines. + private static VCFHeaderVersion getNewestHeaderVersion(final Collection vcfHeaders) { + VCFHeaderVersion newestVersion = null; + for (final VCFHeader header : vcfHeaders) { + final VCFHeaderVersion vcfVersion = header.getVCFHeaderVersion(); + if (!vcfVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_2)) { + throw new TribbleException(String.format( + "Cannot merge a VCFHeader with version (%s) that is older than version %s", + header.getVCFHeaderVersion(), VCFHeaderVersion.VCF4_2)); + } + if (newestVersion == null || (vcfVersion.ordinal() > newestVersion.ordinal())) { + newestVersion = vcfVersion; + } + } + return newestVersion; + } + + // Create a common sequence dictionary from the set of dictionaries in VCFHeaders. The headers must + // either have identical dictionaries, or contain a common superset dictionary where individual dictionaries + // contain a dictionary that is subset of that common superset. Otherwise throw. + private static SAMSequenceDictionary getCommonSequenceDictionaryOrThrow( + final Collection headers, + final HeaderMergeConflictWarnings conflictWarner) { + SAMSequenceDictionary candidateDictionary = null; + + // Because we're doing pairwise comparisons and always selecting the best dictionary as + // our running candidate, we need to visit the headers in order of dictionary size + // (largest first). This prevents a premature failure where an individual pairwise + // comparison erroneously fails because the source is pairwise incompatible with the + // running candidate, and the common superset exists but we just haven't seen it yet. + final List headersByDictionarySize = new ArrayList<>(headers); + headersByDictionarySize.sort(((Comparator) + (hdr1, hdr2) -> Integer.compare(getDictionarySize(hdr1), getDictionarySize(hdr2))).reversed()); + + for ( final VCFHeader sourceHeader : headersByDictionarySize ) { + final SAMSequenceDictionary sourceDictionary = sourceHeader.getSequenceDictionary(); + if (sourceDictionary != null) { + if (candidateDictionary == null) { + candidateDictionary = sourceDictionary; + } else { + // first, compare with checkContigOrdering on + final SAMSequenceDictionaryUtils.SequenceDictionaryCompatibility compatibility = + SAMSequenceDictionaryUtils.compareDictionaries( + candidateDictionary, + sourceDictionary, + true); + switch (compatibility) { + case IDENTICAL: // existing candidateDictionary is identical to sourceDictionary, so keep it + case SUPERSET: // existing candidateDictionary is a superset of sourceDictionary, so keep it + break; + + case COMMON_SUBSET: // fall through + case DIFFERENT_INDICES: + // There exists a common subset of contigs, but for merging purposes we have a slightly + // stricter requirement, that one dictionary is a superset of the other. So try the + // comparison again with checkContigOrdering off, in both directions. If one is a + // superset of the other, retain the superset. + if (SAMSequenceDictionaryUtils.SequenceDictionaryCompatibility.SUPERSET == + SAMSequenceDictionaryUtils.compareDictionaries( + candidateDictionary, + sourceDictionary, + false)) { + break; // keep our candidate + } else if (SAMSequenceDictionaryUtils.SequenceDictionaryCompatibility.SUPERSET == + SAMSequenceDictionaryUtils.compareDictionaries( + sourceDictionary, + candidateDictionary, + false)) { + candidateDictionary = sourceDictionary; // take the sourceDictionary as the new candidate + } else { + // dictionaries are disjoint, and we have no basis to choose a merge order for the + // non-common contigs, so give up + throw new TribbleException( + createHeaderDictionaryFailureMessage( + candidateDictionary, sourceHeader, sourceDictionary, compatibility)); + } + break; + + case NO_COMMON_CONTIGS: // no overlap between dictionaries + case UNEQUAL_COMMON_CONTIGS: // common subset has contigs that have the same name but different lengths + case NON_CANONICAL_HUMAN_ORDER: // human reference detected but the order of the contigs is non-standard (lexicographic, for example) + case OUT_OF_ORDER: // the two dictionaries overlap but the overlapping contigs occur in different + default: + throw new TribbleException( + createHeaderDictionaryFailureMessage( + candidateDictionary, sourceHeader, sourceDictionary, compatibility)); + } + } + } else { + conflictWarner.warn( + String.format( + "Merging header with no sequence dictionary: %s", + getHeaderFragmentForDisplay(sourceHeader))); + } + } + return candidateDictionary; + } + + private static Integer getDictionarySize(final VCFHeader hdr) { + final SAMSequenceDictionary dictionary = hdr.getSequenceDictionary(); + return dictionary == null ? 0 : dictionary.size(); + } + + private static String createHeaderDictionaryFailureMessage( + final SAMSequenceDictionary commonSequenceDictionary, + final VCFHeader sourceHeader, + final SAMSequenceDictionary sourceSequenceDictionary, + final SAMSequenceDictionaryUtils.SequenceDictionaryCompatibility failureReason) { + // return a nice long message that includes as much of the offending context as is reasonable, + // without printing the entire context, since the headers and sequence dictionaries can have + // thousands of entries + return String.format( + "Can't merge VCF headers with incompatible sequence dictionaries, merge failed due to %s:" + + "\n\nHeader dictionary:\n\n%1.2000s\n\nis incompatible with the common dictionary:\n\n%1.2000s\n\n merging VCF header:\n\n%1.2000s\n", + failureReason, + sourceSequenceDictionary.getSequences().stream().map(SAMSequenceRecord::toString).collect(Collectors.joining("\n")), + commonSequenceDictionary.getSequences().stream().map(SAMSequenceRecord::toString).collect(Collectors.joining("\n")), + getHeaderFragmentForDisplay(sourceHeader)); + } + + private static String getHeaderFragmentForDisplay(final VCFHeader sourceHeader) { + return sourceHeader.getContigLines().stream().map(VCFContigHeaderLine::toString).collect(Collectors.joining("\n")); + } + + /** Only displays a warning if warnings are enabled and an identical warning hasn't been already issued */ + static final class HeaderMergeConflictWarnings { + boolean emitWarnings; + final Set alreadyIssued = new HashSet<>(); + + protected HeaderMergeConflictWarnings(final boolean emitWarnings ) { + this.emitWarnings = emitWarnings; + } + + public void warn(final String msg) { + if ( emitWarnings && ! alreadyIssued.contains(msg) ) { + alreadyIssued.add(msg); + VCFHeader.logger.warn(msg); + } + } + } +} diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeaderVersion.java b/src/main/java/htsjdk/variant/vcf/VCFHeaderVersion.java index 43f43c65c3..ce5ed1920a 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeaderVersion.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeaderVersion.java @@ -26,6 +26,7 @@ package htsjdk.variant.vcf; import htsjdk.tribble.TribbleException; +import htsjdk.utils.ValidationUtils; /** * information that identifies each header version @@ -47,7 +48,7 @@ public enum VCFHeaderVersion { * @param vString the version string * @param fString the format string */ - VCFHeaderVersion(String vString, String fString) { + VCFHeaderVersion(String vString, String fString) { this.versionString = vString; this.formatString = fString; } @@ -67,7 +68,8 @@ public static VCFHeaderVersion toHeaderVersion(String version) { /** * are we a valid version string of some type - * @param version the version string + * @param version the version string (the part of the header line that specifies the version, + * i.e., "VCFv4.3" if the line is "##fileformat=VCFv4.3") * @return true if we're valid of some type, false otherwise */ public static boolean isVersionString(String version){ @@ -75,7 +77,8 @@ public static boolean isVersionString(String version){ } /** - * are we a valid format string for some type + * are we a valid format string for some type (the key part of the header line that specifies a version, + * i.e., "fileformat" if the line is "##fileformat=VCFv4.3") * @param format the format string * @return true if we're valid of some type, false otherwise */ @@ -87,8 +90,16 @@ public static boolean isFormatString(String format){ return false; } - public static VCFHeaderVersion getHeaderVersion(String versionLine) { - String[] lineFields = versionLine.split("="); + /** + * + * @param versionLine a VCF header version line, including the leading meta data indicator, + * for example "##fileformat=VCFv4.2" + * @return the VCFHeaderVersion for this string + * @throws TribbleException.InvalidHeader if the string is not a version string for a recognized supported version + */ + public static VCFHeaderVersion fromHeaderVersionLine(final String versionLine) { + ValidationUtils.nonNull(versionLine, "version line"); + final String[] lineFields = versionLine.split("="); if ( lineFields.length != 2 || !isFormatString(lineFields[0].substring(2)) ) throw new TribbleException.InvalidHeader(versionLine + " is not a valid VCF version line"); @@ -98,6 +109,13 @@ public static VCFHeaderVersion getHeaderVersion(String versionLine) { return toHeaderVersion(lineFields[1]); } + /** + * @return A VCF "##fileformat=version" metadata string for the supplied version. + */ + public String toHeaderVersionLine() { + return String.format("%s%s=%s", VCFHeader.METADATA_INDICATOR, getFormatString(), getVersionString()); + } + /** * Utility function to clean up a VCF header string * @@ -118,6 +136,20 @@ public boolean isAtLeastAsRecentAs(final VCFHeaderVersion target) { return this.ordinal() >= target.ordinal(); } + /** + * Determine if two header versions are compatible (header lines from these versions are interchangeable). + * For now, the only incompatibility is between V4.3 and any other version. All other version combinations + * are compatible. + * @param v1 first version to compare + * @param v2 scond version to compare + * @return true if the versions are compatible + */ + //TODO: this method can be removed once this is rebased on the vcf4.3 writing branch + public static boolean versionsAreCompatible(final VCFHeaderVersion v1, final VCFHeaderVersion v2) { + return v1.equals(v2) || + (!v1.isAtLeastAsRecentAs(VCF4_3) && !v2.isAtLeastAsRecentAs(VCF4_3)); + } + public String getVersionString() { return versionString; } @@ -125,4 +157,5 @@ public String getVersionString() { public String getFormatString() { return formatString; } + } diff --git a/src/main/java/htsjdk/variant/vcf/VCFInfoHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFInfoHeaderLine.java index 13df34bc87..12a29a1f6c 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFInfoHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFInfoHeaderLine.java @@ -26,44 +26,90 @@ package htsjdk.variant.vcf; +import htsjdk.samtools.util.Log; +import htsjdk.utils.ValidationUtils; + /** - * @author ebanks *

* Class VCFInfoHeaderLine *

*

- * A class representing a key=value entry for INFO fields in the VCF header + * A class representing an INFO field in the VCF header *

*/ public class VCFInfoHeaderLine extends VCFCompoundHeaderLine { - public VCFInfoHeaderLine(String name, int count, VCFHeaderLineType type, String description) { - super(name, count, type, description, SupportedHeaderLineType.INFO); - } + private static final long serialVersionUID = 1L; + + protected final static Log logger = Log.getInstance(VCFFormatHeaderLine.class); public VCFInfoHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description) { - super(name, count, type, description, SupportedHeaderLineType.INFO); + super(VCFConstants.INFO_HEADER_KEY, name, count, type, description); + } + + public VCFInfoHeaderLine(String name, int count, VCFHeaderLineType type, String description) { + super(VCFConstants.INFO_HEADER_KEY, name, count, type, description); } public VCFInfoHeaderLine(String name, int count, VCFHeaderLineType type, String description, String source, String version) { - super(name, count, type, description, SupportedHeaderLineType.INFO, source, version); + super(VCFConstants.INFO_HEADER_KEY, name, count, type, description); + this.updateGenericField(SOURCE_ATTRIBUTE, source); + this.updateGenericField(VERSION_ATTRIBUTE, version); } public VCFInfoHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description, String source, String version) { - super(name, count, type, description, SupportedHeaderLineType.INFO, source, version); + super(VCFConstants.INFO_HEADER_KEY, name, count, type, description); + this.updateGenericField(SOURCE_ATTRIBUTE, source); + this.updateGenericField(VERSION_ATTRIBUTE, version); } public VCFInfoHeaderLine(String line, VCFHeaderVersion version) { - super(line, version, SupportedHeaderLineType.INFO); + super(VCFConstants.INFO_HEADER_KEY, + VCFHeaderLineTranslator.parseLine(version, line, expectedTagOrder), + version + ); + validateForVersion(version); } - // info fields allow flag values - @Override - boolean allowFlagValues() { - return true; + /** + * Compare two VCFInfoHeaderLine objects to determine if they have compatible number types, and return a + * VCFInfoHeaderLine that represents the result of merging these two lines. + * + * @param infoLine1 first info line to merge + * @param infoLine2 second info line to merge + * @param conflictWarner conflict warning emitter + * @return a merged VCFInfoHeaderLine + */ + public static VCFInfoHeaderLine getMergedInfoHeaderLine( + final VCFInfoHeaderLine infoLine1, + final VCFInfoHeaderLine infoLine2, + final VCFHeaderMerger.HeaderMergeConflictWarnings conflictWarner) + { + ValidationUtils. nonNull(infoLine1); + ValidationUtils. nonNull(infoLine2); + ValidationUtils. nonNull(conflictWarner); + + // delegate to the generic VCFCompoundHeaderLine merger, passing a resolver lambda + return VCFCompoundHeaderLine.getMergedCompoundHeaderLine( + infoLine1, + infoLine2, + conflictWarner, + (l1, l2) -> new VCFInfoHeaderLine( + l1.getID(), + VCFHeaderLineCount.UNBOUNDED, + l1.getType(), + l1.getDescription()) + ); } @Override public boolean shouldBeAddedToDictionary() { return true; } + + @Override + //TODO: integrate this with the existing validateKeyOrID method + protected boolean validHeaderID(final String id) { + return super.validHeaderID(id) || id.equals(VCFConstants.THOUSAND_GENOMES_KEY); + } + } diff --git a/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java b/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java new file mode 100644 index 0000000000..843fdf98cc --- /dev/null +++ b/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java @@ -0,0 +1,525 @@ +package htsjdk.variant.vcf; + +import htsjdk.annotations.InternalAPI; +import htsjdk.samtools.util.Log; +import htsjdk.tribble.TribbleException; +import htsjdk.utils.ValidationUtils; + +import java.io.Serializable; +import java.util.*; +import java.util.stream.Collectors; + +/** + * Class for managing the set of VCFHeaderLines maintained by a VCFHeader. + * + * Since this class is used to incrementally build up a set of header lines for use with a VCFHeader, + * it does not require that the list always contain a fileformat line (its VCFHeader's job to enforce + * that condition). + * + * This class maintains several invariants: + * + * - The list keeps track of the "current version" by tracking whether a version line (a line that + * establishes the VCFHeaderVersion, such as format/fileformat line) is contained in the list. If + * no version line has been added, the list will have a null current version, and contain 0 version + * lines. If a version line has been added, it will have a non-null version, and contain 1 version line. + * If the version line is manually removed, the "current version" is reset to null. + * + * - Each contig line that is retained is guaranteed to have a unique contig index. This does + * NOT guarantee that the contig indices are contiguous, or ordered, only that they are unique. + * + * - Each structured (ID) line for a given key will have a unique ID. Any new line that has the same + * key/ID pair as an existing line will replace the previous line. (Previous htsjdk implementations + * preserve such lines in a master line list, but would silently drop them from the typed + * lookup lists, so such duplicates would never be returned in queries for typed lines such as + * getInfoHeaderLines(), but would still be serialized on write.) + * + * This class does NOT validate that the lines contained are valid for the current version (that is + * the caller's responsibilty). + */ +//Visible to allow disq Kryo registration for serialization +@InternalAPI +final class VCFMetaDataLines implements Serializable { + public static final long serialVersionUID = 1L; + protected final static Log logger = Log.getInstance(VCFMetaDataLines.class); + + // Master map of all header lines (including file format version lines and contig header lines) + private final Map mMetaData = new LinkedHashMap<>(); + + // Map of contig index to contig header line. Must be kept in sync with the mMetaData map + private final Map contigIndexMap = new LinkedHashMap<>(); + + // Current version for lines included in the list. May be null. Must be kept in sync with the + // contents of the mMetaData map. + private VCFHeaderVersion vcfVersion; + + /** + * Add all metadata lines from Set. If a duplicate line is encountered (duplicate content for + * unstructured lines with identical keys, or duplicate key/ID pair for structured lines), only + * the new line will be retained. + * + * @param newMetaData Set of lines to be added to the list. + * @throws IllegalArgumentException if a version is established or if any line fails validation for that version + */ + public void addMetaDataLines(final Set newMetaData) { + newMetaData.forEach(this::addMetaDataLine); + } + + /** + * Add a metadata line to the list. If a duplicate line is encountered (duplicate content for + * unstructured lines with identical keys, or duplicate key/ID pair for structured lines), only + * the newest line will be retained. + * + * @param newMetaDataLine header line to attempt to add + * @returns an existing (equivalent) header line that was replaced by newMetaDataLine, if any, + * otherwise null + */ + public VCFHeaderLine addMetaDataLine(final VCFHeaderLine newMetaDataLine) { + ValidationUtils.nonNull(newMetaDataLine, "metadata line"); + + if (VCFHeaderVersion.isFormatString(newMetaDataLine.getKey())) { + // for format lines, we need to remove any existing format line (which may have a different key + // than the new line, since old VCF versions use a different format key than modern versions) + return updateVersion(newMetaDataLine); + } else { + // otherwise, see if there is an equivalent line that the new line will replace + final HeaderLineMapKey newMapKey = makeKeyForLine(newMetaDataLine); + final VCFHeaderLine equivalentMetaDataLine = mMetaData.get(newMapKey); + if (equivalentMetaDataLine == null) { + createNewMapEntry(newMapKey, newMetaDataLine); + } else { + replaceExistingMapEntry(newMapKey, equivalentMetaDataLine, newMetaDataLine); + } + return equivalentMetaDataLine; + } + } + + /** + * Remove a metadata line from the list. This is the inverse of addMetaDataLine - it removes a + * line that has an identical key and value as lineToRemove if lineToRemove is an unstructured (non-ID) + * but if lineToRemove is a structured line, it will remove the line that has the same key/ID pair as + * lineToRemove, regardless of other content. + * + * The removed value is returned, and can be used by the caller to determine if the removed line has a + * different value than the line presented. + * + * @param lineToRemove the header line to remove + * @return The actual headerline removed, or null of no equivalent headerline was found to remove + */ + public VCFHeaderLine removeMetaDataLine(final VCFHeaderLine lineToRemove) { + final VCFHeaderLine removedLine = mMetaData.remove(makeKeyForLine(lineToRemove)); + if (removedLine != null) { + // only synchronize the dependent version and contig map variables if a line was ACTUALLY removed + if (VCFHeaderVersion.isFormatString(removedLine.getKey())) { + vcfVersion = null; + } else if (lineToRemove.isIDHeaderLine() && lineToRemove.getKey().equals(VCFHeader.CONTIG_KEY)) { + removeFromContigIndexMap((VCFContigHeaderLine) lineToRemove); + } + } + return removedLine; + } + + /** + * @return the version for any contained version line. may be null if no file format version + * line is in the list + */ + public VCFHeaderVersion getVCFVersion() { + return vcfVersion; + } + + /** + * Return the existing line from the list that is "equivalent" to the query line, where + * equivalent is defined as having the same key and value for unstructured header lines, or the + * same key and ID, but not necessarily the same value (for structured header lines). The + * "equivalent" line returned by this method is not guaranteed to be equal to the queryLine, + * in the case where the queryLine is an ID line. + * + * The method is a way to ask "if the queryLine were added to this object via addMetaDataLine, what + * line, if any, would it replace". + * + * @param queryLine the source line to use to check for equivalents + * @return The existing header line of the type/key provided, otherwise NULL. + */ + public VCFHeaderLine findEquivalentHeaderLine(final VCFHeaderLine queryLine) { + return mMetaData.get(makeKeyForLine(queryLine)); + } + + /** + * Validate all metadata lines except the file format line against a target version. + * Throws {@link TribbleException.VersionValidationFailure} if any line is incompatible with the given version. + * @param targetVersion the target version to validate against + * @throws TribbleException if any existing line fails to validate against {@code targetVersion} + */ + //TODO: we need to tell users how to resolve the case where this fails due to version validation + //i.e, use a custom upgrade tool + public void validateMetaDataLines(final VCFHeaderVersion targetVersion) { + mMetaData.values().forEach(headerLine -> { + if (!VCFHeaderVersion.isFormatString(headerLine.getKey())) { + headerLine.validateForVersion(targetVersion); + } + }); + } + + /** + * Get a list of validation failures for all metadata lines (except the file format line) against + * a target version. + * + * @param targetVersion the target version to validate against + * @return an Collection describing the lines that failed to validate + * incompatible with targetVersion. The collections is empty if validation succeeded for all lines. + */ + public Collection getValidationErrors(final VCFHeaderVersion targetVersion) { + return mMetaData.values().stream() + .filter(line -> !VCFHeaderVersion.isFormatString(line.getKey())) + .map(l -> l.getValidationFailure(targetVersion)) + .filter(o -> o.isPresent()) + .map(o -> o.get()) + .collect(Collectors.toList()); + } + + /** + * get the meta data, associated with this header, in input order + * + * @return a set of the meta data + */ + public Set getMetaDataInInputOrder() { + return Collections.unmodifiableSet(new LinkedHashSet<>(mMetaData.values())); + } + + /** + * get the meta data, associated with this header, in SORTED order + * + * @return a set of the meta data + */ + public Set getMetaDataInSortedOrder() { + // Use an intermediate TreeSet to get the correct sort order (via the header line + // comparators), but return an (unmodifiable) LinkedHashSet because TreeSet has a + // `contains` implementation based on comparator equality that can lead to inconsistent + // results for header line types like VCFContigHeaderLine that have a compareTo + // implementation that is inconsistent with equals. + return Collections.unmodifiableSet(new LinkedHashSet<>(new TreeSet<>(mMetaData.values()))); + } + + /** + * @return all of the structured (ID) lines in their original file order, or an empty list if none were present + */ + public List getIDHeaderLines() { + return mMetaData.values().stream() + .filter(VCFHeaderLine::isIDHeaderLine) + .map(hl -> (VCFSimpleHeaderLine) hl) + .collect(Collectors.toCollection(ArrayList::new)); + } + + /** + * @return all of the VCF FILTER lines in their original file order, or an empty list if none were present + */ + public List getFilterLines() { + return mMetaData.values().stream() + .filter(hl -> hl.getKey().equals(VCFConstants.FILTER_HEADER_KEY)) + .map(hl -> (VCFFilterHeaderLine) hl) + .collect(Collectors.toCollection(ArrayList::new)); + } + + /** + * @return all of the VCF header lines of the ##contig form in SORTED order, or an empty list if none were present + */ + public List getContigLines() { + return Collections.unmodifiableList(new ArrayList<>(new TreeSet<>(contigIndexMap.values()))); + } + + /** + * Get the VCFHeaderLine(s) whose key equals key. Returns null if no such line exists + * @param key the VCFHeaderLine key to use to locate the headerline + * @return collection of VCFHeaderLine + */ + public Collection getMetaDataLines(final String key) { + return mMetaData.values().stream() + .filter(hl -> hl.getKey().equals(key)).collect(Collectors.toList()); + } + + /** + * Returns the INFO VCFHeaderLine in their original ordering + */ + public Collection getInfoHeaderLines() { + return mMetaData.values().stream() + .filter(hl -> hl.getKey().equals(VCFConstants.INFO_HEADER_KEY)) + .map(hl -> (VCFInfoHeaderLine) hl) + .collect(Collectors.toCollection(ArrayList::new)); + } + + /** + * Returns the FORMAT VCFHeaderLine in their original ordering + */ + public Collection getFormatHeaderLines() { + return mMetaData.values().stream() + .filter(hl -> hl.getKey().equals(VCFConstants.FORMAT_HEADER_KEY)) + .map(hl -> (VCFFormatHeaderLine) hl) + .collect(Collectors.toCollection(ArrayList::new)); + } + + /** + * @param id the id of the requested header line + * @return the VCFHeaderLine info line, or null if there is none + */ + public VCFInfoHeaderLine getInfoHeaderLine(final String id) { + return (VCFInfoHeaderLine) mMetaData.get(makeKey(VCFConstants.INFO_HEADER_KEY, id)); + } + + /** + * @param id the id of the requested header format line + * @return the meta data line, or null if there is none + */ + public VCFFormatHeaderLine getFormatHeaderLine(final String id) { + return (VCFFormatHeaderLine) mMetaData.get(makeKey(VCFConstants.FORMAT_HEADER_KEY, id)); + } + + /** + * @param id the id of the requested header line + * @return the meta data line, or null if there is none + */ + public VCFFilterHeaderLine getFilterHeaderLine(final String id) { + return (VCFFilterHeaderLine) mMetaData.get(makeKey(VCFConstants.FILTER_HEADER_KEY, id)); + } + + /** + * Returns the other VCFHeaderLines in their original ordering, where "other" means any + * VCFHeaderLine that is not a contig, info, format or filter header line. + */ + public Collection getOtherHeaderLines() { + return mMetaData.values().stream().filter( + hl -> + !hl.getKey().equals(VCFConstants.CONTIG_HEADER_KEY) && + !hl.getKey().equals(VCFConstants.INFO_HEADER_KEY) && + !hl.getKey().equals(VCFConstants.FILTER_HEADER_KEY) && + !hl.getKey().equals(VCFConstants.FORMAT_HEADER_KEY) + ) + .collect(Collectors.toCollection(ArrayList::new)); + } + + /** + * The version/fileformat header line if one exists, otherwise null. + * @return The version/fileformat header line if one exists, otherwise null. + */ + public VCFHeaderLine getFileFormatLine() { + // find any existing version line(s). since there are multiple possible keys that + // represent version lines (old V3 specs used "format" instead of "fileformat") + final List existingVersionLines = mMetaData.values() + .stream() + .filter(line -> VCFHeaderVersion.isFormatString(line.getKey())) + .collect(Collectors.toList()); + + // This class doesn't mandate that the list it maintains always contains a fileformat line + // (its VCFHeader's job to maintain that condition for the header). + if (!existingVersionLines.isEmpty()) { + if (existingVersionLines.size() > 1) { + throw new IllegalStateException( + String.format("The metadata lines class contains more than one version line (%s)", + existingVersionLines.stream() + .map(VCFHeaderLine::toString) + .collect(Collectors.joining(",")))); + } + return existingVersionLines.get(0); + } else { + return null; + } + } + + @Override + public String toString() { + final StringBuilder b = new StringBuilder(); + b.append("[VCFMetaDataLines:"); + for ( final VCFHeaderLine line : mMetaData.values() ) + b.append("\n\t").append(line); + return b.append("\n]").toString(); + } + + @Override + public boolean equals(final Object o) { + if (this == o) return true; + if (!(o instanceof VCFMetaDataLines)) return false; + + final VCFMetaDataLines that = (VCFMetaDataLines) o; + + return mMetaData.equals(that.mMetaData); + } + + @Override + public int hashCode() { + return mMetaData.hashCode(); + } + + /** + * Generate a unique key for a VCFHeaderLine. If the header line is a VCFStructuredHeaderLine, the key + * is the concatenation of the VCFHeaderLine's key (i.e., the type of the VCFHeaderLine) and the ID for + * that VCFHeaderLine (with a ":" separator). Otherwise, we use the concatenation of the OTHER_KEY, the + * VCFHeaderLine's key, and a nonce value to ensure that unstructured lines never collide with structured + * lines, and also can have duplicate identical instances. + * + * @param headerLine the {@link VCFHeaderLine} for which a key should be returned + * @return the generated HeaderLineMapKey + */ + private HeaderLineMapKey makeKeyForLine(final VCFHeaderLine headerLine) { + if (headerLine.isIDHeaderLine()) { + // these are required to have a unique ID, so use the line key as the key, and the id as the constraint + return makeKey(headerLine.getKey(), headerLine.getID()); + } else { + // Allow duplicate unstructured "other" keys, as long as they have different values. Use + // the line key as the key, and the line hashcode as the constraint. + // + // The previous implementation dropped duplicate keys for unstructured lines, but the spec doesn't + // require these to be unique (only to have unique values). This implementation is more permissive in + // that it allows lines with duplicate keys to accumulate as long as they have different values, but + // retains only one with a unique value. + return makeKey(headerLine.getKey(), Integer.toString(headerLine.hashCode())); + } + } + + // Create a VCFHeaderLine hashmap key given a key and an id + private HeaderLineMapKey makeKey(final String nameSpace, final String id) { return new HeaderLineMapKey(nameSpace, id); } + + private void createNewMapEntry(final HeaderLineMapKey newMapKey, final VCFHeaderLine newMetaDataLine) { + // for creation of a new entry, call updateMapEntry, but validate that it ALWAYS returns the + final VCFHeaderLine existingLine = updateMapEntry(newMapKey, newMetaDataLine); + if (existingLine != null ) { + throw new TribbleException(String.format( + "Internal header synchronization error - found unexpected previous value %s while adding %s", + existingLine, + newMetaDataLine)); + } + } + + private VCFHeaderLine updateMapEntry(final HeaderLineMapKey newMapKey, final VCFHeaderLine newMetaDataLine) { + final VCFHeaderLine existingLine = mMetaData.put(newMapKey, newMetaDataLine); + if (newMetaDataLine.isIDHeaderLine() && newMetaDataLine.getKey().equals(VCFHeader.CONTIG_KEY)) { + addToContigIndexMap((VCFContigHeaderLine) newMetaDataLine); + } + return existingLine; + } + + // We can't just blindly replace a line in the map based on the key using map.put, because the contig + // map will get out of sync if the line being replaced is a contig line that has a different contig + // index than the line being replaced. So replace the line in two atomic operations; first remove + // the old line and it's corresponding contig index entry, then add the new contig line and it's + // corresponding contig index entry. + private VCFHeaderLine replaceExistingMapEntry( + final HeaderLineMapKey newMapKey, + final VCFHeaderLine existingMetaDataLine, + final VCFHeaderLine newMetaDataLine) { + removeFromMapOrThrow(existingMetaDataLine); + logger.debug(() -> + "Replacing existing header metadata line: " + + existingMetaDataLine.toStringEncoding() + + " with header metadata line: " + + newMetaDataLine.toStringEncoding() + + "."); + createNewMapEntry(newMapKey, newMetaDataLine); + return existingMetaDataLine; + } + + // remove a line that is expected to be currently in the list, and throw if the line + // isn't found, or if the removed line is different (not equal to) the line to remove + private void removeFromMapOrThrow(final VCFHeaderLine lineToRemove) { + final VCFHeaderLine removedLine = removeMetaDataLine(lineToRemove); + if (removedLine == null || !removedLine.equals(lineToRemove)) { + // sanity check since in this case there should ALWAYS be a non-null line that was removed + // that is an exact duplicate of the "existingLine" + throw new TribbleException(String.format("Internal header synchronization error %s/%s", + lineToRemove, + removedLine == null ? "null line" : removedLine)); + } + } + + //add the new line to our contig index map + private void addToContigIndexMap(final VCFContigHeaderLine newContigLine) { + final VCFContigHeaderLine collidingContigLine = contigIndexMap.get(newContigLine.getContigIndex()); + if (collidingContigLine != null && !collidingContigLine.equals(newContigLine)) { + if (collidingContigLine.getID().equals(newContigLine.getID())) { + // the new line has the same contig ID and index as an existing line, but differ in + // some other attribute, so accept it but log a warning + logger.warn(String.format( + "Replacing an existing contig header line (%s) with a new, similar line that has different attributes (%s)", + collidingContigLine, + newContigLine)); + } else { + // the new contig line collides with an existing contig index, but specifies a different + // contig name, so reject it + throw new TribbleException(String.format( + "Attempt to replace a contig header line (%s) that has the same contig index as an existing line (%s)", + newContigLine, + collidingContigLine)); + } + } + contigIndexMap.put(newContigLine.getContigIndex(), newContigLine); + } + + // remove the contig header line from the contig index map + private void removeFromContigIndexMap(final VCFContigHeaderLine existingContigLine) { + // this remove overload only removes the specified object if its actually in the map + contigIndexMap.remove(existingContigLine.getContigIndex(), existingContigLine); + } + + // First, check for existing header lines that establish a header version. Whenever a new one is + // added, we need to remove the previous version line, validate all remaining lines against the new + // version, then add the new version line, and update our version state. We have to explicitly + // call isFormatString, and manually update the lines, since there is more than one header line key + // that can change the version. In some cases this will result in removing a line fileformat/version + // line with one key and replacing it with a line that has a different key. + private final VCFHeaderLine updateVersion(final VCFHeaderLine newMetaDataLine) { + ValidationUtils.validateArg( + VCFHeaderVersion.isFormatString(newMetaDataLine.getKey()), + "a file format line is required"); + + final VCFHeaderLine currentVersionLine = getFileFormatLine(); + final VCFHeaderVersion newVCFVersion = VCFHeaderVersion.toHeaderVersion(newMetaDataLine.getValue()); + + if (vcfVersion == null) { + logger.debug("Establishing header metadata version ", newVCFVersion); + } else if (!newVCFVersion.equals(vcfVersion)) { + logger.debug(() -> + "Updating header metadata version from " + + vcfVersion + + " to " + + newVCFVersion); + removeFromMapOrThrow(currentVersionLine); + } + + mMetaData.put(makeKeyForLine(newMetaDataLine), newMetaDataLine); + vcfVersion = newVCFVersion; + return currentVersionLine; + } + + // composite keys used by the metadata lines map + private static class HeaderLineMapKey implements Serializable { + public static final long serialVersionUID = 1L; + + final String key; + final String constraint; + + public HeaderLineMapKey(final String key, final String constraint) { + this.key = key; + this.constraint = constraint; + } + + public final String getKey() { return key; } + public final String getConstraint() { return constraint; } + + @Override + public boolean equals(final Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + final HeaderLineMapKey that = (HeaderLineMapKey) o; + + if (!key.equals(that.key)) return false; + return constraint.equals(that.constraint); + } + + @Override + public int hashCode() { + int result = key.hashCode(); + result = 31 * result + constraint.hashCode(); + return result; + } + } + +} + diff --git a/src/main/java/htsjdk/variant/vcf/VCFMetaHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFMetaHeaderLine.java index 991faa806f..d8cd83b8bb 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFMetaHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFMetaHeaderLine.java @@ -1,13 +1,41 @@ package htsjdk.variant.vcf; +import java.util.Map; +import java.util.Optional; + /** - * A class representing META fields in the VCF header + * A class representing META fields in the VCF header. */ public class VCFMetaHeaderLine extends VCFSimpleHeaderLine { private static final long serialVersionUID = 1L; public VCFMetaHeaderLine(final String line, final VCFHeaderVersion version) { - super(VCFConstants.META_HEADER_KEY, VCFHeaderLineTranslator.parseLine(version, line, null)); + // We need to use the V4 parser directly, since the V3 parser requires ALL permissible/expected + // tags to be supplied, which is inconsistent with modern structured header lines that allow + // other tags. So let validateForVersion detect any version incompatibility, ie., if this is ever + // called with a V3 version. + super(VCFConstants.META_HEADER_KEY, new VCF4Parser().parseLine(line, expectedTagOrder)); + validateForVersion(version); + } + + public VCFMetaHeaderLine(final Map mapping) { + super(VCFConstants.META_HEADER_KEY, mapping); + } + + @Override + public Optional> getValidationFailure(final VCFHeaderVersion vcfTargetVersion) { + if (!vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { + return Optional.of( + new VCFValidationFailure<>( + vcfTargetVersion, + this, + String.format("%s header lines are not allowed in VCF version %s headers", + getKey(), + vcfTargetVersion + ))); + } + + return super.getValidationFailure(vcfTargetVersion); } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFPedigreeHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFPedigreeHeaderLine.java index 33f163e8dc..f5bd71c474 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFPedigreeHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFPedigreeHeaderLine.java @@ -1,13 +1,51 @@ package htsjdk.variant.vcf; +import java.util.Map; +import java.util.Optional; + /** - * A class representing PEDIGREE fields in the VCF header + * A class representing PEDIGREE fields in the VCF header. Applicable starting with version VCFv4.3. + * + * ##PEDIGREE= + * ##PEDIGREE= + * ##PEDIGREE= + * ##PEDIGREE= */ public class VCFPedigreeHeaderLine extends VCFSimpleHeaderLine { + private static final long serialVersionUID = 1L; public VCFPedigreeHeaderLine(String line, VCFHeaderVersion version) { - super(VCFConstants.PEDIGREE_HEADER_KEY, VCFHeaderLineTranslator.parseLine(version, line, null)); + // We need to use the V4 parser directly, since the V3 parser requires ALL permissible/expected + // tags to be supplied, which is inconsistent with modern structured header lines that allow + // other tags. So let validateForVersion detect any version incompatibility, ie., if this is ever + // called with a V3 version. + super(VCFConstants.PEDIGREE_HEADER_KEY, new VCF4Parser().parseLine(line, expectedTagOrder)); + validateForVersion(version); + } + + public VCFPedigreeHeaderLine(final Map mapping) { + super(VCFConstants.PEDIGREE_HEADER_KEY, mapping); + } + + @Override + public Optional> getValidationFailure(final VCFHeaderVersion vcfTargetVersion) { + if (!vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { + // previous to VCFv4.3, the PEDIGREE line did not have an ID. Such lines are not modeled by this + // class (since it is derived from VCFSimpleHeaderLine). Therefore instances of this class always + // represent VCFv4.3 or higher. So throw if the requested version is less than 4.3. + final String message = String.format("%s header lines are not allowed in VCF version %s headers", + getKey(), + vcfTargetVersion + ); + if (VCFUtils.isStrictVCFVersionValidation()) { + return Optional.of(new VCFValidationFailure<>(vcfTargetVersion, this, message)); + } else { + logger.warn(message); + } + } + + return super.getValidationFailure(vcfTargetVersion); } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFRecordCodec.java b/src/main/java/htsjdk/variant/vcf/VCFRecordCodec.java index 8fe9b67d6d..cbefb13237 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFRecordCodec.java +++ b/src/main/java/htsjdk/variant/vcf/VCFRecordCodec.java @@ -27,8 +27,7 @@ public VCFRecordCodec(final VCFHeader header) { public VCFRecordCodec(final VCFHeader header, final boolean allowMissingFieldsInHeader) { this.vcfEncoder = new VCFEncoder(header, allowMissingFieldsInHeader, false); - // Explicitly set the version because it's not available in the header itself. - this.vcfDecoder.setVCFHeader(header, VCFHeaderVersion.VCF4_2); + this.vcfDecoder.setVCFHeader(header); } @Override diff --git a/src/main/java/htsjdk/variant/vcf/VCFSampleHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFSampleHeaderLine.java index 973a976baa..7c45e9a1b2 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFSampleHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFSampleHeaderLine.java @@ -1,13 +1,42 @@ package htsjdk.variant.vcf; +import java.util.Map; +import java.util.Optional; + /** - * A class representing SAMPLE fields in the VCF header */ public class VCFSampleHeaderLine extends VCFSimpleHeaderLine { + private static final long serialVersionUID = 1L; public VCFSampleHeaderLine(String line, VCFHeaderVersion version) { - super(VCFConstants.SAMPLE_HEADER_KEY, VCFHeaderLineTranslator.parseLine(version, line, null)); + // We need to use the V4 parser directly, since the V3 parser requires ALL permissible/expected + // tags to be supplied, which is inconsistent with modern structured header lines that allow + // other tags. So let validateForVersion detect any version incompatibility, ie., if this is ever + // called with a V3 version. + super(VCFConstants.SAMPLE_HEADER_KEY, new VCF4Parser().parseLine(line, expectedTagOrder)); + validateForVersion(version); + } + + public VCFSampleHeaderLine(final Map mapping) { + super(VCFConstants.SAMPLE_HEADER_KEY, mapping); + } + + @Override + public Optional> getValidationFailure(final VCFHeaderVersion vcfTargetVersion) { + if (!vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0)) { + final String message = String.format("%s header lines are not allowed in VCF version %s headers", + getKey(), + vcfTargetVersion + ); + if (VCFUtils.isStrictVCFVersionValidation()) { + return Optional.of(new VCFValidationFailure<>(vcfTargetVersion, this, message)); + } else { + logger.warn(message); + } + } + + return super.getValidationFailure(vcfTargetVersion); } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFSimpleHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFSimpleHeaderLine.java index 12b45e5bc9..c0a3abce5c 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFSimpleHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFSimpleHeaderLine.java @@ -1,5 +1,5 @@ /* -* Copyright (c) 2012 The Broad Institute +* Copyright (c) 2017 The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation @@ -25,98 +25,120 @@ package htsjdk.variant.vcf; +import htsjdk.samtools.util.Log; +import htsjdk.tribble.TribbleException; +import htsjdk.utils.ValidationUtils; + +import java.util.ArrayList; import java.util.Collections; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; - +import java.util.stream.Collectors; /** - * @author ebanks - * - * A class representing a key=value entry for simple VCF header types + * An abstract class representing a VCF metadata line with a key and attribute=value pairs, one of + * which represents an ID. The key determines the "type" of the structured header line (i.e., contig, FILTER, + * INFO, ALT, PEDIGREE, META). + * + * The attribute/value pairs are ordered. The first entry in the map must be an ID attribute (used by the + * VCFHeader to ensure that no two structured header lines that share the same key in a given header have the + * same ID). */ public class VCFSimpleHeaderLine extends VCFHeaderLine implements VCFIDHeaderLine { - - private String name; - private Map genericFields = new LinkedHashMap(); + private static final long serialVersionUID = 1L; + protected static final Log logger = Log.getInstance(VCFSimpleHeaderLine.class); public static final String ID_ATTRIBUTE = "ID"; public static final String DESCRIPTION_ATTRIBUTE = "Description"; + public static final String SOURCE_ATTRIBUTE = "Source"; + public static final String VERSION_ATTRIBUTE = "Version"; + + // List of expected tags (for this base class, its ID only; subclasses with more required tags + // should use a custom tag order if more required tags are expected + protected static final List expectedTagOrder = Collections.unmodifiableList( + new ArrayList(1) {{ add(ID_ATTRIBUTE); }}); + + // Map used to retain the attribute/value pairs, in original order. The first entry in the map must be + // an ID field. The entire map must be immutable to prevent hash values from changing, since these are + // often stored in Sets. Its not ACTUALLY immutable in orderto allow for special cases where subclasses + // have to be able to "repair" header lines (via a call to updateGenericField) during constructor validation. + // + // Otherwise the values here should never change during the lifetime of the header line. + private final Map genericFields = new LinkedHashMap(); /** - * create a VCF filter header line - * - * @param key the key for this header line - * @param name the name for this header line - * @param description description for this header line + * Constructor that accepts a key and string that represetns the rest of the line (after the ##KEY="). + * @param key the key to use for this line + * @param line the value part of the line + * @param version the target version to validate the line against */ - public VCFSimpleHeaderLine(String key, String name, String description) { - super(key, ""); - Map map = new LinkedHashMap(1); - map.put(DESCRIPTION_ATTRIBUTE, description); - initialize(name, map); + public VCFSimpleHeaderLine(final String key, final String line, final VCFHeaderVersion version) { + this(key, VCFHeaderLineTranslator.parseLine(version, line, expectedTagOrder)); + validate(); + validateForVersion(version); } /** - * create a VCF info header line - * - * @see #VCFSimpleHeaderLine(String, VCFHeaderVersion, String, List, List) VCFv4.2+ recommended tags support + * Key cannot be null or empty. * - * @param line the header line - * @param version the vcf header version - * @param key the key for this header line - * @param expectedTagOrdering the tag ordering expected for this header line + * @param key key to use for this header line. can not be null. + * @param id id name to use for this line + * @param description string that will be added as a "Description" tag to this line */ - public VCFSimpleHeaderLine(final String line, final VCFHeaderVersion version, final String key, final List expectedTagOrdering) { - this(line, version, key, expectedTagOrdering, Collections.emptyList()); + public VCFSimpleHeaderLine(final String key, final String id, final String description) { + super(key, ""); + genericFields.put(ID_ATTRIBUTE, id); + genericFields.put(DESCRIPTION_ATTRIBUTE, description); + validate(); } /** - * create a VCF info header line + * Key cannot be null or empty. + * + * Note that for attributes where the order is significant, use a LinkedHashMap + * to ensure that attribute order is honored. * - * @param line the header line - * @param version the vcf header version - * @param key the key for this header line - * @param expectedTagOrdering the tag ordering expected for this header line - * @param recommendedTags tags that are optional for this header line + * @param key key to use for this header line. can not be null. + * @param attributeMapping field mappings to use. may not be null. must contain an "ID" field to use as + * a unique id for this line */ - public VCFSimpleHeaderLine(final String line, final VCFHeaderVersion version, final String key, final List expectedTagOrdering, final List recommendedTags) { - this(key, VCFHeaderLineTranslator.parseLine(version, line, expectedTagOrdering, recommendedTags)); + public VCFSimpleHeaderLine(final String key, final Map attributeMapping) { + super(key, ""); + ValidationUtils.nonNull(attributeMapping, "An attribute map is required for structured header lines"); + genericFields.putAll(attributeMapping); + validate(); } - public VCFSimpleHeaderLine(final String key, final Map mapping) { - super(key, ""); - name = mapping.get(ID_ATTRIBUTE); - initialize(name, mapping); + /** + * @return true if this is a structured header line (has a unique ID and multiple key/value pairs), + * otherwise false + */ + @Override + public boolean isIDHeaderLine() { return true; } + + /** + * Return the unique ID for this line. Returns null iff isIDHeaderLine is false. + * @return + */ + @Override + public String getID() { + return getGenericFieldValue(ID_ATTRIBUTE); } - /** - * Returns the String value associated with the given key. Returns null if there is no value. Key - * must not be null. - */ - String getGenericFieldValue(final String key) { - return this.genericFields.get(key); - } - - protected void initialize(String name, Map genericFields) { - if ( name == null || genericFields == null || genericFields.isEmpty() ) - throw new IllegalArgumentException(String.format("Invalid VCFSimpleHeaderLine: key=%s name=%s", super.getKey(), name)); - if ( name.contains("<") || name.contains(">") ) - throw new IllegalArgumentException("VCFHeaderLine: ID cannot contain angle brackets"); - if ( name.contains("=") ) - throw new IllegalArgumentException("VCFHeaderLine: ID cannot contain an equals sign"); - - this.name = name; - this.genericFields.putAll(genericFields); + /** + * Returns the String value associated with the given key. Returns null if there is no value. Key + * must not be null. + */ + public String getGenericFieldValue(final String key) { + return this.genericFields.get(key); } - @Override - protected String toStringEncoding() { - Map map = new LinkedHashMap(); - map.put(ID_ATTRIBUTE, name); - map.putAll(genericFields); - return getKey() + "=" + VCFHeaderLine.toStringEncoding(map); + /** + * Returns a list of all attributes for this header line. + */ + public Map getGenericFields() { + return Collections.unmodifiableMap(this.genericFields); } @Override @@ -129,28 +151,78 @@ public boolean equals( final Object o ) { } final VCFSimpleHeaderLine that = (VCFSimpleHeaderLine) o; - return name.equals(that.name) && - genericFields.equals(that.genericFields); + return genericFields.equals(that.genericFields); } @Override public int hashCode() { int result = super.hashCode(); - result = 31 * result + name.hashCode(); result = 31 * result + genericFields.hashCode(); return result; } + /** + * create a string of a mapping pair for the target VCF version + * @return a string, correctly formatted + */ @Override - public String getID() { - return name; + protected String toStringEncoding() { + //NOTE: this preserves/round-trips "extra" attributes such as SOURCE, VERSION, etc. + final StringBuilder builder = new StringBuilder(); + builder.append(getKey()); + builder.append("=<"); + builder.append(genericFields.entrySet().stream() + .map(e -> e.getKey() + "=" + quoteAttributeValueForSerialization(e.getKey(), e.getValue())) + .collect(Collectors.joining(","))); + builder.append('>'); + return builder.toString(); } + // Called by VCFInfoHeaderLine to allow repairing of VCFInfoLines that have a Flag type and a non-zero count + // (the combination of which is forbidden by the spec, but which we tolerate for backward compatibility with + // previous versions of htsjdk, which silently repaired these). + // + // Replaces the original generic fields map with another immutable map with the updated value. + protected void updateGenericField(final String attributeName, final String value) { + genericFields.put(attributeName, value); + } /** - * @return a map of all pairs of fields and values in this header line + * Return true if the attribute name requires quotes. + * @param attributeName name of the attribute being serialized + * @return boolean indicating whether the value should be embedded n quotes during serialization */ - public Map getGenericFields() { - return Collections.unmodifiableMap(genericFields); + protected boolean getIsQuotableAttribute(final String attributeName) { + // the (VF4.3) spec says that the DESCRIPTION, SOURCE, and VERSION attributes should be quoted + // for INFO/FORMAT lines, but htsjdk seems to have historically quoted these for all structured + // header lines + return attributeName.equals(DESCRIPTION_ATTRIBUTE) || + attributeName.equals(SOURCE_ATTRIBUTE) || + attributeName.equals(VERSION_ATTRIBUTE); } - } + + private void validate() { + if ( genericFields.isEmpty() || !genericFields.keySet().stream().findFirst().get().equals(ID_ATTRIBUTE)) { + throw new TribbleException( + String.format("The required ID tag is missing or not the first attribute: key=%s", super.getKey())); + } + validateKeyOrID(getGenericFieldValue(ID_ATTRIBUTE), "ID"); + } + + // Add quotes around any attribute value that contains a space or comma, or is supposed to be quoted by + // definition per the spec (i.e., Description, Source, Version for INFO lines). + private String quoteAttributeValueForSerialization(final String attribute, final String originalValue) { + return originalValue.contains(",") || originalValue.contains(" ") || getIsQuotableAttribute(attribute) ? + "\""+ escapeQuotes(originalValue) + "\"" : + originalValue; + } + + private static String escapeQuotes(final String value) { + // java escaping in a string literal makes this harder to read than it should be + // without string literal escaping and quoting the regex would be: replaceAll( ([^\])" , $1\" ) + // ie replace: something that's not a backslash ([^\]) followed by a double quote + // with: the thing that wasn't a backslash ($1), followed by a backslash, followed by a double quote + return value.replaceAll("([^\\\\])\"", "$1\\\\\""); + } + +} \ No newline at end of file diff --git a/src/main/java/htsjdk/variant/vcf/VCFStandardHeaderLines.java b/src/main/java/htsjdk/variant/vcf/VCFStandardHeaderLines.java index 6e9e713a20..0d61cf35e4 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFStandardHeaderLines.java +++ b/src/main/java/htsjdk/variant/vcf/VCFStandardHeaderLines.java @@ -51,15 +51,21 @@ public class VCFStandardHeaderLines { /** * Enabling this causes us to repair header lines even if only their descriptions differ. */ - private final static boolean REPAIR_BAD_DESCRIPTIONS = false; - private static Standards formatStandards = new Standards(); - private static Standards infoStandards = new Standards(); + private static Standards formatStandards = new Standards<>(); + private static Standards infoStandards = new Standards<>(); /** * Walks over the VCF header and repairs the standard VCF header lines in it, returning a freshly * allocated {@link VCFHeader} with standard VCF header lines repaired as necessary. */ public static VCFHeader repairStandardHeaderLines(final VCFHeader oldHeader) { + if (oldHeader.getVCFHeaderVersion().isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { + // the "repair" operation effectively upgrades old header lines to v4.2 format, + // but we don't "back-version" headers that are already newer than v4.2, so skip + // repair for newer headers + return oldHeader; + } + final Set newLines = new LinkedHashSet(oldHeader.getMetaDataInInputOrder().size()); for ( VCFHeaderLine line : oldHeader.getMetaDataInInputOrder() ) { if ( line instanceof VCFFormatHeaderLine ) { @@ -67,17 +73,17 @@ public static VCFHeader repairStandardHeaderLines(final VCFHeader oldHeader) { } else if ( line instanceof VCFInfoHeaderLine) { line = infoStandards.repair((VCFInfoHeaderLine) line); } - newLines.add(line); } + //NOTE that its possible for this to fail in the (probably rare) case that the repaired + //lines (which are "version-less") fail validation against the header version final VCFHeader repairedHeader = new VCFHeader(newLines, oldHeader.getGenotypeSamples()); - final VCFHeaderVersion oldHeaderVersion = oldHeader.getVCFHeaderVersion(); - if (oldHeaderVersion != null && oldHeaderVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { - // this needs to maintain version 4.3 (and not back-version to v4.2), so propagate - // the old version only for v4.3 - repairedHeader.setVCFHeaderVersion(oldHeaderVersion); - } + + // the "repair" operation effectively upgrades old header lines to v4.2 format, so the new header should + // reflect that since it may no longer conform to it's original version + // new header reflects that + repairedHeader.addMetaDataLine(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_2)); return repairedHeader; } @@ -159,9 +165,9 @@ private static void registerStandard(final VCFFormatHeaderLine line) { // static { // FORMAT lines - registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); - registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Integer, "Genotype Quality")); - registerStandard(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth (reads with MQ=255 or with bad mates are filtered)")); + registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); + registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Integer, "Genotype Quality")); + registerStandard(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth (reads with MQ=255 or with bad mates are filtered)")); registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_PL_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_ALLELE_DEPTHS, VCFHeaderLineCount.R, VCFHeaderLineType.Integer, "Allelic depths for the ref and alt alleles in the order listed")); registerStandard(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_FILTER_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Genotype-level filter")); @@ -169,16 +175,16 @@ private static void registerStandard(final VCFFormatHeaderLine line) { registerStandard(new VCFFormatHeaderLine(VCFConstants.PHASE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Read-backed phasing quality")); // INFO lines - registerStandard(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "Stop position of the interval")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.DBSNP_KEY, 0, VCFHeaderLineType.Flag, "dbSNP Membership")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth; some reads may have been filtered")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias")); + registerStandard(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "Stop position of the interval")); + registerStandard(new VCFInfoHeaderLine(VCFConstants.DBSNP_KEY, 0, VCFHeaderLineType.Flag, "dbSNP Membership")); + registerStandard(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth; some reads may have been filtered")); + registerStandard(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias")); registerStandard(new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed")); registerStandard(new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Total number of alleles in called genotypes")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.MAPPING_QUALITY_ZERO_KEY, 1, VCFHeaderLineType.Integer, "Total Mapping Quality Zero Reads")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.RMS_MAPPING_QUALITY_KEY, 1, VCFHeaderLineType.Float, "RMS Mapping Quality")); - registerStandard(new VCFInfoHeaderLine(VCFConstants.SOMATIC_KEY, 0, VCFHeaderLineType.Flag, "Somatic event")); + registerStandard(new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Total number of alleles in called genotypes")); + registerStandard(new VCFInfoHeaderLine(VCFConstants.MAPPING_QUALITY_ZERO_KEY, 1, VCFHeaderLineType.Integer, "Total Mapping Quality Zero Reads")); + registerStandard(new VCFInfoHeaderLine(VCFConstants.RMS_MAPPING_QUALITY_KEY, 1, VCFHeaderLineType.Float, "RMS Mapping Quality")); + registerStandard(new VCFInfoHeaderLine(VCFConstants.SOMATIC_KEY, 0, VCFHeaderLineType.Flag, "Somatic event")); } private static class Standards { @@ -191,7 +197,7 @@ public T repair(final T line) { final boolean badCount = line.isFixedCount() && ! badCountType && line.getCount() != standard.getCount(); final boolean badType = line.getType() != standard.getType(); final boolean badDesc = ! line.getDescription().equals(standard.getDescription()); - final boolean needsRepair = badCountType || badCount || badType || (REPAIR_BAD_DESCRIPTIONS && badDesc); + final boolean needsRepair = badCountType || badCount || badType; if ( needsRepair ) { if ( GeneralUtils.DEBUG_MODE_ENABLED ) { diff --git a/src/main/java/htsjdk/variant/vcf/VCFUtils.java b/src/main/java/htsjdk/variant/vcf/VCFUtils.java index 6d0e2d7b68..3599da7edc 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFUtils.java +++ b/src/main/java/htsjdk/variant/vcf/VCFUtils.java @@ -25,110 +25,59 @@ package htsjdk.variant.vcf; +import htsjdk.samtools.Defaults; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.util.FileExtensions; -import htsjdk.variant.utils.GeneralUtils; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.writer.Options; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder; import java.io.File; +import java.util.ArrayList; +import java.util.Collection; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; import java.io.IOException; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; -import java.util.stream.Collectors; public class VCFUtils { private static final Pattern INF_OR_NAN_PATTERN = Pattern.compile("^(?[-+]?)((?(INF|INFINITY))|(?NAN))$", Pattern.CASE_INSENSITIVE); + private static final boolean DEFAULT_VCF_STRICT_VERSION_VALIDATION = true; - public static Set smartMergeHeaders(final Collection headers, final boolean emitWarnings) throws IllegalStateException { - // We need to maintain the order of the VCFHeaderLines, otherwise they will be scrambled in the returned Set. - // This will cause problems for VCFHeader.getSequenceDictionary and anything else that implicitly relies on the line ordering. - final LinkedHashMap map = new LinkedHashMap<>(); // from KEY.NAME -> line - final HeaderConflictWarner conflictWarner = new HeaderConflictWarner(emitWarnings); - final Set headerVersions = new HashSet<>(2); + // a global mutable static - is there an alternative ? + // there isn't any other reasonable place to keep this state + private static boolean vcfStrictVersionValidation = true; - // todo -- needs to remove all version headers from sources and add its own VCF version line - for (final VCFHeader source : headers) { - for (final VCFHeaderLine line : source.getMetaDataInSortedOrder()) { - - enforceHeaderVersionMergePolicy(headerVersions, source.getVCFHeaderVersion()); - String key = line.getKey(); - if (line instanceof VCFIDHeaderLine) - key = key + "-" + ((VCFIDHeaderLine) line).getID(); - - if (map.containsKey(key)) { - final VCFHeaderLine other = map.get(key); - if (line.equals(other)) { - // continue; - } else if (!line.getClass().equals(other.getClass())) { - throw new IllegalStateException("Incompatible header types: " + line + " " + other); - } else if (line instanceof VCFFilterHeaderLine) { - final String lineName = ((VCFFilterHeaderLine) line).getID(); - final String otherName = ((VCFFilterHeaderLine) other).getID(); - if (!lineName.equals(otherName)) - throw new IllegalStateException("Incompatible header types: " + line + " " + other); - } else if (line instanceof VCFCompoundHeaderLine) { - final VCFCompoundHeaderLine compLine = (VCFCompoundHeaderLine) line; - final VCFCompoundHeaderLine compOther = (VCFCompoundHeaderLine) other; - - // if the names are the same, but the values are different, we need to quit - if (!(compLine).equalsExcludingDescription(compOther)) { - if (compLine.getType().equals(compOther.getType())) { - // The Number entry is an Integer that describes the number of values that can be - // included with the INFO field. For example, if the INFO field contains a single - // number, then this value should be 1. However, if the INFO field describes a pair - // of numbers, then this value should be 2 and so on. If the number of possible - // values varies, is unknown, or is unbounded, then this value should be '.'. - conflictWarner.warn(line, "Promoting header field Number to . due to number differences in header lines: " + line + " " + other); - compOther.setNumberToUnbounded(); - } else if (compLine.getType() == VCFHeaderLineType.Integer && compOther.getType() == VCFHeaderLineType.Float) { - // promote key to Float - conflictWarner.warn(line, "Promoting Integer to Float in header: " + compOther); - map.put(key, compOther); - } else if (compLine.getType() == VCFHeaderLineType.Float && compOther.getType() == VCFHeaderLineType.Integer) { - // promote key to Float - conflictWarner.warn(line, "Promoting Integer to Float in header: " + compOther); - } else { - throw new IllegalStateException("Incompatible header types, collision between these two types: " + line + " " + other); - } - } - if (!compLine.getDescription().equals(compOther.getDescription())) - conflictWarner.warn(line, "Allowing unequal description fields through: keeping " + compOther + " excluding " + compLine); - } else { - // we are not equal, but we're not anything special either - conflictWarner.warn(line, "Ignoring header line already in map: this header line = " + line + " already present header = " + other); - } - } else { - map.put(key, line); - } - } - } - - // returning a LinkedHashSet so that ordering will be preserved. Ensures the contig lines do not get scrambled. - return new LinkedHashSet<>(map.values()); - } + /** + * Determine if strict VCF version validation is enabled. Defaults to true. Strict version validation + * ensures that all VCF contents (header and variant contexts) conforms to the established header version. + * This should only be disabled when absolutely necessary. + * + * @return true if strict version validation is enabled + */ + public static boolean isStrictVCFVersionValidation() { return Defaults.STRICT_VCF_VERSION_VALIDATION; } - // Reject attempts to merge a VCFv4.3 header with any other version - private static void enforceHeaderVersionMergePolicy( - final Set headerVersions, - final VCFHeaderVersion candidateVersion) { - if (candidateVersion != null) { - headerVersions.add(candidateVersion); - if (headerVersions.size() > 1 && headerVersions.contains(VCFHeaderVersion.VCF4_3)) { - throw new IllegalArgumentException( - String.format("Attempt to merge version %s header with incompatible header version %s", - VCFHeaderVersion.VCF4_3.getVersionString(), - headerVersions.stream() - .filter(hv -> !hv.equals(VCFHeaderVersion.VCF4_3)) - .map(VCFHeaderVersion::getVersionString) - .collect(Collectors.joining(" ")))); - } - } + /** + * The headers passed in must be version >= 4.2 (older headers that are read in via AbstractVCFCodecs + * are "repaired" and stamped as VCF4.2 when they're read in). + * + * @param headers the set of headers to merge + * @param emitWarnings true if warning should be emitted by the merge + * @return + * @throws {@link htsjdk.tribble.TribbleException} if any header has a version < vcfV4.2 + * @throws {@link htsjdk.tribble.TribbleException} if any header cannot be upgraded to the newest version amongst + * all headers provided + */ + public static Set smartMergeHeaders( + final Collection headers, + final boolean emitWarnings) { + return VCFHeaderMerger.getMergedHeaderLines(headers, emitWarnings); } /** @@ -149,8 +98,8 @@ public static Set withUpdatedContigsAsLines(final Set withUpdatedContigsAsLines(final Set oldLines, final File referenceFile, final SAMSequenceDictionary refDict, final boolean referenceNameOnly) { final Set lines = new LinkedHashSet<>(oldLines.size()); - for (final VCFHeaderLine line : oldLines) { - if (line instanceof VCFContigHeaderLine) + for ( final VCFHeaderLine line : oldLines ) { + if ( line.isIDHeaderLine() && line.getKey().equals(VCFConstants.CONTIG_HEADER_KEY) ) continue; // skip old contig lines if (line.getKey().equals(VCFHeader.REFERENCE_KEY)) continue; // skip the old reference key @@ -184,17 +133,14 @@ public static List makeContigHeaderLines(final SAMSequenceD final File referenceFile) { final List lines = new ArrayList<>(); final String assembly = referenceFile != null ? getReferenceAssembly(referenceFile.getName()) : null; - for (final SAMSequenceRecord contig : refDict.getSequences()) - lines.add(makeContigHeaderLine(contig, assembly)); + for ( final SAMSequenceRecord contig : refDict.getSequences() ) + lines.add(new VCFContigHeaderLine(contig, assembly)); return lines; } + @Deprecated private static VCFContigHeaderLine makeContigHeaderLine(final SAMSequenceRecord contig, final String assembly) { - final Map map = new LinkedHashMap<>(3); - map.put("ID", contig.getSequenceName()); - map.put("length", String.valueOf(contig.getSequenceLength())); - if (assembly != null) map.put("assembly", assembly); - return new VCFContigHeaderLine(map, contig.getSequenceIndex()); + return new VCFContigHeaderLine(contig, assembly); } /** @@ -295,22 +241,4 @@ else if (refPath.contains("hg38")) return assembly; } - /** - * Only displays a warning if warnings are enabled and an identical warning hasn't been already issued - */ - private static final class HeaderConflictWarner { - boolean emitWarnings; - Set alreadyIssued = new HashSet<>(); - - private HeaderConflictWarner(final boolean emitWarnings) { - this.emitWarnings = emitWarnings; - } - - public void warn(final VCFHeaderLine line, final String msg) { - if (GeneralUtils.DEBUG_MODE_ENABLED && emitWarnings && !alreadyIssued.contains(line.getKey())) { - alreadyIssued.add(line.getKey()); - System.err.println(msg); - } - } - } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFValidationFailure.java b/src/main/java/htsjdk/variant/vcf/VCFValidationFailure.java new file mode 100644 index 0000000000..c6f0ad8708 --- /dev/null +++ b/src/main/java/htsjdk/variant/vcf/VCFValidationFailure.java @@ -0,0 +1,63 @@ +package htsjdk.variant.vcf; + +import htsjdk.utils.ValidationUtils; + +/** + * A class representing a VCF validation failure. + * @param a type representing the object that is being validated + */ +class VCFValidationFailure { + private final VCFHeaderVersion targetVersion; + private final T source; + private final String sourceMessage; + + /** + * A VCF validation failure. + * + * @param targetVersion the version for which validation failed. + * @param source the source object being validated + * @param sourceMessage the validation failure reason + */ + public VCFValidationFailure(final VCFHeaderVersion targetVersion, final T source, final String sourceMessage) { + ValidationUtils.nonNull(targetVersion); + ValidationUtils.nonNull(source); + ValidationUtils.nonNull(sourceMessage); + + this.targetVersion = targetVersion; + this.source = source; + this.sourceMessage = sourceMessage; + } + + /** + * @return the source object being validated + */ + public T getSource() { + return source; + } + + /** + * @return The validation failure reason. + */ + public String getSourceMessage() { + return sourceMessage; + } + + /** + * @return A formatted message describing the validation failure reason and target version. + */ + public String getFailureMessage() { + return String.format( + "Failure validating %s for reason %s, target version %s", + source.toString(), + sourceMessage, + targetVersion); + } + + /** + * @return The version for which validation failed. May be null. + */ + public VCFHeaderVersion getTargetVersion() { + return targetVersion; + } + +} diff --git a/src/test/java/htsjdk/samtools/SAMSequenceDictionaryUtilsTest.java b/src/test/java/htsjdk/samtools/SAMSequenceDictionaryUtilsTest.java index 37842f8a9a..7167fa8f12 100644 --- a/src/test/java/htsjdk/samtools/SAMSequenceDictionaryUtilsTest.java +++ b/src/test/java/htsjdk/samtools/SAMSequenceDictionaryUtilsTest.java @@ -1,11 +1,7 @@ -package org.broadinstitute.hellbender.utils; - -import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.samtools.SAMSequenceRecord; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; -import org.broadinstitute.hellbender.exceptions.UserException; -import org.broadinstitute.hellbender.GATKBaseTest; +package htsjdk.samtools; + +import htsjdk.HtsjdkTest; +import htsjdk.samtools.util.Interval; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -14,12 +10,10 @@ import java.util.Arrays; import java.util.List; -import static org.broadinstitute.hellbender.utils.SequenceDictionaryUtils.*; -import static org.broadinstitute.hellbender.utils.SequenceDictionaryUtils.SequenceDictionaryCompatibility.*; - -public final class SequenceDictionaryUtilsUnitTest extends GATKBaseTest { +import static htsjdk.samtools.SAMSequenceDictionaryUtils.*; +import static htsjdk.samtools.SAMSequenceDictionaryUtils.SequenceDictionaryCompatibility.*; - private static Logger logger = LogManager.getLogger(SequenceDictionaryUtilsUnitTest.class); +public final class SAMSequenceDictionaryUtilsTest extends HtsjdkTest { @DataProvider( name = "testSequenceRecordsAreEquivalentDataProvider" ) public Object[][] testSequenceRecordsAreEquivalentDataProvider() { @@ -43,7 +37,7 @@ public Object[][] testSequenceRecordsAreEquivalentDataProvider() { @Test(dataProvider = "testSequenceRecordsAreEquivalentDataProvider") public void testSequenceRecordsAreEquivalent(final SAMSequenceRecord one, final SAMSequenceRecord two, final boolean expected){ - final boolean actual = SequenceDictionaryUtils.sequenceRecordsAreEquivalent(one, two); + final boolean actual = SAMSequenceDictionaryUtils.sequenceRecordsAreEquivalent(one, two); Assert.assertEquals(actual, expected); } @@ -59,204 +53,157 @@ public Object[][] generateSequenceDictionaryTestData() { CHR1_HG19_WITH_ATTRIBUTES.setAttribute("M5", "0dec9660ec1efaaf33281c0d5ea2560f"); CHR1_HG19_WITH_ATTRIBUTES.setAttribute("UR", "file:/foo/bar"); - final Class NO_COMMON_CONTIGS_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class; - final Class UNEQUAL_COMMON_CONTIGS_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class; - final Class NON_CANONICAL_HUMAN_ORDER_EXCEPTION = UserException.LexicographicallySortedSequenceDictionary.class; - final Class OUT_OF_ORDER_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class; - final Class DIFFERENT_INDICES_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class; - - final List hg19AllContigsIntervalSet = Arrays.asList( - new SimpleInterval("chrM", 1, 1), - new SimpleInterval("chr1", 1, 1), - new SimpleInterval("chr2", 1, 1), - new SimpleInterval("chr10", 1, 1)); - final List hg19PartialContigsIntervalSet = Arrays.asList( - new SimpleInterval("chrM", 1, 1), - new SimpleInterval("chr1", 1, 1)); + final List hg19AllContigsIntervalSet = Arrays.asList( + new Interval("chrM", 1, 1), + new Interval("chr1", 1, 1), + new Interval("chr2", 1, 1), + new Interval("chr10", 1, 1)); + final List hg19PartialContigsIntervalSet = Arrays.asList( + new Interval("chrM", 1, 1), + new Interval("chr1", 1, 1)); return new Object[][] { // Identical dictionaries: - {Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, null, false, false}, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, null, false, false}, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, null, true, false}, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, null, false, true}, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, null, true, true}, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), IDENTICAL, null, false, false}, - { Arrays.asList(CHR1_B37), Arrays.asList(CHR1_B37), IDENTICAL, null, false, false}, - { Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), IDENTICAL, null, false, false}, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19_WITH_UNKNOWN_LENGTH), IDENTICAL, null, false, false}, - { Arrays.asList(CHR1_HG19_WITH_UNKNOWN_LENGTH), Arrays.asList(CHR1_HG19), IDENTICAL, null, false, false}, + {Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, true, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, false, true}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), IDENTICAL, true, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), IDENTICAL, false, false}, + { Arrays.asList(CHR1_B37), Arrays.asList(CHR1_B37), IDENTICAL, false, false}, + { Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), IDENTICAL, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19_WITH_UNKNOWN_LENGTH), IDENTICAL, false, false}, + { Arrays.asList(CHR1_HG19_WITH_UNKNOWN_LENGTH), Arrays.asList(CHR1_HG19), IDENTICAL, false, false}, // Dictionaries with a common subset: - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, null, false, false}, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, null, false, true}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, false, true}, // If requireSuperset == true, we should get an exception upon COMMON_SUBSET: - { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR10_HG19), COMMON_SUBSET, UserException.IncompatibleSequenceDictionaries.class, true, false}, - { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR_NONSTANDARD2), COMMON_SUBSET, null, false, false}, - { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19), COMMON_SUBSET, null, false, false}, - { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHRM_HG19), COMMON_SUBSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD2), COMMON_SUBSET, null, false, false}, - { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHR2_HG19, CHR10_HG19), COMMON_SUBSET, null, false, false}, - { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHR2_HG19, CHR10_HG19, CHRM_HG19), COMMON_SUBSET, null, false, false}, - { Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37, CHR_NONSTANDARD1), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37, CHR_NONSTANDARD2), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR10_HG19), COMMON_SUBSET, true, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR_NONSTANDARD2), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHRM_HG19), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD2), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHR2_HG19, CHR10_HG19), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHR2_HG19, CHR10_HG19, CHRM_HG19), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37, CHR_NONSTANDARD1), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37, CHR_NONSTANDARD2), COMMON_SUBSET, false, false}, // If requireSuperset == true, we should get an exception upon COMMON_SUBSET: - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), COMMON_SUBSET, UserException.IncompatibleSequenceDictionaries.class, true, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), COMMON_SUBSET, true, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), COMMON_SUBSET, false, false}, // If checkContigOrdering == false, ordering of the common contigs should not matter: - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19), COMMON_SUBSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR10_HG19, CHR2_HG19, CHR1_HG19), COMMON_SUBSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR10_HG19, CHR1_HG19), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR10_HG19, CHR2_HG19, CHR1_HG19), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR10_HG19, CHR1_HG19), COMMON_SUBSET, false, false}, // Dictionaries with no common contigs: - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, false}, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, true}, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, true, false}, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, true, true}, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_B37), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), NO_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), NO_COMMON_CONTIGS, false, true}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), NO_COMMON_CONTIGS, true, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), NO_COMMON_CONTIGS, true, true}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_B37), NO_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), NO_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), NO_COMMON_CONTIGS, false, false}, // Dictionaries with unequal common contigs: - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19_WITH_DIFFERENT_LENGTH), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, - { Arrays.asList(CHR1_HG19_WITH_DIFFERENT_LENGTH), Arrays.asList(CHR1_HG19), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, true, false}, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, true}, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, true, true}, - { Arrays.asList(CHR1_B36), Arrays.asList(CHR1_B37), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, - { Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), Arrays.asList(CHR1_B36, CHR2_B36, CHR10_B36), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18, CHR_NONSTANDARD2), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, - { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG18, CHR2_HG18, CHR10_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18), UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19_WITH_DIFFERENT_LENGTH), UNEQUAL_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR1_HG19_WITH_DIFFERENT_LENGTH), Arrays.asList(CHR1_HG19), UNEQUAL_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), UNEQUAL_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), UNEQUAL_COMMON_CONTIGS, true, false}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), UNEQUAL_COMMON_CONTIGS, false, true}, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), UNEQUAL_COMMON_CONTIGS, true, true}, + { Arrays.asList(CHR1_B36), Arrays.asList(CHR1_B37), UNEQUAL_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18), UNEQUAL_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), Arrays.asList(CHR1_B36, CHR2_B36, CHR10_B36), UNEQUAL_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18, CHR_NONSTANDARD2), UNEQUAL_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG18, CHR2_HG18, CHR10_HG18), UNEQUAL_COMMON_CONTIGS, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18), UNEQUAL_COMMON_CONTIGS, false, false}, // One or both dictionaries in non-canonical human order: - { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, - { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, true, true}, - { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, - { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, true, true}, - { Arrays.asList(CHR1_HG18, CHR10_HG18, CHR2_HG18), Arrays.asList(CHR1_HG18, CHR10_HG18, CHR2_HG18), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, - { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, - { Arrays.asList(CHR1_B37, CHR10_B37, CHR2_B37), Arrays.asList(CHR1_B37, CHR10_B37, CHR2_B37), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, - { Arrays.asList(CHR1_B36, CHR10_B36, CHR2_B36), Arrays.asList(CHR1_B36, CHR10_B36, CHR2_B36), NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, false, true}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, true, true}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, false, true}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, true, true}, + { Arrays.asList(CHR1_HG18, CHR10_HG18, CHR2_HG18), Arrays.asList(CHR1_HG18, CHR10_HG18, CHR2_HG18), NON_CANONICAL_HUMAN_ORDER, false, true}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), NON_CANONICAL_HUMAN_ORDER, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), NON_CANONICAL_HUMAN_ORDER, false, true}, + { Arrays.asList(CHR1_B37, CHR10_B37, CHR2_B37), Arrays.asList(CHR1_B37, CHR10_B37, CHR2_B37), NON_CANONICAL_HUMAN_ORDER, false, true}, + { Arrays.asList(CHR1_B36, CHR10_B36, CHR2_B36), Arrays.asList(CHR1_B36, CHR10_B36, CHR2_B36), NON_CANONICAL_HUMAN_ORDER, false, true}, // If checkContigOrdering == false, we should not get NON_CANONICAL_HUMAN_ORDER: - { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), IDENTICAL, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), IDENTICAL, false, false}, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, false, false}, // Dictionaries with a common subset, but different relative ordering within that subset - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, true}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, true, true}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, true}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, true, true}, - { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHRM_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, true}, - { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, true}, - { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHRM_HG19, CHR1_HG19), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, true}, - { Arrays.asList(CHR1_B37, CHR2_B37), Arrays.asList(CHR2_B37, CHR1_B37), OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, true, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, true, true}, + { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHRM_HG19), OUT_OF_ORDER, false, true}, + { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR2_HG19, CHR1_HG19), OUT_OF_ORDER, false, true}, + { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHRM_HG19, CHR1_HG19), OUT_OF_ORDER, false, true}, + { Arrays.asList(CHR1_B37, CHR2_B37), Arrays.asList(CHR2_B37, CHR1_B37), OUT_OF_ORDER, false, true}, // If checkContigOrdering == false, we should not get OUT_OF_ORDER: - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), SUPERSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), SUPERSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHR_NONSTANDARD1), COMMON_SUBSET,false, false}, // Dictionaries with a common subset in the same relative order, but with different indices. // This will only throw an exception during validation if checkContigOrdering is true // These have checkContigOrdering == true, so we expect DIFFERENT_INDICES and an exception: - { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, + { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), DIFFERENT_INDICES, false, true}, // Setting requireSuperset == true should make no difference here (we should still get DIFFERENT_INDICES and an exception): - { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, true}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, - { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, - { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, - { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, false, true}, + { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), DIFFERENT_INDICES, true, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), DIFFERENT_INDICES, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1), DIFFERENT_INDICES, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19), DIFFERENT_INDICES, false, true}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19), DIFFERENT_INDICES, false, true}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), DIFFERENT_INDICES, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19), DIFFERENT_INDICES, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), DIFFERENT_INDICES, false, true}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), DIFFERENT_INDICES, false, true}, // Same test cases as above, but these have checkContigOrdering == false, so we expect SUPERSET or COMMON_SUBSET instead of DIFFERENT_INDICES, and no exception: - { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), SUPERSET, null, false, false}, - { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), SUPERSET, null, true, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), COMMON_SUBSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19), COMMON_SUBSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19), SUPERSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), SUPERSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19), SUPERSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), SUPERSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), SUPERSET, null, false, false}, + { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), SUPERSET, false, false}, + { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), SUPERSET, true, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19), COMMON_SUBSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19), SUPERSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), SUPERSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19), SUPERSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), SUPERSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), SUPERSET, false, false}, // tests for SUPERSET - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19), SUPERSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19), SUPERSET, null, false, true}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19), SUPERSET, null, true, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19), SUPERSET, null, true, true}, - { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19), SUPERSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), SUPERSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), SUPERSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19), SUPERSET, null, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19), SUPERSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19), SUPERSET, false, true}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19), SUPERSET, true, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19), SUPERSET, true, true}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19), SUPERSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), SUPERSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), SUPERSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19), SUPERSET, false, false}, // Extended attributes should be ignored when determining whether a superset exists: - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19_WITH_ATTRIBUTES), SUPERSET, null, false, false}, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHRM_HG19), Arrays.asList(CHR1_HG19_WITH_ATTRIBUTES, CHR10_HG19), SUPERSET, null, false, false} + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19_WITH_ATTRIBUTES), SUPERSET, false, false}, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHRM_HG19), Arrays.asList(CHR1_HG19_WITH_ATTRIBUTES, CHR10_HG19), SUPERSET, false, false} }; } - @Test( dataProvider = "SequenceDictionaryDataProvider" ) - public void testSequenceDictionaryValidation( final List firstDictionaryContigs, - final List secondDictionaryContigs, - final SequenceDictionaryUtils.SequenceDictionaryCompatibility dictionaryCompatibility, //not needed by this test - final Class expectedExceptionUponValidation, - final boolean requireSuperset, - final boolean checkContigOrdering) { - final SAMSequenceDictionary firstDictionary = createSequenceDictionary(firstDictionaryContigs); - final SAMSequenceDictionary secondDictionary = createSequenceDictionary(secondDictionaryContigs); - final String testDescription = String.format("First dictionary: %s Second dictionary: %s", - SequenceDictionaryUtils.getDictionaryAsString(firstDictionary), - SequenceDictionaryUtils.getDictionaryAsString(secondDictionary)); - Exception exceptionThrown = null; - try { - SequenceDictionaryUtils.validateDictionaries( - "firstDictionary", - firstDictionary, - "secondDictionary", - secondDictionary, - requireSuperset, - checkContigOrdering); - } - catch ( Exception e ) { - exceptionThrown = e; - } - if ( expectedExceptionUponValidation != null ) { - Assert.assertTrue(exceptionThrown != null && expectedExceptionUponValidation.isInstance(exceptionThrown), - String.format("Expected exception %s but saw %s instead. %s", - expectedExceptionUponValidation.getSimpleName(), - exceptionThrown == null ? "no exception" : exceptionThrown.getClass().getSimpleName(), - testDescription)); - } - else { - Assert.assertTrue(exceptionThrown == null, - String.format("Expected no exception but saw exception %s instead. %s", - exceptionThrown != null ? exceptionThrown.getClass().getSimpleName() : "none", - testDescription)); - } - } - @Test( dataProvider = "SequenceDictionaryDataProvider" ) public void testSequenceDictionaryComparison( final List firstDictionaryContigs, final List secondDictionaryContigs, - final SequenceDictionaryUtils.SequenceDictionaryCompatibility dictionaryCompatibility, - final Class expectedExceptionUponValidation, + final SAMSequenceDictionaryUtils.SequenceDictionaryCompatibility dictionaryCompatibility, final boolean requireSuperset, final boolean checkContigOrdering) { final SAMSequenceDictionary firstDictionary = createSequenceDictionary(firstDictionaryContigs); final SAMSequenceDictionary secondDictionary = createSequenceDictionary(secondDictionaryContigs); final String testDescription = String.format("First dictionary: %s Second dictionary: %s", - SequenceDictionaryUtils.getDictionaryAsString(firstDictionary), - SequenceDictionaryUtils.getDictionaryAsString(secondDictionary)); + SAMSequenceDictionaryUtils.getDictionaryAsString(firstDictionary), + SAMSequenceDictionaryUtils.getDictionaryAsString(secondDictionary)); - final SequenceDictionaryUtils.SequenceDictionaryCompatibility reportedCompatibility = - SequenceDictionaryUtils.compareDictionaries(firstDictionary, secondDictionary, checkContigOrdering); + final SAMSequenceDictionaryUtils.SequenceDictionaryCompatibility reportedCompatibility = + SAMSequenceDictionaryUtils.compareDictionaries(firstDictionary, secondDictionary, checkContigOrdering); Assert.assertTrue(reportedCompatibility == dictionaryCompatibility, String.format("Dictionary comparison should have returned %s but instead returned %s. %s", @@ -274,64 +221,8 @@ public Object[][] getStandardValidationIgnoresContigOrderData() { }; } - @Test(dataProvider = "StandardValidationIgnoresContigOrderData") - public void testStandardValidationIgnoresContigOrder( final List firstDictionaryContigs, final List secondDictionaryContigs ) { - final SAMSequenceDictionary firstDictionary = createSequenceDictionary(firstDictionaryContigs); - final SAMSequenceDictionary secondDictionary = createSequenceDictionary(secondDictionaryContigs); - - // Standard validation (the overload of validateDictionaries() that doesn't take any boolean args) - // should ignore differences in ordering of common contigs, so we shouldn't get an exception here - SequenceDictionaryUtils.validateDictionaries("first", firstDictionary, "second", secondDictionary); - } - - @DataProvider(name = "NonSupersetData") - public Object[][] getNonSupersetData() { - return new Object[][] { - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19) }, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR10_HG19, CHR2_HG19, CHR1_HG19) } - }; - } - - @Test(dataProvider = "NonSupersetData") - public void testStandardValidationDoesNotRequireSuperset( final List firstDictionaryContigs, final List secondDictionaryContigs ) { - final SAMSequenceDictionary firstDictionary = createSequenceDictionary(firstDictionaryContigs); - final SAMSequenceDictionary secondDictionary = createSequenceDictionary(secondDictionaryContigs); - - // Standard validation (the overload of validateDictionaries() that doesn't take any boolean args) - // should not require a superset relationship, so we shouldn't get an exception here - SequenceDictionaryUtils.validateDictionaries("first", firstDictionary, "second", secondDictionary); - } - - @Test(dataProvider = "NonSupersetData", expectedExceptions = UserException.IncompatibleSequenceDictionaries.class) - public void testCRAMValidationDoesRequireSuperset( final List refDictionaryContigs, final List cramDictionaryContigs ) { - final SAMSequenceDictionary refDictionary = createSequenceDictionary(refDictionaryContigs); - final SAMSequenceDictionary cramDictionary = createSequenceDictionary(cramDictionaryContigs); - - // CRAM validation against the reference SHOULD require a superset relationship, so we should - // get an exception here - SequenceDictionaryUtils.validateCRAMDictionaryAgainstReference(refDictionary, cramDictionary); - } - - @DataProvider(name = "SupersetData") - public Object[][] getSupersetData() { - return new Object[][] { - { Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19)}, //exactly same - { Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19) }, - { Arrays.asList(CHR10_HG19, CHR2_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19) } - }; - } - - @Test(dataProvider = "SupersetData") - public void testCRAMValidationDoesAcceptSuperset( final List refDictionaryContigs, final List cramDictionaryContigs ) { - final SAMSequenceDictionary refDictionary = createSequenceDictionary(refDictionaryContigs); - final SAMSequenceDictionary cramDictionary = createSequenceDictionary(cramDictionaryContigs); - - //In these inputs , cram contigs are subsets of ref contigs and so it should be accepted - SequenceDictionaryUtils.validateCRAMDictionaryAgainstReference(refDictionary, cramDictionary); - } - private SAMSequenceDictionary createSequenceDictionary( final List contigs ) { - final List clonedContigs = new ArrayList(contigs.size()); + final List clonedContigs = new ArrayList<>(contigs.size()); // Clone the individual SAMSequenceRecords to avoid contig-index issues with shared objects // across multiple dictionaries in tests diff --git a/src/test/java/htsjdk/variant/bcf2/BCF2UtilsUnitTest.java b/src/test/java/htsjdk/variant/bcf2/BCF2UtilsUnitTest.java index 91804c48dc..95fb359446 100644 --- a/src/test/java/htsjdk/variant/bcf2/BCF2UtilsUnitTest.java +++ b/src/test/java/htsjdk/variant/bcf2/BCF2UtilsUnitTest.java @@ -34,9 +34,8 @@ import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFHeaderLineCount; import htsjdk.variant.vcf.VCFHeaderLineType; -import htsjdk.variant.vcf.VCFIDHeaderLine; +import htsjdk.variant.vcf.VCFHeaderVersion; import htsjdk.variant.vcf.VCFInfoHeaderLine; -import htsjdk.variant.vcf.VCFSimpleHeaderLine; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -75,22 +74,22 @@ public void testCollapseExpandTest(final List in, final String expectedC public void testCreateDictionary() { final List inputLines = new ArrayList(); int counter = 0; + inputLines.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++))); inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++))); inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter)); inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter)); - inputLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); - inputLines.add(new VCFInfoHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + inputLines.add(new VCFInfoHeaderLine(String.valueOf("A"+counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + inputLines.add(new VCFInfoHeaderLine(String.valueOf("A"+counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); inputLines.add(new VCFHeaderLine("x", "misc")); inputLines.add(new VCFHeaderLine("y", "misc")); - inputLines.add(new VCFSimpleHeaderLine("GATKCommandLine","z","misc")); - inputLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); - inputLines.add(new VCFFormatHeaderLine(String.valueOf(counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); - final int inputLineCounter = counter; - final VCFHeader inputHeader = new VCFHeader(new LinkedHashSet(inputLines)); + inputLines.add(new VCFFilterHeaderLine("aFilter", "misc")); + inputLines.add(new VCFFormatHeaderLine(String.valueOf("A"+counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + inputLines.add(new VCFFormatHeaderLine(String.valueOf("A"+counter++), VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "x")); + final VCFHeader inputHeader = new VCFHeader(new LinkedHashSet<>(inputLines)); final ArrayList dict = BCF2Utils.makeDictionary(inputHeader); final int dict_size = dict.size(); - Assert.assertEquals(7,dict_size); + Assert.assertEquals(8,dict_size); } /** @@ -115,6 +114,7 @@ public Object[][] makeHeaderOrderTestProvider() { final List extraLines = new ArrayList(); int counter = 0; + inputLines.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++))); inputLines.add(new VCFFilterHeaderLine(String.valueOf(counter++))); inputLines.add(new VCFContigHeaderLine(Collections.singletonMap("ID", String.valueOf(counter++)), counter)); @@ -142,7 +142,7 @@ public Object[][] makeHeaderOrderTestProvider() { for ( final List permutation : permutations ) { for ( int i = -1; i < inputLines.size(); i++ ) { final List allLines = new ArrayList(inputLines); - if ( i >= 0 ) + if ( i >= 0 && !VCFHeaderVersion.isFormatString(allLines.get(i).getKey()) ) allLines.remove(i); allLines.addAll(permutation); final VCFHeader testHeader = new VCFHeader(new LinkedHashSet(allLines)); @@ -179,8 +179,8 @@ public Object[][] makeHeaderOrderTestProvider() { private static boolean expectedConsistent(final VCFHeader combinationHeader, final int minCounterForInputLines) { final List ids = new ArrayList(); for ( final VCFHeaderLine line : combinationHeader.getMetaDataInInputOrder() ) { - if ( line instanceof VCFIDHeaderLine) { - ids.add(Integer.valueOf(((VCFIDHeaderLine) line).getID())); + if ( line.isIDHeaderLine()) { + ids.add(Integer.valueOf(line.getID())); } } diff --git a/src/test/java/htsjdk/variant/bcf2/BCF2WriterUnitTest.java b/src/test/java/htsjdk/variant/bcf2/BCF2WriterUnitTest.java index 7a99916c5b..17e2ae3257 100644 --- a/src/test/java/htsjdk/variant/bcf2/BCF2WriterUnitTest.java +++ b/src/test/java/htsjdk/variant/bcf2/BCF2WriterUnitTest.java @@ -75,6 +75,7 @@ private static VCFHeader createFakeHeader() { final SAMSequenceDictionary sequenceDict = createArtificialSequenceDictionary(); final Set metaData = new HashSet<>(); final Set additionalColumns = new HashSet<>(); + metaData.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); metaData.add(new VCFHeaderLine("two", "2")); additionalColumns.add("extra1"); additionalColumns.add("extra2"); diff --git a/src/test/java/htsjdk/variant/variantcontext/VariantContextTestProvider.java b/src/test/java/htsjdk/variant/variantcontext/VariantContextTestProvider.java index 8cff545f78..e04910eb0e 100644 --- a/src/test/java/htsjdk/variant/variantcontext/VariantContextTestProvider.java +++ b/src/test/java/htsjdk/variant/variantcontext/VariantContextTestProvider.java @@ -221,6 +221,7 @@ private final static void addHeaderLine(final Set metaData, final private static void createSyntheticHeader() { Set metaData = new TreeSet<>(); + metaData.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); addHeaderLine(metaData, "STRING1", 1, VCFHeaderLineType.String); addHeaderLine(metaData, "END", 1, VCFHeaderLineType.Integer); addHeaderLine(metaData, "STRING3", 3, VCFHeaderLineType.String); diff --git a/src/test/java/htsjdk/variant/variantcontext/writer/AsyncVariantContextWriterUnitTest.java b/src/test/java/htsjdk/variant/variantcontext/writer/AsyncVariantContextWriterUnitTest.java index 9e7f7e45cb..379130407c 100644 --- a/src/test/java/htsjdk/variant/variantcontext/writer/AsyncVariantContextWriterUnitTest.java +++ b/src/test/java/htsjdk/variant/variantcontext/writer/AsyncVariantContextWriterUnitTest.java @@ -89,7 +89,7 @@ public void testWriteAndReadAsyncVCFHeaderless() throws IOException { writer.add(createVC(header)); } final VCFCodec codec = new VCFCodec(); - codec.setVCFHeader(header, VCFHeaderVersion.VCF4_2); + codec.setVCFHeader(header); try (final FileInputStream fis = new FileInputStream(fakeVCFFile)) { final AsciiLineReaderIterator iterator = new AsciiLineReaderIterator(new AsciiLineReader(fis)); @@ -110,6 +110,7 @@ public void testWriteAndReadAsyncVCFHeaderless() throws IOException { */ public static VCFHeader createFakeHeader(final Set metaData, final Set additionalColumns, final SAMSequenceDictionary sequenceDict) { + metaData.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); metaData.add(new VCFHeaderLine("two", "2")); additionalColumns.add("extra1"); additionalColumns.add("extra2"); diff --git a/src/test/java/htsjdk/variant/variantcontext/writer/VCFWriterUnitTest.java b/src/test/java/htsjdk/variant/variantcontext/writer/VCFWriterUnitTest.java index ca2afcbec0..ceac4f95a8 100644 --- a/src/test/java/htsjdk/variant/variantcontext/writer/VCFWriterUnitTest.java +++ b/src/test/java/htsjdk/variant/variantcontext/writer/VCFWriterUnitTest.java @@ -42,11 +42,7 @@ import htsjdk.variant.variantcontext.GenotypesContext; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; -import htsjdk.variant.vcf.VCFCodec; -import htsjdk.variant.vcf.VCFFileReader; -import htsjdk.variant.vcf.VCFHeader; -import htsjdk.variant.vcf.VCFHeaderLine; -import htsjdk.variant.vcf.VCFHeaderVersion; +import htsjdk.variant.vcf.*; import java.io.File; import java.io.FileInputStream; @@ -154,7 +150,7 @@ public void testWriteAndReadVCFHeaderless(final String extension) throws IOExcep writer.add(createVC(header)); } final VCFCodec codec = new VCFCodec(); - codec.setVCFHeader(header, VCFHeaderVersion.VCF4_2); + codec.setVCFHeader(header); try (BlockCompressedInputStream bcis = new BlockCompressedInputStream(fakeVCFFile); FileInputStream fis = new FileInputStream(fakeVCFFile)) { @@ -228,7 +224,7 @@ public void testChangeHeaderAfterWritingBody() { */ private static VCFHeader createFakeHeader(final Set metaData, final Set additionalColumns, final SAMSequenceDictionary sequenceDict) { - metaData.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_0.getFormatString(), VCFHeaderVersion.VCF4_0.getVersionString())); + metaData.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_2.getFormatString(), VCFHeaderVersion.VCF4_2.getVersionString())); metaData.add(new VCFHeaderLine("two", "2")); additionalColumns.add("extra1"); additionalColumns.add("extra2"); @@ -330,6 +326,7 @@ public void TestWritingLargeVCF(final String extension) throws FileNotFoundExcep @DataProvider(name = "vcfExtensionsDataProvider") public Object[][]vcfExtensionsDataProvider() { return new Object[][] { + //TODO: fix this BCF problem! // TODO: BCF doesn't work because header is not properly constructed. // {".bcf"}, {FileExtensions.VCF}, diff --git a/src/test/java/htsjdk/variant/vcf/AbstractVCFCodecTest.java b/src/test/java/htsjdk/variant/vcf/AbstractVCFCodecTest.java index 273b0f24af..97e7493a6f 100644 --- a/src/test/java/htsjdk/variant/vcf/AbstractVCFCodecTest.java +++ b/src/test/java/htsjdk/variant/vcf/AbstractVCFCodecTest.java @@ -13,6 +13,7 @@ import java.util.Iterator; import java.util.List; + public class AbstractVCFCodecTest extends VariantBaseTest { @Test @@ -31,11 +32,28 @@ public void shouldPreserveSymbolicAlleleCase() { public void TestSpanDelParseAlleles() { final List list = VCF3Codec.parseAlleles("A", Allele.SPAN_DEL_STRING, 0); } + @DataProvider(name="AllVCFCodecs") + public Object[][] allVCFCodecs() { + return new Object[][] { + {new VCF3Codec() }, + {new VCFCodec() }, + }; + } + + @Test(dataProvider = "AllVCFCodecs") + public void TestSpanDelParseAlleles(final AbstractVCFCodec vcfCodec){ + // TODO: why is there no Assert here ?? + vcfCodec.parseAlleles("A", Allele.SPAN_DEL_STRING, 0); + } @Test(expectedExceptions = TribbleException.class) public void TestSpanDelParseAllelesException() { final List list1 = VCF3Codec.parseAlleles(Allele.SPAN_DEL_STRING, "A", 0); } + @Test(dataProvider = "AllVCFCodecs", expectedExceptions = TribbleException.class) + public void TestSpanDelParseAllelesException(final AbstractVCFCodec vcfCodec){ + vcfCodec.parseAlleles(Allele.SPAN_DEL_STRING, "A", 0); + } @DataProvider(name = "thingsToTryToDecode") public Object[][] getThingsToTryToDecode() { @@ -47,16 +65,49 @@ public Object[][] getThingsToTryToDecode() { }; } - @Test(dataProvider = "thingsToTryToDecode") - public void testCanDecodeFile(String potentialInput, boolean canDecode) { - Assert.assertEquals(AbstractVCFCodec.canDecodeFile(potentialInput, VCFCodec.VCF4_MAGIC_HEADER), canDecode); - } + @Test(dataProvider = "thingsToTryToDecode") + public void testCanDecodeFile(String potentialInput, boolean canDecode) { + //TODO: add VCF43Codec when available + //TODO: its not sufficient to test for ANY v4 prefix since it will succeed on 4.3 as well + Assert.assertEquals(AbstractVCFCodec.canDecodeFile(potentialInput, VCFCodec.VCF4_MAGIC_HEADER), canDecode); + } - @Test - public void testGetTabixFormat() { - Assert.assertEquals(new VCFCodec().getTabixFormat(), TabixFormat.VCF); - Assert.assertEquals(new VCF3Codec().getTabixFormat(), TabixFormat.VCF); - } + @Test(dataProvider = "AllVCFCodecs") + public void testGetTabixFormat(final AbstractVCFCodec vcfCodec) { + Assert.assertEquals(vcfCodec.getTabixFormat(), TabixFormat.VCF); + } + + @DataProvider(name="otherHeaderLines") + public Object[][] otherHeaderLines() { + return new Object[][] { + { "key=<", new VCFHeaderLine("key", "<") }, + // taken from Funcotator test file as ##ID= + // technically, this is invalid due to the lack of an "ID" attribute, but it should still parse + // into a VCFHeaderLine (but noa VCFSimpleHeaderLine + { "ID=", + new VCFHeaderLine("ID", "") }, + }; + } + + @Test(dataProvider="otherHeaderLines") + public void testGetOtherHeaderLine(final String headerLineString, final VCFHeaderLine headerLine) { + Assert.assertEquals(new VCFCodec().getOtherHeaderLine(headerLineString, VCFHeaderVersion.VCF4_2), headerLine); + } + + @DataProvider(name="badOtherHeaderLines") + public Object[][] badOtherHeaderLines() { + return new Object[][] { + { "=" }, + { "=<" }, + { "=<>" }, + { "key" }, + }; + } + + @Test(dataProvider="badOtherHeaderLines", expectedExceptions=TribbleException.InvalidHeader.class) + public void testBadOtherHeaderLine(final String headerLineString) { + Assert.assertNull(new VCFCodec().getOtherHeaderLine(headerLineString, VCFHeaderVersion.VCF4_2)); + } @Test public void testGLnotOverridePL() { diff --git a/src/test/java/htsjdk/variant/vcf/VCFAltHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFAltHeaderLineUnitTest.java new file mode 100644 index 0000000000..ed6a1d2b96 --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFAltHeaderLineUnitTest.java @@ -0,0 +1,43 @@ +package htsjdk.variant.vcf; + +import htsjdk.HtsjdkTest; +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +public class VCFAltHeaderLineUnitTest extends HtsjdkTest { + + @DataProvider(name = "allowedVCFVersions") + public Object[][] allowedVCFVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF4_3} + }; + } + + @DataProvider(name = "rejectedVCFVersions") + public Object[][] rejectedVCFVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF3_2}, + {VCFHeaderVersion.VCF3_3}, + }; + } + + private static final String ALT_STRING = ""; + + @Test(dataProvider="allowedVCFVersions") + public void testAllowedVersions(final VCFHeaderVersion vcfAllowedVersion) { + final VCFAltHeaderLine vcfLine = new VCFAltHeaderLine(ALT_STRING, vcfAllowedVersion); + Assert.assertEquals("id", vcfLine.getID()); + Assert.assertEquals("desc", vcfLine.getGenericFieldValue(VCFSimpleHeaderLine.DESCRIPTION_ATTRIBUTE)); + } + + @Test(dataProvider="rejectedVCFVersions",expectedExceptions=TribbleException.class) + public void testRejectedVersions(final VCFHeaderVersion vcfAllowedVersion) { + new VCFAltHeaderLine(ALT_STRING, vcfAllowedVersion); + } + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFCodec43FeaturesTest.java b/src/test/java/htsjdk/variant/vcf/VCFCodec43FeaturesTest.java index cbc027ab5d..8dbf6dd30d 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFCodec43FeaturesTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFCodec43FeaturesTest.java @@ -43,23 +43,31 @@ public class VCFCodec43FeaturesTest extends VariantBaseTest { private Object[][] allVCF43Files() { return new Object[][] { // a .vcf, .vcf.gz, .vcf with UTF8 chars, and .vcf.gz with UTF8 chars - { TEST_43_FILE }, - { TEST_43_UTF8_FILE }, - { TEST_43_GZ_FILE }, - { TEST_43_UTF8_GZ_FILE } + + // these first two files have a duplicate INFO header line in them that differ + // from each other only by virtue of having different descriptions: + //WARNING 2021-02-23 15:37:13 VCFMetaDataLines Attempt to add header line (INFO=) collides with existing line header line (INFO=). + // The existing line will be retained + { TEST_43_FILE, 69 }, + { TEST_43_UTF8_FILE, 69 }, + + { TEST_43_GZ_FILE, 70 }, + { TEST_43_UTF8_GZ_FILE, 70 } }; } @Test(dataProvider="all43Files") - public void testReadAllVCF43Features(final Path testFile) { + public void testReadAllVCF43Features(final Path testFile, int expectedHeaderLineCount) { final Tuple> entireVCF = readEntireVCFIntoMemory(testFile); - Assert.assertEquals(entireVCF.a.getMetaDataInInputOrder().size(), 70); + Assert.assertEquals(entireVCF.a.getMetaDataInInputOrder().size(), expectedHeaderLineCount); Assert.assertEquals(entireVCF.b.size(), 25); } @Test(dataProvider="all43Files") - public void testVCF43SampleLine(final Path testFile) { + public void testVCF43SampleLine(final Path testFile, int ignored) { // ##SAMPLE= final VCFSampleHeaderLine sampleLine = getHeaderLineFromTestFile( @@ -77,7 +85,7 @@ public void testVCF43SampleLine(final Path testFile) { } @Test(dataProvider="all43Files") - public void testVCF43AltLine(final Path testFile) { + public void testVCF43AltLine(final Path testFile, int ignored) { // ##ALT= final VCFAltHeaderLine altLine = getHeaderLineFromTestFile( testFile, @@ -90,7 +98,7 @@ public void testVCF43AltLine(final Path testFile) { } @Test(dataProvider="all43Files") - public void testVCF43PedigreeLine(final Path testFile) { + public void testVCF43PedigreeLine(final Path testFile, int ignored) { // ##PEDIGREE= final VCFPedigreeHeaderLine pedigreeLine = getHeaderLineFromTestFile( testFile, @@ -116,7 +124,7 @@ public void testV43PedigreeParsing() { } @Test(dataProvider="all43Files") - public void testVCF43MetaLine(final Path testFile) { + public void testVCF43MetaLine(final Path testFile, int ignored) { // ##META= final VCFMetaHeaderLine metaLine = getHeaderLineFromTestFile( testFile, @@ -129,7 +137,7 @@ public void testVCF43MetaLine(final Path testFile) { } @Test(dataProvider="all43Files") - public void testVCF43PercentEncoding(final Path testFile) { + public void testVCF43PercentEncoding(final Path testFile, int ignored) { final Tuple> entireVCF = readEntireVCFIntoMemory(testFile); // 1 327 . T <*> 666.18 GATK_STANDARD;HARD_TO_VALIDATE @@ -142,7 +150,7 @@ public void testVCF43PercentEncoding(final Path testFile) { } @Test(dataProvider="all43Files") - public void testSymbolicAlternateAllele(final Path testFile) { + public void testSymbolicAlternateAllele(final Path testFile, int ignored) { final Tuple> entireVCF = readEntireVCFIntoMemory(testFile); // 1 327 . T <*> 666.18 GATK_STANDARD;HARD_TO_VALIDATE @@ -241,7 +249,7 @@ public void testVCF43PercentEncodingWithUTF8() { // given a vcf file, extract a header line with the given key and ID, cast to the target // header line type (T) via the transformer function - private static T getHeaderLineFromTestFile( + private static T getHeaderLineFromTestFile( final Path testVCFFile, final String key, final String ID, diff --git a/src/test/java/htsjdk/variant/vcf/VCFCompoundHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFCompoundHeaderLineUnitTest.java index f94435a833..96924b4e3a 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFCompoundHeaderLineUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFCompoundHeaderLineUnitTest.java @@ -25,22 +25,245 @@ package htsjdk.variant.vcf; +import htsjdk.tribble.TribbleException; import htsjdk.variant.VariantBaseTest; import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNotNull; /** - * User: ebanks - * Date: Apr 2, 2014 + * Tests for VCFCompoundHeaderLine. + * + * NOTE: This class uses VCFInfoHeaderLine instances to test shared VCFCompoundHeaderLine functionality since + * VCFCompoundHeaderLine abstract. */ public class VCFCompoundHeaderLineUnitTest extends VariantBaseTest { + @DataProvider (name = "badOrMissingAttributes") + public Object[][] getMissingAttributes() { + return new Object[][] { + {""}, // no Type + {""}, // no Type + {""}, // no Number + {""}, // bogus Type + {""}, // bogus Number + }; + } + + @Test(dataProvider= "badOrMissingAttributes", expectedExceptions=TribbleException.class) + public void testBadOrMissingAttributes(final String lineString) { + new VCFInfoHeaderLine(lineString, VCFHeader.DEFAULT_VCF_VERSION); + } + + @DataProvider (name = "acceptedAttributes") + public Object[][] getAcceptedAttributes() { + return new Object[][] { + {"", "Description", "foo"}, + //next two cases from https://github.com/samtools/htsjdk/issues/517 + {"", "Version", "3"}, + {"", "Source", "mySource"}, + }; + } + + @Test(dataProvider= "acceptedAttributes") + public void testAcceptedAttributes(final String lineString, final String attribute, final String expectedValue) { + final VCFCompoundHeaderLine headerline = new VCFInfoHeaderLine(lineString, VCFHeader.DEFAULT_VCF_VERSION); + Assert.assertEquals(headerline.getGenericFieldValue(attribute), expectedValue); + } + + @DataProvider (name = "invalidIDs") + public Object[][] getInvalidLines() { + return new Object[][] { + // ID cannot start with number + {""}, + // ID cannot start with '.'' + {""}, + // Test that IDs with the special thousand genomes key as a prefix are rejected + // The thousand genomes key is only accepted for VCFInfoHeaderLine and is tested in VCFInfoHeaderLineUnitTest + {""}, + // Contains invalid character '&' + {""}, + }; + } + + @Test(dataProvider = "invalidIDs", expectedExceptions = TribbleException.VersionValidationFailure.class) + public void testGetValidationError(final String lineString) { + // TODO change to VCFHeader.DEFAULT_VCF_VERSION + new VCFInfoHeaderLine(lineString, VCFHeaderVersion.VCF4_3); + } + + @DataProvider (name = "headerLineTypes") + public Object[][] getHeaderLineTypes() { + return new Object[][] { + {"", VCFHeaderLineType.Float}, + {"", VCFHeaderLineType.Integer}, + {"", VCFHeaderLineType.String}, + {"", VCFHeaderLineType.Character}, + // Number must be 0 for flag type + {"", VCFHeaderLineType.Flag}, + }; + } + + @Test(dataProvider = "headerLineTypes") + public void testGetType(final String lineString, final VCFHeaderLineType expectedType) { + final VCFCompoundHeaderLine headerline = new VCFInfoHeaderLine(lineString, VCFHeader.DEFAULT_VCF_VERSION); + Assert.assertEquals(headerline.getType(), expectedType); + } + + @DataProvider (name = "headerLineCountTypes") + public Object[][] getLineCountTypes() { + return new Object[][] { + {"", VCFHeaderLineCount.A}, + {"", VCFHeaderLineCount.R}, + {"", VCFHeaderLineCount.G}, + {"", VCFHeaderLineCount.INTEGER}, + {"", VCFHeaderLineCount.UNBOUNDED}, + }; + } + + @Test(dataProvider= "headerLineCountTypes") + public void testGetLineCountType(final String lineString, final VCFHeaderLineCount expectedCountType) { + final VCFCompoundHeaderLine headerline = new VCFInfoHeaderLine(lineString, VCFHeader.DEFAULT_VCF_VERSION); + Assert.assertEquals(headerline.getCountType(), expectedCountType); + Assert.assertEquals(headerline.isFixedCount(), expectedCountType == VCFHeaderLineCount.INTEGER); + } + + @Test(expectedExceptions=TribbleException.class) + public void testRejectIntegerTypeWithNegativeCount() { + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION); + } + @Test - public void supportsVersionFields() { - final String line = ""; - new VCFInfoHeaderLine(line, VCFHeaderVersion.VCF4_2); - // if we don't support version fields then we should fail before we ever get here - Assert.assertTrue(true); + public void testRepairFlagTypeWithNegativeCount() { + final VCFInfoHeaderLine infoLine = new VCFInfoHeaderLine("", + VCFHeader.DEFAULT_VCF_VERSION); + Assert.assertEquals(infoLine.getCount(), 0); } + + @DataProvider (name = "equalsData") + public Object[][] getEqualsData() { + return new Object[][] { + //pos + {"", + "", true}, + {"", + "", true}, + {"", + "", true}, + {"", + "", true}, + {"", + "", true}, + + //neg + {"", + "", false}, // different ID + {"", + "", false}, // different Type + {"", + "", false}, // different Number + {"", + "", false}, // different integer Number + {"", + "", false}, // different description + {"", + "", VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION), // merged result, promote to float + }, + { + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION) // merged result, promote to float + }, + { + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION) // merged result, resolve as new unbounded + }, + }; + } + + @Test(dataProvider = "mergeCompatibleInfoLines") + public void testMergeIncompatibleInfoLines(final VCFInfoHeaderLine line1, final VCFInfoHeaderLine line2, final VCFInfoHeaderLine expectedLine) { + VCFCompoundHeaderLine mergedLine = VCFCompoundHeaderLine.getMergedCompoundHeaderLine( + line1, + line2, + new VCFHeaderMerger.HeaderMergeConflictWarnings(false), + (l1, l2) -> new VCFInfoHeaderLine( + l1.getID(), + VCFHeaderLineCount.UNBOUNDED, + l1.getType(), + l1.getDescription()) + ); + Assert.assertEquals(mergedLine, expectedLine); + } + + @DataProvider(name = "mergeIncompatibleInfoLines") + public Object[][] getMergeIncompatibleInfoLines() { + return new Object[][]{ + { + new VCFInfoHeaderLine("",VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION), + }, + { + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION), + }, + }; + } + + @Test(dataProvider = "mergeIncompatibleInfoLines", expectedExceptions=TribbleException.class) + public void testMergeIncompatibleInfoLines(final VCFInfoHeaderLine line1, final VCFInfoHeaderLine line2) { + VCFCompoundHeaderLine.getMergedCompoundHeaderLine( + line1, + line2, + new VCFHeaderMerger.HeaderMergeConflictWarnings(false), + (l1, l2) -> { throw new IllegalArgumentException("lambda should never execute - this exception should never be thrown"); } + ); + } + + @Test + public void testEncodeWithUnescapedQuotes() { + + VCFFilterHeaderLine unescapedFilterLine = new VCFFilterHeaderLine( + "aFilter", + "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \"NA\" || ANNOTATION <= 2.0]"); + + final String encodedAttributes = unescapedFilterLine.toStringEncoding(); + assertNotNull(encodedAttributes); + + final String expectedEncoding = "FILTER="; + assertEquals(encodedAttributes, expectedEncoding); + } + + @Test + public void testEncodeWithEscapedQuotes() { + + VCFFilterHeaderLine escapedFilterLine = new VCFFilterHeaderLine("aFilter", "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \\\"NA\\\" || ANNOTATION <= 2.0]"); + final String encodedAttributes = escapedFilterLine.toStringEncoding(); + assertNotNull(encodedAttributes); + + final String expectedEncoding = "FILTER="; + assertEquals(encodedAttributes, expectedEncoding); + } + } diff --git a/src/test/java/htsjdk/variant/vcf/VCFContigHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFContigHeaderLineUnitTest.java new file mode 100644 index 0000000000..ad33575bef --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFContigHeaderLineUnitTest.java @@ -0,0 +1,184 @@ +package htsjdk.variant.vcf; + +import htsjdk.HtsjdkTest; +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.TreeSet; + +public class VCFContigHeaderLineUnitTest extends HtsjdkTest { + + @DataProvider(name = "allowedIDs") + public Object[][] getAllowedIDs() { + return new Object[][]{ + {"", "1"}, + {"", "10"}, + {"", "X"}, + {"", "Y"}, + {"", "MT"}, + {"", "NC_007605"}, + {"", "GL000191.1"}, + {"", "HLA-A*01:01:01:01"}, //https://github.com/samtools/hts-specs/issues/124 + }; + } + + @Test(dataProvider= "allowedIDs") + public void testAllowedIDs(final String lineString, final String expectedIDString) { + final VCFContigHeaderLine headerline = new VCFContigHeaderLine(lineString, VCFHeader.DEFAULT_VCF_VERSION, 0); + Assert.assertEquals(headerline.getID(), expectedIDString); + } + + @DataProvider(name = "invalidIDs") + public Object[][] getInvalidIDs() { + return new Object[][]{ + // IDs cannot start with '*' + {""}, + // IDs cannot start with '=' + // The parser cannot handle attributes starting with '=' so we cannot express this test case + // {""}, + // IDs cannot contain '{' + {""}, + }; + } + + @Test(dataProvider = "invalidIDs", expectedExceptions = TribbleException.VersionValidationFailure.class) + public void testInvalidIDs(final String lineString) { + // TODO change to VCFHeader.DEFAULT_VCF_VERSION + new VCFContigHeaderLine(lineString, VCFHeaderVersion.VCF4_3, 1); + } + + @Test(expectedExceptions=TribbleException.class) + public void testRejectNegativeIndex() { + new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, -1); + } + + @DataProvider(name = "allowedAttributes") + public Object[][] getAllowedAttributes() { + return new Object[][] { + {"", "ID", "contig1"}, // https://github.com/samtools/htsjdk/issues/389 (no length) + {"", "length", "100"}, + {"", "taxonomy", "Homo sapiens"}, + {"", "assembly", "b37"}, + {"", "md5", "1a258fe76dfc8abd926f81f0e9b82ed7"}, + {"", + "URL", "http://www.refserve.org:8080/path/"}, + {"", + "species", "Homo sapiens"}, + }; + } + + @Test(dataProvider= "allowedAttributes") + public void testAllowedAttributes(final String lineString, final String attribute, final String expectedValue) { + final VCFContigHeaderLine headerline = new VCFContigHeaderLine(lineString, VCFHeader.DEFAULT_VCF_VERSION, 0); + Assert.assertEquals(headerline.getGenericFieldValue(attribute), expectedValue); + } + + @Test + public void testRoundTripThroughSequenceRecord() { + final VCFContigHeaderLine contigLine = new VCFContigHeaderLine( + "", + VCFHeader.DEFAULT_VCF_VERSION, + 0); + + final String lengthString = "100"; + final String assemblyString = "b37"; + final String md5String = "1a258fe76dfc8abd926f81f0e9b82ed7"; + final String URLString = "http://www.refserve.org:8080/path/"; + final String speciesString = "Homo sapiens"; + + final SAMSequenceRecord sequenceRecord = contigLine.getSAMSequenceRecord(); + + Assert.assertEquals(Integer.toString(sequenceRecord.getSequenceLength()), lengthString); + Assert.assertEquals(contigLine.getGenericFieldValue(VCFContigHeaderLine.LENGTH_ATTRIBUTE), lengthString); + + Assert.assertEquals(sequenceRecord.getAssembly(), assemblyString); + Assert.assertEquals(contigLine.getGenericFieldValue(VCFContigHeaderLine.ASSEMBLY_ATTRIBUTE), assemblyString); + + Assert.assertEquals(sequenceRecord.getMd5(), md5String); + Assert.assertEquals(contigLine.getGenericFieldValue(VCFContigHeaderLine.MD5_ATTRIBUTE), md5String); + + Assert.assertEquals(sequenceRecord.getAttribute(SAMSequenceRecord.URI_TAG), URLString); + Assert.assertEquals(contigLine.getGenericFieldValue(VCFContigHeaderLine.URL_ATTRIBUTE), URLString); + + Assert.assertEquals(sequenceRecord.getAttribute(SAMSequenceRecord.SPECIES_TAG), speciesString); + Assert.assertEquals(contigLine.getGenericFieldValue(VCFContigHeaderLine.SPECIES_ATTRIBUTE), speciesString); + + // now turn the SAMSequenceRecord back into a contig line, and compare the result to the + // original contig line + Assert.assertEquals( + new VCFContigHeaderLine(sequenceRecord, assemblyString), + contigLine); + } + + @DataProvider (name = "hashEqualsCompareData") + public Object[][] getHashEqualsCompareData() { + return new Object[][] { + + // For contig lines, equals and hash depend on the id, all other attributes, and the contig index, + // but compareTo only cares about the index. + + // line, index, line, line, index -> expected hash equals, expected equals, expected compare, + {"", 0, "", 0, true, true, 0 }, // identical + {"", 0, "", 1, false, false, -1 }, // identical except contig index + {"", 1, "", 0, false, false, 1 }, // identical except contig index + + {"", 0, "", 0, false, false, 0 }, // identical except attributes + {"", 0, "", 1, false, false, -1 }, // different attributes, different index + + {"", 0, "", 0, false, false, 0 }, // identical except ID + // different ID, same attributes and index, -> not equal, different hash, compare==0 + {"", 0, "", 0, false, false, 0 }, // different ID, attributes, same index + }; + } + + @Test(dataProvider = "hashEqualsCompareData") + public void testHashEqualsCompare( + final String line1, + final int index1, + final String line2, + final int index2, + final boolean expectedHashEquals, + final boolean expectedEquals, + final int expectedCompare) + { + final VCFContigHeaderLine headerLine1 = new VCFContigHeaderLine(line1, VCFHeader.DEFAULT_VCF_VERSION, index1); + final VCFContigHeaderLine headerLine2 = new VCFContigHeaderLine(line2, VCFHeader.DEFAULT_VCF_VERSION, index2); + + Assert.assertEquals(headerLine1.hashCode() == headerLine2.hashCode(), expectedHashEquals); + Assert.assertEquals(headerLine1.equals(headerLine2), expectedEquals); + Assert.assertEquals(headerLine1.compareTo(headerLine2), expectedCompare); + } + + @Test + public void testSortOrder() { + + final List expectedLineOrder = new ArrayList() {{ + add(new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 1)); + add(new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 2)); + add(new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 10)); + add(new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 20)); + }}; + + final TreeSet sortedLines = new TreeSet<>( + new ArrayList() {{ + add(new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 20)); + add(new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 10)); + add(new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 1)); + add(new VCFContigHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION, 2)); + }} + ); + + final Iterator sortedIt = sortedLines.iterator(); + for (final VCFContigHeaderLine cl : expectedLineOrder) { + Assert.assertTrue(sortedIt.hasNext()); + Assert.assertEquals(cl, sortedIt.next()); + } + } + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFEncoderTest.java b/src/test/java/htsjdk/variant/vcf/VCFEncoderTest.java index 89537eaf51..00e13b5813 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFEncoderTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFEncoderTest.java @@ -184,6 +184,7 @@ public void testMissingFormatFields(final VCFEncoder encoder, final VariantConte private static Set createSyntheticMetadata() { final Set metaData = new TreeSet<>(); + metaData.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); metaData.add(new VCFContigHeaderLine(Collections.singletonMap("ID", "1"), 0)); metaData.add(new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, "x")); diff --git a/src/test/java/htsjdk/variant/vcf/VCFFormatHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFFormatHeaderLineUnitTest.java new file mode 100644 index 0000000000..1e07ff9c2d --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFFormatHeaderLineUnitTest.java @@ -0,0 +1,19 @@ +package htsjdk.variant.vcf; + +import htsjdk.HtsjdkTest; +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.Test; + +/** + * Test conditions that are unique to FORMAT lines (not covered by VCFCompoundHeaderLineUnitTest). + */ +public class VCFFormatHeaderLineUnitTest extends HtsjdkTest { + + // FORMAT lines aren't allowed to have type==Flag + @Test(expectedExceptions=TribbleException.class) + public void testRejectInfoLineWithFlagField() { + new VCFFormatHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION); + } + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderLineTranslatorUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderLineTranslatorUnitTest.java index 73116f53f0..94859c8717 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFHeaderLineTranslatorUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderLineTranslatorUnitTest.java @@ -102,12 +102,14 @@ private Object[][] getInvalidHeaderLines() { List sourceVersion = Arrays.asList("Source", "Version"); return new Object[][]{ // to parse, expected, recommended, error message - {"", idDesc, none, "Tag Description in wrong order (was #1, expected #2)"}, - {"", idDesc, none, "Unexpected tag Desc"}, - {"<>", idDesc, none, "Unexpected tag "}, - - {"", idDesc, sourceVersion, "Recommended tag Source must be listed after all expected tags"}, - {"", idDesc, sourceVersion, "Recommended tag Source must be listed after all expected tags"} + {"", idDesc, none, "Unexpected tag or tag order for tag \"Description\""}, + {"", idDesc, none, "Unexpected tag or tag order for tag \"Desc\""}, + {"<>", idDesc, none, "Unexpected tag or tag order for tag \"\""}, + + {"", idDesc, sourceVersion, + "Unexpected tag or tag order for tag \"Source\""}, + {"", idDesc, sourceVersion, + "Unexpected tag or tag order for tag \"Source\""} }; } @@ -119,7 +121,7 @@ private static void callTranslator(final String line, VCFHeaderLineTranslator.parseLine(VCFHeaderVersion.VCF4_2, line, expectedTagOrder); } else { - VCFHeaderLineTranslator.parseLine(VCFHeaderVersion.VCF4_2, line, expectedTagOrder, recommendedTags); + VCFHeaderLineTranslator.parseLine(VCFHeaderVersion.VCF4_2, line, expectedTagOrder); } } @@ -153,13 +155,4 @@ private Object[][] getVcfV3Versions() { }; } - @Test(dataProvider = "vcfv3", expectedExceptions = TribbleException.class) - public void testVcfV3FailsRecommendedTags(final VCFHeaderVersion vcfVersion) { - VCFHeaderLineTranslator.parseLine( - vcfVersion, - "", - Arrays.asList("ID"), - Arrays.asList("Description") - ); - } } diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderLineUnitTest.java index e04d3c69c8..d5d7e47ec9 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFHeaderLineUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderLineUnitTest.java @@ -1,6 +1,9 @@ package htsjdk.variant.vcf; +import htsjdk.tribble.TribbleException; import htsjdk.variant.VariantBaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.LinkedHashMap; @@ -9,46 +12,146 @@ import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertNotNull; + public class VCFHeaderLineUnitTest extends VariantBaseTest { @Test public void testEncodeVCFHeaderLineWithUnescapedQuotes() { - final Map attributes = new LinkedHashMap<>(); attributes.put("ID", "VariantFiltration"); attributes.put("CommandLineOptions", "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \"NA\" || ANNOTATION <= 2.0]"); - final String encodedAttributes = VCFHeaderLine.toStringEncoding(attributes); + final VCFSimpleHeaderLine simpleHeaderLine = new VCFSimpleHeaderLine("someKey", attributes); + final String encodedAttributes = simpleHeaderLine.toStringEncoding(); assertNotNull(encodedAttributes); - final String expectedEncoding = ""; + final String expectedEncoding = "someKey="; assertEquals(encodedAttributes, expectedEncoding); } @Test public void testEncodeVCFHeaderLineWithEscapedQuotes() { - final Map attributes = new LinkedHashMap<>(); attributes.put("ID", "VariantFiltration"); attributes.put("CommandLineOptions", "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \\\"NA\\\" || ANNOTATION <= 2.0]"); - final String encodedAttributes = VCFHeaderLine.toStringEncoding(attributes); + final VCFSimpleHeaderLine simpleHeaderLine = new VCFSimpleHeaderLine("someKey", attributes); + final String encodedAttributes = simpleHeaderLine.toStringEncoding(); assertNotNull(encodedAttributes); - final String expectedEncoding = ""; + final String expectedEncoding = "someKey="; assertEquals(encodedAttributes, expectedEncoding); } - @Test(expectedExceptions = { IllegalArgumentException.class }, expectedExceptionsMessageRegExp = "Invalid count number, with fixed count the number should be 1 or higher: .*") - public void testFormatNumberExeptions() { + @Test + public void testIsNotStructuredHeaderLine() { + VCFHeaderLine hl = new VCFHeaderLine("key", "value"); + Assert.assertFalse(hl.isIDHeaderLine()); + Assert.assertNull(hl.getID()); + } + + @Test + public void testStringEncoding() { + VCFHeaderLine hl = new VCFHeaderLine("key", "value"); + Assert.assertEquals(hl.toStringEncoding(), "key=value"); + } + + @DataProvider(name = "headerLineEquals") + public Object[][] headerLineEquals() { + return new Object[][]{ + { + new VCFHeaderLine("key", "value"), + new VCFHeaderLine("key", "value"), + true + }, + { + new VCFHeaderLine("key", "value1"), + new VCFHeaderLine("key", "value2"), + false + }, + { + new VCFHeaderLine("key1", "value"), + new VCFHeaderLine("key2", "value"), + false + }, + { + new VCFHeaderLine("key1", "value1"), + new VCFHeaderLine("key2", "value2"), + false + } + }; + } + + @Test(dataProvider = "headerLineEquals") + public void testEquals(final VCFHeaderLine hl1, final VCFHeaderLine hl2, final boolean expectedEquals) { + Assert.assertEquals(hl1.equals(hl2), expectedEquals); + } + + @DataProvider(name = "invalidHeaderLineKeys") + public Object[][] invalidHeaderLineKeys() { + return new Object[][]{ + {null}, + {"embedded<"}, + {"embedded="}}; + } + + @Test(dataProvider = "invalidHeaderLineKeys", expectedExceptions=TribbleException.class) + public void testInvalidKeys(final String testKey) { + new VCFHeaderLine(testKey, ""); + } + + @Test(dataProvider = "invalidHeaderLineKeys", expectedExceptions=TribbleException.class) + public void testValidateAsIdInvalid(final String testKey) { + VCFHeaderLine.validateKeyOrID(testKey, "test"); + } + + @DataProvider(name = "vcfVersions") + public Object[][] vcfVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF3_2}, + {VCFHeaderVersion.VCF3_3}, + {VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF4_3} + }; + } + + @Test(dataProvider = "vcfVersions") + public void testValidateForVersion(final VCFHeaderVersion vcfVersion) { + VCFHeaderLine headerLine = new VCFHeaderLine(vcfVersion.getFormatString(), vcfVersion.getVersionString()); + headerLine.validateForVersion(vcfVersion); + } + + @DataProvider(name = "incompatibleVersions") + public Object[][] incompatibleVersionPairs() { + return new Object[][]{ + // each pair just has to be different + {VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_2} + }; + } + + @Test(dataProvider="incompatibleVersions", expectedExceptions= TribbleException.VersionValidationFailure.class) + public void testValidateForVersionFails(final VCFHeaderVersion vcfVersion, final VCFHeaderVersion incompatibleVersion) { + VCFHeaderLine headerLine = new VCFHeaderLine(vcfVersion.getFormatString(), vcfVersion.getVersionString()); + headerLine.validateForVersion(incompatibleVersion); + } + + @Test(expectedExceptions = { TribbleException.InvalidHeader.class }, expectedExceptionsMessageRegExp = ".*For fixed count, the count number must be 1 or higher.") + public void testFormatNumberExceptions() { new VCFFormatHeaderLine("test", 0, VCFHeaderLineType.Integer, ""); } - @Test(expectedExceptions = { IllegalArgumentException.class }, expectedExceptionsMessageRegExp = "Invalid count number, with fixed count the number should be 1 or higher: .*") - public void testInfoNumberExeptions() { + @Test(expectedExceptions = { TribbleException.InvalidHeader.class }, expectedExceptionsMessageRegExp = ".*For fixed count, the count number must be 1 or higher.") + public void testInfoNumberExceptions() { new VCFInfoHeaderLine("test", 0, VCFHeaderLineType.Integer, diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderMergerUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderMergerUnitTest.java new file mode 100644 index 0000000000..1be8bdf085 --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderMergerUnitTest.java @@ -0,0 +1,554 @@ +package htsjdk.variant.vcf; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.tribble.TribbleException; +import htsjdk.variant.VariantBaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; +import java.util.stream.IntStream; + +import static htsjdk.variant.vcf.VCFConstants.PEDIGREE_HEADER_KEY; + +public class VCFHeaderMergerUnitTest extends VariantBaseTest { + + @DataProvider(name="mergeValidVersions") + public Object[][] getMergeValidVersions() { + + // only v4.2+ headers can be merged, merge result version is always the highest version presented + return new Object[][] { + // headers to merge, expected result version + {Arrays.asList(VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_2), VCFHeaderVersion.VCF4_2}, + {Arrays.asList(VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_3), VCFHeaderVersion.VCF4_3}, + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_2), VCFHeaderVersion.VCF4_3}, + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_3), VCFHeaderVersion.VCF4_3}, + {Arrays.asList(VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_2), VCFHeaderVersion.VCF4_2}, + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_3), VCFHeaderVersion.VCF4_3}, + {Arrays.asList(VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_3), VCFHeaderVersion.VCF4_3}, + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_2), VCFHeaderVersion.VCF4_3}, + {Arrays.asList(VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_2), VCFHeaderVersion.VCF4_2 }, + {Arrays.asList(VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_3), VCFHeaderVersion.VCF4_3}, + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_2), VCFHeaderVersion.VCF4_3}, + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_3), VCFHeaderVersion.VCF4_3}, + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_3), VCFHeaderVersion.VCF4_3}, + }; + } + + @DataProvider(name="mergeInvalidVersions") + public Object[][] getMergeInvalidVersions() { + // only v4.2+ headers can be merged + return new Object[][] { + {Arrays.asList(VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF3_3)}, + {Arrays.asList(VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF4_0)}, + {Arrays.asList(VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF4_1)}, + {Arrays.asList(VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF4_2)}, + {Arrays.asList(VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF4_3)}, + + {Arrays.asList(VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF3_2)}, + {Arrays.asList(VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF4_0)}, + {Arrays.asList(VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF4_1)}, + {Arrays.asList(VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF4_2)}, + {Arrays.asList(VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF4_3)}, + + {Arrays.asList(VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF3_2)}, + {Arrays.asList(VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF3_3)}, + {Arrays.asList(VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF4_1)}, + {Arrays.asList(VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF4_2)}, + {Arrays.asList(VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF4_3)}, + + {Arrays.asList(VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF3_2)}, + {Arrays.asList(VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF3_3)}, + {Arrays.asList(VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF4_0)}, + {Arrays.asList(VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF4_2)}, + {Arrays.asList(VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF4_3)}, + + {Arrays.asList(VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF3_2)}, + {Arrays.asList(VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF3_3)}, + {Arrays.asList(VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_0)}, + {Arrays.asList(VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_1)}, + + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF3_2)}, + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF3_3)}, + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_0)}, + {Arrays.asList(VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_1)}, + }; + } + + @Test(dataProvider="mergeValidVersions") + public void testMergeValidVersions(final List headerVersions, final VCFHeaderVersion expectedVersion) { + // merge the headers, and then verify that the merged lines have the expected version by + // instantiating a VCFMetaDataLines instance to determine the resulting version + final Set mergedHeaderLines = doHeaderMergeForVersions(headerVersions); + final VCFMetaDataLines metaDataLines = new VCFMetaDataLines(); + metaDataLines.addMetaDataLines(mergedHeaderLines); + final VCFHeaderLine versionLine = metaDataLines.getFileFormatLine(); + Assert.assertEquals(VCFHeaderVersion.toHeaderVersion(versionLine.getValue()), expectedVersion); + + // now create a new header using the merged VersionLines, and make sure *it* has the expected version + final VCFHeader mergedHeader = new VCFHeader(mergedHeaderLines); + Assert.assertEquals(mergedHeader.getVCFHeaderVersion(), expectedVersion); + + // also verify that all the header lines in the merged set are also in the resulting header + Assert.assertEquals(mergedHeader.getMetaDataInInputOrder(), mergedHeaderLines); + } + + @Test(dataProvider="mergeInvalidVersions", expectedExceptions = TribbleException.class) + public void testMergeInvalidVersions(final List headerVersions) { + doHeaderMergeForVersions(headerVersions); + } + + @Test(expectedExceptions = TribbleException.class) + public void testMergeWithValidationFailure() { + // test mixing header versions where the old version header has a line that fails validation + // using the resulting (newer) version + + // create a 4.2 header with a 4.2 style pedigree line (one that has no ID) + final Set oldHeaderLines = VCFHeader.makeHeaderVersionLineSet(VCFHeaderVersion.VCF4_2); + oldHeaderLines.add(new VCFHeaderLine(PEDIGREE_HEADER_KEY, "")); + final VCFHeader oldHeader = new VCFHeader(oldHeaderLines); + Assert.assertEquals(oldHeader.getVCFHeaderVersion(), VCFHeaderVersion.VCF4_2); + + // now create a simple 4.3 header; the merge should fail because the old PEDIGREE line isn't valid + // for 4.3 (for which pedigree lines mut have an ID) + final VCFHeader newHeader = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeaderVersion.VCF4_3)); + Assert.assertEquals(newHeader.getVCFHeaderVersion(), VCFHeaderVersion.VCF4_3); + + VCFHeaderMerger.getMergedHeaderLines(Arrays.asList(oldHeader, newHeader),true); + } + + private Set doHeaderMergeForVersions(final List headerVersions) { + // This is a somewhat sketchy way to write a test...for each header we create here, we're + // using the same fixed set of VCF42-conforming VCFHeader lines, and then we add a fileformat + // line with whatever VCFVersion the test calls for. Its conceivable that as time goes on + // and we add new versions, the VCFHeader constructor could throw if any of the lines don't + // conform to the requested version. + final List headerList = new ArrayList<>(headerVersions.size()); + for (final VCFHeaderVersion version : headerVersions) { + final Set metaDataSet = VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString(); + metaDataSet.add(VCFHeader.makeHeaderVersionLine(version)); + final VCFHeader header = new VCFHeader(metaDataSet); + Assert.assertEquals(header.getVCFHeaderVersion(), version); + headerList.add(header); + } + + return VCFUtils.smartMergeHeaders(headerList, false); + } + + @DataProvider(name = "subsetHeaders") + public Iterator getSubsetHeaders() { + final List headerLineList = new ArrayList<>(new VCFHeaderUnitTestData().getTestMetaDataLinesSet()); + final Collection mergeTestCase = new ArrayList<>(); + // For each header line in the list of test lines, create a test case consisting of a pair of headers, + // one of which is a header created with all of the lines, and one of which is a subset of the full header + // with one line removed. Skip the case where the line to be removed is a fileformat line, since thats + // required to create a header. + for (int i = 0; i < headerLineList.size(); i++) { + // take the header line set and remove the ith line, unless its a fileformat line, since if we remove + // that, then we won't be able to create a header using the resulting lines at all. + final VCFHeaderLine candidateLine = headerLineList.get(i); + if (!VCFHeaderVersion.isFormatString(candidateLine.getKey())) { + List subsetList = new ArrayList<>(headerLineList); + subsetList.remove(i); + mergeTestCase.add( + new Object[] { + new VCFHeader(VCFHeaderUnitTestData.getTestMetaDataLinesSet()), + new VCFHeader(new LinkedHashSet<>(subsetList)) + }); + } + } + + return mergeTestCase.iterator(); + } + + @Test(dataProvider = "subsetHeaders") + public void testMergeSubsetHeaders( + final VCFHeader fullHeader, + final VCFHeader subsetHeader) + { + final List headerList = new ArrayList() {{ + add(fullHeader); + add(subsetHeader); + add(subsetHeader); + }}; + Assert.assertEquals( + VCFHeaderMerger.getMergedHeaderLines(headerList, false), + fullHeader.getMetaDataInSortedOrder()); + + // now again, in the reverse order + final List reverseHeaderList = new ArrayList() {{ + add(subsetHeader); + add(subsetHeader); + add(fullHeader); + }}; + Assert.assertEquals( + VCFHeaderMerger.getMergedHeaderLines(reverseHeaderList, false), + fullHeader.getMetaDataInSortedOrder()); + } + + @Test + public void testDictionaryMergeDuplicateFile() { + final VCFHeader headerOne = new VCFFileReader(new File(variantTestDataRoot + "diagnosis_targets_testfile.vcf"), false).getFileHeader(); + final VCFHeader headerTwo = new VCFHeader(headerOne); // deep copy + final List sampleList = new ArrayList<>(); + sampleList.addAll(headerOne.getSampleNamesInOrder()); + + // Check that the two dictionaries start out the same + headerOne.getSequenceDictionary().assertSameDictionary(headerTwo.getSequenceDictionary()); + + // Run the merge command + final VCFHeader mergedHeader = new VCFHeader(VCFHeaderMerger.getMergedHeaderLines(Arrays.asList(headerOne, headerTwo), false), sampleList); + + // Check that the mergedHeader's sequence dictionary matches the first two + mergedHeader.getSequenceDictionary().assertSameDictionary(headerOne.getSequenceDictionary()); + } + + @DataProvider(name="dictionaryMergePositive") + private Object[][] getDictionaryMergePositive() { + return new Object[][] { + // input dictionary list, expected merged dictionary + { + // one dictionary + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)) + ), + createTestSAMDictionary(1, 2) + }, + { + // two identical dictionaries + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)) + ), + createTestSAMDictionary(1, 2) + }, + { + // three different subsets; superset first + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 10)), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(7, 2)), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(3, 2)) + ), + createTestSAMDictionary(1, 10) + }, + { + // three different subsets; superset second + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(7, 2)), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 10)), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(3, 2)) + ), + createTestSAMDictionary(1, 10) + }, + { + // three different subsets; superset third (requires the merge implementation to sort on dictionary size) + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(7, 2)), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(3, 2)), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 10)) + ), + createTestSAMDictionary(1, 10) + }, + { + // one non-null dictionary, one null + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)), + createTestVCFHeaderWithSAMDictionary(null) + ), + createTestSAMDictionary(1, 2) + }, + { + // one non-null dictionary, one null, in reverse direction + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(null), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)) + ), + createTestSAMDictionary(1, 2) + }, + { + // three dictionaries: non-null, null, null + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)), + createTestVCFHeaderWithSAMDictionary(null), + createTestVCFHeaderWithSAMDictionary(null) + ), + createTestSAMDictionary(1, 2) + }, + { + // three dictionaries: null, non-null, null + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(null), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)), + createTestVCFHeaderWithSAMDictionary(null) + ), + createTestSAMDictionary(1, 2) + }, + { + // three dictionaries: null, null, non-null + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(null), + createTestVCFHeaderWithSAMDictionary(null), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)) + ), + createTestSAMDictionary(1, 2) + }, + { + // three dictionaries: non-null, null, non-null + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)), + createTestVCFHeaderWithSAMDictionary(null), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)) + ), + createTestSAMDictionary(1, 2) + }, + { + // three dictionaries: subset, null, superset + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)), + createTestVCFHeaderWithSAMDictionary(null), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 10)) + ), + createTestSAMDictionary(1, 10) + }, + { + // all null dictionaries + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(null), + createTestVCFHeaderWithSAMDictionary(null) + ), + null + } + }; + } + + @Test(dataProvider = "dictionaryMergePositive") + private void testDictionaryMergePositive( + final List sourceHeaders, final SAMSequenceDictionary expectedDictionary) { + final Set mergedHeaderLines = VCFHeaderMerger.getMergedHeaderLines(sourceHeaders, false); + final VCFHeader mergedHeader = new VCFHeader(mergedHeaderLines); + Assert.assertEquals(mergedHeader.getSequenceDictionary(), expectedDictionary); + } + + @DataProvider(name="dictionaryMergeNegative") + private Object[][] getDictionaryMergeNegative() { + final SAMSequenceDictionary forwardDictionary = createTestSAMDictionary(1, 2); + final SAMSequenceDictionary reverseDictionary = createReverseDictionary(forwardDictionary); + + return new Object[][] { + { + // SequenceDictionaryCompatibility.NO_COMMON_CONTIGS + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(1, 2)), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(5, 2))) + }, + { + // SequenceDictionaryCompatibility.OUT_OF_ORDER + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(forwardDictionary), + createTestVCFHeaderWithSAMDictionary(reverseDictionary)) + }, + { + // SequenceDictionaryCompatibility.UNEQUAL_COMMON_CONTIGS common subset has contigs that have the same name but different lengths + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createDictionaryWithLengths(100)), + createTestVCFHeaderWithSAMDictionary(createDictionaryWithLengths(200))) + }, + { + // SequenceDictionaryCompatibility.NON_CANONICAL_HUMAN_ORDER human reference detected but the order of the contigs is non-standard (lexicographic, for example) + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createDictionaryInCanonicalHumanOrder()), + createTestVCFHeaderWithSAMDictionary(createDictionaryInNonCanonicalHumanOrder())) + }, + { + // three mutually disjoint dictionaries, no superset + Arrays.asList( + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(5, 2)), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(4, 2)), + createTestVCFHeaderWithSAMDictionary(createTestSAMDictionary(6, 2)) + ) + }, + }; + } + + @Test(dataProvider = "dictionaryMergeNegative", expectedExceptions = TribbleException.class) + private void testDictionaryMergeNegative(final List sourceHeaders) { + VCFHeaderMerger.getMergedHeaderLines(sourceHeaders, false); + } + + @Test + final void testDuplicateNonStructuredKeys() { + // merge 2 headers, one has "##sample=foo", one has "##sample=bar", both should survive the merge + final VCFHeaderLine fooLine = new VCFHeaderLine("sample", "foo"); + final Set fooLines = VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION); + fooLines.add(fooLine); + final VCFHeader fooHeader = new VCFHeader(fooLines); + + final VCFHeaderLine barLine = new VCFHeaderLine("sample", "bar"); + final Set barLines = VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION); + barLines.add(barLine); + final VCFHeader barHeader = new VCFHeader(barLines); + + final Set mergedLines = VCFHeaderMerger.getMergedHeaderLines(Arrays.asList(fooHeader, barHeader), false); + Assert.assertEquals(mergedLines.size(), 3); + Assert.assertTrue(mergedLines.contains(fooLine)); + Assert.assertTrue(mergedLines.contains(barLine)); + } + + @DataProvider(name = "compatibleInfoLines") + public Object[][] getMergerData() { + return new Object[][]{ + // 2 lines to merge, expected result + { + // mixed number, promote to "." + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + "AB" + }, + { + // mixed number type, promote to float + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + "AB" + }, + { + // mixed number type in reverse direction, promote to float + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + "AB" + }, + }; + } + + @Test(dataProvider = "compatibleInfoLines") + public void testMergeCompatibleInfoLines(final VCFInfoHeaderLine line1, final VCFInfoHeaderLine line2, final VCFInfoHeaderLine expectedLine, final String id) { + final VCFHeader hdr1 = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION), Collections.EMPTY_SET); + hdr1.addMetaDataLine(line1); + + final VCFHeader hdr2 = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION), Collections.EMPTY_SET); + hdr2.addMetaDataLine(line2); + + final VCFHeader mergedHeader = new VCFHeader(VCFHeaderMerger.getMergedHeaderLines(Arrays.asList(hdr1, hdr2), true)); + Assert.assertEquals(mergedHeader.getInfoHeaderLine(id), expectedLine); + } + + @DataProvider(name = "mergeIncompatibleInfoLines") + public Object[][] getMergeIncompatibleInfoLines() { + return new Object[][]{ + // 2 lines to merge, expected result + { + // mixed number AND number type (multiple different attributes) + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + "AB" + }, + { + // mixed number AND number type (multiple different attributes), reverse direction + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + "AB" + }, + }; + } + + @Test(dataProvider = "mergeIncompatibleInfoLines", expectedExceptions=TribbleException.class) + public void testMergeIncompatibleInfoLines(final VCFInfoHeaderLine line1, final VCFInfoHeaderLine line2, final VCFInfoHeaderLine expectedLine, final String id) { + final VCFHeader hdr1 = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION), Collections.EMPTY_SET); + hdr1.addMetaDataLine(line1); + final VCFHeader hdr2 = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION), Collections.EMPTY_SET); + hdr2.addMetaDataLine(line2); + new VCFHeader(VCFHeaderMerger.getMergedHeaderLines(Arrays.asList(hdr1, hdr2), true)); + } + + private final SAMSequenceDictionary createTestSAMDictionary(final int startSequence, final int numSequences) { + final SAMSequenceDictionary samDictionary = new SAMSequenceDictionary(); + IntStream.range(startSequence, startSequence + numSequences).forEachOrdered( + i -> samDictionary.addSequence(new SAMSequenceRecord(Integer.toString(i), i))); + return samDictionary; + } + + private final VCFHeader createTestVCFHeaderWithSAMDictionary(final SAMSequenceDictionary samDictionary) { + final VCFHeader vcfHeader = createTestVCFHeader(); + vcfHeader.setSequenceDictionary(samDictionary); + return vcfHeader; + } + + private SAMSequenceDictionary createDictionaryInNonCanonicalHumanOrder() { + final List sequences = new ArrayList<>(); + sequences.add(new SAMSequenceRecord("1", 100)); + sequences.add(new SAMSequenceRecord("10", 100)); + sequences.add(new SAMSequenceRecord("2", 100)); + return new SAMSequenceDictionary(sequences); + } + + private SAMSequenceDictionary createDictionaryInCanonicalHumanOrder() { + final List sequences = new ArrayList<>(); + sequences.add(new SAMSequenceRecord("1", 100)); + sequences.add(new SAMSequenceRecord("2", 100)); + sequences.add(new SAMSequenceRecord("10", 100)); + return new SAMSequenceDictionary(sequences); + } + + private SAMSequenceDictionary createDictionaryWithLengths(final int length) { + final List sequences = new ArrayList<>(); + sequences.add(new SAMSequenceRecord("1", length)); + sequences.add(new SAMSequenceRecord("2", length)); + sequences.add(new SAMSequenceRecord("3", length)); + return new SAMSequenceDictionary(sequences); + } + + private SAMSequenceDictionary createReverseDictionary(final SAMSequenceDictionary forwardDictionary){ + // its not sufficient to reuse the existing sequences by just reordering them, since + // SAMSequenceDictionary *mutates* the sequence indices to match the input order. So we need + // to create the new sequence dictionary using entirely new sequence records, and let + // SAMSequenceDictionary assign them indices that match the input order. + final List reverseSequences = new ArrayList<>(forwardDictionary.getSequences()); + Collections.reverse(reverseSequences); + final SAMSequenceDictionary reverseDictionary = new SAMSequenceDictionary(); + + int count = 0; + for (final SAMSequenceRecord samSequenceRecord : reverseSequences) { + final SAMSequenceRecord newSequenceRecord = new SAMSequenceRecord( + samSequenceRecord.getSequenceName(), + samSequenceRecord.getSequenceLength()); + reverseDictionary.addSequence(newSequenceRecord); + Assert.assertEquals(newSequenceRecord.getSequenceIndex(), count); + count++; + } + return reverseDictionary; + } + + private final VCFHeader createTestVCFHeader() { + return new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION)); + } + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java index e4d5099eda..8ee9ccab26 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java @@ -29,7 +29,6 @@ import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.FileExtensions; -import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.TestUtil; import htsjdk.tribble.TribbleException; import htsjdk.tribble.readers.AsciiLineReader; @@ -42,66 +41,64 @@ import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder; import org.testng.Assert; -import org.testng.annotations.AfterClass; -import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.*; -import java.math.BigInteger; import java.nio.charset.StandardCharsets; import java.nio.file.Files; -import java.security.MessageDigest; -import java.security.NoSuchAlgorithmException; import java.util.*; import java.util.stream.Collectors; -/** - * Created by IntelliJ IDEA. - * User: aaron - * Date: Jun 30, 2010 - * Time: 3:32:08 PM - * To change this template use File | Settings | File Templates. - */ public class VCFHeaderUnitTest extends VariantBaseTest { - private File tempDir; - - private VCFHeader createHeader(String headerStr) { - VCFCodec codec = new VCFCodec(); - VCFHeader header = (VCFHeader) codec.readActualHeader(new LineIteratorImpl(new SynchronousLineReader( - new StringReader(headerStr)))); - Assert.assertEquals(header.getMetaDataInInputOrder().size(), VCF4headerStringCount); - return header; - } - - @BeforeClass - private void createTemporaryDirectory() { - tempDir = TestUtil.getTempDirectory("VCFHeader", "VCFHeaderTest"); + @DataProvider(name="headerRoundTrip") + private Object[][] getHeaderRoundTrip() { + return new Object[][] { + { VCFHeaderUnitTestData.getVCFV42TestHeaderString() }, + { VCFHeaderUnitTestData.VCF42headerStrings_with_negativeOne } + }; } - @AfterClass - private void deleteTemporaryDirectory() { - for (File f : tempDir.listFiles()) { - f.delete(); - } - tempDir.delete(); + @Test(dataProvider = "headerRoundTrip") + public void test42HeaderRoundTrip(final String headerString) throws IOException { + final VCFHeader header = VCFHeaderUnitTestData.createHeaderFromString(headerString); + Assert.assertEquals(header.getMetaDataInSortedOrder(), getRoundTripEncoded(header)); } @Test - public void testVCF4ToVCF4() { - VCFHeader header = createHeader(VCF4headerStrings); - checkMD5ofHeaderFile(header, "91c33dadb92e01ea349bd4bcdd02d6be"); - } + public void test42FileRoundtrip() throws Exception { + // this test validates that source/version fields are round-tripped properly - @Test - public void testVCF4ToVCF4_alternate() { - VCFHeader header = createHeader(VCF4headerStrings_with_negativeOne); - checkMD5ofHeaderFile(header, "39318d9713897d55be5ee32a2119853f"); + // read an existing VCF + final File expectedFile = new File("src/test/resources/htsjdk/variant/Vcf4.2WithSourceVersionInfoFields.vcf"); + + // write the file out into a new copy + final File actualFile = File.createTempFile("testVcf4.2roundtrip.", FileExtensions.VCF); + actualFile.deleteOnExit(); + + try (final VCFFileReader originalFileReader = new VCFFileReader(expectedFile, false); + final VariantContextWriter copyWriter = new VariantContextWriterBuilder() + .setOutputFile(actualFile) + .setReferenceDictionary(createArtificialSequenceDictionary()) + .setOptions(EnumSet.of(Options.ALLOW_MISSING_FIELDS_IN_HEADER, Options.INDEX_ON_THE_FLY)) + .build() + ) { + final VCFHeader originalHeader = originalFileReader.getFileHeader(); + + copyWriter.writeHeader(originalHeader); + for (final VariantContext variantContext : originalFileReader) { + copyWriter.add(variantContext); + } + } + + final String actualContents = new String(Files.readAllBytes(actualFile.toPath()), StandardCharsets.UTF_8); + final String expectedContents = new String(Files.readAllBytes(expectedFile.toPath()), StandardCharsets.UTF_8); + Assert.assertEquals(actualContents, expectedContents); } @Test - public void testVCFHeaderSampleRenamingSingleSampleVCF() throws Exception { + public void testSampleRenamingSingleSample() throws Exception { final VCFCodec codec = new VCFCodec(); codec.setRemappedSampleName("FOOSAMPLE"); final AsciiLineReaderIterator vcfIterator = new AsciiLineReaderIterator(AsciiLineReader.from(new FileInputStream(variantTestDataRoot + "HiSeq.10000.vcf"))); @@ -120,57 +117,25 @@ public void testVCFHeaderSampleRenamingSingleSampleVCF() throws Exception { } } - @DataProvider - public Object[][] testVCFHeaderDictionaryMergingData() { + @DataProvider(name="testSampleRenamingFailsTests") + public Object[][] testSampleRenamingFailsTests() { return new Object[][]{ - {"diagnosis_targets_testfile.vcf"}, // numerically ordered contigs - {"dbsnp_135.b37.1000.vcf"} // lexicographically ordered contigs + {variantTestDataRoot + "ex2.vcf"}, // multi sample vcf + {variantTestDataRoot + "dbsnp_135.b37.1000.vcf"} // sites only vcf }; } - @Test(dataProvider = "testVCFHeaderDictionaryMergingData") - public void testVCFHeaderDictionaryMerging(final String vcfFileName) { - final VCFHeader headerOne = new VCFFileReader(new File(variantTestDataRoot + vcfFileName), false).getFileHeader(); - final VCFHeader headerTwo = new VCFHeader(headerOne); // deep copy - final List sampleList = new ArrayList(); - sampleList.addAll(headerOne.getSampleNamesInOrder()); - - // Check that the two dictionaries start out the same - headerOne.getSequenceDictionary().assertSameDictionary(headerTwo.getSequenceDictionary()); - - // Run the merge command - final VCFHeader mergedHeader = new VCFHeader(VCFUtils.smartMergeHeaders(Arrays.asList(headerOne, headerTwo), false), sampleList); - - // Check that the mergedHeader's sequence dictionary matches the first two - mergedHeader.getSequenceDictionary().assertSameDictionary(headerOne.getSequenceDictionary()); - } - - @Test(expectedExceptions = TribbleException.class) - public void testVCFHeaderSampleRenamingMultiSampleVCF() throws Exception { - final VCFCodec codec = new VCFCodec(); - codec.setRemappedSampleName("FOOSAMPLE"); - final AsciiLineReaderIterator vcfIterator = new AsciiLineReaderIterator(AsciiLineReader.from(new FileInputStream(variantTestDataRoot + "ex2.vcf"))); - final VCFHeader header = (VCFHeader) codec.readHeader(vcfIterator).getHeaderValue(); - } - - @Test(expectedExceptions = TribbleException.class) - public void testVCFHeaderSampleRenamingSitesOnlyVCF() throws Exception { + @Test(dataProvider = "testSampleRenamingFailsTests", expectedExceptions = TribbleException.class) + public void testSampleRenamingFails(final String fileName) throws IOException { final VCFCodec codec = new VCFCodec(); codec.setRemappedSampleName("FOOSAMPLE"); - final AsciiLineReaderIterator vcfIterator = new AsciiLineReaderIterator(AsciiLineReader.from(new FileInputStream(variantTestDataRoot + "dbsnp_135.b37.1000.vcf"))); - final VCFHeader header = (VCFHeader) codec.readHeader(vcfIterator).getHeaderValue(); - } - - private VCFHeader getHiSeqVCFHeader() { - final File vcf = new File("src/test/resources/htsjdk/variant/HiSeq.10000.vcf"); - final VCFFileReader reader = new VCFFileReader(vcf, false); - final VCFHeader header = reader.getFileHeader(); - reader.close(); - return header; + final AsciiLineReaderIterator vcfIterator = new AsciiLineReaderIterator( + AsciiLineReader.from(new FileInputStream(fileName))); + codec.readHeader(vcfIterator).getHeaderValue(); } @Test - public void testVCFHeaderAddInfoLine() { + public void testAddInfoLine() { final VCFHeader header = getHiSeqVCFHeader(); final VCFInfoHeaderLine infoLine = new VCFInfoHeaderLine("TestInfoLine", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "test info line"); header.addMetaDataLine(infoLine); @@ -185,13 +150,8 @@ public void testVCFHeaderAddInfoLine() { Assert.assertFalse(header.getOtherHeaderLines().contains(infoLine), "TestInfoLine present in other header lines"); } - private static Collection asCollectionOfVCFHeaderLine(Collection headers) { - // create a collection of VCFHeaderLine so that contains tests work correctly - return headers.stream().map(h -> (VCFHeaderLine) h).collect(Collectors.toList()); - } - @Test - public void testVCFHeaderAddFormatLine() { + public void testAddFormatLine() { final VCFHeader header = getHiSeqVCFHeader(); final VCFFormatHeaderLine formatLine = new VCFFormatHeaderLine("TestFormatLine", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "test format line"); header.addMetaDataLine(formatLine); @@ -207,11 +167,11 @@ public void testVCFHeaderAddFormatLine() { } @Test - public void testVCFHeaderAddFilterLine() { + public void testAddFilterLine() { final VCFHeader header = getHiSeqVCFHeader(); final String filterDesc = "TestFilterLine Description"; - final VCFFilterHeaderLine filterLine = new VCFFilterHeaderLine("TestFilterLine",filterDesc); - Assert.assertEquals(filterDesc,filterLine.getDescription()); + final VCFFilterHeaderLine filterLine = new VCFFilterHeaderLine("TestFilterLine", filterDesc); + Assert.assertEquals(filterDesc, filterLine.getDescription()); header.addMetaDataLine(filterLine); Assert.assertTrue(header.getFilterLines().contains(filterLine), "TestFilterLine not found in filter header lines"); @@ -225,10 +185,15 @@ public void testVCFHeaderAddFilterLine() { } @Test - public void testVCFHeaderAddContigLine() { + public void testAddContigLine() { final VCFHeader header = getHiSeqVCFHeader(); + // no contig lines in this header + Assert.assertTrue(header.getContigLines().isEmpty()); + final VCFContigHeaderLine contigLine = new VCFContigHeaderLine( - "", VCFHeaderVersion.VCF4_0, VCFHeader.CONTIG_KEY, 0); + "", VCFHeaderVersion.VCF4_0, 0); + Assert.assertEquals(contigLine.getKey(), VCFHeader.CONTIG_KEY); + Assert.assertEquals(contigLine.getID(), "chr1"); header.addMetaDataLine(contigLine); Assert.assertTrue(header.getContigLines().contains(contigLine), "Test contig line not found in contig header lines"); @@ -241,10 +206,70 @@ public void testVCFHeaderAddContigLine() { } @Test - public void testVCFHeaderContigLineMissingLength() { + public void testAddContigLineExactDuplicateSilentlyDropped() { + final File input = new File("src/test/resources/htsjdk/variant/ex2.vcf"); + + final VCFFileReader reader = new VCFFileReader(input, false); + final VCFHeader header = reader.getFileHeader(); + + final int numContigLinesBefore = header.getContigLines().size(); + // try to read the first contig line + header.addMetaDataLine(header.getContigLines().get(0)); + final int numContigLinesAfter = header.getContigLines().size(); + + // assert that we have the same number of contig lines before and after + Assert.assertEquals(numContigLinesBefore, numContigLinesAfter); + } + + @Test + public void testAddContigLineWithDifferentAttributesSilentlyDropped() { + final VCFContigHeaderLine contigOneNoAssembly = new VCFContigHeaderLine( + new LinkedHashMap() {{ + put("ID", "1"); + put("length", "123"); + }}, + 0); + final VCFContigHeaderLine contigOneWithAssembly = new VCFContigHeaderLine( + new LinkedHashMap() {{ + put("ID", "1"); + put("length", "123"); + put("assembly", "b37"); + }}, + 1); + Assert.assertNotEquals(contigOneNoAssembly.hashCode(), contigOneWithAssembly.hashCode()); + + final Set headerLineSet = VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION); + headerLineSet.add(contigOneNoAssembly); + headerLineSet.add(contigOneWithAssembly); + Assert.assertEquals(headerLineSet.size(), 3); // one fileformat line, plus 2 contig lines + + // silently drops contigOneNoAssembly since it has the same ID AND contig index as contigOneWithAssembly + final VCFHeader vcfHeader = new VCFHeader(headerLineSet); + final Set allMetaDataInput = vcfHeader.getMetaDataInInputOrder(); + Assert.assertEquals(allMetaDataInput.size(), 2); + final Set allMetaDataSorted = vcfHeader.getMetaDataInSortedOrder(); + Assert.assertEquals(allMetaDataSorted.size(), 2); + final List allContigLines = vcfHeader.getContigLines(); + Assert.assertEquals(allContigLines.size(), 1); // one contig + Assert.assertEquals(allContigLines.get(0).getGenericFieldValue("assembly"), "b37"); + } + + @Test(expectedExceptions = TribbleException.class) + public void testAddContigLineRejectDuplicateContigIndex() { + final VCFHeader header = new VCFHeader(); + // add two contig lines that share an index, but have different IDs and represetn different contifs + final VCFContigHeaderLine contigLine1 = new VCFContigHeaderLine("", VCFHeaderVersion.VCF4_2, 0); + final VCFContigHeaderLine contigLine2 = new VCFContigHeaderLine("", VCFHeaderVersion.VCF4_2, 0); + + header.addMetaDataLine(contigLine1); + header.addMetaDataLine(contigLine2); + } + + @Test + public void testAddContigLineMissingLength() { final VCFHeader header = getHiSeqVCFHeader(); final VCFContigHeaderLine contigLine = new VCFContigHeaderLine( - "", VCFHeaderVersion.VCF4_0, VCFHeader.CONTIG_KEY, 0); + "", VCFHeaderVersion.VCF4_0, 0); header.addMetaDataLine(contigLine); Assert.assertTrue(header.getContigLines().contains(contigLine), "Test contig line not found in contig header lines"); Assert.assertTrue(header.getMetaDataInInputOrder().contains(contigLine), "Test contig line not found in set of all header lines"); @@ -252,58 +277,66 @@ public void testVCFHeaderContigLineMissingLength() { final SAMSequenceDictionary sequenceDictionary = header.getSequenceDictionary(); Assert.assertNotNull(sequenceDictionary); Assert.assertEquals(sequenceDictionary.getSequence("chr1").getSequenceLength(), SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH); - } - @Test - public void testVCFHeaderHonorContigLineOrder() throws IOException { + @Test + public void testGetContigLinesHonorsSortOrder() { + // NOTE: this test file has *lexicographically* ordered contigs try (final VCFFileReader vcfReader = new VCFFileReader(new File(variantTestDataRoot + "dbsnp_135.b37.1000.vcf"), false)) { // start with a header with a bunch of contig lines final VCFHeader header = vcfReader.getFileHeader(); - final List originalHeaderList = header.getContigLines(); - Assert.assertTrue(originalHeaderList.size() > 0); - - // copy the contig lines to a new list, sticking an extra contig line in the middle - final List orderedList = new ArrayList<>(); - final int splitInTheMiddle = originalHeaderList.size() / 2; - orderedList.addAll(originalHeaderList.subList(0, splitInTheMiddle)); - final VCFContigHeaderLine outrageousContigLine = new VCFContigHeaderLine( - "", + final List originalContigsInSortedOrder = header.getContigLines(); + Assert.assertTrue(originalContigsInSortedOrder.size() > 0); + + // copy the contig lines to a new list + final List confoundedList = new ArrayList<>(); + final int midPoint = originalContigsInSortedOrder.size() / 2; + confoundedList.addAll(originalContigsInSortedOrder.subList(0, midPoint)); + + // deliberately stick an extra contig line in the middle of the list, but using a contig index + // that will cause the line to sort to the end + final String newContigID = "newContigID"; + final int newContigIndex = originalContigsInSortedOrder.size(); + final VCFContigHeaderLine newContigLine = new VCFContigHeaderLine( + String.format( + "", newContigID), VCFHeaderVersion.VCF4_2, - VCFHeader.CONTIG_KEY, - 0); - orderedList.add(outrageousContigLine); - // make sure the extra contig line is outrageous enough to not collide with a real contig ID - Assert.assertTrue(orderedList.contains(outrageousContigLine)); - orderedList.addAll(originalHeaderList.subList(splitInTheMiddle, originalHeaderList.size())); - Assert.assertEquals(originalHeaderList.size() + 1, orderedList.size()); - - // crete a new header from the ordered list, and test that getContigLines honors the input order - final VCFHeader orderedHeader = new VCFHeader(); - orderedList.forEach(hl -> orderedHeader.addMetaDataLine(hl)); - Assert.assertEquals(orderedList, orderedHeader.getContigLines()); + newContigIndex); + confoundedList.add(newContigLine); + confoundedList.addAll(originalContigsInSortedOrder.subList(midPoint, originalContigsInSortedOrder.size())); + + // make sure the new contig line was actually added + Assert.assertEquals(originalContigsInSortedOrder.size() + 1, confoundedList.size()); + Assert.assertTrue(confoundedList.contains(newContigLine)); + + // create a new header from the confounded list, call getContigLines() on the header, and validate + // that the new line is included in the resulting list, and is at the end + final VCFHeader newHeader = new VCFHeader(); + confoundedList.forEach(hl -> newHeader.addMetaDataLine(hl)); + final List roundTrippedLines = newHeader.getContigLines(); + Assert.assertEquals(roundTrippedLines.size(), originalContigsInSortedOrder.size() + 1); + Assert.assertEquals(roundTrippedLines.get(roundTrippedLines.size() - 1), newContigLine); + + // make sure the sequence dictionary has the contig with the correct contig index, and in + // the same relative location in the dictionary (at the end of the list) + final SAMSequenceDictionary orderedSeqDict = newHeader.getSequenceDictionary(); + Assert.assertEquals( + orderedSeqDict.getSequence(newContigID).getSequenceIndex(), + roundTrippedLines.size() - 1); + Assert.assertEquals( + orderedSeqDict.getSequences().get(newHeader.getContigLines().size() - 1).getSequenceName(), + newContigID); } } @Test - public void testVCFSimpleHeaderLineGenericFieldGetter() { - VCFHeader header = createHeader(VCF4headerStrings); - List filters = header.getFilterLines(); - VCFFilterHeaderLine filterHeaderLine = filters.get(0); - Map genericFields = filterHeaderLine.getGenericFields(); - Assert.assertEquals(genericFields.get("ID"),"NoQCALL"); - Assert.assertEquals(genericFields.get("Description"),"Variant called by Dindel but not confirmed by QCALL"); - } - - @Test - public void testVCFHeaderAddOtherLine() { + public void testAddOtherLine() { final VCFHeader header = getHiSeqVCFHeader(); final VCFHeaderLine otherLine = new VCFHeaderLine("TestOtherLine", "val"); header.addMetaDataLine(otherLine); Assert.assertTrue(header.getOtherHeaderLines().contains(otherLine), "TestOtherLine not found in other header lines"); Assert.assertTrue(header.getMetaDataInInputOrder().contains(otherLine), "TestOtherLine not found in set of all header lines"); - Assert.assertNotNull(header.getOtherHeaderLine("TestOtherLine"), "Lookup for TestOtherLine by key failed"); Assert.assertFalse(asCollectionOfVCFHeaderLine(header.getInfoHeaderLines()).contains(otherLine), "TestOtherLine present in info header lines"); Assert.assertFalse(asCollectionOfVCFHeaderLine(header.getFormatHeaderLines()).contains(otherLine), "TestOtherLine present in format header lines"); @@ -312,15 +345,16 @@ public void testVCFHeaderAddOtherLine() { } @Test - public void testVCFHeaderAddMetaDataLineDoesNotDuplicateContigs() { - File input = new File("src/test/resources/htsjdk/variant/ex2.vcf"); + public void testAddMetaDataLineDoesNotDuplicateContigs() { + final File input = new File("src/test/resources/htsjdk/variant/ex2.vcf"); - VCFFileReader reader = new VCFFileReader(input, false); - VCFHeader header = reader.getFileHeader(); + final VCFFileReader reader = new VCFFileReader(input, false); + final VCFHeader header = reader.getFileHeader(); final int numContigLinesBefore = header.getContigLines().size(); - VCFInfoHeaderLine newInfoField = new VCFInfoHeaderLine("test", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "test info field"); + final VCFInfoHeaderLine newInfoField = new VCFInfoHeaderLine( + "test", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "test info field"); header.addMetaDataLine(newInfoField); // getting the sequence dictionary was failing due to duplicating contigs in issue #214, @@ -333,109 +367,280 @@ public void testVCFHeaderAddMetaDataLineDoesNotDuplicateContigs() { } @Test - public void testVCFHeaderAddDuplicateContigLine() { - File input = new File("src/test/resources/htsjdk/variant/ex2.vcf"); - - VCFFileReader reader = new VCFFileReader(input, false); - VCFHeader header = reader.getFileHeader(); - - - final int numContigLinesBefore = header.getContigLines().size(); - // try to readd the first contig line - header.addMetaDataLine(header.getContigLines().get(0)); - final int numContigLinesAfter = header.getContigLines().size(); - - // assert that we have the same number of contig lines before and after - Assert.assertEquals(numContigLinesBefore, numContigLinesAfter); - } - - @Test - public void testVCFHeaderAddDuplicateHeaderLine() { - File input = new File("src/test/resources/htsjdk/variant/ex2.vcf"); + public void testAddDuplicateKeyValueHeaderLine() { + final File input = new File("src/test/resources/htsjdk/variant/ex2.vcf"); - VCFFileReader reader = new VCFFileReader(input, false); - VCFHeader header = reader.getFileHeader(); + final VCFFileReader reader = new VCFFileReader(input, false); + final VCFHeader header = reader.getFileHeader(); - VCFHeaderLine newHeaderLine = new VCFHeaderLine("key", "value"); + final VCFHeaderLine newHeaderLine = new VCFHeaderLine("key", "value"); // add this new header line header.addMetaDataLine(newHeaderLine); final int numHeaderLinesBefore = header.getOtherHeaderLines().size(); - // readd the same header line + // add the same header line again header.addMetaDataLine(newHeaderLine); final int numHeaderLinesAfter = header.getOtherHeaderLines().size(); - // assert that we have the same number of other header lines before and after + // Note: we don't allow duplicate unstructured lines with the same key unless they have + // different content + // assert that we have the one more other header line after Assert.assertEquals(numHeaderLinesBefore, numHeaderLinesAfter); } + @Test + public void testSimpleHeaderLineGenericFieldGetter() { + final VCFHeader header = VCFHeaderUnitTestData.createHeaderFromString(VCFHeaderUnitTestData.getVCFV42TestHeaderString()); + final List filters = header.getFilterLines(); + final VCFFilterHeaderLine filterHeaderLine = filters.get(0); + final Map genericFields = filterHeaderLine.getGenericFields(); + Assert.assertEquals(genericFields.get("ID"),"NoQCALL"); + Assert.assertEquals(genericFields.get("Description"),"Variant called by Dindel but not confirmed by QCALL"); + } + + @Test + public void testSerialization() throws Exception { + final VCFFileReader reader = new VCFFileReader(new File("src/test/resources/htsjdk/variant/HiSeq.10000.vcf"), false); + final VCFHeader originalHeader = reader.getFileHeader(); + reader.close(); + + final VCFHeader deserializedHeader = TestUtil.serializeAndDeserialize(originalHeader); + + Assert.assertEquals(deserializedHeader.getMetaDataInInputOrder(), originalHeader.getMetaDataInInputOrder(), "Header metadata does not match before/after serialization"); + Assert.assertEquals(deserializedHeader.getContigLines(), originalHeader.getContigLines(), "Contig header lines do not match before/after serialization"); + Assert.assertEquals(deserializedHeader.getFilterLines(), originalHeader.getFilterLines(), "Filter header lines do not match before/after serialization"); + Assert.assertEquals(deserializedHeader.getFormatHeaderLines(), originalHeader.getFormatHeaderLines(), "Format header lines do not match before/after serialization"); + Assert.assertEquals(deserializedHeader.getIDHeaderLines(), originalHeader.getIDHeaderLines(), "ID header lines do not match before/after serialization"); + Assert.assertEquals(deserializedHeader.getInfoHeaderLines(), originalHeader.getInfoHeaderLines(), "Info header lines do not match before/after serialization"); + Assert.assertEquals(deserializedHeader.getOtherHeaderLines(), originalHeader.getOtherHeaderLines(), "Other header lines do not match before/after serialization"); + Assert.assertEquals(deserializedHeader.getGenotypeSamples(), originalHeader.getGenotypeSamples(), "Genotype samples not the same before/after serialization"); + Assert.assertEquals(deserializedHeader.samplesWereAlreadySorted(), originalHeader.samplesWereAlreadySorted(), "Sortedness of samples not the same before/after serialization"); + Assert.assertEquals(deserializedHeader.getSampleNamesInOrder(), originalHeader.getSampleNamesInOrder(), "Sorted list of sample names in header not the same before/after serialization"); + Assert.assertEquals(deserializedHeader.getSampleNameToOffset(), originalHeader.getSampleNameToOffset(), "Sample name to offset map not the same before/after serialization"); + Assert.assertEquals(deserializedHeader.toString(), originalHeader.toString(), "String representation of header not the same before/after serialization"); + } + @DataProvider(name="validHeaderVersionTransitions") public Object[][] validHeaderVersionTransitions() { - // v4.3 can never transition, all other version transitions are allowed + // all (forward) version transitions are allowed return new Object[][] { + {VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF3_3}, + {VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF3_2, VCFHeaderVersion.VCF4_3}, + + {VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF4_0}, {VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF4_1}, {VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF4_1}, {VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_3} }; } @DataProvider(name="invalidHeaderVersionTransitions") public Object[][] invalidHeaderVersionTransitions() { - // v4.3 can never transition with, all other version transitions are allowed return new Object[][] { - {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_0}, - {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_1}, + //reject any attempt to go backwards in time {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_2}, - {VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF4_3}, - {VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF4_3}, - {VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_3}, + {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF3_3}, + {VCFHeaderVersion.VCF4_3, VCFHeaderVersion.VCF3_2}, + + {VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF3_3}, + {VCFHeaderVersion.VCF4_2, VCFHeaderVersion.VCF3_2}, + + {VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF3_3}, + {VCFHeaderVersion.VCF4_1, VCFHeaderVersion.VCF3_2}, + + {VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF3_3}, + {VCFHeaderVersion.VCF4_0, VCFHeaderVersion.VCF3_2}, + + {VCFHeaderVersion.VCF3_3, VCFHeaderVersion.VCF3_2}, }; } @Test(dataProvider="validHeaderVersionTransitions") - public void testValidHeaderVersionTransition(final VCFHeaderVersion fromVersion, final VCFHeaderVersion toVersion) { - doHeaderTransition(fromVersion, toVersion); + public void testAddVersionLineValidTransition(final VCFHeaderVersion fromVersion, final VCFHeaderVersion toVersion) { + final VCFHeader vcfHeader = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(fromVersion), Collections.emptySet()); + vcfHeader.addMetaDataLine(VCFHeader.makeHeaderVersionLine(toVersion)); + Assert.assertEquals(vcfHeader.getVCFHeaderVersion(), toVersion); } @Test(dataProvider="invalidHeaderVersionTransitions", expectedExceptions = TribbleException.class) - public void testInvalidHeaderVersionTransition(final VCFHeaderVersion fromVersion, final VCFHeaderVersion toVersion) { - doHeaderTransition(fromVersion, toVersion); + public void testAddVersionInvalidTransition(final VCFHeaderVersion fromVersion, final VCFHeaderVersion toVersion) { + new VCFHeader(VCFHeader.makeHeaderVersionLineSet(fromVersion), Collections.emptySet()) + .addMetaDataLine(VCFHeader.makeHeaderVersionLine(toVersion)); + } + + @DataProvider(name = "vcfVersions") + public Object[][] vcfVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF3_2}, + {VCFHeaderVersion.VCF3_3}, + {VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF4_3} + }; + } + + @Test(expectedExceptions = TribbleException.class) + public void testVersionUpgradeWithValidationFailure() { + // test mixing header versions where the old version header has a line that fails validation + // using the resulting (newer) version + + // create a 4.2 header with a 4.2 style pedigree line (one that has no ID) + final VCFHeader vcfHeader = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeaderVersion.VCF4_2)); + vcfHeader.addMetaDataLine(new VCFHeaderLine(VCFConstants.PEDIGREE_HEADER_KEY, "")); + + // now try to force a version upgrade to 4.3, old style pedigree line should cause a failure + vcfHeader.addMetaDataLine(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_3)); + } + + @Test(expectedExceptions = TribbleException.class) + public void testAddLineWithValidationFailure() { + // create a 4.3 header, and then try to add an old-style pedigree line (one that has no ID) + // which should cause a failure + final VCFHeader vcfHeader = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeaderVersion.VCF4_3)); + vcfHeader.addMetaDataLine(new VCFHeaderLine(VCFConstants.PEDIGREE_HEADER_KEY, "")); + } + + + @Test(expectedExceptions = TribbleException.class) + public void testConstructorRequiresFileFormatLine() { + final Set metaDataSet = VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString(); // 4.2 header is compatible with all 4.x versions + // create a new header from this set (containing no fileformat line), no requested version in constructor + new VCFHeader(metaDataSet, Collections.emptySet()); //defaults to v4.2 } - private void doHeaderTransition(final VCFHeaderVersion fromVersion, final VCFHeaderVersion toVersion) { - final VCFHeader vcfHeader = - fromVersion == null ? - new VCFHeader() : - new VCFHeader(fromVersion, Collections.EMPTY_SET, Collections.EMPTY_SET); - vcfHeader.setVCFHeaderVersion(toVersion); + @Test(dataProvider = "vcfVersions") + public void testConstructorWithSingleFileFormatLine(final VCFHeaderVersion vcfVersion) { + final Set metaDataSet = VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString(); // 4.2 header is compatible with all 4.x versions + + // add in the corresponding fileformat line; create a new versioned header + // since the version requested in the constructor and the format lines are in sync, there is + // no conflict, and the resulting header's version should always match the requested version + metaDataSet.add(VCFHeader.makeHeaderVersionLine(vcfVersion)); + final VCFHeader vcfHeader = new VCFHeader(metaDataSet, Collections.emptySet()); + Assert.assertEquals(vcfHeader.getVCFHeaderVersion(), vcfVersion); } @Test - public void testVCFHeaderSerialization() throws Exception { - final VCFFileReader reader = new VCFFileReader(new File("src/test/resources/htsjdk/variant/HiSeq.10000.vcf"), false); - final VCFHeader originalHeader = reader.getFileHeader(); - reader.close(); + public void testConstructorWithMultipleFileFormatLines() { + final Set metaDataSet = VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString(); // this (4.2) header is compatible with all 4.x versions + final int beforeSize = metaDataSet.size(); - final VCFHeader deserializedHeader = TestUtil.serializeAndDeserialize(originalHeader); + // multiple version lines will be ignored, with only the last one retained + metaDataSet.add(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_2)); + metaDataSet.add(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_1)); + Assert.assertEquals(metaDataSet.size(), beforeSize + 2); - Assert.assertEquals(deserializedHeader.getMetaDataInInputOrder(), originalHeader.getMetaDataInInputOrder(), "Header metadata does not match before/after serialization"); - Assert.assertEquals(deserializedHeader.getContigLines(), originalHeader.getContigLines(), "Contig header lines do not match before/after serialization"); - Assert.assertEquals(deserializedHeader.getFilterLines(), originalHeader.getFilterLines(), "Filter header lines do not match before/after serialization"); - Assert.assertEquals(deserializedHeader.getFormatHeaderLines(), originalHeader.getFormatHeaderLines(), "Format header lines do not match before/after serialization"); - Assert.assertEquals(deserializedHeader.getIDHeaderLines(), originalHeader.getIDHeaderLines(), "ID header lines do not match before/after serialization"); - Assert.assertEquals(deserializedHeader.getInfoHeaderLines(), originalHeader.getInfoHeaderLines(), "Info header lines do not match before/after serialization"); - Assert.assertEquals(deserializedHeader.getOtherHeaderLines(), originalHeader.getOtherHeaderLines(), "Other header lines do not match before/after serialization"); - Assert.assertEquals(deserializedHeader.getGenotypeSamples(), originalHeader.getGenotypeSamples(), "Genotype samples not the same before/after serialization"); - Assert.assertEquals(deserializedHeader.samplesWereAlreadySorted(), originalHeader.samplesWereAlreadySorted(), "Sortedness of samples not the same before/after serialization"); - Assert.assertEquals(deserializedHeader.getSampleNamesInOrder(), originalHeader.getSampleNamesInOrder(), "Sorted list of sample names in header not the same before/after serialization"); - Assert.assertEquals(deserializedHeader.getSampleNameToOffset(), originalHeader.getSampleNameToOffset(), "Sample name to offset map not the same before/after serialization"); - Assert.assertEquals(deserializedHeader.toString(), originalHeader.toString(), "String representation of header not the same before/after serialization"); + // create a new versioned header from this set + final VCFHeader vcfHeader = new VCFHeader(metaDataSet, Collections.emptySet()); + Assert.assertEquals(vcfHeader.getVCFHeaderVersion(), VCFHeaderVersion.VCF4_1); + } + + @Test(expectedExceptions = TribbleException.VersionValidationFailure.class) + public void testConstructorWithInvalidLineForVersion() { + final Set metaDataSet = VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString(); // this (4.2) header is compatible with all 4.x versions + metaDataSet.add(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_2)); + final Map attributes = new LinkedHashMap<>(); + attributes.put("ID", "id"); + metaDataSet.add(new VCFPedigreeHeaderLine(attributes)); + new VCFHeader(metaDataSet, Collections.emptySet()); + } + + @Test(expectedExceptions = TribbleException.VersionValidationFailure.class) + public void testAddMetaDataLineInvalidForVersion() { + final Set metaDataSet = VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString(); // this (4.2) header is compatible with all 4.x versions + metaDataSet.add(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_2)); + final VCFHeader header = new VCFHeader(metaDataSet, Collections.emptySet()); + Assert.assertEquals(header.getVCFHeaderVersion(), VCFHeaderVersion.VCF4_2); + final Map attributes = new LinkedHashMap<>(); + attributes.put("ID", "id"); + header.addMetaDataLine(new VCFPedigreeHeaderLine(attributes)); + } + + @Test(expectedExceptions = TribbleException.class) + public void testAddMetaDataLineFileFormat() { + final Set metaDataSet = VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString(); // this (4.2) header is compatible with all 4.x versions + final int beforeSize = metaDataSet.size(); + + metaDataSet.add(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_1)); + Assert.assertEquals(metaDataSet.size(), beforeSize + 1); + + // create a new versioned header from this set + final VCFHeader vcfHeader = new VCFHeader(metaDataSet, Collections.emptySet()); + Assert.assertEquals(vcfHeader.getVCFHeaderVersion(), VCFHeaderVersion.VCF4_1); + + // add a new line that uses the same header version already established + vcfHeader.addMetaDataLine(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_1)); + + // add a new line that tries to move the version forward + vcfHeader.addMetaDataLine(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_2)); + + // now try to go backwards (throws) + vcfHeader.addMetaDataLine(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_1)); + } + + @Test + public void testPreserveSequenceDictionaryAttributes() { + // Round trip a SAMSequenceDictionary with attributes, through a VCFHeader, and back + // to a SAMSequenceDictionary with the same attributes. + // https://github.com/samtools/htsjdk/issues/730 + + final String assemblyString = "hg37"; + final String md5String = "68b329da9893e34099c7d8ad5cb9c940"; + final String speciesString = "Home Sapiens"; + final String urlString = "http://www.refserve.org:8080/path/"; + + final SAMSequenceDictionary samDict = new SAMSequenceDictionary(); + + final SAMSequenceRecord seqRec1 = new SAMSequenceRecord("1", 1); + seqRec1.setAssembly(assemblyString); + seqRec1.setMd5(md5String); + seqRec1.setAttribute(SAMSequenceRecord.URI_TAG, urlString); + seqRec1.setSpecies(speciesString); + final SAMSequenceRecord seqRec2 = new SAMSequenceRecord("2", 1); + samDict.addSequence(seqRec1); + samDict.addSequence(seqRec2); + + final VCFHeader vcfHeader = new VCFHeader(); + vcfHeader.setSequenceDictionary(samDict); + final SAMSequenceDictionary roundTrippedDict = vcfHeader.getSequenceDictionary(); + + final SAMSequenceRecord rtRec1 = roundTrippedDict.getSequence("1"); + Assert.assertEquals(assemblyString, rtRec1.getAssembly()); + Assert.assertEquals(md5String, rtRec1.getMd5()); + Assert.assertEquals(urlString, rtRec1.getAttribute(SAMSequenceRecord.URI_TAG)); + Assert.assertEquals(speciesString, rtRec1.getSpecies()); + + Assert.assertEquals(seqRec1, roundTrippedDict.getSequence("1")); // somewhat redundant check on full record + Assert.assertEquals(seqRec2, roundTrippedDict.getSequence("2")); } + ///////////////////////////////////////////////////////////////// + ////////////////************************* End new tests block... + ///////////////////////////////////////////////////////////////// + @Test public void testVCFHeaderQuoteEscaping() throws Exception { // this test ensures that the end-to-end process of quote escaping is stable when headers are @@ -449,10 +654,9 @@ public void testVCFHeaderQuoteEscaping() throws Exception { final VCFHeader originalHeader = originalFileReader.getFileHeader(); // add a header line with quotes to the header - final Map attributes = new LinkedHashMap<>(); - attributes.put("ID", "VariantFiltration"); - attributes.put("CommandLineOptions", "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \"NA\" || ANNOTATION <= 2.0]"); - final VCFSimpleHeaderLine addedHeaderLine = new VCFSimpleHeaderLine("GATKCommandLine.Test", attributes); + final VCFSimpleHeaderLine addedHeaderLine = new VCFFilterHeaderLine( + "FakeFilter", + "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \"NA\" || ANNOTATION <= 2.0]"); originalHeader.addMetaDataLine(addedHeaderLine); final VCFFilterHeaderLine originalCopyAnnotationLine1 = originalHeader.getFilterHeaderLine("ANNOTATION"); @@ -487,7 +691,7 @@ public void testVCFHeaderQuoteEscaping() throws Exception { firstCopyWriter.writeHeader(originalHeader); final CloseableIterator firstCopyVariantIterator = originalFileReader.iterator(); while (firstCopyVariantIterator.hasNext()) { - VariantContext variantContext = firstCopyVariantIterator.next(); + final VariantContext variantContext = firstCopyVariantIterator.next(); firstCopyWriter.add(variantContext); } originalFileReader.close(); @@ -496,7 +700,7 @@ public void testVCFHeaderQuoteEscaping() throws Exception { // read the copied file back in final VCFFileReader firstCopyReader = new VCFFileReader(firstCopyVCFFile, false); final VCFHeader firstCopyHeader = firstCopyReader.getFileHeader(); - final VCFHeaderLine firstCopyNewHeaderLine = firstCopyHeader.getOtherHeaderLine("GATKCommandLine.Test"); + final VCFFilterHeaderLine firstCopyNewHeaderLine = firstCopyHeader.getFilterHeaderLine("FakeFilter"); Assert.assertNotNull(firstCopyNewHeaderLine); final VCFFilterHeaderLine firstCopyAnnotationLine1 = firstCopyHeader.getFilterHeaderLine("ANNOTATION"); @@ -530,7 +734,7 @@ public void testVCFHeaderQuoteEscaping() throws Exception { secondCopyWriter.writeHeader(firstCopyHeader); final CloseableIterator secondCopyVariantIterator = firstCopyReader.iterator(); while (secondCopyVariantIterator.hasNext()) { - VariantContext variantContext = secondCopyVariantIterator.next(); + final VariantContext variantContext = secondCopyVariantIterator.next(); secondCopyWriter.add(variantContext); } secondCopyWriter.close(); @@ -539,7 +743,7 @@ public void testVCFHeaderQuoteEscaping() throws Exception { final VCFFileReader secondCopyReader = new VCFFileReader(secondCopyVCFFile, false); final VCFHeader secondCopyHeader = secondCopyReader.getFileHeader(); - final VCFHeaderLine secondCopyNewHeaderLine = secondCopyHeader.getOtherHeaderLine("GATKCommandLine.Test"); + final VCFFilterHeaderLine secondCopyNewHeaderLine = secondCopyHeader.getFilterHeaderLine("FakeFilter"); Assert.assertNotNull(secondCopyNewHeaderLine); final VCFFilterHeaderLine secondCopyAnnotationLine1 = secondCopyHeader.getFilterHeaderLine("ANNOTATION"); @@ -549,8 +753,8 @@ public void testVCFHeaderQuoteEscaping() throws Exception { Assert.assertNotNull(secondCopyAnnotationLine2); Assert.assertEquals(firstCopyNewHeaderLine, secondCopyNewHeaderLine); - Assert.assertEquals(firstCopyNewHeaderLine.toStringEncoding(), "GATKCommandLine.Test="); - Assert.assertEquals(secondCopyNewHeaderLine.toStringEncoding(), "GATKCommandLine.Test="); + Assert.assertEquals(firstCopyNewHeaderLine.toStringEncoding(), "FILTER="); + Assert.assertEquals(secondCopyNewHeaderLine.toStringEncoding(), "FILTER="); Assert.assertEquals(firstCopyAnnotationLine1, secondCopyAnnotationLine1); Assert.assertEquals(secondCopyAnnotationLine1.getGenericFieldValue("Description"), "ANNOTATION != \"NA\" || ANNOTATION <= 0.01"); @@ -574,136 +778,153 @@ public void testVCFHeaderQuoteEscaping() throws Exception { } - @Test - public void testVcf42Roundtrip() throws Exception { - // this test ensures that source/version fields are round-tripped properly + ///////////////////////////////////////////////////////////////////// + // Private helper methods + ///////////////////////////////////////////////////////////////////// - // read an existing VCF - File expectedFile = new File("src/test/resources/htsjdk/variant/Vcf4.2WithSourceVersionInfoFields.vcf"); + // Serialize/encode the header to a file, read metaData back in + private Set getRoundTripEncoded(final VCFHeader header) throws IOException { + final File myTempFile = File.createTempFile("VCFHeader", "vcf"); + try (final VariantContextWriter vcfWriter = + new VariantContextWriterBuilder() + .setOutputFile(myTempFile) + .setOutputFileType(VariantContextWriterBuilder.OutputType.VCF) + .setOptions(VariantContextWriterBuilder.NO_OPTIONS) + .build()) { + vcfWriter.writeHeader(header); + } + final VCFHeader vcfHeader = (VCFHeader) new VCFCodec().readActualHeader(new LineIteratorImpl( + new SynchronousLineReader(new FileReader(myTempFile.getAbsolutePath())))); + return vcfHeader.getMetaDataInSortedOrder(); + } - // write the file out into a new copy - final File actualFile = File.createTempFile("testVcf4.2roundtrip.", FileExtensions.VCF); - actualFile.deleteOnExit(); - try (final VCFFileReader originalFileReader = new VCFFileReader(expectedFile, false); - final VariantContextWriter copyWriter = new VariantContextWriterBuilder() - .setOutputFile(actualFile) - .setReferenceDictionary(createArtificialSequenceDictionary()) - .setOptions(EnumSet.of(Options.ALLOW_MISSING_FIELDS_IN_HEADER, Options.INDEX_ON_THE_FLY)) - .build() - ) { - final VCFHeader originalHeader = originalFileReader.getFileHeader(); - - copyWriter.writeHeader(originalHeader); - for (final VariantContext variantContext : originalFileReader) { - copyWriter.add(variantContext); - } - } + private VCFHeader getHiSeqVCFHeader() { + final File vcf = new File("src/test/resources/htsjdk/variant/HiSeq.10000.vcf"); + final VCFFileReader reader = new VCFFileReader(vcf, false); + final VCFHeader header = reader.getFileHeader(); + reader.close(); + return header; + } - final String actualContents = new String(Files.readAllBytes(actualFile.toPath()), StandardCharsets.UTF_8); - final String expectedContents = new String(Files.readAllBytes(expectedFile.toPath()), StandardCharsets.UTF_8); - Assert.assertEquals(actualContents, expectedContents); + private static Collection asCollectionOfVCFHeaderLine(final Collection headers) { + // create a collection of VCFHeaderLine so that contains tests work correctly + return headers.stream().map(h -> (VCFHeaderLine) h).collect(Collectors.toList()); } + @DataProvider(name="duplicateHeaderLineCases") + private Object[][] getDuplicateHeaderLineCases() { + return new Object[][] { - /** - * a little utility function for all tests to md5sum a file - * Shameless taken from: - *

- * http://www.javalobby.org/java/forums/t84420.html - * - * @param file the file - * @return a string - */ - private static String md5SumFile(File file) { - MessageDigest digest; - try { - digest = MessageDigest.getInstance("MD5"); - } catch (NoSuchAlgorithmException e) { - throw new RuntimeException("Unable to find MD5 digest"); - } - InputStream is; - try { - is = new FileInputStream(file); - } catch (FileNotFoundException e) { - throw new RuntimeException("Unable to open file " + file); - } - byte[] buffer = new byte[8192]; - int read; - try { - while ((read = is.read(buffer)) > 0) { - digest.update(buffer, 0, read); - } - byte[] md5sum = digest.digest(); - BigInteger bigInt = new BigInteger(1, md5sum); - return bigInt.toString(16); - - } catch (IOException e) { - throw new RuntimeException("Unable to process file for MD5", e); - } finally { - try { - is.close(); - } catch (IOException e) { - throw new RuntimeException("Unable to close input stream for MD5 calculation", e); - } - } + // these tests use VCFAltHeaderLine to test structured/ID lines, but the behavior should be the same + // for any header ID line + + // duplicate IDs, duplicate description; line is dropped due to duplicate ID + { new VCFAltHeaderLine("X", "description1"), + new VCFAltHeaderLine("X", "description1"), false }, + // duplicate IDs, different descriptions; line is dropped due to duplicate ID + { new VCFAltHeaderLine("X", "description1"), + new VCFAltHeaderLine("X", "description2"), false }, + // different IDs, different descriptions; line is retained + { new VCFAltHeaderLine("X", "description1"), + new VCFAltHeaderLine("Y", "description2"), true }, + // different IDs, duplicate descriptions; line is retained + { new VCFAltHeaderLine("X", "description"), + new VCFAltHeaderLine("Y", "description"), true }, + + // .......unstructured header lines........ + + // duplicate key, duplicate value, line is dropped + { new VCFHeaderLine("CommandLine", "command"), new VCFHeaderLine("CommandLine", "command"), false }, + // duplicate key, different value, line is retained + { new VCFHeaderLine("CommandLine", "command1"), new VCFHeaderLine("CommandLine", "command2"), true }, + + /////////////////////////////////////////////////////////////////////////////////////////// + // since the VCFHeaderLine constructor is public, it can be used erroneously to model header + // lines that have structured syntax, but which will not obey structured header line rules, + // since those are enabled via VCFSimpleHeaderLine, and VCFHeaderLine is intended to be used + // for non-structured lines. so include some tests that simulate this + + // duplicate key, duplicate value (...duplicate ID), line is dropped + { new VCFHeaderLine("KEY", ""), new VCFHeaderLine("KEY", ""), false }, + // duplicate key, different value (different ID), line is retained + { new VCFHeaderLine("KEY", ""), new VCFHeaderLine("KEY", ""), true }, + + //NOTE: this case illustrates how its possible to use the API to cause two structured lines + // with duplicate IDs to be retained if they are not modeled as VCFStructuredHeaderLines + // duplicate key, different value (but IDENTICAL ID), line is RETAINED + { new VCFHeaderLine("KEY", ""), new VCFHeaderLine("KEY", ""), true }, + + // different key, duplicate value, line is retained + { new VCFHeaderLine("KEY1", ""), new VCFHeaderLine("KEY2", ""), true }, + // different key, different value, line is retained + { new VCFHeaderLine("KEY1", ""), new VCFHeaderLine("KEY2", ""), true }, + }; } - private void checkMD5ofHeaderFile(VCFHeader header, String md5sum) { - File myTempFile = null; - PrintWriter pw = null; - try { - myTempFile = File.createTempFile("VCFHeader", "vcf"); - myTempFile.deleteOnExit(); - pw = new PrintWriter(myTempFile); - } catch (IOException e) { - Assert.fail("Unable to make a temp file!"); - } - for (VCFHeaderLine line : header.getMetaDataInSortedOrder()) - pw.println(line); - pw.close(); - Assert.assertEquals(md5SumFile(myTempFile), md5sum); - } - - public static final int VCF4headerStringCount = 16; - - public static final String VCF4headerStrings = - "##fileformat=VCFv4.2\n" + - "##filedate=2010-06-21\n" + - "##reference=NCBI36\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##FILTER=\n" + - "##FORMAT=\n" + - "##FORMAT=\n" + - "##FORMAT=\n" + - "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"; - - - public static final String VCF4headerStrings_with_negativeOne = - "##fileformat=VCFv4.2\n" + - "##filedate=2010-06-21\n" + - "##reference=NCBI36\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##INFO=\n" + - "##FILTER=\n" + - "##FORMAT=\n" + - "##FORMAT=\n" + - "##FORMAT=\n" + - "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"; + @Test(dataProvider = "duplicateHeaderLineCases") + private void testDuplicateHeaderLine(final VCFHeaderLine hl1, final VCFHeaderLine hl2, final boolean expectHL2Retained) { + final Set lineSet = VCFHeader.makeHeaderVersionLineSet(VCFHeaderVersion.VCF4_2); + lineSet.add(hl1); + lineSet.add(hl2); + final VCFHeader vcfHeader = new VCFHeader(lineSet); + + Assert.assertEquals(vcfHeader.getMetaDataInInputOrder().size(), expectHL2Retained ? 3 : 2); + } + + @Test + public void testAddOtherHeaderLineUnique() { + final String TEST_KEY = "testKey"; + final VCFHeader vcfHeader = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION)); + final VCFHeaderLine otherLine1 = new VCFHeaderLine(TEST_KEY, "Test Value 1"); + vcfHeader.addMetaDataLine(otherLine1); + final List otherLines1 = vcfHeader.getOtherHeaderLines(TEST_KEY); + Assert.assertEquals(otherLines1.size(), 1); + Assert.assertTrue(otherLines1.contains(otherLine1)); + + // now add a second line + final VCFHeaderLine otherLine2 = new VCFHeaderLine(TEST_KEY, "Test Value 2"); + vcfHeader.addMetaDataLine(otherLine2); + final List otherLines2 = vcfHeader.getOtherHeaderLines(TEST_KEY); + Assert.assertEquals(otherLines2.size(), 2); + Assert.assertTrue(otherLines2.contains(otherLine1)); + Assert.assertTrue(otherLines2.contains(otherLine2)); + + // now call addOtherHeaderLineUnique with a 3rd line, the first two should be removed + final VCFHeaderLine otherLine3= new VCFHeaderLine(TEST_KEY, "Test Value 3"); + vcfHeader.addOtherHeaderLineUnique(otherLine3); + final List otherLines3 = vcfHeader.getOtherHeaderLines(TEST_KEY); + Assert.assertEquals(otherLines3.size(), 1); + Assert.assertFalse(otherLines3.contains(otherLine1)); + Assert.assertFalse(otherLines3.contains(otherLine2)); + Assert.assertTrue(otherLines3.contains(otherLine3)); + } + + @Test(expectedExceptions = TribbleException.class) + public void testAddOtherHeaderLineUniqueRejectsIDLines() { + final VCFHeader vcfHeader = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION)); + final VCFSimpleHeaderLine simpleHeaderLine = new VCFSimpleHeaderLine("testKey", "testID","test description"); + vcfHeader.addOtherHeaderLineUnique(simpleHeaderLine); + } + + @Test(expectedExceptions = TribbleException.class) + public void testGetOtherHeaderLineUnique() { + final String TEST_KEY = "testKey"; + final VCFHeader vcfHeader = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeader.DEFAULT_VCF_VERSION)); + + // now add two lines with the same key + final VCFHeaderLine otherLine1 = new VCFHeaderLine(TEST_KEY, "Test Value 1"); + vcfHeader.addMetaDataLine(otherLine1); + final VCFHeaderLine otherLine2 = new VCFHeaderLine(TEST_KEY, "Test Value 2"); + vcfHeader.addMetaDataLine(otherLine2); + + final List otherLines = vcfHeader.getOtherHeaderLines(TEST_KEY); + Assert.assertEquals(otherLines.size(), 2); + Assert.assertTrue(otherLines.contains(otherLine1)); + Assert.assertTrue(otherLines.contains(otherLine2)); + + // now call getOtherHeaderLineUnique, should throw + vcfHeader.getOtherHeaderLineUnique(TEST_KEY); + } } diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTestData.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTestData.java new file mode 100644 index 0000000000..7b57a19b5a --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTestData.java @@ -0,0 +1,203 @@ +package htsjdk.variant.vcf; + +import htsjdk.tribble.TribbleException; +import htsjdk.tribble.readers.LineIteratorImpl; +import htsjdk.tribble.readers.SynchronousLineReader; +import org.testng.Assert; + +import java.io.StringReader; +import java.util.*; + +// Unit test data used by unit tests for VCFHeader, VCFMetaDataLines, and VCFHeaderLine hierarchy. +public class VCFHeaderUnitTestData { + public final static VCFHeaderVersion TEST_VERSION = VCFHeader.DEFAULT_VCF_VERSION; + + // fileformat line + public static List getTestDefaultFileFormatLine() { + return new ArrayList() {{ + add(VCFHeader.makeHeaderVersionLine(TEST_VERSION)); + }}; + } + + // FILTER lines + public static List getTestFilterLines() { + return new ArrayList() {{ + add(new VCFFilterHeaderLine("LowQual", "Description=\"Low quality\"")); + add(new VCFFilterHeaderLine("highDP", "Description=\"DP < 8\"")); + add(new VCFFilterHeaderLine("TruthSensitivityTranche98.50to98.80", "Truth sensitivity tranche level at VSQ Lod: -0.1106 <= x < 0.6654")); + }}; + } + + // FORMAT lines + public static List getTestFormatLines() { + return new ArrayList() {{ + add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_KEY, 1, VCFHeaderLineType.String, "Genotype")); + add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_QUALITY_KEY, 1, VCFHeaderLineType.Integer, "Genotype Quality")); + add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth (reads with MQ=255 or with bad mates are filtered)")); + add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_PL_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); + add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_ALLELE_DEPTHS, VCFHeaderLineCount.R, VCFHeaderLineType.Integer, "Allelic depths for the ref and alt alleles in the order listed")); + add(new VCFFormatHeaderLine(VCFConstants.PHASE_QUALITY_KEY, 1, VCFHeaderLineType.Float, "Read-backed phasing quality")); + add(new VCFFormatHeaderLine("MLPSAF", VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Maximum likelihood expectation (MLE) for the alternate allele fraction")); + add(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_FILTER_KEY, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Genotype-level filter")); + }}; + } + + // INFO lines + public static List getTestInfoLines() { + return new ArrayList() {{ + add(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "Stop position of the interval")); + add(new VCFInfoHeaderLine(VCFConstants.DBSNP_KEY, 0, VCFHeaderLineType.Flag, "dbSNP Membership")); + add(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Integer, "Approximate read depth; some reads may have been filtered")); + add(new VCFInfoHeaderLine(VCFConstants.STRAND_BIAS_KEY, 1, VCFHeaderLineType.Float, "Strand Bias")); + add(new VCFInfoHeaderLine(VCFConstants.ALLELE_FREQUENCY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Allele Frequency, for each ALT allele, in the same order as listed")); + add(new VCFInfoHeaderLine(VCFConstants.ALLELE_COUNT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Allele count in genotypes, for each ALT allele, in the same order as listed")); + add(new VCFInfoHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Total number of alleles in called genotypes")); + add(new VCFInfoHeaderLine(VCFConstants.MAPPING_QUALITY_ZERO_KEY, 1, VCFHeaderLineType.Integer, "Total Mapping Quality Zero Reads")); + add(new VCFInfoHeaderLine(VCFConstants.RMS_MAPPING_QUALITY_KEY, 1, VCFHeaderLineType.Float, "RMS Mapping Quality")); + add(new VCFInfoHeaderLine(VCFConstants.SOMATIC_KEY, 0, VCFHeaderLineType.Flag, "Somatic event")); + }}; + } + + // CONTIG lines + public static List getTestContigLines() { + return new ArrayList() {{ + add(new VCFContigHeaderLine(Collections.singletonMap("ID", "1"), 0)); + add(new VCFContigHeaderLine(Collections.singletonMap("ID", "2"), 1)); + add(new VCFContigHeaderLine(Collections.singletonMap("ID", "3"), 2)); + }}; + } + + //misc lines + public static List getTestMiscellaneousLines() { + return new ArrayList() {{ + add(new VCFHeaderLine("reference", "g37")); + add(new VCFHeaderLine("GATKCommandLine", "SelectVariants and such.")); + }}; + } + + //Return a full set of metadata lines, retaining order in a LinkedHashSet. + public static LinkedHashSet getTestMetaDataLinesSet() { + final LinkedHashSet allHeaderLines = new LinkedHashSet() {{ //preserve order + addAll(getTestDefaultFileFormatLine()); + addAll(getTestFilterLines()); + addAll(getTestFormatLines()); + addAll(getTestInfoLines()); + addAll(getTestContigLines()); + addAll(getTestMiscellaneousLines()); + }}; + Assert.assertEquals(allHeaderLines.size(), + 1 + // file format line + getTestFilterLines().size() + getTestFormatLines().size() + + getTestInfoLines().size() + getTestContigLines().size() + getTestMiscellaneousLines().size()); + return allHeaderLines; + } + + //Return a full set of metadata lines as a VCFMetaDataLines. + public static VCFMetaDataLines getTestMetaDataLines() { + final VCFMetaDataLines md = new VCFMetaDataLines(); + md.addMetaDataLines(getTestMetaDataLinesSet()); + return md; + } + + private static final int VCF_4_HEADER_STRING_COUNT = 16; // 17 -1 for the #CHROM... line + + public static String getVCFV42TestHeaderString() { + return "##fileformat=VCFv4.2\n" + + "##filedate=2010-06-21\n" + + "##reference=NCBI36\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##FILTER=\n" + + "##FORMAT=\n" + + "##FORMAT=\n" + + "##FORMAT=\n" + + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"; + } + + public static final String VCF42headerStrings_with_negativeOne = + "##fileformat=VCFv4.2\n" + + "##filedate=2010-06-21\n" + + "##reference=NCBI36\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##INFO=\n" + + "##FILTER=\n" + + "##FORMAT=\n" + + "##FORMAT=\n" + + "##FORMAT=\n" + + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"; + + public static Set getV42HeaderLinesWITHOUTFormatString() { + // precondition - create a v42 VCFMetaDataLines and make sure its v42 + final Set metaDataSet = getV42HeaderLinesWITHFormatString(); + final VCFMetaDataLines metaDataLines = new VCFMetaDataLines(); + metaDataLines.addMetaDataLines(metaDataSet); + final VCFHeaderLine versionLine = metaDataLines.getFileFormatLine(); + Assert.assertEquals( + VCFHeaderVersion.toHeaderVersion(versionLine.getValue()), + VCFHeaderVersion.VCF4_2); + + // remove the 4.2 version line from the original set, verify, and return the set with no fileformat string + metaDataSet.remove(versionLine); + Assert.assertNull(getVersionLineFromHeaderLineSet(metaDataSet)); + return metaDataSet; + } + + public static Set getV42HeaderLinesWITHFormatString() { + // precondition - create a v42 header and make sure its v42 + final VCFHeader header = createHeaderFromString(getVCFV42TestHeaderString()); + Assert.assertEquals( + header.getVCFHeaderVersion(), + VCFHeaderVersion.VCF4_2); + + // return a mutable set for test use + return new LinkedHashSet<>(header.getMetaDataInInputOrder()); + } + + public static VCFHeader createHeaderFromString(final String headerStr) { + final VCFCodec codec = new VCFCodec(); + final VCFHeader header = (VCFHeader) codec.readActualHeader( + new LineIteratorImpl(new SynchronousLineReader(new StringReader(headerStr)))); + Assert.assertEquals(header.getMetaDataInInputOrder().size(), VCF_4_HEADER_STRING_COUNT); + return header; + } + + /** + * Find and return the VCF fileformat/version line + * + * Return null if no fileformat/version lines are found + */ + private static VCFHeaderLine getVersionLineFromHeaderLineSet(final Set metaDataLines) { + VCFHeaderLine versionLine = null; + final List formatLines = new ArrayList<>(); + for (final VCFHeaderLine headerLine : metaDataLines) { + if (VCFHeaderVersion.isFormatString(headerLine.getKey())) { + formatLines.add(headerLine); + } + } + + if (!formatLines.isEmpty()) { + if (formatLines.size() > 1) { + //throw if there are duplicate version lines + throw new TribbleException("Multiple version header lines found in header line list"); + } + return formatLines.get(0); + } + + return versionLine; + } + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFInfoHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFInfoHeaderLineUnitTest.java new file mode 100644 index 0000000000..9e2a82f15a --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFInfoHeaderLineUnitTest.java @@ -0,0 +1,86 @@ +package htsjdk.variant.vcf; + +import htsjdk.HtsjdkTest; +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +/** + * Test conditions that are unique to INFO lines (not covered by VCFCompoundHeaderLineUnitTest). + */ +public class VCFInfoHeaderLineUnitTest extends HtsjdkTest { + + @Test + public void testRepairInfoLineFlagTypeWithNonzeroCount() { + final VCFInfoHeaderLine infoLine = new VCFInfoHeaderLine("", VCFHeader.DEFAULT_VCF_VERSION); + Assert.assertEquals(0, infoLine.getCount()); + } + + @DataProvider(name = "mergeCompatibleInfoLines") + public Object[][] getMergeCompatibleInfoLines() { + return new Object[][]{ + { + new VCFInfoHeaderLine("INFO=", VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", VCFHeader.DEFAULT_VCF_VERSION) + } + }; + } + + @Test(dataProvider = "mergeCompatibleInfoLines") + public void testMergeCompatibleInfoLines( + final VCFInfoHeaderLine infoHeaderLine1, + final VCFInfoHeaderLine infoHeaderLine2, + final VCFInfoHeaderLine expectedHeaderLine) { + Assert.assertEquals( + VCFInfoHeaderLine.getMergedInfoHeaderLine( + infoHeaderLine1, + infoHeaderLine2, + new VCFHeaderMerger.HeaderMergeConflictWarnings(true)), + expectedHeaderLine); + } + + @DataProvider(name = "mergeIncompatibleInfoLines") + public Object[][] getMergeIncompatibleInfoLines() { + return new Object[][]{ + // 2 lines to merge, expected result + { + // mixed number AND number type (multiple different attributes) + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION) + }, + { + // mixed number AND number type (multiple different attributes), reverse direction + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION), + new VCFInfoHeaderLine("INFO=", + VCFHeader.DEFAULT_VCF_VERSION) + } + }; + } + + @Test + public void testAllow1000GKey() { + final VCFInfoHeaderLine line = new VCFInfoHeaderLine( + "INFO=", + VCFHeader.DEFAULT_VCF_VERSION + ); + + // TODO change to VCFHeader.DEFAULT_VCF_VERSION + Assert.assertFalse(line.getValidationFailure(VCFHeaderVersion.VCF4_3).isPresent()); + } + + @Test(dataProvider = "mergeIncompatibleInfoLines", expectedExceptions= TribbleException.class) + public void testMergeIncompatibleInfoLines( + final VCFInfoHeaderLine infoHeaderLine1, + final VCFInfoHeaderLine infoHeaderLine2) { + VCFInfoHeaderLine.getMergedInfoHeaderLine( + infoHeaderLine1, + infoHeaderLine2, + new VCFHeaderMerger.HeaderMergeConflictWarnings(true)); + } + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFMetaDataLinesUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFMetaDataLinesUnitTest.java new file mode 100644 index 0000000000..2e41536abe --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFMetaDataLinesUnitTest.java @@ -0,0 +1,354 @@ +package htsjdk.variant.vcf; + +import htsjdk.HtsjdkTest; +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +public class VCFMetaDataLinesUnitTest extends HtsjdkTest { + + @DataProvider(name="keyCollisions") + public Object[][] keyCollisions() { + return new Object[][] { + // line 1, line 2, expected to collide + + // Unstructured key collisions + { // same key, same value + new VCFHeaderLine("key", "value"), + new VCFHeaderLine("key", "value"), true + }, + { // same key, different value + new VCFHeaderLine("key", "value"), + new VCFHeaderLine("key", "value1"), false + }, + { // different key, same value + new VCFHeaderLine("key1", "value"), + new VCFHeaderLine("key2", "value"), false + }, + { // different key, different value + new VCFHeaderLine("key1", "value1"), + new VCFHeaderLine("key2", "value2"), false + }, + + // Structured key collisions + { // same key, same ID, same (base VCFSimpleHeaderLine) class + new VCFSimpleHeaderLine("FILTER", Collections.singletonMap("ID", "id")), + new VCFSimpleHeaderLine("FILTER", Collections.singletonMap("ID", "id")), true + }, + { // same key, same ID, same (derived-VCFSimpleHeaderLine) class, same attributes + new VCFFilterHeaderLine("filterName", "unused description"), + new VCFFilterHeaderLine("filterName", "unused description"), true + }, + { // same key, same ID, same class, different attributes + new VCFFilterHeaderLine("filterName", "unused description"), + new VCFFilterHeaderLine("filterName", "different unused description"), true + }, + { // same key, different ID + new VCFFilterHeaderLine("filterName", "unused description"), + new VCFFilterHeaderLine("filterName2", "unused description"), false + }, + { // This is an unfortunate case that is allowed by the existing permissive VCFHeader + // APIs; two header lines that have identical content, one of which is modeled by the + // VCFSimpleHeaderLine base class, and one of which is modeled by the specialized , + // derived VCFFilterHeaderLine class + new VCFFilterHeaderLine("id", "unused description"), + new VCFSimpleHeaderLine("FILTER", new LinkedHashMap() {{ + put("ID", "id"); + put("Description", "unused description"); + }}), true } + }; + } + + @Test(dataProvider="keyCollisions") + public void testKeyCollisions(final VCFHeaderLine line1, final VCFHeaderLine line2, final boolean expectCollision) { + final VCFMetaDataLines mdLines = new VCFMetaDataLines(); + mdLines.addMetaDataLine(line1); + mdLines.addMetaDataLine(line2); + Assert.assertEquals(mdLines.getMetaDataInInputOrder().size(), expectCollision ? 1 : 2); + } + + @Test + public void testRetainFullHeaderLines() { + final VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + final VCFMetaDataLines md = unitTestData.getTestMetaDataLines(); + + Assert.assertEquals(md.getMetaDataInInputOrder().size(), unitTestData.getTestMetaDataLinesSet().size()); + Assert.assertEquals(md.getMetaDataInSortedOrder().size(), unitTestData.getTestMetaDataLinesSet().size()); + + Assert.assertEquals(unitTestData.getTestFormatLines(), md.getFormatHeaderLines()); + Assert.assertEquals(unitTestData.getTestFilterLines(), md.getFilterLines()); + Assert.assertEquals(unitTestData.getTestInfoLines(), md.getInfoHeaderLines()); + Assert.assertEquals(unitTestData.getTestContigLines(), md.getContigLines()); + Assert.assertEquals(unitTestData.getTestFilterLines(), md.getFilterLines()); + + final Set otherLines = new LinkedHashSet<>(); + otherLines.addAll(unitTestData.getTestDefaultFileFormatLine()); + otherLines.addAll(unitTestData.getTestMiscellaneousLines()); + Assert.assertEquals(otherLines, md.getOtherHeaderLines()); + } + + @Test + public void testAddRemoveOtherMetaDataLine() { + final VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + final VCFMetaDataLines md = unitTestData.getTestMetaDataLines(); + + int beforeAllSize = md.getMetaDataInInputOrder().size(); + int beforeStructuredSize = md.getIDHeaderLines().size(); + int beforeOtherSize = md.getOtherHeaderLines().size(); + + final VCFHeaderLine newLine = new VCFHeaderLine("foo", "bar"); + + // add one other line + md.addMetaDataLine(newLine); + Assert.assertEquals(md.getMetaDataInInputOrder().size(), beforeAllSize + 1); + Assert.assertEquals(md.getIDHeaderLines().size(), beforeStructuredSize); // remains the same + Assert.assertEquals(md.getOtherHeaderLines().size(), beforeOtherSize + 1); + + // remove the other line and we're back to original size + Assert.assertEquals(md.removeMetaDataLine(newLine), newLine); + Assert.assertEquals(md.getMetaDataInInputOrder().size(), beforeAllSize); + Assert.assertEquals(md.getIDHeaderLines().size(), beforeStructuredSize); // still remains the same + Assert.assertEquals(md.getOtherHeaderLines().size(), beforeOtherSize); + } + + @Test + public void testAddRemoveUniqueStructuredLine() { + final VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + final VCFMetaDataLines md = unitTestData.getTestMetaDataLines(); + + final int beforeAllSize = md.getMetaDataInInputOrder().size(); + final int beforeStructuredSize = md.getIDHeaderLines().size(); + final int beforeFilterSize = md.getFilterLines().size(); + final int beforeOtherSize = md.getOtherHeaderLines().size(); + + // add a new, unique, structured line + final VCFFilterHeaderLine newLine = new VCFFilterHeaderLine("filterID", "unused desc"); + md.addMetaDataLine(newLine); + + Assert.assertEquals(md.getMetaDataInInputOrder().size(), beforeAllSize + 1); + Assert.assertEquals(md.getIDHeaderLines().size(), beforeStructuredSize + 1); + Assert.assertEquals(md.getFilterLines().size(), beforeFilterSize + 1); + Assert.assertEquals(md.getOtherHeaderLines().size(), beforeOtherSize); // remains the same + + // remove the new line and we're back to original size + Assert.assertEquals(md.removeMetaDataLine(newLine), newLine); + Assert.assertEquals(md.getMetaDataInInputOrder().size(), beforeAllSize); + Assert.assertEquals(md.getIDHeaderLines().size(), beforeStructuredSize); + Assert.assertEquals(md.getFilterLines().size(), beforeFilterSize); + Assert.assertEquals(md.getOtherHeaderLines().size(), beforeOtherSize); // still remains the same + } + + @Test + public void testAddRemoveDuplicateStructuredLine() { + final VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + final VCFMetaDataLines md = unitTestData.getTestMetaDataLines(); + + final int beforeAllSize = md.getMetaDataInInputOrder().size(); + final int beforeStructuredSize = md.getIDHeaderLines().size(); + final int beforeFilterSize = md.getFilterLines().size(); + + // add a new, unique, structured (filter) line + final VCFFilterHeaderLine newLine = new VCFFilterHeaderLine("filterID", "unused desc"); + md.addMetaDataLine(newLine); + + Assert.assertEquals(md.getMetaDataInInputOrder().size(), beforeAllSize + 1); + Assert.assertEquals(md.getIDHeaderLines().size(), beforeStructuredSize + 1); + Assert.assertEquals(md.getFilterLines().size(), beforeFilterSize + 1); + + // now try to re-add the same structured filter line again, this second one is rejected, count remains the same + md.addMetaDataLine(newLine); + Assert.assertEquals(md.getMetaDataInInputOrder().size(), beforeAllSize + 1); + Assert.assertEquals(md.getIDHeaderLines().size(), beforeStructuredSize + 1); + Assert.assertEquals(md.getFilterLines().size(), beforeFilterSize + 1); + Assert.assertEquals(md.getFilterHeaderLine("filterID"), newLine); + + // remove the first structured line and we're back to the original size + Assert.assertEquals(md.removeMetaDataLine(newLine), newLine); + Assert.assertEquals(md.getMetaDataInInputOrder().size(), beforeAllSize); + Assert.assertEquals(md.getIDHeaderLines().size(), beforeStructuredSize); + Assert.assertEquals(md.getFilterLines().size(), beforeFilterSize); + } + +// @Test +// public void testAddRemoveContigLine() { +// final VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); +// } + + @Test + public void testHasEquivalentHeaderLinePositive() { + final VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + final VCFMetaDataLines sourceMetaDataLines = unitTestData.getTestMetaDataLines(); + + // for each headerLine in the set, make sure findEquivalentHeaderLine returns it + for (final VCFHeaderLine headerLine : sourceMetaDataLines.getMetaDataInInputOrder()) { + final VCFHeaderLine equivalentLine = sourceMetaDataLines.findEquivalentHeaderLine(headerLine); + Assert.assertTrue(equivalentLine.equals(headerLine)); + } + } + + @Test + public void testHasEquivalentHeaderLineNegative() { + final VCFMetaDataLines metaDataLines = new VCFMetaDataLines(); + // add a few test lines + metaDataLines.addMetaDataLine(new VCFHeaderLine("testkey1", "test value")); + metaDataLines.addMetaDataLine(new VCFHeaderLine("testkey1", "other value")); + metaDataLines.addMetaDataLine(new VCFHeaderLine("reference", "assembly37")); + + // for each other headerLine in the starting set, make another header line with the same key but a different + // value, and ensure findEquivalentHeaderLine does NOT return it + for (final VCFHeaderLine headerLine : metaDataLines.getMetaDataInInputOrder()) { + final VCFHeaderLine equivalentLine = metaDataLines.findEquivalentHeaderLine(headerLine); + Assert.assertTrue(equivalentLine.equals(headerLine)); + + final VCFHeaderLine modifiedHeaderLine = new VCFHeaderLine(headerLine.getKey(), headerLine.getValue() + "zzz"); + Assert.assertNull(metaDataLines.findEquivalentHeaderLine(modifiedHeaderLine)); + } + } + + @Test + public void testGetFilterHeaderLine() { + final VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + final VCFMetaDataLines md = unitTestData.getTestMetaDataLines(); + Assert.assertEquals(md.getFilterHeaderLine(unitTestData.getTestFilterLines().get(0).getID()), unitTestData.getTestFilterLines().get(0)); + } + + @Test + public void testGetInfoHeaderLine() { + final VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + final VCFMetaDataLines md = unitTestData.getTestMetaDataLines(); + Assert.assertEquals(md.getInfoHeaderLine(unitTestData.getTestInfoLines().get(0).getID()), unitTestData.getTestInfoLines().get(0)); + } + + @Test + public void testGetFormatHeaderLine() { + final VCFHeaderUnitTestData testData = new VCFHeaderUnitTestData(); + final VCFMetaDataLines md = testData.getTestMetaDataLines(); + Assert.assertEquals(md.getFormatHeaderLine(testData.getTestFormatLines().get(0).getID()), testData.getTestFormatLines().get(0)); + } + + @Test + public void testAddRemoveVersionLine() { + final VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); + final VCFMetaDataLines md = unitTestData.getTestMetaDataLines(); + Assert.assertEquals(md.getVCFVersion(), unitTestData.TEST_VERSION); + + final int originalMetaDataLineCount = md.getMetaDataInInputOrder().size(); + + // now, remove the version line, make sure the removed line is actually the version line, that the + // resulting metadataLines version is now null, and the line count drops by 1 + final VCFHeaderLine queryVersionLine = VCFHeader.makeHeaderVersionLine(unitTestData.TEST_VERSION); + final VCFHeaderLine oldVersionLine = md.removeMetaDataLine(queryVersionLine); + Assert.assertEquals(oldVersionLine, queryVersionLine); + Assert.assertNull(md.getVCFVersion()); + Assert.assertEquals(md.getMetaDataInInputOrder().size(), originalMetaDataLineCount - 1); + + // now put it back... + md.addMetaDataLine(oldVersionLine); + Assert.assertEquals(md.getVCFVersion(), unitTestData.TEST_VERSION); + Assert.assertEquals(md.getMetaDataInInputOrder().size(), originalMetaDataLineCount); + } + + @Test + public void testAddContigLineExactDuplicate() { + final VCFMetaDataLines md = new VCFMetaDataLines(); + final Set contigLines = new LinkedHashSet<>(); + + final VCFContigHeaderLine vcfContigLine1 = new VCFContigHeaderLine( + new LinkedHashMap() {{ + put("ID", "contig1"); + }}, 0); + final VCFContigHeaderLine vcfContigLine2 = new VCFContigHeaderLine( + new LinkedHashMap() {{ + put("ID", "contig2"); + }}, 1); + + contigLines.add(vcfContigLine1); + contigLines.add(vcfContigLine2); + md.addMetaDataLines(contigLines); + Assert.assertEquals(md.getContigLines(), contigLines); + + // add in the duplicate line + md.addMetaDataLine(vcfContigLine1); + Assert.assertEquals(md.getContigLines(), contigLines); + } + + @Test(expectedExceptions = TribbleException.class) + public void testAddContigLineConflicting() { + final VCFMetaDataLines md = new VCFMetaDataLines(); + + final Set contigLines = new LinkedHashSet<>(); + contigLines.add(new VCFContigHeaderLine( + new LinkedHashMap() {{ + put("ID", "contig1"); + }}, 0)); + contigLines.add(new VCFContigHeaderLine( + new LinkedHashMap() {{ + put("ID", "contig2"); + }}, 1)); + + md.addMetaDataLines(contigLines); + Assert.assertEquals(md.getContigLines(), contigLines); + + // try to add a contg line with a duplicate index, but with a different name than the existing line with that index + md.addMetaDataLine(new VCFContigHeaderLine( + new LinkedHashMap() {{ + put("ID", "contig3"); + }}, 0)); + } + + @Test + public void testRemoveAndReplaceContigLines() { + final VCFMetaDataLines md = new VCFMetaDataLines(); + final Set contigLines = new LinkedHashSet<>(); + + final VCFContigHeaderLine vcfContigLine1 = new VCFContigHeaderLine( + new LinkedHashMap() {{ + put("ID", "contig1"); + }}, 1); + final VCFContigHeaderLine vcfContigLine2 = new VCFContigHeaderLine( + new LinkedHashMap() {{ + put("ID", "contig2"); + }}, 2); + + contigLines.add(vcfContigLine1); + contigLines.add(vcfContigLine2); + md.addMetaDataLines(contigLines); + Assert.assertEquals(md.getContigLines(), contigLines); + + //make sure the initial contig index order is honored; it happens to be the same as the input + // order a this point, but check anyway + final List sortedLines1 = md.getContigLines(); + Assert.assertEquals(sortedLines1.get(0), vcfContigLine1); + Assert.assertEquals(sortedLines1.get(1), vcfContigLine2); + + // now remove the first contig line; only one should remain + final VCFHeaderLine removedContigLine = md.removeMetaDataLine(vcfContigLine1); + Assert.assertEquals(removedContigLine, vcfContigLine1); + final List sortedContigHeaderLines = md.getContigLines(); + Assert.assertEquals(sortedContigHeaderLines.size(), 1); + + // now add the first line back in, so the input order is different than the sorted order, + // and make sure the order is honored + md.addMetaDataLine(vcfContigLine1); + final List sortedLines2 = md.getContigLines(); + Assert.assertEquals(sortedLines2.get(0), vcfContigLine1); + Assert.assertEquals(sortedLines2.get(1), vcfContigLine2); + + // now add in ANOTHER contig line at the end that has an index that puts it BEFORE the existing lines + final VCFContigHeaderLine vcfContigLine3 = new VCFContigHeaderLine( + new LinkedHashMap() {{ + put("ID", "contig3"); + }}, 0); + md.addMetaDataLine(vcfContigLine3); + final List sortedLines3 = md.getContigLines(); + Assert.assertEquals(sortedLines3.size(), 3); + Assert.assertEquals(sortedLines3.get(0), vcfContigLine3); + Assert.assertEquals(sortedLines3.get(1), vcfContigLine1); + Assert.assertEquals(sortedLines3.get(2), vcfContigLine2); + } + +} + diff --git a/src/test/java/htsjdk/variant/vcf/VCFMetaHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFMetaHeaderLineUnitTest.java new file mode 100644 index 0000000000..518f6a6928 --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFMetaHeaderLineUnitTest.java @@ -0,0 +1,44 @@ +package htsjdk.variant.vcf; + +import htsjdk.HtsjdkTest; +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +public class VCFMetaHeaderLineUnitTest extends HtsjdkTest { + + @DataProvider(name = "allowedVCFVersions") + public Object[][] allowedVCFVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF4_3} + }; + } + + @DataProvider(name = "rejectedVCFVersions") + public Object[][] rejectedVCFVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF3_2}, + {VCFHeaderVersion.VCF3_3}, + {VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_2}, + }; + } + + private static final String META_STRING = ""; + + @Test(dataProvider="allowedVCFVersions") + public void testAllowedVersions(final VCFHeaderVersion vcfAllowedVersion) { + final VCFMetaHeaderLine vcfLine = new VCFMetaHeaderLine(META_STRING, vcfAllowedVersion); + Assert.assertEquals("id", vcfLine.getID()); + Assert.assertEquals("desc", vcfLine.getGenericFieldValue(VCFSimpleHeaderLine.DESCRIPTION_ATTRIBUTE)); + } + + @Test(dataProvider="rejectedVCFVersions",expectedExceptions=TribbleException.class) + public void testRejectedVersions(final VCFHeaderVersion vcfAllowedVersion) { + new VCFMetaHeaderLine(META_STRING, vcfAllowedVersion); + } + + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFPedigreeHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFPedigreeHeaderLineUnitTest.java new file mode 100644 index 0000000000..43179c6862 --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFPedigreeHeaderLineUnitTest.java @@ -0,0 +1,50 @@ +package htsjdk.variant.vcf; + +import htsjdk.HtsjdkTest; +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +public class VCFPedigreeHeaderLineUnitTest extends HtsjdkTest { + + + @DataProvider(name = "allowedVCFVersions") + public Object[][] allowedVCFVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF4_3} + }; + } + + @DataProvider(name = "rejectedVCFVersions") + public Object[][] rejectedVCFVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF3_2}, + {VCFHeaderVersion.VCF3_3}, + }; + } + + private static final String PEDIGREE_STRING_4_2 = "PEDIGREE="; + private static final String PEDIGREE_STRING_4_3 = "PEDIGREE="; + + @Test(dataProvider="allowedVCFVersions") + public void testAllowedVersions(final VCFHeaderVersion vcfAllowedVersion) { + final VCFPedigreeHeaderLine vcfLine = new VCFPedigreeHeaderLine( + vcfAllowedVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3) ? + PEDIGREE_STRING_4_3 : + PEDIGREE_STRING_4_2, + vcfAllowedVersion); + Assert.assertEquals("id", vcfLine.getID()); + Assert.assertEquals("desc", vcfLine.getGenericFieldValue(VCFSimpleHeaderLine.DESCRIPTION_ATTRIBUTE)); + } + + @Test(dataProvider="rejectedVCFVersions",expectedExceptions=TribbleException.class) + public void testRejectedVersions(final VCFHeaderVersion vcfAllowedVersion) { + new VCFPedigreeHeaderLine(PEDIGREE_STRING_4_2, vcfAllowedVersion); + } + + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFSampleHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFSampleHeaderLineUnitTest.java new file mode 100644 index 0000000000..355827e27b --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFSampleHeaderLineUnitTest.java @@ -0,0 +1,43 @@ +package htsjdk.variant.vcf; + +import htsjdk.HtsjdkTest; +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +public class VCFSampleHeaderLineUnitTest extends HtsjdkTest { + + @DataProvider(name = "allowedVCFVersions") + public Object[][] allowedVCFVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF4_0}, + {VCFHeaderVersion.VCF4_1}, + {VCFHeaderVersion.VCF4_2}, + {VCFHeaderVersion.VCF4_3} + }; + } + + @DataProvider(name = "rejectedVCFVersions") + public Object[][] rejectedVCFVersions() { + return new Object[][]{ + {VCFHeaderVersion.VCF3_2}, + {VCFHeaderVersion.VCF3_3}, + }; + } + + private static final String SAMPLE_STRING = "SAMPLE="; + + @Test(dataProvider="allowedVCFVersions") + public void testAllowedVersions(final VCFHeaderVersion vcfAllowedVersion) { + final VCFSampleHeaderLine vcfLine = new VCFSampleHeaderLine(SAMPLE_STRING, vcfAllowedVersion); + Assert.assertEquals("id", vcfLine.getID()); + Assert.assertEquals("desc", vcfLine.getGenericFieldValue(VCFSimpleHeaderLine.DESCRIPTION_ATTRIBUTE)); + } + + @Test(dataProvider="rejectedVCFVersions",expectedExceptions=TribbleException.class) + public void testRejectedVersions(final VCFHeaderVersion vcfAllowedVersion) { + new VCFSampleHeaderLine(SAMPLE_STRING, vcfAllowedVersion); + } + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFSimpleHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFSimpleHeaderLineUnitTest.java new file mode 100644 index 0000000000..c9f8841d3d --- /dev/null +++ b/src/test/java/htsjdk/variant/vcf/VCFSimpleHeaderLineUnitTest.java @@ -0,0 +1,151 @@ +package htsjdk.variant.vcf; + +import htsjdk.HtsjdkTest; +import htsjdk.tribble.TribbleException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import java.util.LinkedHashMap; + +public class VCFSimpleHeaderLineUnitTest extends HtsjdkTest { + + private VCFSimpleHeaderLine getStructuredHeaderLine() { + return new VCFSimpleHeaderLine( + "key", + new LinkedHashMap() {{ + put("ID", "id"); + put("attr1", "value1"); + put("attr2", "value2"); + }} + ); + } + + @Test + public void testConstructorFromStrings() { + final VCFSimpleHeaderLine hl = new VCFSimpleHeaderLine("testKey", "testId", "test description"); + Assert.assertEquals("testKey", hl.getKey()); + Assert.assertEquals("testId", hl.getID()); + Assert.assertEquals("test description", hl.getGenericFieldValue(VCFSimpleHeaderLine.DESCRIPTION_ATTRIBUTE)); + Assert.assertEquals("testKey=", hl.toStringEncoding()); + } + + @Test + public void testConstructorFromEncodedLine() { + final VCFSimpleHeaderLine hLine = new VCFSimpleHeaderLine("key", "", VCFHeader.DEFAULT_VCF_VERSION); + Assert.assertEquals(hLine.getKey(), "key"); + Assert.assertEquals(hLine.getID(), "id"); + Assert.assertEquals(hLine.getGenericFieldValue("ID"), "id"); + Assert.assertEquals(hLine.getGenericFieldValue("attr1"), "value1"); + } + + @Test + public void testConstructorFromAttributeMap() { + final VCFSimpleHeaderLine hLine = new VCFSimpleHeaderLine( + "key", + new LinkedHashMap() {{ + put("ID", "id"); + put("attr1", "value1"); + put("attr2", "value2"); + }}); + + Assert.assertEquals(hLine.getKey(), "key"); + Assert.assertEquals(hLine.getID(), "id"); + Assert.assertEquals(hLine.getGenericFieldValue("ID"), "id"); + Assert.assertEquals(hLine.getGenericFieldValue("attr1"), "value1"); + } + + @Test(expectedExceptions=TribbleException.class) + public void testRejectIdMissingFromEncodedLine() { + new VCFSimpleHeaderLine("key", "", VCFHeader.DEFAULT_VCF_VERSION); + } + + @Test(expectedExceptions=TribbleException.class) + public void testRejectIdMissingFromAttributeMap() { + new VCFSimpleHeaderLine( + "key", + new LinkedHashMap() {{ + put("attr1", "value1"); + put("attr2", "value2"); + }}); + } + + @DataProvider(name = "violateIDRequirements") + public Object[][] getViolateIDRequirements() { + return new Object[][]{ + {""}, + {""}, + {""}, + {""} + }; + } + + @Test(dataProvider="violateIDRequirements",expectedExceptions=TribbleException.class) + public void testViolateIDRequirements(final String headerLine) { + new VCFSimpleHeaderLine("key", headerLine, VCFHeader.DEFAULT_VCF_VERSION); + } + + @Test + public void testGetID() { + Assert.assertEquals(getStructuredHeaderLine().getID(), "id"); + } + + @Test + public void testIsIDLine() { + Assert.assertTrue(getStructuredHeaderLine().isIDHeaderLine()); + } + + @Test + public void testGetGenericFieldValue() { + Assert.assertEquals(getStructuredHeaderLine().getGenericFieldValue("attr1"), "value1"); + } + + @Test + public void testStringEncoding() { + final VCFSimpleHeaderLine structuredHL = getStructuredHeaderLine(); + Assert.assertEquals(structuredHL.toStringEncoding(),"key="); + } + + @Test + public void testUnescapedQuotedStringEncoding() { + final VCFSimpleHeaderLine unescapedHeaderLine = new VCFSimpleHeaderLine( + "key", + new LinkedHashMap() {{ + put("ID", "id"); + put(VCFSimpleHeaderLine.DESCRIPTION_ATTRIBUTE, + "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \"NA\" || ANNOTATION <= 2.0]"); + put(VCFSimpleHeaderLine.SOURCE_ATTRIBUTE, + "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \"NA\" || ANNOTATION <= 2.0]"); + }} + ); + + final String encodedAttributes = unescapedHeaderLine.toStringEncoding(); + Assert.assertNotNull(encodedAttributes); + + final String expectedEncoding = "key="; + Assert.assertEquals(encodedAttributes, expectedEncoding); + } + + @Test + public void testEscapedQuotedStringEncoding() { + // test Source and Version attributes + final VCFSimpleHeaderLine unescapedHeaderLine = new VCFSimpleHeaderLine( + "key", + new LinkedHashMap() {{ + put("ID", "id"); + put(VCFSimpleHeaderLine.DESCRIPTION_ATTRIBUTE, + "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \\\"NA\\\" || ANNOTATION <= 2.0]"); + put(VCFSimpleHeaderLine.SOURCE_ATTRIBUTE, + "filterName=[ANNOTATION] filterExpression=[ANNOTATION == \\\"NA\\\" || ANNOTATION <= 2.0]"); + }} + ); + + final String encodedAttributes = unescapedHeaderLine.toStringEncoding(); + Assert.assertNotNull(encodedAttributes); + + final String expectedEncoding = "key="; + Assert.assertEquals(encodedAttributes, expectedEncoding); + } + +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFStandardHeaderLinesUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFStandardHeaderLinesUnitTest.java index c9efaa59ef..45009ce211 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFStandardHeaderLinesUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFStandardHeaderLinesUnitTest.java @@ -31,8 +31,9 @@ import org.testng.annotations.Test; import java.util.ArrayList; -import java.util.Collections; +import java.util.LinkedHashSet; import java.util.List; +import java.util.Set; /** * Created by IntelliJ IDEA. @@ -188,7 +189,11 @@ public Object[][] makeRepairHeaderTest() { @Test(dataProvider = "RepairHeaderTest") public void testRepairHeaderTest(final RepairHeaderTest cfg) { - final VCFHeader toRepair = new VCFHeader(Collections.singleton((VCFHeaderLine)cfg.original)); + final Set headerLines = new LinkedHashSet<>(); + headerLines.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); + headerLines.add(cfg.original); + + final VCFHeader toRepair = new VCFHeader(headerLines); final VCFHeader repaired = VCFStandardHeaderLines.repairStandardHeaderLines(toRepair); VCFCompoundHeaderLine repairedLine = (VCFCompoundHeaderLine)repaired.getFormatHeaderLine(cfg.original.getID()); diff --git a/src/test/java/htsjdk/variant/vcf/VCFUtilsTest.java b/src/test/java/htsjdk/variant/vcf/VCFUtilsTest.java index ed943feac1..5629798c61 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFUtilsTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFUtilsTest.java @@ -1,6 +1,7 @@ package htsjdk.variant.vcf; import htsjdk.HtsjdkTest; +import htsjdk.tribble.TribbleException; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -11,45 +12,55 @@ public class VCFUtilsTest extends HtsjdkTest { @DataProvider(name="validHeaderVersionMerger") public Object[][] validHeaderMergerVersions() { - // v4.3 can only merge with v4.3, all other version mergers are allowed + + // header version must be at least v4.2 to merge, result is always highest version return new Object[][] { - {Arrays.asList("VCFv4.0", "VCFv4.0")}, - {Arrays.asList("VCFv4.1", "VCFv4.1")}, - {Arrays.asList("VCFv4.2", "VCFv4.2")}, - {Arrays.asList("VCFv4.3", "VCFv4.3")}, - {Arrays.asList("VCFv4.2", "VCFv4.2")}, - {Arrays.asList("VCFv4.2", "VCFv4.2", "VCFv4.2")}, + // headers to merge, expected result version + {Arrays.asList("VCFv4.2", "VCFv4.2"), VCFHeaderVersion.VCF4_2}, + {Arrays.asList("VCFv4.3", "VCFv4.3"), VCFHeaderVersion.VCF4_3}, + {Arrays.asList("VCFv4.2", "VCFv4.3"), VCFHeaderVersion.VCF4_3}, + {Arrays.asList("VCFv4.3", "VCFv4.2"), VCFHeaderVersion.VCF4_3}, + {Arrays.asList("VCFv4.2", "VCFv4.2"), VCFHeaderVersion.VCF4_2 }, + {Arrays.asList("VCFv4.2", "VCFv4.2", "VCFv4.3"), VCFHeaderVersion.VCF4_3}, + {Arrays.asList("VCFv4.3", "VCFv4.3", "VCFv4.2"), VCFHeaderVersion.VCF4_3}, + {Arrays.asList("VCFv4.3", "VCFv4.2", "VCFv4.3"), VCFHeaderVersion.VCF4_3}, }; } @DataProvider(name="invalidHeaderVersionMerger") public Object[][] invalidHeaderVersionMerger() { - // v4.3 can only merge with v4.3, all other version mergers are allowed + // header version must be at least v4.2 to merge return new Object[][] { - {Arrays.asList("VCFv4.0", "VCFv4.3")}, - {Arrays.asList("VCFv4.1", "VCFv4.3")}, - {Arrays.asList("VCFv4.2", "VCFv4.3")}, - {Arrays.asList("VCFv4.0", "VCFv4.0", "VCFv4.2", "VCFv4.3")}, - {Arrays.asList("VCFv4.3", "VCFv4.0", "VCFv4.1", "VCFv4.2")}, + {Arrays.asList("VCFv4.0", "VCFv4.2")}, + {Arrays.asList("VCFv4.1", "VCFv4.2")}, + {Arrays.asList("VCFv4.0", "VCFv4.1", "VCFv4.2", "VCFv4.3")}, + {Arrays.asList("VCFv4.3", "VCFv4.2", "VCFv4.1", "VCFv4.0")}, }; } @Test(dataProvider="validHeaderVersionMerger") - public void testValidHeaderVersionMerger(final List headerVersions) { - final List headersToMerge = new ArrayList<>(headerVersions.size()); - headerVersions.forEach(hv -> headersToMerge.add( - new VCFHeader(VCFHeaderVersion.toHeaderVersion(hv), Collections.emptySet(), Collections.emptySet())) - ); - final Set resultHeaders = VCFUtils.smartMergeHeaders(headersToMerge, true); + public void testValidHeaderVersionMerger(final List headerVersions, final VCFHeaderVersion expectedVersion) { + final Set mergedHeaderLines = doHeaderMerge(headerVersions); + + final VCFMetaDataLines metaDataLines = new VCFMetaDataLines(); + metaDataLines.addMetaDataLines(mergedHeaderLines); + final VCFHeaderLine versionLine = metaDataLines.getFileFormatLine(); + Assert.assertEquals(VCFHeaderVersion.toHeaderVersion(versionLine.getValue()), expectedVersion); } - @Test(dataProvider="invalidHeaderVersionMerger", expectedExceptions = IllegalArgumentException.class) + @Test(dataProvider="invalidHeaderVersionMerger", expectedExceptions = TribbleException.class) public void testInvalidHeaderVersionMerger(final List headerVersions) { + doHeaderMerge(headerVersions); + } + + private Set doHeaderMerge(final List headerVersions) { final List headersToMerge = new ArrayList<>(headerVersions.size()); headerVersions.forEach(hv -> headersToMerge.add( - new VCFHeader(VCFHeaderVersion.toHeaderVersion(hv), Collections.emptySet(), Collections.emptySet())) + new VCFHeader( + VCFHeader.makeHeaderVersionLineSet(VCFHeaderVersion.toHeaderVersion(hv)), + Collections.emptySet())) ); - VCFUtils.smartMergeHeaders(headersToMerge, true); + return VCFUtils.smartMergeHeaders(headersToMerge, true); } @DataProvider(name = "caseIntolerantDoubles") diff --git a/src/test/resources/htsjdk/variant/HiSeq.10000.vcf b/src/test/resources/htsjdk/variant/HiSeq.10000.vcf index a304ba24da..75c9f9b537 100644 --- a/src/test/resources/htsjdk/variant/HiSeq.10000.vcf +++ b/src/test/resources/htsjdk/variant/HiSeq.10000.vcf @@ -9,7 +9,6 @@ ##FILTER= ##FILTER= ##FILTER= -##FILTER= ##FILTER= ##FORMAT= ##FORMAT= diff --git a/src/test/resources/htsjdk/variant/VCF4HeaderTest.vcf b/src/test/resources/htsjdk/variant/VCF4HeaderTest.vcf index 9af0cb3e64..097d0b034f 100644 --- a/src/test/resources/htsjdk/variant/VCF4HeaderTest.vcf +++ b/src/test/resources/htsjdk/variant/VCF4HeaderTest.vcf @@ -9,7 +9,6 @@ ##FILTER= ##FILTER= ##FILTER= -##FILTER= ##FILTER= ##FILTER= ##FILTER= From 210adb2d9d609a9647fc3339d948f9fb8529e3b6 Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Mon, 15 Nov 2021 08:42:06 -0500 Subject: [PATCH 03/12] Eliminate redundant modeling of VCFHeaderVersion in VCFHeader. --- .../java/htsjdk/variant/vcf/VCFHeader.java | 42 ++++++++----------- .../htsjdk/variant/vcf/VCFMetaDataLines.java | 5 ++- .../htsjdk/variant/vcf/VCFHeaderUnitTest.java | 17 ++++---- 3 files changed, 29 insertions(+), 35 deletions(-) diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeader.java b/src/main/java/htsjdk/variant/vcf/VCFHeader.java index 637c04c4fc..1dcb5e07f9 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeader.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeader.java @@ -65,9 +65,6 @@ public enum HEADER_FIELDS { CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO } - // the VCF version for this header - private VCFHeaderVersion vcfHeaderVersion; - // header meta data private final VCFMetaDataLines mMetaData = new VCFMetaDataLines(); @@ -163,7 +160,7 @@ public VCFHeader(final Set metaData, final List genotypeS // lines are presented in the set, a warning will be issued, only the last one will be retained, // and the header version will be established using the last version line encountered mMetaData.addMetaDataLines(metaData); - vcfHeaderVersion = initializeHeaderVersion(); + final VCFHeaderVersion vcfHeaderVersion = initializeHeaderVersion(); mMetaData.validateMetaDataLines(vcfHeaderVersion); checkForDeprecatedGenotypeLikelihoodsKey(); @@ -180,7 +177,7 @@ public VCFHeader(final Set metaData, final List genotypeS * @return the VCFHeaderVersion for this header. will not be null */ public VCFHeaderVersion getVCFHeaderVersion() { - return vcfHeaderVersion; + return mMetaData.getVCFVersion(); } /** @@ -191,16 +188,12 @@ public VCFHeaderVersion getVCFHeaderVersion() { * @param headerLine header line to attempt to add */ public void addMetaDataLine(final VCFHeaderLine headerLine) { - // propagate the new line to the metadata lines object + // propagate the new line to the metadata lines object, and if the version changed, validate + // the lines against the new version + final VCFHeaderVersion oldHeaderVersion = mMetaData.getVCFVersion(); mMetaData.addMetaDataLine(headerLine); - - // update the current version in case this line triggered a version change final VCFHeaderVersion newHeaderVersion = mMetaData.getVCFVersion(); - if (!newHeaderVersion.equals(vcfHeaderVersion)) { - validateVersionTransition(vcfHeaderVersion, newHeaderVersion); - } - vcfHeaderVersion = newHeaderVersion; - headerLine.validateForVersion(vcfHeaderVersion); + validateVersionTransition(headerLine, oldHeaderVersion, newHeaderVersion); checkForDeprecatedGenotypeLikelihoodsKey(); } @@ -574,7 +567,6 @@ public boolean equals(final Object o) { if (samplesWereAlreadySorted != vcfHeader.samplesWereAlreadySorted) return false; if (writeEngineHeaders != vcfHeader.writeEngineHeaders) return false; if (writeCommandLine != vcfHeader.writeCommandLine) return false; - if (vcfHeaderVersion != vcfHeader.vcfHeaderVersion) return false; if (!mMetaData.equals(vcfHeader.mMetaData)) return false; if (mGenotypeSampleNames != null ? !mGenotypeSampleNames.equals(vcfHeader.mGenotypeSampleNames) : vcfHeader.mGenotypeSampleNames != null) @@ -588,8 +580,7 @@ public boolean equals(final Object o) { @Override public int hashCode() { - int result = vcfHeaderVersion.hashCode(); - result = 31 * result + mMetaData.hashCode(); + int result = mMetaData.hashCode(); result = 31 * result + (mGenotypeSampleNames != null ? mGenotypeSampleNames.hashCode() : 0); result = 31 * result + (samplesWereAlreadySorted ? 1 : 0); result = 31 * result + (sampleNamesInOrder != null ? sampleNamesInOrder.hashCode() : 0); @@ -614,26 +605,29 @@ private VCFHeaderVersion initializeHeaderVersion() { } private void validateVersionTransition( - final VCFHeaderVersion previousVersion, + final VCFHeaderLine newHeaderLine, + final VCFHeaderVersion currentVersion, final VCFHeaderVersion newVersion) { - final int compareTo = newVersion.compareTo(previousVersion); + final int compareTo = newVersion.compareTo(currentVersion); + + // We only allow going forward to a newer version, not backwards to an older one, since there + // is really no way to validate old header lines (pre vcfV4.2). If the version moved forward, + // revalidate all the lines, otherwise only validate the new header line. if (compareTo < 0) { - // We only allow going forward to a newer version, not backwards to an older one, since there - // is really no way to validate old header lines (pre vcfV4.2). The only way to create a header with - // an old version is to create it that way from the start. - // to be created with the old version from the start. throw new TribbleException(String.format( "When changing a header version, the new header version %s must be > the previous version %s", newVersion, - previousVersion)); + currentVersion)); } else if (compareTo > 0) { logger.debug(() -> String.format("Updating VCFHeader version from %s to %s", - previousVersion.getVersionString(), + currentVersion.getVersionString(), newVersion.getVersionString())); // the version moved forward, so validate ALL of the existing lines in the list to ensure // that the transition is valid mMetaData.validateMetaDataLines(newVersion); + } else { + newHeaderLine.validateForVersion(newVersion); } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java b/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java index 843fdf98cc..5f68a61113 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java +++ b/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java @@ -144,10 +144,11 @@ public VCFHeaderLine findEquivalentHeaderLine(final VCFHeaderLine queryLine) { } /** - * Validate all metadata lines except the file format line against a target version. + * Validate all metadata lines, excluding the file format line against a target version. * Throws {@link TribbleException.VersionValidationFailure} if any line is incompatible with the given version. * @param targetVersion the target version to validate against - * @throws TribbleException if any existing line fails to validate against {@code targetVersion} + * @throws {@link TribbleException.VersionValidationFailure} if any existing line fails to validate against + * {@code targetVersion} */ //TODO: we need to tell users how to resolve the case where this fails due to version validation //i.e, use a custom upgrade tool diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java index 8ee9ccab26..b604b91899 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java @@ -515,15 +515,6 @@ public void testVersionUpgradeWithValidationFailure() { vcfHeader.addMetaDataLine(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_3)); } - @Test(expectedExceptions = TribbleException.class) - public void testAddLineWithValidationFailure() { - // create a 4.3 header, and then try to add an old-style pedigree line (one that has no ID) - // which should cause a failure - final VCFHeader vcfHeader = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeaderVersion.VCF4_3)); - vcfHeader.addMetaDataLine(new VCFHeaderLine(VCFConstants.PEDIGREE_HEADER_KEY, "")); - } - - @Test(expectedExceptions = TribbleException.class) public void testConstructorRequiresFileFormatLine() { final Set metaDataSet = VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString(); // 4.2 header is compatible with all 4.x versions @@ -579,6 +570,14 @@ public void testAddMetaDataLineInvalidForVersion() { header.addMetaDataLine(new VCFPedigreeHeaderLine(attributes)); } + @Test(expectedExceptions = TribbleException.class) + public void testAddMetaDataLineWithValidationFailure() { + // create a 4.3 header, and then try to add an old-style pedigree line (one that has no ID) + // which should cause a failure + final VCFHeader vcfHeader = new VCFHeader(VCFHeader.makeHeaderVersionLineSet(VCFHeaderVersion.VCF4_3)); + vcfHeader.addMetaDataLine(new VCFHeaderLine(VCFConstants.PEDIGREE_HEADER_KEY, "")); + } + @Test(expectedExceptions = TribbleException.class) public void testAddMetaDataLineFileFormat() { final Set metaDataSet = VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString(); // this (4.2) header is compatible with all 4.x versions From f3b9001148e2f72fd1e762716622632f5b856146 Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Mon, 15 Nov 2021 11:16:20 -0500 Subject: [PATCH 04/12] Eliminate redundant modeling of file format lines in VCFMetaDataLines. --- .../htsjdk/variant/vcf/VCFMetaDataLines.java | 131 ++++++++++-------- .../variant/vcf/VCFHeaderMergerUnitTest.java | 3 +- .../htsjdk/variant/vcf/VCFHeaderUnitTest.java | 20 +++ .../variant/vcf/VCFHeaderUnitTestData.java | 2 +- .../variant/vcf/VCFMetaDataLinesUnitTest.java | 26 +++- 5 files changed, 113 insertions(+), 69 deletions(-) diff --git a/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java b/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java index 5f68a61113..97f208e7b4 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java +++ b/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java @@ -13,16 +13,16 @@ * Class for managing the set of VCFHeaderLines maintained by a VCFHeader. * * Since this class is used to incrementally build up a set of header lines for use with a VCFHeader, - * it does not require that the list always contain a fileformat line (its VCFHeader's job to enforce + * it does not require that the list always contain a file format line (its VCFHeader's job to enforce * that condition). * * This class maintains several invariants: * - * - The list keeps track of the "current version" by tracking whether a version line (a line that - * establishes the VCFHeaderVersion, such as format/fileformat line) is contained in the list. If - * no version line has been added, the list will have a null current version, and contain 0 version - * lines. If a version line has been added, it will have a non-null version, and contain 1 version line. - * If the version line is manually removed, the "current version" is reset to null. + * - The "current version" of the lines is tracked by recording whether a version line (a line that + * establishes the VCFHeaderVersion, such as format/fileformat line) has been added to the list. If + * no version line has been added, the list will have a null current version; if a version line has + * been added, it will have a non-null version. If the version line is manually removed, the "current + * version" is reset to null. * * - Each contig line that is retained is guaranteed to have a unique contig index. This does * NOT guarantee that the contig indices are contiguous, or ordered, only that they are unique. @@ -34,7 +34,7 @@ * getInfoHeaderLines(), but would still be serialized on write.) * * This class does NOT validate that the lines contained are valid for the current version (that is - * the caller's responsibilty). + * the caller's responsibility). */ //Visible to allow disq Kryo registration for serialization @InternalAPI @@ -53,9 +53,10 @@ final class VCFMetaDataLines implements Serializable { private VCFHeaderVersion vcfVersion; /** - * Add all metadata lines from Set. If a duplicate line is encountered (duplicate content for - * unstructured lines with identical keys, or duplicate key/ID pair for structured lines), only - * the new line will be retained. + * Add all metadata lines from Set. If an equivalent line already exists (any existing file format + * line if the new line is an unstructured file format line; any existing identical line if the new + * line is an unstructured non-file format line; or any existing line with a duplicate key/ID pair + * if the new line is a structured line), only the new line will be retained. * * @param newMetaData Set of lines to be added to the list. * @throws IllegalArgumentException if a version is established or if any line fails validation for that version @@ -65,9 +66,10 @@ public void addMetaDataLines(final Set newMetaData) { } /** - * Add a metadata line to the list. If a duplicate line is encountered (duplicate content for - * unstructured lines with identical keys, or duplicate key/ID pair for structured lines), only - * the newest line will be retained. + * Add a metadata line to the list. If an equivalent line already exists (any existing file format + * line if the new line is an unstructured file format line; any existing identical line if the new + * line is an unstructured non-file format line; or any existing line with a duplicate key/ID pair + * if the new line is a structured line), only the new line will be retained. * * @param newMetaDataLine header line to attempt to add * @returns an existing (equivalent) header line that was replaced by newMetaDataLine, if any, @@ -94,25 +96,32 @@ public VCFHeaderLine addMetaDataLine(final VCFHeaderLine newMetaDataLine) { } /** - * Remove a metadata line from the list. This is the inverse of addMetaDataLine - it removes a - * line that has an identical key and value as lineToRemove if lineToRemove is an unstructured (non-ID) - * but if lineToRemove is a structured line, it will remove the line that has the same key/ID pair as - * lineToRemove, regardless of other content. + * Remove an equivalent metadata line from the list. This is the inverse of addMetaDataLine, and removes + * any equivalent line that already exists (any existing file format line if the line to be removed is + * an unstructured file format line; any existing identical line if the line to be removed is an unstructured + * non-file format line, or any existing line with a duplicate key/ID pair if the line to be removed is a + * structured line). * * The removed value is returned, and can be used by the caller to determine if the removed line has a * different value than the line presented. * * @param lineToRemove the header line to remove - * @return The actual headerline removed, or null of no equivalent headerline was found to remove + * @return The actual header line removed, or null of no equivalent header line was found to remove */ public VCFHeaderLine removeMetaDataLine(final VCFHeaderLine lineToRemove) { - final VCFHeaderLine removedLine = mMetaData.remove(makeKeyForLine(lineToRemove)); - if (removedLine != null) { - // only synchronize the dependent version and contig map variables if a line was ACTUALLY removed - if (VCFHeaderVersion.isFormatString(removedLine.getKey())) { + VCFHeaderLine removedLine = null; + if (VCFHeaderVersion.isFormatString(lineToRemove.getKey()) && vcfVersion != null) { + final VCFHeaderVersion versionToRemove = VCFHeaderVersion.toHeaderVersion(lineToRemove.getValue()); + if (versionToRemove.equals(vcfVersion)) { + // simulate "removal" of the line by recreating the line that we're dropping as the return value + removedLine = VCFHeader.makeHeaderVersionLine(versionToRemove); vcfVersion = null; - } else if (lineToRemove.isIDHeaderLine() && lineToRemove.getKey().equals(VCFHeader.CONTIG_KEY)) { - removeFromContigIndexMap((VCFContigHeaderLine) lineToRemove); + } + } else { + removedLine = mMetaData.remove(makeKeyForLine(lineToRemove)); + // only synchronize the dependent contig map variables if a line was ACTUALLY removed + if (removedLine != null && lineToRemove.isIDHeaderLine() && lineToRemove.getKey().equals(VCFHeader.CONTIG_KEY)) { + removeFromContigIndexMap((VCFContigHeaderLine) removedLine); } } return removedLine; @@ -128,19 +137,29 @@ public VCFHeaderVersion getVCFVersion() { /** * Return the existing line from the list that is "equivalent" to the query line, where - * equivalent is defined as having the same key and value for unstructured header lines, or the - * same key and ID, but not necessarily the same value (for structured header lines). The - * "equivalent" line returned by this method is not guaranteed to be equal to the queryLine, - * in the case where the queryLine is an ID line. + * equivalent is defined as having the same key and value for unstructured header lines, + * or the same key and ID, but not necessarily the same value, for structured header lines. + * The "equivalent" line returned by this method is not guaranteed to be equal to the + * queryLine, in the case where the queryLine is an ID line. * - * The method is a way to ask "if the queryLine were added to this object via addMetaDataLine, what - * line, if any, would it replace". + * The method is a way to ask "if the queryLine were added to this object via addMetaDataLine, + * what line, if any, would it replace". + * + * Note that for file format (VCF version) lines, this returns an existing file format line + * if there is one, even if the key is different than the query line (since that behavior + * mirrors the behavior of addMetaDataLine and removeMetaDataLine). * * @param queryLine the source line to use to check for equivalents * @return The existing header line of the type/key provided, otherwise NULL. */ public VCFHeaderLine findEquivalentHeaderLine(final VCFHeaderLine queryLine) { - return mMetaData.get(makeKeyForLine(queryLine)); + if (VCFHeaderVersion.isFormatString(queryLine.getKey())) { + return vcfVersion == null ? + null : + VCFHeader.makeHeaderVersionLine(vcfVersion); + } else { + return mMetaData.get(makeKeyForLine(queryLine)); + } } /** @@ -183,7 +202,7 @@ public Collection getValidationErrors(final VCFHeaderVersi * @return a set of the meta data */ public Set getMetaDataInInputOrder() { - return Collections.unmodifiableSet(new LinkedHashSet<>(mMetaData.values())); + return makeMetaDataLineSet(mMetaData.values()); } /** @@ -197,7 +216,7 @@ public Set getMetaDataInSortedOrder() { // `contains` implementation based on comparator equality that can lead to inconsistent // results for header line types like VCFContigHeaderLine that have a compareTo // implementation that is inconsistent with equals. - return Collections.unmodifiableSet(new LinkedHashSet<>(new TreeSet<>(mMetaData.values()))); + return makeMetaDataLineSet(new TreeSet<>(mMetaData.values())); } /** @@ -286,7 +305,7 @@ public VCFFilterHeaderLine getFilterHeaderLine(final String id) { * VCFHeaderLine that is not a contig, info, format or filter header line. */ public Collection getOtherHeaderLines() { - return mMetaData.values().stream().filter( + return getMetaDataInInputOrder().stream().filter( hl -> !hl.getKey().equals(VCFConstants.CONTIG_HEADER_KEY) && !hl.getKey().equals(VCFConstants.INFO_HEADER_KEY) && @@ -297,31 +316,11 @@ public Collection getOtherHeaderLines() { } /** - * The version/fileformat header line if one exists, otherwise null. - * @return The version/fileformat header line if one exists, otherwise null. + * A version/fileformat header line representing the version for these lines, otherwise null. + * @return The version file format header line if a version has been established, otherwise null. */ public VCFHeaderLine getFileFormatLine() { - // find any existing version line(s). since there are multiple possible keys that - // represent version lines (old V3 specs used "format" instead of "fileformat") - final List existingVersionLines = mMetaData.values() - .stream() - .filter(line -> VCFHeaderVersion.isFormatString(line.getKey())) - .collect(Collectors.toList()); - - // This class doesn't mandate that the list it maintains always contains a fileformat line - // (its VCFHeader's job to maintain that condition for the header). - if (!existingVersionLines.isEmpty()) { - if (existingVersionLines.size() > 1) { - throw new IllegalStateException( - String.format("The metadata lines class contains more than one version line (%s)", - existingVersionLines.stream() - .map(VCFHeaderLine::toString) - .collect(Collectors.joining(",")))); - } - return existingVersionLines.get(0); - } else { - return null; - } + return vcfVersion == null ? null : VCFHeader.makeHeaderVersionLine(vcfVersion); } @Override @@ -469,7 +468,6 @@ private final VCFHeaderLine updateVersion(final VCFHeaderLine newMetaDataLine) { VCFHeaderVersion.isFormatString(newMetaDataLine.getKey()), "a file format line is required"); - final VCFHeaderLine currentVersionLine = getFileFormatLine(); final VCFHeaderVersion newVCFVersion = VCFHeaderVersion.toHeaderVersion(newMetaDataLine.getValue()); if (vcfVersion == null) { @@ -480,12 +478,23 @@ private final VCFHeaderLine updateVersion(final VCFHeaderLine newMetaDataLine) { vcfVersion + " to " + newVCFVersion); - removeFromMapOrThrow(currentVersionLine); } - mMetaData.put(makeKeyForLine(newMetaDataLine), newMetaDataLine); + final VCFHeaderLine oldVersionLine = getFileFormatLine(); vcfVersion = newVCFVersion; - return currentVersionLine; + return oldVersionLine; + } + + // make a new metadata line set to hand out to callers that includes + private Set makeMetaDataLineSet(final Collection orderedLines) { + if (vcfVersion != null) { + final Set orderedSet = new LinkedHashSet<>(orderedLines.size() + 1); + orderedSet.add(VCFHeader.makeHeaderVersionLine(vcfVersion)); + orderedSet.addAll(orderedLines); + return Collections.unmodifiableSet(orderedSet); + } else { + return Collections.unmodifiableSet(new LinkedHashSet<>(orderedLines)); + } } // composite keys used by the metadata lines map diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderMergerUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderMergerUnitTest.java index 1be8bdf085..818aae84a0 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFHeaderMergerUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderMergerUnitTest.java @@ -92,8 +92,7 @@ public void testMergeValidVersions(final List headerVersions, final Set mergedHeaderLines = doHeaderMergeForVersions(headerVersions); final VCFMetaDataLines metaDataLines = new VCFMetaDataLines(); metaDataLines.addMetaDataLines(mergedHeaderLines); - final VCFHeaderLine versionLine = metaDataLines.getFileFormatLine(); - Assert.assertEquals(VCFHeaderVersion.toHeaderVersion(versionLine.getValue()), expectedVersion); + Assert.assertEquals(metaDataLines.getVCFVersion(), expectedVersion); // now create a new header using the merged VersionLines, and make sure *it* has the expected version final VCFHeader mergedHeader = new VCFHeader(mergedHeaderLines); diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java index b604b91899..9f51901f91 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java @@ -600,6 +600,26 @@ public void testAddMetaDataLineFileFormat() { vcfHeader.addMetaDataLine(VCFHeader.makeHeaderVersionLine(VCFHeaderVersion.VCF4_1)); } + @Test + public void testFileFormatLineFirstInSet() { + final Set orderedLineSet = new LinkedHashSet<>(); + orderedLineSet.addAll(VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString()); + orderedLineSet.stream().forEach(l -> Assert.assertFalse(VCFHeaderVersion.isFormatString(l.getKey()))); + // add the file format line last + orderedLineSet.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); + final VCFHeader vcfHeader = new VCFHeader(orderedLineSet, Collections.EMPTY_SET); + + final Collection inputOrderLines = vcfHeader.getMetaDataInInputOrder(); + final Optional optFirstInputOrderLine = inputOrderLines.stream().findFirst(); + Assert.assertTrue(optFirstInputOrderLine.isPresent()); + Assert.assertTrue(VCFHeaderVersion.isFormatString(optFirstInputOrderLine.get().getKey())); + + final Collection sortedOrderLines = vcfHeader.getMetaDataInSortedOrder(); + final Optional optFirstSortedOrderLine = sortedOrderLines.stream().findFirst(); + Assert.assertTrue(optFirstSortedOrderLine.isPresent()); + Assert.assertTrue(VCFHeaderVersion.isFormatString(optFirstSortedOrderLine.get().getKey())); + } + @Test public void testPreserveSequenceDictionaryAttributes() { // Round trip a SAMSequenceDictionary with attributes, through a VCFHeader, and back diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTestData.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTestData.java index 7b57a19b5a..286fcecfa6 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTestData.java +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTestData.java @@ -147,7 +147,7 @@ public static Set getV42HeaderLinesWITHOUTFormatString() { metaDataLines.addMetaDataLines(metaDataSet); final VCFHeaderLine versionLine = metaDataLines.getFileFormatLine(); Assert.assertEquals( - VCFHeaderVersion.toHeaderVersion(versionLine.getValue()), + metaDataLines.getVCFVersion(), VCFHeaderVersion.VCF4_2); // remove the 4.2 version line from the original set, verify, and return the set with no fileformat string diff --git a/src/test/java/htsjdk/variant/vcf/VCFMetaDataLinesUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFMetaDataLinesUnitTest.java index 2e41536abe..f79331a7eb 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFMetaDataLinesUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFMetaDataLinesUnitTest.java @@ -172,11 +172,6 @@ public void testAddRemoveDuplicateStructuredLine() { Assert.assertEquals(md.getFilterLines().size(), beforeFilterSize); } -// @Test -// public void testAddRemoveContigLine() { -// final VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); -// } - @Test public void testHasEquivalentHeaderLinePositive() { final VCFHeaderUnitTestData unitTestData = new VCFHeaderUnitTestData(); @@ -350,5 +345,26 @@ public void testRemoveAndReplaceContigLines() { Assert.assertEquals(sortedLines3.get(2), vcfContigLine2); } + @Test + public void testFileFormatLineFirstInSet() { + final Set orderedLineSet = new LinkedHashSet<>(); + orderedLineSet.addAll(VCFHeaderUnitTestData.getV42HeaderLinesWITHOUTFormatString()); + orderedLineSet.stream().forEach(l -> Assert.assertFalse(VCFHeaderVersion.isFormatString(l.getKey()))); + // add the file format line last + orderedLineSet.add(VCFHeader.makeHeaderVersionLine(VCFHeader.DEFAULT_VCF_VERSION)); + final VCFMetaDataLines metaDataLines = new VCFMetaDataLines(); + metaDataLines.addMetaDataLines(orderedLineSet); + + final Collection inputOrderLines = metaDataLines.getMetaDataInInputOrder(); + final Optional optFirstInputOrderLine = inputOrderLines.stream().findFirst(); + Assert.assertTrue(optFirstInputOrderLine.isPresent()); + Assert.assertTrue(VCFHeaderVersion.isFormatString(optFirstInputOrderLine.get().getKey())); + + final Collection sortedOrderLines = metaDataLines.getMetaDataInInputOrder(); + final Optional optFirstSortedOrderLine = sortedOrderLines.stream().findFirst(); + Assert.assertTrue(optFirstSortedOrderLine.isPresent()); + Assert.assertTrue(VCFHeaderVersion.isFormatString(optFirstSortedOrderLine.get().getKey())); + } + } From 2fe930cc6c2df9b313d717c4e520a2f1a30889ef Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Mon, 15 Nov 2021 15:16:54 -0500 Subject: [PATCH 05/12] More code review comments. --- src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java | 4 ++-- src/test/java/htsjdk/variant/vcf/AbstractVCFCodecTest.java | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java index 60eb4fc90f..7f0f255883 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java @@ -323,11 +323,11 @@ private int decodeCount(final String countString, final VCFHeaderLineCount reque // This check is here on behalf of INFO lines (which are the only header line type allowed to have Flag // type). A Flag type with a count value other than 0 violates the spec (at least v4.2 and v4.3), but // to retain backward compatibility with previous implementations, we accept (and repair) and the line here. - updateGenericField(NUMBER_ATTRIBUTE, "0"); - lineCount = 0; logger.warn(String.format("FLAG fields must have a count value of 0, but saw count %d for header line %s. A value of 0 will be used", lineCount, getID())); + updateGenericField(NUMBER_ATTRIBUTE, "0"); + lineCount = 0; } } else if (lineCount <= 0) { throw new TribbleException.InvalidHeader( diff --git a/src/test/java/htsjdk/variant/vcf/AbstractVCFCodecTest.java b/src/test/java/htsjdk/variant/vcf/AbstractVCFCodecTest.java index 97e7493a6f..9709af8cc6 100644 --- a/src/test/java/htsjdk/variant/vcf/AbstractVCFCodecTest.java +++ b/src/test/java/htsjdk/variant/vcf/AbstractVCFCodecTest.java @@ -83,7 +83,7 @@ public Object[][] otherHeaderLines() { { "key=<", new VCFHeaderLine("key", "<") }, // taken from Funcotator test file as ##ID= // technically, this is invalid due to the lack of an "ID" attribute, but it should still parse - // into a VCFHeaderLine (but noa VCFSimpleHeaderLine + // into a VCFHeaderLine (just not a VCFSimpleHeaderLine) { "ID=", new VCFHeaderLine("ID", "") }, }; From 88bdf78c13165a2e5d5f4c2d32ea64fa62ce1940 Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Tue, 23 Nov 2021 14:28:22 -0500 Subject: [PATCH 06/12] Changes needed to port GATK over to htsjdk SAMSequenceDictionaryUtils. --- .../htsjdk/samtools/SAMSequenceDictionaryUtils.java | 4 ++-- .../samtools/SAMSequenceDictionaryUtilsTest.java | 11 ----------- 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/src/main/java/htsjdk/samtools/SAMSequenceDictionaryUtils.java b/src/main/java/htsjdk/samtools/SAMSequenceDictionaryUtils.java index 0d5073a0ba..4e2262fb26 100644 --- a/src/main/java/htsjdk/samtools/SAMSequenceDictionaryUtils.java +++ b/src/main/java/htsjdk/samtools/SAMSequenceDictionaryUtils.java @@ -149,7 +149,7 @@ private static boolean commonContigsHaveSameLengths(Set commonContigs, S * @param dict2 * @return */ - private static List findDisequalCommonContigs(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { + public static List findDisequalCommonContigs(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { for ( String name : commonContigs ) { SAMSequenceRecord elt1 = dict1.getSequence(name); SAMSequenceRecord elt2 = dict2.getSequence(name); @@ -197,7 +197,7 @@ public static boolean sequenceRecordsAreEquivalent(final SAMSequenceRecord first * @param dict * @return */ - private static boolean nonCanonicalHumanContigOrder(SAMSequenceDictionary dict) { + public static boolean nonCanonicalHumanContigOrder(SAMSequenceDictionary dict) { SAMSequenceRecord chr1 = null, chr2 = null, chr10 = null; for ( SAMSequenceRecord elt : dict.getSequences() ) { if ( isHumanSeqRecord(elt, CHR1_HG18, CHR1_HG19, CHR1_B36, CHR1_B37) ) chr1 = elt; diff --git a/src/test/java/htsjdk/samtools/SAMSequenceDictionaryUtilsTest.java b/src/test/java/htsjdk/samtools/SAMSequenceDictionaryUtilsTest.java index 7167fa8f12..51675664ba 100644 --- a/src/test/java/htsjdk/samtools/SAMSequenceDictionaryUtilsTest.java +++ b/src/test/java/htsjdk/samtools/SAMSequenceDictionaryUtilsTest.java @@ -210,17 +210,6 @@ public void testSequenceDictionaryComparison( final List firs dictionaryCompatibility, reportedCompatibility, testDescription)); } - @DataProvider(name = "StandardValidationIgnoresContigOrderData") - public Object[][] getStandardValidationIgnoresContigOrderData() { - return new Object[][] { - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19) }, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHR10_HG19) }, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR10_HG19, CHR2_HG19, CHR1_HG19) }, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR10_HG19, CHR1_HG19) }, - - }; - } - private SAMSequenceDictionary createSequenceDictionary( final List contigs ) { final List clonedContigs = new ArrayList<>(contigs.size()); From d93546f0d15a559d27718857e3ccd3ea1b1a3865 Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Tue, 30 Nov 2021 14:18:00 -0500 Subject: [PATCH 07/12] One more code review comment fix. --- src/main/java/htsjdk/variant/vcf/VCFHeader.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeader.java b/src/main/java/htsjdk/variant/vcf/VCFHeader.java index 1dcb5e07f9..0c60e92fd9 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeader.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeader.java @@ -478,7 +478,7 @@ public VCFHeaderLine getOtherHeaderLineUnique(final String key) { } else if (lineList.size() > 1) { throw new TribbleException( String.format( - "More than one \"other\" header line matches the key \"%s\". Use getOtherHeaderLines() to retrieve multiple lines:", + "More than one \"other\" header line matches the key \"%s\" (%s). Use getOtherHeaderLines() to retrieve multiple lines:", key, lineList.stream().map(VCFHeaderLine::toString).collect(Collectors.joining(",")))); } else { From f9a0c0811d27f3b4aa5d381b0f2ff7267b423272 Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Mon, 6 Dec 2021 09:40:07 -0500 Subject: [PATCH 08/12] Make VCFMetaDataLines public/@InternalAPI to allow consumers access for customs serialization. --- src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java b/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java index 97f208e7b4..e88acf274e 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java +++ b/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java @@ -36,9 +36,9 @@ * This class does NOT validate that the lines contained are valid for the current version (that is * the caller's responsibility). */ -//Visible to allow disq Kryo registration for serialization +//Visible to allow registration for custom serialization @InternalAPI -final class VCFMetaDataLines implements Serializable { +final public class VCFMetaDataLines implements Serializable { public static final long serialVersionUID = 1L; protected final static Log logger = Log.getInstance(VCFMetaDataLines.class); From 29c854f47e7ab3268c2f908accb5656266f7ad14 Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Mon, 28 Feb 2022 10:06:41 -0500 Subject: [PATCH 09/12] Versioned header line validation framework. --- .../htsjdk/variant/vcf/VCFAltHeaderLine.java | 6 +- .../variant/vcf/VCFCompoundHeaderLine.java | 6 +- .../variant/vcf/VCFContigHeaderLine.java | 6 +- .../variant/vcf/VCFFilterHeaderLine.java | 2 +- .../variant/vcf/VCFFormatHeaderLine.java | 2 +- .../java/htsjdk/variant/vcf/VCFHeader.java | 6 +- .../htsjdk/variant/vcf/VCFHeaderLine.java | 78 +++++++++++-------- .../htsjdk/variant/vcf/VCFInfoHeaderLine.java | 2 +- .../htsjdk/variant/vcf/VCFMetaDataLines.java | 6 +- .../htsjdk/variant/vcf/VCFMetaHeaderLine.java | 6 +- .../variant/vcf/VCFPedigreeHeaderLine.java | 6 +- .../variant/vcf/VCFSampleHeaderLine.java | 6 +- .../variant/vcf/VCFSimpleHeaderLine.java | 36 ++++----- .../variant/vcf/VCFHeaderLineUnitTest.java | 8 +- .../vcf/VCFInfoHeaderLineUnitTest.java | 2 +- 15 files changed, 92 insertions(+), 86 deletions(-) diff --git a/src/main/java/htsjdk/variant/vcf/VCFAltHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFAltHeaderLine.java index 37ac9874e9..d195dca6d8 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFAltHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFAltHeaderLine.java @@ -22,7 +22,7 @@ public VCFAltHeaderLine(final String line, final VCFHeaderVersion version) { // Honor the requested version to choose the parser, and let validateForVersion figure out // whether that version is valid for this line (for example, if this is called with a pre-4.0 version) super(VCFConstants.ALT_HEADER_KEY, VCFHeaderLineTranslator.parseLine(version, line, expectedTags)); - validateForVersion(version); + validateForVersionOrThrow(version); } public VCFAltHeaderLine(final String id, final String description) { @@ -35,7 +35,7 @@ public VCFAltHeaderLine(final String id, final String description) { } @Override - public Optional> getValidationFailure(final VCFHeaderVersion vcfTargetVersion) { + public Optional> validateForVersion(final VCFHeaderVersion vcfTargetVersion) { //TODO: Should we validate/constrain these to match the 4.3 spec constraints ? if (!vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0)) { final VCFValidationFailure validationFailure = new VCFValidationFailure<>( @@ -49,6 +49,6 @@ public Optional> getValidationFailure(final } } - return super.getValidationFailure(vcfTargetVersion); + return super.validateForVersion(vcfTargetVersion); } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java index 7f0f255883..c970dd7da5 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java @@ -126,7 +126,7 @@ protected VCFCompoundHeaderLine(final String key, final Map mapp final String countString = getGenericFieldValue(NUMBER_ATTRIBUTE); this.countType = decodeCountType(countString, vcfVersion); this.count = decodeCount(countString, this.countType); - validateForVersion(vcfVersion); + validateForVersionOrThrow(vcfVersion); } /** @@ -174,7 +174,7 @@ public String getVersion() { } @Override - public Optional> getValidationFailure(final VCFHeaderVersion vcfTargetVersion) { + public Optional> validateForVersion(final VCFHeaderVersion vcfTargetVersion) { // The VCF 4.3 spec does not phrase this restriction as one on the form of the ID value of // INFO/FORMAT lines but instead on the INFO/FORMAT fixed field key values (c.f. section 1.6.1). // However, the key values correspond to INFO/FORMAT header lines defining the attribute and its type, @@ -194,7 +194,7 @@ public Optional> getValidationFailure(final } } - return super.getValidationFailure(vcfTargetVersion); + return super.validateForVersion(vcfTargetVersion); } /** diff --git a/src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java index d8a19e2fa5..01794bd51e 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java @@ -97,7 +97,7 @@ public VCFContigHeaderLine(final String line, final VCFHeaderVersion version, fi if (contigIndex < 0) { throw new TribbleException(String.format("The contig index (%d) is less than zero.", contigIndex)); } - validateForVersion(version); + validateForVersionOrThrow(version); } public VCFContigHeaderLine(final Map mapping, final int contigIndex) { @@ -186,7 +186,7 @@ public SAMSequenceRecord getSAMSequenceRecord() { } @Override - public Optional> getValidationFailure(final VCFHeaderVersion vcfTargetVersion) { + public Optional> validateForVersion(final VCFHeaderVersion vcfTargetVersion) { if (vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { if (!VALID_CONTIG_ID_PATTERN.matcher(getID()).matches()) { return Optional.of(new VCFValidationFailure<>( @@ -196,7 +196,7 @@ public Optional> getValidationFailure(final } } - return super.getValidationFailure(vcfTargetVersion); + return super.validateForVersion(vcfTargetVersion); } public Integer getContigIndex() { diff --git a/src/main/java/htsjdk/variant/vcf/VCFFilterHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFFilterHeaderLine.java index 1b890db1b1..f0a601b10f 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFFilterHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFFilterHeaderLine.java @@ -78,7 +78,7 @@ public VCFFilterHeaderLine(final String name) { public VCFFilterHeaderLine(final String line, final VCFHeaderVersion version) { super(VCFConstants.FILTER_HEADER_KEY, VCFHeaderLineTranslator.parseLine(version, line, requiredTagOrder)); validate(); - validateForVersion(version); + validateForVersionOrThrow(version); } private void validate() { diff --git a/src/main/java/htsjdk/variant/vcf/VCFFormatHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFFormatHeaderLine.java index fc75ee5291..6d25b0c6e4 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFFormatHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFFormatHeaderLine.java @@ -57,7 +57,7 @@ public VCFFormatHeaderLine(String line, VCFHeaderVersion version) { VCFHeaderLineTranslator.parseLine(version, line, expectedTagOrder), version); validate(); - validateForVersion(version); + validateForVersionOrThrow(version); } /** diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeader.java b/src/main/java/htsjdk/variant/vcf/VCFHeader.java index 0c60e92fd9..520cc0497c 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeader.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeader.java @@ -161,7 +161,7 @@ public VCFHeader(final Set metaData, final List genotypeS // and the header version will be established using the last version line encountered mMetaData.addMetaDataLines(metaData); final VCFHeaderVersion vcfHeaderVersion = initializeHeaderVersion(); - mMetaData.validateMetaDataLines(vcfHeaderVersion); + mMetaData.validateMetaDataLinesOrThrow(vcfHeaderVersion); checkForDeprecatedGenotypeLikelihoodsKey(); if ( genotypeSampleNames.size() != new HashSet<>(genotypeSampleNames).size() ) @@ -625,9 +625,9 @@ private void validateVersionTransition( // the version moved forward, so validate ALL of the existing lines in the list to ensure // that the transition is valid - mMetaData.validateMetaDataLines(newVersion); + mMetaData.validateMetaDataLinesOrThrow(newVersion); } else { - newHeaderLine.validateForVersion(newVersion); + newHeaderLine.validateForVersionOrThrow(newVersion); } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java index 94a3a0849e..3e3e03e990 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java @@ -50,10 +50,13 @@ public class VCFHeaderLine implements Comparable, Serializable { * @param key the key for this header line * @param value the value for this header line */ - public VCFHeaderLine(String key, String value) { + public VCFHeaderLine(final String key, final String value) { + final Optional validationFailure = validateAttributeName(key, "header line key"); + if (validationFailure.isPresent()) { + throw new TribbleException(validationFailure.get()); + } mKey = key; mValue = value; - validate(); } /** @@ -86,15 +89,20 @@ public String getValue() { public String getID() { return null; } /** - * Validates this header line against {@code vcfTargetVersion}. - * Subclasses can override this to provide line type-specific version validation, and the - * overrides should also call super.getValidationFailure to allow each class in the class hierarchy - * to do class-level validation. + * Validates this header line against {@code vcfTargetVersion} and returns a {@link VCFValidationFailure} + * describing the reaon for the failure, if one exists. This method is used to report the reason for a + * version upgrade failure. + * + * Subclasses can override this to provide line type-specific version validation. Overrides should + * call super.validateForVersion to allow each class in the hierarchy to do class-level validation. * + * @param vcfTargetVersion * @return Optional containing a {@link VCFValidationFailure} describing validation failure if this * line fails validation, otherwise Optional.empty(). */ - public Optional> getValidationFailure(final VCFHeaderVersion vcfTargetVersion) { + public Optional> validateForVersion(final VCFHeaderVersion vcfTargetVersion) { + ValidationUtils.nonNull(vcfTargetVersion); + // If this header line is itself a fileformat/version line, // make sure it doesn't clash with the requested vcfTargetVersion. if (VCFHeaderVersion.isFormatString(getKey())) { @@ -124,33 +132,42 @@ public Optional> getValidationFailure(final } /** - * Validate that the header line conforms to {@code vcfTargetVersion. - * @param vcfTargetVersion + * Validate that the header line conforms to {@code vcfTargetVersion. throws if the line fails to + * validate for the target version. + * + * @param vcfTargetVersion the version agint which to validate the line * @throws {@link TribbleException.VersionValidationFailure} if this header line fails to conform */ - public void validateForVersion(final VCFHeaderVersion vcfTargetVersion) { - final Optional> error = getValidationFailure(vcfTargetVersion); - if (error.isPresent()) { - throw new TribbleException.VersionValidationFailure(error.get().getSourceMessage()); + public void validateForVersionOrThrow(final VCFHeaderVersion vcfTargetVersion) { + ValidationUtils.nonNull(vcfTargetVersion); + final Optional> versionValidationFailure = validateForVersion(vcfTargetVersion); + if (versionValidationFailure.isPresent()) { + throw new TribbleException.VersionValidationFailure(versionValidationFailure.get().getSourceMessage()); } } /** - * Validate a string that is to be used as a unique id or key field. + * Validate a string that is to be used as a unique id or key field, and return an Optional String describing + * the validation failure. + * + * @param targetString the string to validate + * @param targetContext the context for which the {@code targetString} is used. Used when reporting validation + * failures. May not be null. + * @return an Optional String containing an error message */ - protected static void validateKeyOrID(final String keyString, final String sourceName) { - ValidationUtils.nonNull(sourceName); - if (keyString == null) { - throw new TribbleException( - String.format("VCFHeaderLine: %s cannot be null or empty", sourceName)); - } - if ( keyString.contains("<") || keyString.contains(">") ) { - throw new TribbleException( - String.format("VCFHeaderLine: %s cannot contain angle brackets", sourceName)); - } - if ( keyString.contains("=") ) { - throw new TribbleException( - String.format("VCFHeaderLine: %s cannot contain an equals sign", sourceName)); + protected static Optional validateAttributeName(final String targetString, final String targetContext) { + ValidationUtils.nonNull(targetContext); + + if (targetString == null) { + return Optional.of(String.format("VCFHeaderLine: %s is null", targetContext)); + } else if (targetString.length() < 1) { + return Optional.of(String.format("VCFHeaderLine: %s has zero length", targetContext)); + } else if ( targetString.contains("<") || targetString.contains(">") ) { + return Optional.of(String.format("VCFHeaderLine: angle brackets not allowed in \"%s\" value", targetContext)); + } else if ( targetString.contains("=") ) { + return Optional.of(String.format("VCFHeaderLine: equals sign not allowed in %s value \"%s\"", targetContext, targetString)); + } else { + return Optional.empty(); } } @@ -239,13 +256,6 @@ public static String toStringEncoding(Map keyValues) { return builder.toString(); } - /** - * Validate the state of this header line. Require the key be valid as an "id". - */ - private void validate() { - validateKeyOrID(mKey, "key"); - } - private static String escapeQuotes(final String value) { // java escaping in a string literal makes this harder to read than it should be // without string literal escaping and quoting the regex would be: replaceAll( ([^\])" , $1\" ) diff --git a/src/main/java/htsjdk/variant/vcf/VCFInfoHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFInfoHeaderLine.java index 12a29a1f6c..6bb2264ecf 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFInfoHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFInfoHeaderLine.java @@ -67,7 +67,7 @@ public VCFInfoHeaderLine(String line, VCFHeaderVersion version) { VCFHeaderLineTranslator.parseLine(version, line, expectedTagOrder), version ); - validateForVersion(version); + validateForVersionOrThrow(version); } /** diff --git a/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java b/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java index e88acf274e..3a811f84c4 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java +++ b/src/main/java/htsjdk/variant/vcf/VCFMetaDataLines.java @@ -171,10 +171,10 @@ public VCFHeaderLine findEquivalentHeaderLine(final VCFHeaderLine queryLine) { */ //TODO: we need to tell users how to resolve the case where this fails due to version validation //i.e, use a custom upgrade tool - public void validateMetaDataLines(final VCFHeaderVersion targetVersion) { + public void validateMetaDataLinesOrThrow(final VCFHeaderVersion targetVersion) { mMetaData.values().forEach(headerLine -> { if (!VCFHeaderVersion.isFormatString(headerLine.getKey())) { - headerLine.validateForVersion(targetVersion); + headerLine.validateForVersionOrThrow(targetVersion); } }); } @@ -190,7 +190,7 @@ public void validateMetaDataLines(final VCFHeaderVersion targetVersion) { public Collection getValidationErrors(final VCFHeaderVersion targetVersion) { return mMetaData.values().stream() .filter(line -> !VCFHeaderVersion.isFormatString(line.getKey())) - .map(l -> l.getValidationFailure(targetVersion)) + .map(l -> l.validateForVersion(targetVersion)) .filter(o -> o.isPresent()) .map(o -> o.get()) .collect(Collectors.toList()); diff --git a/src/main/java/htsjdk/variant/vcf/VCFMetaHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFMetaHeaderLine.java index d8cd83b8bb..019ab27c1f 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFMetaHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFMetaHeaderLine.java @@ -15,7 +15,7 @@ public VCFMetaHeaderLine(final String line, final VCFHeaderVersion version) { // other tags. So let validateForVersion detect any version incompatibility, ie., if this is ever // called with a V3 version. super(VCFConstants.META_HEADER_KEY, new VCF4Parser().parseLine(line, expectedTagOrder)); - validateForVersion(version); + validateForVersionOrThrow(version); } public VCFMetaHeaderLine(final Map mapping) { @@ -23,7 +23,7 @@ public VCFMetaHeaderLine(final Map mapping) { } @Override - public Optional> getValidationFailure(final VCFHeaderVersion vcfTargetVersion) { + public Optional> validateForVersion(final VCFHeaderVersion vcfTargetVersion) { if (!vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { return Optional.of( new VCFValidationFailure<>( @@ -35,7 +35,7 @@ public Optional> getValidationFailure(final ))); } - return super.getValidationFailure(vcfTargetVersion); + return super.validateForVersion(vcfTargetVersion); } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFPedigreeHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFPedigreeHeaderLine.java index f5bd71c474..5e3b1c1748 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFPedigreeHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFPedigreeHeaderLine.java @@ -21,7 +21,7 @@ public VCFPedigreeHeaderLine(String line, VCFHeaderVersion version) { // other tags. So let validateForVersion detect any version incompatibility, ie., if this is ever // called with a V3 version. super(VCFConstants.PEDIGREE_HEADER_KEY, new VCF4Parser().parseLine(line, expectedTagOrder)); - validateForVersion(version); + validateForVersionOrThrow(version); } public VCFPedigreeHeaderLine(final Map mapping) { @@ -29,7 +29,7 @@ public VCFPedigreeHeaderLine(final Map mapping) { } @Override - public Optional> getValidationFailure(final VCFHeaderVersion vcfTargetVersion) { + public Optional> validateForVersion(final VCFHeaderVersion vcfTargetVersion) { if (!vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { // previous to VCFv4.3, the PEDIGREE line did not have an ID. Such lines are not modeled by this // class (since it is derived from VCFSimpleHeaderLine). Therefore instances of this class always @@ -45,7 +45,7 @@ public Optional> getValidationFailure(final } } - return super.getValidationFailure(vcfTargetVersion); + return super.validateForVersion(vcfTargetVersion); } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFSampleHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFSampleHeaderLine.java index 7c45e9a1b2..81c032b8ed 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFSampleHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFSampleHeaderLine.java @@ -15,7 +15,7 @@ public VCFSampleHeaderLine(String line, VCFHeaderVersion version) { // other tags. So let validateForVersion detect any version incompatibility, ie., if this is ever // called with a V3 version. super(VCFConstants.SAMPLE_HEADER_KEY, new VCF4Parser().parseLine(line, expectedTagOrder)); - validateForVersion(version); + validateForVersionOrThrow(version); } public VCFSampleHeaderLine(final Map mapping) { @@ -23,7 +23,7 @@ public VCFSampleHeaderLine(final Map mapping) { } @Override - public Optional> getValidationFailure(final VCFHeaderVersion vcfTargetVersion) { + public Optional> validateForVersion(final VCFHeaderVersion vcfTargetVersion) { if (!vcfTargetVersion.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0)) { final String message = String.format("%s header lines are not allowed in VCF version %s headers", getKey(), @@ -36,7 +36,7 @@ public Optional> getValidationFailure(final } } - return super.getValidationFailure(vcfTargetVersion); + return super.validateForVersion(vcfTargetVersion); } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFSimpleHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFSimpleHeaderLine.java index c0a3abce5c..e40a544a66 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFSimpleHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFSimpleHeaderLine.java @@ -29,11 +29,7 @@ import htsjdk.tribble.TribbleException; import htsjdk.utils.ValidationUtils; -import java.util.ArrayList; -import java.util.Collections; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.stream.Collectors; /** @@ -75,8 +71,7 @@ public class VCFSimpleHeaderLine extends VCFHeaderLine implements VCFIDHeaderLin */ public VCFSimpleHeaderLine(final String key, final String line, final VCFHeaderVersion version) { this(key, VCFHeaderLineTranslator.parseLine(version, line, expectedTagOrder)); - validate(); - validateForVersion(version); + validateForVersionOrThrow(version); } /** @@ -87,10 +82,10 @@ public VCFSimpleHeaderLine(final String key, final String line, final VCFHeaderV * @param description string that will be added as a "Description" tag to this line */ public VCFSimpleHeaderLine(final String key, final String id, final String description) { - super(key, ""); - genericFields.put(ID_ATTRIBUTE, id); - genericFields.put(DESCRIPTION_ATTRIBUTE, description); - validate(); + this(key, new LinkedHashMap() {{ + put(ID_ATTRIBUTE, id); + put(DESCRIPTION_ATTRIBUTE, description); + }}); } /** @@ -105,9 +100,18 @@ public VCFSimpleHeaderLine(final String key, final String id, final String descr */ public VCFSimpleHeaderLine(final String key, final Map attributeMapping) { super(key, ""); + ValidationUtils.nonNull(attributeMapping, "An attribute map is required for structured header lines"); genericFields.putAll(attributeMapping); - validate(); + + if ( genericFields.isEmpty() || !genericFields.keySet().stream().findFirst().get().equals(ID_ATTRIBUTE)) { + throw new TribbleException( + String.format("The required ID tag is missing or not the first attribute: key=%s", super.getKey())); + } + final Optional validationFailure = validateAttributeName(getGenericFieldValue(ID_ATTRIBUTE), "ID"); + if (validationFailure.isPresent()) { + throw new TribbleException(validationFailure.get()); + } } /** @@ -201,14 +205,6 @@ protected boolean getIsQuotableAttribute(final String attributeName) { attributeName.equals(VERSION_ATTRIBUTE); } - private void validate() { - if ( genericFields.isEmpty() || !genericFields.keySet().stream().findFirst().get().equals(ID_ATTRIBUTE)) { - throw new TribbleException( - String.format("The required ID tag is missing or not the first attribute: key=%s", super.getKey())); - } - validateKeyOrID(getGenericFieldValue(ID_ATTRIBUTE), "ID"); - } - // Add quotes around any attribute value that contains a space or comma, or is supposed to be quoted by // definition per the spec (i.e., Description, Source, Version for INFO lines). private String quoteAttributeValueForSerialization(final String attribute, final String originalValue) { diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderLineUnitTest.java index d5d7e47ec9..6cc0eb6443 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFHeaderLineUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderLineUnitTest.java @@ -100,9 +100,9 @@ public void testInvalidKeys(final String testKey) { new VCFHeaderLine(testKey, ""); } - @Test(dataProvider = "invalidHeaderLineKeys", expectedExceptions=TribbleException.class) + @Test(dataProvider = "invalidHeaderLineKeys") public void testValidateAsIdInvalid(final String testKey) { - VCFHeaderLine.validateKeyOrID(testKey, "test"); + Assert.assertTrue(VCFHeaderLine.validateAttributeName(testKey, "test").isPresent()); } @DataProvider(name = "vcfVersions") @@ -120,7 +120,7 @@ public Object[][] vcfVersions() { @Test(dataProvider = "vcfVersions") public void testValidateForVersion(final VCFHeaderVersion vcfVersion) { VCFHeaderLine headerLine = new VCFHeaderLine(vcfVersion.getFormatString(), vcfVersion.getVersionString()); - headerLine.validateForVersion(vcfVersion); + headerLine.validateForVersionOrThrow(vcfVersion); } @DataProvider(name = "incompatibleVersions") @@ -139,7 +139,7 @@ public Object[][] incompatibleVersionPairs() { @Test(dataProvider="incompatibleVersions", expectedExceptions= TribbleException.VersionValidationFailure.class) public void testValidateForVersionFails(final VCFHeaderVersion vcfVersion, final VCFHeaderVersion incompatibleVersion) { VCFHeaderLine headerLine = new VCFHeaderLine(vcfVersion.getFormatString(), vcfVersion.getVersionString()); - headerLine.validateForVersion(incompatibleVersion); + headerLine.validateForVersionOrThrow(incompatibleVersion); } @Test(expectedExceptions = { TribbleException.InvalidHeader.class }, expectedExceptionsMessageRegExp = ".*For fixed count, the count number must be 1 or higher.") diff --git a/src/test/java/htsjdk/variant/vcf/VCFInfoHeaderLineUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFInfoHeaderLineUnitTest.java index 9e2a82f15a..dc556a2315 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFInfoHeaderLineUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFInfoHeaderLineUnitTest.java @@ -70,7 +70,7 @@ public void testAllow1000GKey() { ); // TODO change to VCFHeader.DEFAULT_VCF_VERSION - Assert.assertFalse(line.getValidationFailure(VCFHeaderVersion.VCF4_3).isPresent()); + Assert.assertFalse(line.validateForVersion(VCFHeaderVersion.VCF4_3).isPresent()); } @Test(dataProvider = "mergeIncompatibleInfoLines", expectedExceptions= TribbleException.class) From 570095812e65e2dff8ef2cb25c8f669837d49d41 Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Mon, 28 Feb 2022 12:19:58 -0500 Subject: [PATCH 10/12] Remove obsolete variable. --- src/main/java/htsjdk/variant/vcf/VCFUtils.java | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/main/java/htsjdk/variant/vcf/VCFUtils.java b/src/main/java/htsjdk/variant/vcf/VCFUtils.java index 3599da7edc..71776db364 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFUtils.java +++ b/src/main/java/htsjdk/variant/vcf/VCFUtils.java @@ -48,11 +48,6 @@ public class VCFUtils { private static final Pattern INF_OR_NAN_PATTERN = Pattern.compile("^(?[-+]?)((?(INF|INFINITY))|(?NAN))$", Pattern.CASE_INSENSITIVE); - private static final boolean DEFAULT_VCF_STRICT_VERSION_VALIDATION = true; - - // a global mutable static - is there an alternative ? - // there isn't any other reasonable place to keep this state - private static boolean vcfStrictVersionValidation = true; /** * Determine if strict VCF version validation is enabled. Defaults to true. Strict version validation From 5e09eb38e2ff2bb8897795737ee6a30f91f5e058 Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Thu, 1 Jun 2023 16:36:32 -0400 Subject: [PATCH 11/12] Properly handle info fields with embedded spaces by VCF version. --- .../java/htsjdk/variant/vcf/VCFCodec.java | 2 +- .../variant/vcf/VCFCodec42FeaturesTest.java | 19 ++++++ .../variant/vcf/VCFCodec43FeaturesTest.java | 13 +++- .../resources/htsjdk/variant/infoSpace42.vcf | 62 +++++++++++++++++++ .../htsjdk/variant/vcf43/infoSpace43.vcf | 62 +++++++++++++++++++ 5 files changed, 155 insertions(+), 3 deletions(-) create mode 100644 src/test/resources/htsjdk/variant/infoSpace42.vcf create mode 100644 src/test/resources/htsjdk/variant/vcf43/infoSpace43.vcf diff --git a/src/main/java/htsjdk/variant/vcf/VCFCodec.java b/src/main/java/htsjdk/variant/vcf/VCFCodec.java index 3ebf47c02a..844a07688a 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFCodec.java +++ b/src/main/java/htsjdk/variant/vcf/VCFCodec.java @@ -104,7 +104,7 @@ protected void reportDuplicateInfoKeyValue(final String duplicateKey, final Stri * @return a mapping of keys to objects */ protected Map parseInfo(String infoField) { - if (infoField.indexOf(' ') != -1) { + if ((infoField.indexOf(' ') != -1) && !version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { generateException( String.format("Whitespace is not allowed in the INFO field in VCF version %s: %s", version == null ? diff --git a/src/test/java/htsjdk/variant/vcf/VCFCodec42FeaturesTest.java b/src/test/java/htsjdk/variant/vcf/VCFCodec42FeaturesTest.java index 9f39228d5a..bbc66d58bc 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFCodec42FeaturesTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFCodec42FeaturesTest.java @@ -1,11 +1,15 @@ package htsjdk.variant.vcf; import htsjdk.HtsjdkTest; +import htsjdk.samtools.util.Tuple; +import htsjdk.tribble.TribbleException; +import htsjdk.variant.variantcontext.VariantContext; import org.testng.Assert; import org.testng.annotations.Test; import java.nio.file.Path; import java.nio.file.Paths; +import java.util.List; public class VCFCodec42FeaturesTest extends HtsjdkTest { private static final Path TEST_PATH = Paths.get("src/test/resources/htsjdk/variant/"); @@ -21,4 +25,19 @@ public void testV42PedigreeParsing() { Assert.assertEquals(vcf42PedigreeLine.getClass(), VCFHeaderLine.class); Assert.assertEquals(vcf42PedigreeLine.getValue(), ""); } + + @Test(expectedExceptions = TribbleException.class) + public void testVCF42RejectsInfoFieldWithSpaces() { + // 1st variant has an info field with a value containing an embedded space + final Path infoSpace42File = TEST_PATH.resolve("infoSpace42.vcf"); + + try ( final VCFFileReader vcfReader = new VCFFileReader(infoSpace42File, false) ){ + for (final VariantContext vc : vcfReader) { + + } + } catch (final TribbleException e) { + Assert.assertTrue(e.getMessage().contains("Whitespace is not allowed")); + throw e; + } + } } diff --git a/src/test/java/htsjdk/variant/vcf/VCFCodec43FeaturesTest.java b/src/test/java/htsjdk/variant/vcf/VCFCodec43FeaturesTest.java index 8dbf6dd30d..52f8dd9e32 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFCodec43FeaturesTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFCodec43FeaturesTest.java @@ -1,5 +1,7 @@ package htsjdk.variant.vcf; +import htsjdk.beta.io.IOPathUtils; +import htsjdk.io.IOPath; import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.FileExtensions; import htsjdk.samtools.util.Interval; @@ -19,8 +21,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; -import java.util.ArrayList; -import java.util.List; +import java.util.*; import java.util.function.Function; import java.util.stream.Collectors; @@ -203,6 +204,14 @@ public void testVCF43IndexRoundTripQuery(final Path testFile) throws IOException } } + @Test + public void testVCF43AcceptsInfoFieldWithSpaces() { + // 1st variant has an info field with a value containing an embedded space + final Path infoSpaceFile = TEST_PATH.resolve("infoSpace43.vcf"); + final Tuple> infoSpace43 = readEntireVCFIntoMemory(infoSpaceFile); + Assert.assertTrue(infoSpace43.b.get(0).getAttribute("set").toString().contains(" ")); + } + // // UTF8-specific tests // diff --git a/src/test/resources/htsjdk/variant/infoSpace42.vcf b/src/test/resources/htsjdk/variant/infoSpace42.vcf new file mode 100644 index 0000000000..089063193f --- /dev/null +++ b/src/test/resources/htsjdk/variant/infoSpace42.vcf @@ -0,0 +1,62 @@ +##fileformat=VCFv4.2 +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA19238 NA19239 NA19240 +1 327 . T <*> 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredIn Both GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 diff --git a/src/test/resources/htsjdk/variant/vcf43/infoSpace43.vcf b/src/test/resources/htsjdk/variant/vcf43/infoSpace43.vcf new file mode 100644 index 0000000000..9040c6af1f --- /dev/null +++ b/src/test/resources/htsjdk/variant/vcf43/infoSpace43.vcf @@ -0,0 +1,62 @@ +##fileformat=VCFv4.3 +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA19238 NA19239 NA19240 +1 327 . T <*> 666.18 GATK_STANDARD;HARD_TO_VALIDATE AB=0.74;AC=3;AF=0.50;AN=6;DB=0;DP=936;Dels=0.00;HRun=3;MQ=34.66;MQ0=728;QD=0.71;SB=-268.74;set=filteredIn Both GT:DP:GQ 1/0:10:62 1/0:37:99 1/0:53:99 From 860cab69ce791d9d5cd3e283cca5577010c3df09 Mon Sep 17 00:00:00 2001 From: Chris Norman Date: Mon, 5 Jun 2023 15:47:30 -0400 Subject: [PATCH 12/12] Remove unnecessary null test. --- src/main/java/htsjdk/variant/vcf/VCFCodec.java | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/main/java/htsjdk/variant/vcf/VCFCodec.java b/src/main/java/htsjdk/variant/vcf/VCFCodec.java index 844a07688a..b03af9f8b2 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFCodec.java +++ b/src/main/java/htsjdk/variant/vcf/VCFCodec.java @@ -105,13 +105,10 @@ protected void reportDuplicateInfoKeyValue(final String duplicateKey, final Stri */ protected Map parseInfo(String infoField) { if ((infoField.indexOf(' ') != -1) && !version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { - generateException( - String.format("Whitespace is not allowed in the INFO field in VCF version %s: %s", - version == null ? - "unknown" : - version.getVersionString(), - infoField) - ); + generateException(String.format( + "Whitespace is not allowed in the INFO field in VCF version %s: %s", + version.getVersionString(), + infoField)); } return super.parseInfo(infoField); }