Skip to content

Commit

Permalink
Mn tag validation (#1448)
Browse files Browse the repository at this point in the history
* Base modifications -- validate seq length vs MN tag.   Also, in the absence of MN tag validate # of required bases from MM tag vs seq length.
* Add user preference to validation base count for MM modifications (default is true)
  • Loading branch information
jrobinso authored Dec 5, 2023
1 parent 17981f8 commit 95e90f1
Show file tree
Hide file tree
Showing 5 changed files with 107 additions and 38 deletions.
1 change: 1 addition & 0 deletions src/main/java/org/broad/igv/prefs/Constants.java
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ private Constants() {
public static final String BASEMOD_GROUP_BY_STRAND = "BASEMOD.GROUP_BY_STRAND";
public static final String BASEMOD_SKIPPED_BASES = "BASEMOD.SKIPPED_BASES";
public static final String SMRT_KINETICS_SHOW_OPTIONS = "SMRT_KINETICS.SHOW_OPTIONS";
public static final String BASEMOD_VALIDATE_BASE_COUNT = "BASEMOD.VALIDATE_BASE_COUNT";

// Sequence track settings
public static final String SEQUENCE_TRANSLATION_STRAND = "SEQUENCE_TRANSLATION_STRAND";
Expand Down
109 changes: 96 additions & 13 deletions src/main/java/org/broad/igv/sam/SAMAlignment.java
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMTag;
import htsjdk.samtools.util.SequenceUtil;
import org.broad.igv.logging.*;
import org.broad.igv.Globals;
import org.broad.igv.feature.Strand;
Expand Down Expand Up @@ -120,21 +121,24 @@ public class SAMAlignment implements Alignment {
*/
private List<BaseModificationSet> baseModificationSets;
private SMRTKinetics smrtKinetics;
private Boolean mmValidated;

private enum CacheKey {CLIPPING_COUNTS, SA_GROUP};
private enum CacheKey {CLIPPING_COUNTS, SA_GROUP}

;

/**
* Use this to cache an expensive to compute value on this record and retrieve it as necessary.
*/
@SuppressWarnings("unchecked")
private <T> T getCachedOrCompute(CacheKey key, Supplier<T> supplier){
private <T> T getCachedOrCompute(CacheKey key, Supplier<T> supplier) {
final Object value = record.getTransientAttribute(key);
if (value == null) {
final T newValue = supplier.get();
record.setTransientAttribute(key, newValue);
return newValue;
} else {
return (T)value;
return (T) value;
}
}

Expand Down Expand Up @@ -287,9 +291,9 @@ public Cigar getCigar() {
}

@Override
public ClippingCounts getClippingCounts(){
return getCachedOrCompute(CacheKey.CLIPPING_COUNTS,
() -> ClippingCounts.fromCigar(record.getCigar()));
public ClippingCounts getClippingCounts() {
return getCachedOrCompute(CacheKey.CLIPPING_COUNTS,
() -> ClippingCounts.fromCigar(record.getCigar()));
}

public String getReadSequence() {
Expand Down Expand Up @@ -367,12 +371,23 @@ public List<BaseModificationSet> getBaseModificationSets() {

Object mm = record.hasAttribute("Mm") ? record.getAttribute("Mm") : record.getAttribute("MM");
byte[] ml = (byte[]) (record.hasAttribute("Ml") ? record.getAttribute("Ml") : record.getAttribute("ML"));
Integer mn = record.getIntegerAttribute("MN");

// Minimal tag validation
if(mm instanceof String && (ml == null || ml instanceof byte [])) {
// Minimal tag validation -- 10X uses MM and/or ML for other purposes
if (mm instanceof String && (mm.toString().length() > 0) && (ml == null || ml instanceof byte[])) {

byte[] sequence = record.getReadBases();

// Sequence length validation -- if MN tag is present use it, otherwise do a partial validation
if (mn != null) {
if (mn != record.getReadBases().length) {
return null;
}
} else if (!validateMMTag(mm.toString(), ml, sequence)) { //record.getCigarString().indexOf("H") > 0 &&
return null;
}


if (mm.toString().length() == 0) { // TODO -- more extensive validation?
baseModificationSets = Collections.EMPTY_LIST;
} else {
Expand All @@ -383,6 +398,74 @@ public List<BaseModificationSet> getBaseModificationSets() {
return baseModificationSets;
}

/**
* Minimally validate an MM tag. This will not catch all problems, but will many. Validation proceeds as follows
* 1. Validate types of MM and ML tags. This catches missues of the tags, for example in certain 10X files.
* 2. If available, validate sequence length vs MN tag.
* 3. If MN tag is not available, validate implied minimum count of base nucleotide vs actual count.
*
* @return
*/
boolean validateMMTag(String mm, byte[] ml, byte[] sequence) {

if (mmValidated != null) {
return mmValidated;
}

// Minimal tag validation -- 10X uses MM and/or ML for other purposes
if (!(mm instanceof String && mm.toString().length() > 0 && (ml == null || ml instanceof byte[]))) {
mmValidated = false;
return mmValidated;
}

// Test sequence length vs mn if avaliable
Integer mn = record.getIntegerAttribute("MN");
if (mn != null) {
if (mn == sequence.length) {
mmValidated = true;
} else {
mmValidated = false;
}
return mmValidated;
}

// Finally, test implied minimum base count vs actual base count in sequence. The minimum base count is
// equal to the number of modified bases + the number of skipped bases as codified in the MM tag.
// e.g. C+m,5,12,0 => at least 20 "Cs", 3 with modifications and 17 skipped
if (PreferencesManager.getPreferences().getAsBoolean(Constants.BASEMOD_VALIDATE_BASE_COUNT)) {
String[] mmTokens = mm.split(";");
for (String mmi : mmTokens) {
String[] tokens = mmi.split(","); //Globals.commaPattern.split(mm);
int baseCount;
if (tokens[0].charAt(0) == 'N') {
baseCount = sequence.length;
} else {
byte base = (byte) tokens[0].charAt(0);
char strand = tokens[0].charAt(1);
if (strand == '-') {
base = SequenceUtil.complement(base);
}
baseCount = 0;
for (int i = 0; i < sequence.length; i++) if (sequence[i] == base) baseCount++;
}

// Count # of bases implied by tag
int modified = tokens.length - 1; // All tokens but the first are "skip" numbers
int skipped = 0;
for (int i = 1; i < tokens.length; i++) skipped += Integer.parseInt(tokens[i]);
if (modified + skipped > baseCount) {
log.warn(this.getReadName() + " MM base count validation failed: expected " + (modified + skipped) + "'" + (tokens[0].charAt(0) + "'s" + ", actual count = " + baseCount));
mmValidated = false;
return mmValidated;
}
}
}

// If we get here assume the tag is valide
mmValidated = true;
return mmValidated;
}

public SMRTKinetics getSmrtKinetics() {
if (smrtKinetics == null) {
smrtKinetics = new SMRTKinetics(this);
Expand Down Expand Up @@ -794,7 +877,7 @@ public String getAlignmentValueString(double position, int mouseX, AlignmentTrac
// Identify the number of hard and soft clipped bases.
ClippingCounts clipping = getClippingCounts();

if (!clipping.isClipped()){
if (!clipping.isClipped()) {
buf.append("None");
} else {
if (clipping.isLeftClipped()) {
Expand Down Expand Up @@ -978,8 +1061,8 @@ private String getSupplAlignmentString() {
final List<SupplementaryAlignment> supplementaryAlignments = getSupplementaryAlignments();
final int insertionIndex = SupplementaryAlignment.getInsertionIndex(this, supplementaryAlignments);
int i = 0;
for (SupplementaryAlignment sa: supplementaryAlignments) {
if(i == insertionIndex) { //Add this read into the list
for (SupplementaryAlignment sa : supplementaryAlignments) {
if (i == insertionIndex) { //Add this read into the list
sb.append(getThisReadDescriptionForSAList());
}
i++;
Expand All @@ -989,7 +1072,7 @@ private String getSupplAlignmentString() {
sb.append("<br>* Invalid SA entry (not listed) *");
}
}
if(i == insertionIndex){
if (i == insertionIndex) {
sb.append(getThisReadDescriptionForSAList());
}
return sb.toString();
Expand Down Expand Up @@ -1211,7 +1294,7 @@ public int getHapDistance() {
return hapDistance;
}

public List<SupplementaryAlignment> getSupplementaryAlignments(){
public List<SupplementaryAlignment> getSupplementaryAlignments() {
return getCachedOrCompute(CacheKey.SA_GROUP,
() -> {
Object rawSAValue = this.getAttribute(SAMTag.SA);
Expand Down
2 changes: 2 additions & 0 deletions src/main/resources/org/broad/igv/prefs/preferences.tab
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,8 @@ BASEMOD.NONE_N_COLOR G color 0,0,255
##SMRT Kinetics
SMRT_KINETICS.SHOW_OPTIONS Show visibility options for SMRT kinetics data boolean FALSE

##Validation
BASEMOD.VALIDATE_BASE_COUNT Validate base count required by MM tag vs actual base count in read sequence boolean TRUE

#Proxy
PROXY.DISABLE_CHECK Disable check for system proxy boolean FALSE
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
package org.broad.igv.sam.mods;

import htsjdk.samtools.util.CloseableIterator;
import org.broad.igv.prefs.Constants;
import org.broad.igv.prefs.PreferencesManager;
import org.broad.igv.sam.Alignment;
import org.broad.igv.sam.SAMAlignment;
import org.broad.igv.sam.reader.BAMReader;
import org.broad.igv.util.ResourceLocator;
import org.broad.igv.util.TestUtils;
import org.junit.Before;
import org.junit.Test;

import java.io.IOException;
Expand All @@ -15,6 +18,11 @@

public class BaseModificationCountsTest {

@Before
public void setup() {
PreferencesManager.getPreferences().put(Constants.BASEMOD_VALIDATE_BASE_COUNT, "false");
}

@Test
public void incrementCounts() throws IOException {

Expand Down
25 changes: 0 additions & 25 deletions test/sessions/base_mods/megladon_newstyle_session.xml

This file was deleted.

0 comments on commit 95e90f1

Please sign in to comment.