Skip to content

Commit

Permalink
Fix variant region for ins and dup on intron-exon boundary
Browse files Browse the repository at this point in the history
  • Loading branch information
b0d0nne11 committed Sep 13, 2024
1 parent e6dbd1e commit e406e44
Show file tree
Hide file tree
Showing 8 changed files with 252 additions and 48 deletions.
15 changes: 12 additions & 3 deletions src/hgvs/assemblymapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,15 +164,24 @@ def t_to_p(self, var_t):
)

def c_to_n(self, var_c):
var_out = super(AssemblyMapper, self).c_to_n(var_c)
alt_ac = self._alt_ac_for_tx_ac(var_c.ac)
var_out = super(AssemblyMapper, self).c_to_n(
var_c, alt_ac=alt_ac, alt_aln_method=self.alt_aln_method
)
return self._maybe_normalize(var_out)

def n_to_c(self, var_n):
var_out = super(AssemblyMapper, self).n_to_c(var_n)
alt_ac = self._alt_ac_for_tx_ac(var_n.ac)
var_out = super(AssemblyMapper, self).n_to_c(
var_n, alt_ac=alt_ac, alt_aln_method=self.alt_aln_method
)
return self._maybe_normalize(var_out)

def c_to_p(self, var_c):
var_out = super(AssemblyMapper, self).c_to_p(var_c)
alt_ac = self._alt_ac_for_tx_ac(var_c.ac)
var_out = super(AssemblyMapper, self).c_to_p(
var_c, alt_ac=alt_ac, alt_aln_method=self.alt_aln_method
)
return self._maybe_normalize(var_out)

def relevant_transcripts(self, var_g):
Expand Down
4 changes: 2 additions & 2 deletions src/hgvs/sequencevariant.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def __repr__(self):
", ".join((a.name + "=" + str(getattr(self, a.name))) for a in self.__attrs_attrs__),
)

def fill_ref(self, hdp):
def fill_ref(self, hdp, alt_ac=None, alt_aln_method=hgvs.global_config.mapping.alt_aln_method):
# TODO: Refactor. SVs should not operate on themselves when
# external resources are required
# replace_reference should be moved outside function
Expand All @@ -67,7 +67,7 @@ def fill_ref(self, hdp):
type in ["del", "delins", "identity", "dup", "repeat"]
and self.posedit.edit.ref_s is None
):
vm._replace_reference(self)
vm._replace_reference(self, alt_ac=alt_ac, alt_aln_method=alt_aln_method)
if type == "identity" and isinstance(self.posedit.edit, hgvs.edit.NARefAlt):
self.posedit.edit.alt = self.posedit.edit.ref
return self
Expand Down
27 changes: 22 additions & 5 deletions src/hgvs/utils/altseqbuilder.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
import logging
import math

from hgvs.exceptions import HGVSError

import six
from bioutils.sequences import reverse_complement, translate_cds

Expand Down Expand Up @@ -195,8 +197,20 @@ def _get_variant_region(self):
and self._var_c.posedit.pos.end.datum == Datum.CDS_END
):
result = self.WHOLE_GENE
elif self._var_c.posedit.edit.type == "ins" and self._var_c.posedit.pos.start.offset == -1 and self._var_c.posedit.pos.end.offset == 0:
# ins at intron-exon boundary
result = self.EXON
elif self._var_c.posedit.edit.type == "ins" and self._var_c.posedit.pos.start.offset == 0 and self._var_c.posedit.pos.end.offset == 1:
# ins at exon-intron boundary
result = self.EXON
elif self._var_c.posedit.edit.type == "dup" and self._var_c.posedit.pos.end.offset == -1:
# dup at intron-exon boundary
result = self.EXON
elif self._var_c.posedit.edit.type == "dup" and self._var_c.posedit.pos.start.offset == 1:
# dup at exon-intron boundary
result = self.EXON
elif self._var_c.posedit.pos.start.offset != 0 or self._var_c.posedit.pos.end.offset != 0:
# leave out anything intronic for now
# leave out anything else intronic for now
result = self.INTRON
else: # anything else that contains an exon
result = self.EXON
Expand Down Expand Up @@ -266,7 +280,10 @@ def _incorporate_dup(self):
"""Incorporate dup into sequence"""
seq, cds_start, cds_stop, start, end = self._setup_incorporate()

dup_seq = seq[start:end]
if not self._var_c.posedit.edit.ref:
raise HGVSError('Duplication variant is missing reference sequence')

dup_seq = self._var_c.posedit.edit.ref
seq[end:end] = dup_seq

is_frameshift = len(dup_seq) % 3 != 0
Expand Down Expand Up @@ -328,10 +345,10 @@ def _setup_incorporate(self):
if pos.base < 0: # 5' UTR
result = cds_start - 1
else: # cds/intron
if pos.offset <= 0:
result = (cds_start - 1) + pos.base - 1
if pos.offset < 0:
result = (cds_start - 1) + pos.base - 2
else:
result = (cds_start - 1) + pos.base
result = (cds_start - 1) + pos.base - 1
elif pos.datum == Datum.CDS_END: # 3' UTR
result = cds_stop + pos.base - 1
else:
Expand Down
85 changes: 58 additions & 27 deletions src/hgvs/variantmapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def t_to_g(self, var_t, alt_ac, alt_aln_method=hgvs.global_config.mapping.alt_al
raise HGVSInvalidVariantError("Expected a c. or n. variant; got " + str(var_t))
if self._validator:
self._validator.validate(var_t)
var_t.fill_ref(self.hdp)
var_t.fill_ref(self.hdp, alt_ac=alt_ac, alt_aln_method=alt_aln_method)
if var_t.type == "c":
var_out = VariantMapper.c_to_g(
self, var_c=var_t, alt_ac=alt_ac, alt_aln_method=alt_aln_method
Expand Down Expand Up @@ -212,7 +212,7 @@ def n_to_g(self, var_n, alt_ac, alt_aln_method=hgvs.global_config.mapping.alt_al
raise HGVSInvalidVariantError("Expected a n. variant; got " + str(var_n))
if self._validator:
self._validator.validate(var_n)
var_n.fill_ref(self.hdp)
var_n.fill_ref(self.hdp, alt_ac=alt_ac, alt_aln_method=alt_aln_method)
mapper = self._fetch_AlignmentMapper(
tx_ac=var_n.ac, alt_ac=alt_ac, alt_aln_method=alt_aln_method
)
Expand All @@ -234,7 +234,7 @@ def n_to_g(self, var_n, alt_ac, alt_aln_method=hgvs.global_config.mapping.alt_al
ac=alt_ac, type="g", posedit=hgvs.posedit.PosEdit(pos_g, edit_g)
)
if self.replace_reference:
self._replace_reference(var_g)
self._replace_reference(var_g, alt_ac, alt_aln_method)
# No gene symbol for g. variants (actually, *should* for NG, but no way to distinguish)
return var_g

Expand Down Expand Up @@ -306,7 +306,7 @@ def c_to_g(self, var_c, alt_ac, alt_aln_method=hgvs.global_config.mapping.alt_al
raise HGVSInvalidVariantError("Expected a cDNA (c.); got " + str(var_c))
if self._validator:
self._validator.validate(var_c)
var_c.fill_ref(self.hdp)
var_c.fill_ref(self.hdp, alt_ac=alt_ac, alt_aln_method=alt_aln_method)
mapper = self._fetch_AlignmentMapper(
tx_ac=var_c.ac, alt_ac=alt_ac, alt_aln_method=alt_aln_method
)
Expand All @@ -331,12 +331,12 @@ def c_to_g(self, var_c, alt_ac, alt_aln_method=hgvs.global_config.mapping.alt_al
ac=alt_ac, type="g", posedit=hgvs.posedit.PosEdit(pos_g, edit_g)
)
if self.replace_reference:
self._replace_reference(var_g)
self._replace_reference(var_g, alt_ac, alt_aln_method)
return var_g

# ############################################################################
# c⟷n
def c_to_n(self, var_c):
def c_to_n(self, var_c, alt_ac=None, alt_aln_method=hgvs.global_config.mapping.alt_aln_method):
"""Given a parsed c. variant, return a n. variant on the specified
transcript using the specified alignment method (default is
"transcript" indicating a self alignment).
Expand All @@ -351,7 +351,7 @@ def c_to_n(self, var_c):
raise HGVSInvalidVariantError("Expected a cDNA (c.); got " + str(var_c))
if self._validator:
self._validator.validate(var_c)
var_c.fill_ref(self.hdp)
var_c.fill_ref(self.hdp, alt_ac=alt_ac, alt_aln_method=alt_aln_method)
mapper = self._fetch_AlignmentMapper(
tx_ac=var_c.ac, alt_ac=var_c.ac, alt_aln_method="transcript"
)
Expand All @@ -370,12 +370,12 @@ def c_to_n(self, var_c):
ac=var_c.ac, type="n", posedit=hgvs.posedit.PosEdit(pos_n, edit_n)
)
if self.replace_reference:
self._replace_reference(var_n)
self._replace_reference(var_n, alt_ac, alt_aln_method)
if self.add_gene_symbol:
self._update_gene_symbol(var_n, var_c.gene)
return var_n

def n_to_c(self, var_n):
def n_to_c(self, var_n, alt_ac=None, alt_aln_method=hgvs.global_config.mapping.alt_aln_method):
"""Given a parsed n. variant, return a c. variant on the specified
transcript using the specified alignment method (default is
"transcript" indicating a self alignment).
Expand All @@ -390,7 +390,7 @@ def n_to_c(self, var_n):
raise HGVSInvalidVariantError("Expected n. variant; got " + str(var_n))
if self._validator:
self._validator.validate(var_n)
var_n.fill_ref(self.hdp)
var_n.fill_ref(self.hdp, alt_ac=alt_ac, alt_aln_method=alt_aln_method)
mapper = self._fetch_AlignmentMapper(
tx_ac=var_n.ac, alt_ac=var_n.ac, alt_aln_method="transcript"
)
Expand All @@ -409,14 +409,14 @@ def n_to_c(self, var_n):
ac=var_n.ac, type="c", posedit=hgvs.posedit.PosEdit(pos_c, edit_c)
)
if self.replace_reference:
self._replace_reference(var_c)
self._replace_reference(var_c, alt_ac, alt_aln_method)
if self.add_gene_symbol:
self._update_gene_symbol(var_c, var_n.gene)
return var_c

# ############################################################################
# c ⟶ p
def c_to_p(self, var_c, pro_ac=None):
def c_to_p(self, var_c, pro_ac=None, alt_ac=None, alt_aln_method=hgvs.global_config.mapping.alt_aln_method):
"""
Converts a c. SequenceVariant to a p. SequenceVariant on the specified protein accession
Author: Rudy Rico
Expand All @@ -431,6 +431,7 @@ def c_to_p(self, var_c, pro_ac=None):
raise HGVSInvalidVariantError("Expected a cDNA (c.) variant; got " + str(var_c))
if self._validator:
self._validator.validate(var_c)
var_c.fill_ref(self.hdp, alt_ac=alt_ac, alt_aln_method=alt_aln_method)
reference_data = RefTranscriptData(self.hdp, var_c.ac, pro_ac)
builder = altseqbuilder.AltSeqBuilder(var_c, reference_data)

Expand All @@ -455,7 +456,7 @@ def c_to_p(self, var_c, pro_ac=None):
############################################################################
# Internal methods

def _replace_reference(self, var):
def _replace_reference(self, var, alt_ac=None, alt_aln_method=hgvs.global_config.mapping.alt_aln_method):
"""fetch reference sequence for variant and update (in-place) if necessary"""

if var.type not in "cgmnr":
Expand All @@ -465,24 +466,51 @@ def _replace_reference(self, var):
# these types have no reference sequence (zero-width), so return as-is
return var

pos = var.posedit.pos
if (isinstance(pos.start, hgvs.location.BaseOffsetPosition) and pos.start.offset != 0) or (
isinstance(pos.end, hgvs.location.BaseOffsetPosition) and pos.end.offset != 0
mapper = None
if (
var.type in "cnr"
and var.posedit.pos is not None
and (var.posedit.pos.start.offset != 0 or var.posedit.pos.end.offset != 0)
):
_logger.info("Can't update reference sequence for intronic variant {}".format(var))
return var

# For c. variants, we need coords on underlying sequences
if var.type == "c":
mapper = self._fetch_AlignmentMapper(
tx_ac=var.ac, alt_ac=var.ac, alt_aln_method="transcript"
)
pos = mapper.c_to_n(var.posedit.pos)
if var.type == "r":
_logger.info("Can't update reference sequence for intronic variant %s", var)
return var
if alt_ac is None:
_logger.info("Can't update reference sequence for intronic variant %s without alt_ac", var)
return var
if var.type == "c":
mapper = self._fetch_AlignmentMapper(
tx_ac=var.ac, alt_ac=alt_ac, alt_aln_method=alt_aln_method
)
pos = mapper.c_to_g(var.posedit.pos)
ac = alt_ac
_type = "g"
if var.type == "n":
mapper = self._fetch_AlignmentMapper(
tx_ac=var.ac, alt_ac=alt_ac, alt_aln_method=alt_aln_method
)
pos = mapper.n_to_g(var.posedit.pos)
ac = alt_ac
_type = "g"
else:
pos = var.posedit.pos
# For c. variants, we need coords on underlying sequences
if var.type == "c":
mapper = self._fetch_AlignmentMapper(
tx_ac=var.ac, alt_ac=var.ac, alt_aln_method="transcript"
)
pos = mapper.c_to_n(var.posedit.pos)
ac = var.ac
_type = var.type
else:
pos = var.posedit.pos
ac = var.ac
_type = var.type

seq_start = pos.start.base - 1
seq_end = pos.end.base
if _type in "cnr":
seq_start += pos.start.offset
seq_end += pos.end.offset

# When strict_bounds is False and an error occurs, return
# variant as-is
Expand All @@ -491,12 +519,15 @@ def _replace_reference(self, var):
# this is an out-of-bounds variant
return var

seq = self.hdp.get_seq(var.ac, seq_start, seq_end)
seq = self.hdp.get_seq(ac, seq_start, seq_end)

if len(seq) != seq_end - seq_start:
# tried to read beyond seq end; this is an out-of-bounds variant
return var

if _type == "g" and mapper and mapper.strand == -1:
seq = reverse_complement(seq)

edit = var.posedit.edit
if edit.ref != seq:
_logger.debug(
Expand Down
20 changes: 10 additions & 10 deletions tests/data/sanity_cp.tsv
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
accession transcript_sequence cds_start_i cds_end_i
NM_999999.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGCGAAATAGGGG 9 39
NM_999998.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGCGAAATAGAGGAGGTAGTTTCGC 9 39
NM_999997.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGTAGAATAGAGGAGGCAGTTTCGC 9 39
NM_999996.1 AAAATCAAAATGAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 39
NM_999995.1 AAAATCAAAATGAAAAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 42
NM_999994.1 AAAATCAAAATGAAAAAAAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 45
NM_999993.1 AAAATCAAAATGGGGAGGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGC 9 39
NM_999992.1 AAAATCAAAATGGGGTAGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGC 9 39
NM_999992.2 AAAATCAAAATGGGGTAGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGCC 9 40
accession transcript_sequence cds_start_i cds_end_i lengths
NM_999999.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGCGAAATAGGGG 9 39 [10,25,30]
NM_999998.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGCGAAATAGAGGAGGTAGTTTCGC 9 39 [10,25,30]
NM_999997.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGTAGAATAGAGGAGGCAGTTTCGC 9 39 [10,25,30]
NM_999996.1 AAAATCAAAATGAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 39 [10,25,30]
NM_999995.1 AAAATCAAAATGAAAAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 42 [10,25,30]
NM_999994.1 AAAATCAAAATGAAAAAAAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 45 [10,25,30]
NM_999993.1 AAAATCAAAATGGGGAGGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGC 9 39 [10,25,30]
NM_999992.1 AAAATCAAAATGGGGTAGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGC 9 39 [10,25,30]
NM_999992.2 AAAATCAAAATGGGGTAGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGCC 9 40 [10,25,30]
Loading

0 comments on commit e406e44

Please sign in to comment.