Version 0.2.9

MatthiasLienhard · Jan 18, 2022 · 3ca5fd1 · 3ca5fd1
1 parent 8282222
commit 3ca5fd1
Show file tree

Hide file tree

Showing 8 changed files with 129 additions and 38 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,7 +4,9 @@
 * optimize add_qc_metrics for run after new samples have been added - should not recompute everything
 
 ## [0.2.9]
-* improved assigment of reference genes in case of equal number of matching splice sites to several reference genes. 
+* added DIE test
+* adjusted classification of novel exonic TSS/PAS to ISM
+* improved assignment of reference genes in case of equal number of matching splice sites to several reference genes. 
 * added parameter to control for minimal exonic overlap to reference genes in add_sample_from_bam.
 * changed computation of direct repeats. Added wobble and max_mm parameters.
 * exposed parameters to end user in the add_qc_metrics function. 
@@ -14,7 +16,7 @@
 
 ## [0.2.8]
 * fix: version information lost when pickeling reference.
-* fix missing genen name
+* fix missing gene name
 * added pt_size parameter to plot_embedding and plot_diff_results function
 * added colors parameter to plotting functions
 * various fixes of command line script run_isotools.py

diff --git a/VERSION.txt b/VERSION.txt
@@ -1 +1 @@
-0.2.9rc22
+0.2.9
diff --git a/src/isotools/_transcriptome_io.py b/src/isotools/_transcriptome_io.py
@@ -8,7 +8,7 @@
 from tqdm import tqdm
 from contextlib import ExitStack
 from .short_read import Coverage
-from ._utils import junctions_from_cigar, splice_identical, is_same_gene, overlap, pairwise, cigar_string2tuples, rc, get_intersects
+from ._utils import junctions_from_cigar, splice_identical, is_same_gene, has_overlap, get_overlap, pairwise, cigar_string2tuples, rc, get_intersects
 from .gene import Gene
 from .decorators import experimental
 import logging
@@ -84,7 +84,7 @@ def remove_samples(self, sample_names):
 
 
 def add_sample_from_bam(self, fn, sample_name=None, barcode_file=None, fuzzy_junction=5, add_chromosomes=True,
-                        min_align_fraction=.75, chimeric_mincov=2, min_exonic_ref_coverage=.5, use_satag=False, save_readnames=False, progress_bar=True,
+                        min_align_fraction=.75, chimeric_mincov=2, min_exonic_ref_coverage=.25, use_satag=False, save_readnames=False, progress_bar=True,
                         **kwargs):
     '''Imports expressed transcripts from bam and adds it to the 'Transcriptome' object.
 
@@ -579,7 +579,7 @@ def _add_novel_genes(t, novel, chrom, sa, spj_iou_th=0, reg_iou_th=.5, gene_pref
     t.infos['novel_counter'] = n_novel
 
 
-def _find_matching_gene(genes_ol, exons, min_exon_coverage=.5):
+def _find_matching_gene(genes_ol, exons, min_exon_coverage):
     '''check the splice site intersect of all overlapping genes and return
             1) the gene with most shared splice sites,
             2) names of genes that cover additional splice sites, and
@@ -629,7 +629,7 @@ def _find_matching_gene(genes_ol, exons, min_exon_coverage=.5):
             trlen = exons[0][1]-exons[0][0]
             ol = []
             for g in genes_ol:
-                ol_g = [overlap(exons[0], tr['exons'][0]) for tr in g.transcripts if len(tr['exons']) == 1]
+                ol_g = [get_overlap(exons[0], tr['exons'][0]) for tr in g.ref_transcripts if len(tr['exons']) == 1]
                 ol.append(max(ol_g, default=0))
             best_idx = np.argmax(ol)
             if ol[best_idx] >= min_exon_coverage * trlen:
@@ -1322,7 +1322,7 @@ def overlap(self, begin, end):
         except IndexError:
             logger.error('requesting interval between %s and %s, but array is allocated only until position %s', begin, end, len(self.data)*self.bin_size)
             raise
-        return (self.obj[obj_id] for obj_id in candidates if overlap((begin, end), self.obj[obj_id]))  # this assumes object has range obj[0] to obj[1]
+        return (self.obj[obj_id] for obj_id in candidates if has_overlap((begin, end), self.obj[obj_id]))  # this assumes object has range obj[0] to obj[1]
 
     def add(self, obj):
         self.obj[id(obj)] = obj

diff --git a/src/isotools/_transcriptome_stats.py b/src/isotools/_transcriptome_stats.py
@@ -7,7 +7,6 @@
 import pandas as pd
 import itertools
 
-# from ._utils import overlap
 # from .decorators import deprecated, debug, experimental
 from ._utils import _filter_function
 
@@ -132,6 +131,28 @@ def betabinom_ll(x, n, a, b):
          'proportions': proportion_test}
 
 
+def _check_groups(transcriptome, groups):
+    assert len(groups) == 2, "length of groups should be 2, but found %i" % len(groups)
+    # find groups and sample indices
+    if isinstance(groups, dict):
+        groupnames = list(groups)
+        groups = list(groups.values())
+    elif all(isinstance(gn, str) and gn in transcriptome.groups() for gn in groups):
+        groupnames = list(groups)
+        groups = [transcriptome.groups()[gn] for gn in groupnames]
+    elif all(isinstance(grp, list) for grp in groups):
+        groupnames = ['group{i+1}' for i in range(len(groups))]
+    else:
+        raise ValueError('groups not found in dataset')
+    notfound = [sa for grp in groups for sa in grp if sa not in transcriptome.samples]
+    if notfound:
+        raise ValueError(f"Cannot find the following samples: {notfound}")
+    assert not (groupnames[0] in groupnames[1] or groupnames[1] in groupnames[0]), 'group names must not be contained in other group names'
+    sa_idx = {sa: idx[0] for sa, idx in transcriptome._get_sample_idx().items()}
+    grp_idx = [[sa_idx[sa] for sa in grp] for grp in groups]
+    return groupnames, groups, grp_idx
+
+
 def altsplice_test(self, groups, min_total=100, min_alt_fraction=.1, min_n=10, min_sa=.51, test='auto', padj_method='fdr_bh', types=None, progress_bar=True):
     '''Performs the alternative splicing event test.
 
@@ -145,22 +166,10 @@ def altsplice_test(self, groups, min_total=100, min_alt_fraction=.1, min_n=10, m
     :param test: The name of one of the implemented statistical tests ('betabinom_lr','binom_lr','proportions').
     :param padj_method: Specify the method for multiple testing correction.
     :param types: Restrict the analysis on types of events. If ommited, all types are tested.'''
-    # assert len(groups) == 2 , "length of groups should be 2, but found %i" % len(groups)
-    # find groups and sample indices
-    if isinstance(groups, dict):
-        groupnames = list(groups)
-        groups = list(groups.values())
-    elif all(isinstance(gn, str) and gn in self.groups() for gn in groups):
-        groupnames = list(groups)
-        groups = [self.groups()[gn] for gn in groupnames]
-    elif all(isinstance(grp, list) for grp in groups):
-        groupnames = ['group{i+1}' for i in range(len(groups))]
-    else:
-        raise ValueError('groups not found in dataset')
-    notfound = [sa for grp in groups for sa in grp if sa not in self.samples]
-    if notfound:
-        raise ValueError(f"Cannot find the following samples: {notfound}")
-    assert not (groupnames[0] in groupnames[1] or groupnames[1] in groupnames[0]), 'group names must not be contained in other group names'
+
+    groupnames, groups, grp_idx = _check_groups(self, groups)
+    sidx = grp_idx[0] + grp_idx[1]
+
     if isinstance(test, str):
         if test == 'auto':
             test = 'betabinom_lr' if min(len(g) for g in groups[:2]) > 1 else 'proportions'
@@ -173,9 +182,7 @@ def altsplice_test(self, groups, min_total=100, min_alt_fraction=.1, min_n=10, m
         test_name = 'custom'
 
     logger.info('testing differential splicing for %s using %s test', ' vs '.join(f'{groupnames[i]} ({len(groups[i])})' for i in range(2)), test_name)
-    sa_idx = {sa: idx[0] for sa, idx in self._get_sample_idx().items()}
-    grp_idx = [[sa_idx[sa] for sa in grp] for grp in groups]
-    sidx = grp_idx[0] + grp_idx[1]
+
     if min_sa < 1:
         min_sa *= sum(len(gr) for gr in groups[:2])
     res = []
@@ -236,6 +243,31 @@ def altsplice_test(self, groups, min_total=100, min_alt_fraction=.1, min_n=10, m
     return df
 
 
+def die_test(self, groups, min_cov=25, n_isoforms=10, padj_method='fdr_bh', progress_bar=True):
+    ''' Reimplementation of the DIE test, suggested by Joglekar et al in Nat Commun 12, 463 (2021):
+    "A spatially resolved brain region- and cell type-specific isoform atlas of the postnatal mouse brain"
+
+    Syntax and parameters follow the original implementation in
+    https://github.com/noush-joglekar/scisorseqr/blob/master/inst/RScript/IsoformTest.R
+    :param groups: Dict with groupnames as keys and lists of samplenames as values, defining the two groups for the test.
+    If more then two groups are provided, test is performed between first two groups, but maximum likelihood parameters
+    (expected PSI and dispersion) will be computet for the other groups as well.
+    :param min_cov: Minimal number of reads per group for each gene.
+    :param n_isoforms: Number of isoforms to consider in the test for each gene. All additional least expressed isoforms get summarized.'''
+
+    groupnames, groups, grp_idx = _check_groups(self, groups)
+    logger.info('testing differential isoform expression (DIE) for %s.', ' vs '.join(f'{groupnames[i]} ({len(groups[i])})' for i in range(2)))
+
+    result = [(g.id, g.name, g.chrom, g.strand, g.start, g.end)+g.die_test(grp_idx, min_cov, n_isoforms) for g in self.iter_genes(progress_bar=progress_bar)]
+    result = pd.DataFrame(result, columns=['gene_id', 'gene_name', 'chrom', 'strand', 'start', 'end', 'pvalue', 'deltaPI', 'transcript_ids'])
+    mask = np.isfinite(result['pvalue'])
+    padj = np.empty(mask.shape)
+    padj.fill(np.nan)
+    padj[mask] = multi.multipletests(result.loc[mask, 'pvalue'], method=padj_method)[1]
+    result.insert(6, 'padj', padj)
+    return result
+
+
 def alternative_splicing_events(self, min_total=100, min_alt_fraction=.1, samples=None, region=None, query=None, progress_bar=False):
     '''Finds alternative splicing events.
 

diff --git a/src/isotools/_utils.py b/src/isotools/_utils.py
@@ -117,7 +117,7 @@ def splice_identical(tr1, tr2):
     # all splice sites are equal
     if len(tr1) != len(tr2):  # different number of exons
         return False
-    if len(tr1) == 1 and overlap(tr1[0], tr2[0]):  # single exon genes
+    if len(tr1) == 1 and has_overlap(tr1[0], tr2[0]):  # single exon genes
         return True
     if tr1[0][1] != tr2[0][1] or tr1[-1][0] != tr2[-1][0]:  # check first and last exons
         return False
@@ -127,7 +127,7 @@ def splice_identical(tr1, tr2):
     return True
 
 
-def overlap(r1, r2):
+def has_overlap(r1, r2):
     "check the overlap of two intervals"
     # assuming start < end
     if r1[1] < r2[0] or r2[1] < r1[0]:
@@ -136,6 +136,12 @@ def overlap(r1, r2):
         return True
 
 
+def get_overlap(r1, r2):
+    "check the overlap of two intervals"
+    # assuming start < end
+    return min(0, min(r1[1], r2[1]) - max(r1[0], r2[0]))
+
+
 def get_intersects(tr1, tr2):
     "get the number of intersecting splice sites and intersecting bases of two transcripts"
     tr1_enum = enumerate(tr1)
@@ -151,7 +157,7 @@ def get_intersects(tr1, tr2):
                 sjintersect += 1
             if tr2_exon[1] == tr1_exon[1] and i < len(tr2)-1 and j < len(tr1)-1:
                 sjintersect += 1
-            if overlap(tr1_exon, tr2_exon):
+            if has_overlap(tr1_exon, tr2_exon):
                 # region intersect
                 i_end = min(tr1_exon[1], tr2_exon[1])
                 i_start = max(tr1_exon[0], tr2_exon[0])

diff --git a/src/isotools/gene.py b/src/isotools/gene.py
@@ -1,5 +1,6 @@
 from intervaltree import Interval
 from collections.abc import Iterable
+from scipy.stats import chi2_contingency
 from Bio.Seq import Seq
 import numpy as np
 import copy
@@ -415,6 +416,56 @@ def copy(self):
         'Returns a shallow copy of self.'
         return self.__copy__()
 
+    def die_test(self, groups, min_cov=25, n_isoforms=10):
+        ''' Reimplementation of the DIE test, suggested by Joglekar et al in Nat Commun 12, 463 (2021):
+        "A spatially resolved brain region- and cell type-specific isoform atlas of the postnatal mouse brain"
+
+        Syntax and parameters follow the original implementation in
+        https://github.com/noush-joglekar/scisorseqr/blob/master/inst/RScript/IsoformTest.R
+        :param groups: Define the columns for the groups.
+        :param min_cov: Minimal number of reads per group for the gene.
+        :param n_isoforms: Number of isoforms to consider in the test for the gene. All additional least expressed isoforms get summarized.'''
+        # select the samples and sum the group counts
+        try:
+            # Fast mode when testing several genes
+            cov = np.array([self.coverage[grp].sum(0) for grp in groups]).T
+        except IndexError:
+            # Fall back to looking up the sample indices
+            from isotools._transcriptome_stats import _check_groups
+            _, _, groups = _check_groups(self._transcriptome, groups)
+            cov = np.array([self.coverage[grp].sum(0) for grp in groups]).T
+
+        if np.any(cov.sum(0) < min_cov):
+            return np.nan, np.nan, []
+        # if there are more than 'numIsoforms' isoforms of the gene, all additional least expressed get summarized.
+        if cov.shape[0] > n_isoforms:
+            idx = np.argpartition(-cov.sum(1), n_isoforms)
+
+            additional = cov[idx[n_isoforms:]].sum(0)
+            cov = cov[idx[:n_isoforms]]
+            cov[n_isoforms-1] += additional
+            idx[n_isoforms-1] = -1  # this isoform gets all other - I give it index
+        elif cov.shape[0] < 2:
+            return np.nan, np.nan, []
+        else:
+            idx = np.array(range(cov.shape[0]))
+        try:
+            _, pval, _, _ = chi2_contingency(cov)
+        except ValueError:
+            logger.debug(f'chi2_contingency({cov})')
+            raise
+        iso_frac = cov/cov.sum(0)
+        deltaPI = iso_frac[..., 0]-iso_frac[..., 1]
+        order = np.argsort(deltaPI)
+        pos_idx = [order[-i] for i in range(1, 3) if deltaPI[order[-i]] > 0]
+        neg_idx = [order[i] for i in range(2) if deltaPI[order[i]] < 0]
+        deltaPI_pos = deltaPI[pos_idx].sum()
+        deltaPI_neg = deltaPI[neg_idx].sum()
+        if deltaPI_pos > -deltaPI_neg:
+            return pval, deltaPI_pos, idx[pos_idx]
+        else:
+            return pval, deltaPI_neg, idx[neg_idx]
+
 
 def _coding_len(exons, cds):
     coding_len = [0, 0, 0]

diff --git a/src/isotools/splice_graph.py b/src/isotools/splice_graph.py
@@ -1,7 +1,7 @@
 import numpy as np
 import logging
 from sortedcontainers import SortedDict  # for SpliceGraph
-from ._utils import pairwise, overlap
+from ._utils import pairwise, has_overlap
 from .decorators import deprecated, experimental
 from typing import Union
 
@@ -284,7 +284,7 @@ def _check_exon(self, j1, j2, is_first, is_reverse, e, e2=None):
             else:
                 altsplice = {'novel exon': [e]}
             j2 = j1
-        elif (is_first and is_last):  # mono-exon
+        elif (is_first and is_last):  # mono-exon TODO: add info wether there is a ref mono exon?
             altsplice['mono-exon'] = []
             category = 1
         else:  # check splice sites
@@ -298,7 +298,7 @@ def _check_exon(self, j1, j2, is_first, is_reverse, e, e2=None):
                 elif self[j1][0] > e[0] and not any(j in self._tss for j in range(j1, j2 + 1)):  # exon start is intronic in ref
                     site = 'PAS' if is_reverse else 'TSS'
                     altsplice.setdefault(f'novel exonic {site}', []).append((e[0], self[j1][0]))
-                    category = max(2, category)
+                    category = max(1, category)
             if self[j2][1] != e[1]:  # second splice site missmatch
                 if not is_last:
                     # pos="intronic" if self[j2][1]<e[1] else "exonic"
@@ -310,7 +310,7 @@ def _check_exon(self, j1, j2, is_first, is_reverse, e, e2=None):
                 elif self[j2][1] < e[1] and not any(j in self._pas for j in range(j1, j2 + 1)):  # exon end is intronic in ref & not overlapping tss
                     site = 'TSS' if is_reverse else 'PAS'
                     altsplice.setdefault(f'novel exonic {site}', []).append((self[j2][1], e[1]))
-                    category = max(2, category)
+                    category = max(1, category)
 
         # find intron retentions
         if j1 < j2 and any(self[ji + 1].start - self[ji].end > 0 for ji in range(j1, j2)):
@@ -519,14 +519,14 @@ def get_exon_support_matrix(self, exons):
         esm = np.zeros((len(self._tss), len(exons)), np.bool)  # the intron support matrix
         for tr_nr, tss in enumerate(self._tss):  # check overlap of first exon
             for j in range(tss, len(self)):
-                if overlap(self[j], exons[0]):
+                if has_overlap(self[j], exons[0]):
                     esm[tr_nr, 0] = True
                 elif self[j].suc.get(tr_nr, None) == j + 1 and j - 1 < len(self) and self[j].end == self[j + 1].start:
                     continue
                 break
         for tr_nr, pas in enumerate(self._pas):  # check overlap of last exon
             for j in range(pas, -1, -1):
-                if overlap(self[j], exons[-1]):
+                if has_overlap(self[j], exons[-1]):
                     esm[tr_nr, -1] = True
                 elif self[j].pre.get(tr_nr, None) == j - 1 and j > 0 and self[j].start == self[j - 1].end:
                     continue

diff --git a/src/isotools/transcriptome.py b/src/isotools/transcriptome.py
@@ -235,7 +235,7 @@ def __iter__(self):
     from ._transcriptome_filter import add_qc_metrics, add_filter, remove_filter, iter_genes, iter_transcripts, iter_ref_transcripts
 
     # statistic: differential splicing, alternative_splicing_events
-    from ._transcriptome_stats import altsplice_test, alternative_splicing_events, rarefaction
+    from ._transcriptome_stats import die_test, altsplice_test, alternative_splicing_events, rarefaction
 
     # statistic: summary tables (can be used as input to plot_bar / plot_dist)
     from ._transcriptome_stats import altsplice_stats, filter_stats, transcript_length_hist, transcript_coverage_hist,\