Skip to content

Commit

Permalink
Version 0.2.9
Browse files Browse the repository at this point in the history
  • Loading branch information
Matthias Lienhard committed Jan 18, 2022
1 parent 8282222 commit 3ca5fd1
Show file tree
Hide file tree
Showing 8 changed files with 129 additions and 38 deletions.
6 changes: 4 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
* optimize add_qc_metrics for run after new samples have been added - should not recompute everything

## [0.2.9]
* improved assigment of reference genes in case of equal number of matching splice sites to several reference genes.
* added DIE test
* adjusted classification of novel exonic TSS/PAS to ISM
* improved assignment of reference genes in case of equal number of matching splice sites to several reference genes.
* added parameter to control for minimal exonic overlap to reference genes in add_sample_from_bam.
* changed computation of direct repeats. Added wobble and max_mm parameters.
* exposed parameters to end user in the add_qc_metrics function.
Expand All @@ -14,7 +16,7 @@

## [0.2.8]
* fix: version information lost when pickeling reference.
* fix missing genen name
* fix missing gene name
* added pt_size parameter to plot_embedding and plot_diff_results function
* added colors parameter to plotting functions
* various fixes of command line script run_isotools.py
Expand Down
2 changes: 1 addition & 1 deletion VERSION.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.2.9rc22
0.2.9
10 changes: 5 additions & 5 deletions src/isotools/_transcriptome_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from tqdm import tqdm
from contextlib import ExitStack
from .short_read import Coverage
from ._utils import junctions_from_cigar, splice_identical, is_same_gene, overlap, pairwise, cigar_string2tuples, rc, get_intersects
from ._utils import junctions_from_cigar, splice_identical, is_same_gene, has_overlap, get_overlap, pairwise, cigar_string2tuples, rc, get_intersects
from .gene import Gene
from .decorators import experimental
import logging
Expand Down Expand Up @@ -84,7 +84,7 @@ def remove_samples(self, sample_names):


def add_sample_from_bam(self, fn, sample_name=None, barcode_file=None, fuzzy_junction=5, add_chromosomes=True,
min_align_fraction=.75, chimeric_mincov=2, min_exonic_ref_coverage=.5, use_satag=False, save_readnames=False, progress_bar=True,
min_align_fraction=.75, chimeric_mincov=2, min_exonic_ref_coverage=.25, use_satag=False, save_readnames=False, progress_bar=True,
**kwargs):
'''Imports expressed transcripts from bam and adds it to the 'Transcriptome' object.
Expand Down Expand Up @@ -579,7 +579,7 @@ def _add_novel_genes(t, novel, chrom, sa, spj_iou_th=0, reg_iou_th=.5, gene_pref
t.infos['novel_counter'] = n_novel


def _find_matching_gene(genes_ol, exons, min_exon_coverage=.5):
def _find_matching_gene(genes_ol, exons, min_exon_coverage):
'''check the splice site intersect of all overlapping genes and return
1) the gene with most shared splice sites,
2) names of genes that cover additional splice sites, and
Expand Down Expand Up @@ -629,7 +629,7 @@ def _find_matching_gene(genes_ol, exons, min_exon_coverage=.5):
trlen = exons[0][1]-exons[0][0]
ol = []
for g in genes_ol:
ol_g = [overlap(exons[0], tr['exons'][0]) for tr in g.transcripts if len(tr['exons']) == 1]
ol_g = [get_overlap(exons[0], tr['exons'][0]) for tr in g.ref_transcripts if len(tr['exons']) == 1]
ol.append(max(ol_g, default=0))
best_idx = np.argmax(ol)
if ol[best_idx] >= min_exon_coverage * trlen:
Expand Down Expand Up @@ -1322,7 +1322,7 @@ def overlap(self, begin, end):
except IndexError:
logger.error('requesting interval between %s and %s, but array is allocated only until position %s', begin, end, len(self.data)*self.bin_size)
raise
return (self.obj[obj_id] for obj_id in candidates if overlap((begin, end), self.obj[obj_id])) # this assumes object has range obj[0] to obj[1]
return (self.obj[obj_id] for obj_id in candidates if has_overlap((begin, end), self.obj[obj_id])) # this assumes object has range obj[0] to obj[1]

def add(self, obj):
self.obj[id(obj)] = obj
Expand Down
72 changes: 52 additions & 20 deletions src/isotools/_transcriptome_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import pandas as pd
import itertools

# from ._utils import overlap
# from .decorators import deprecated, debug, experimental
from ._utils import _filter_function

Expand Down Expand Up @@ -132,6 +131,28 @@ def betabinom_ll(x, n, a, b):
'proportions': proportion_test}


def _check_groups(transcriptome, groups):
assert len(groups) == 2, "length of groups should be 2, but found %i" % len(groups)
# find groups and sample indices
if isinstance(groups, dict):
groupnames = list(groups)
groups = list(groups.values())
elif all(isinstance(gn, str) and gn in transcriptome.groups() for gn in groups):
groupnames = list(groups)
groups = [transcriptome.groups()[gn] for gn in groupnames]
elif all(isinstance(grp, list) for grp in groups):
groupnames = ['group{i+1}' for i in range(len(groups))]
else:
raise ValueError('groups not found in dataset')
notfound = [sa for grp in groups for sa in grp if sa not in transcriptome.samples]
if notfound:
raise ValueError(f"Cannot find the following samples: {notfound}")
assert not (groupnames[0] in groupnames[1] or groupnames[1] in groupnames[0]), 'group names must not be contained in other group names'
sa_idx = {sa: idx[0] for sa, idx in transcriptome._get_sample_idx().items()}
grp_idx = [[sa_idx[sa] for sa in grp] for grp in groups]
return groupnames, groups, grp_idx


def altsplice_test(self, groups, min_total=100, min_alt_fraction=.1, min_n=10, min_sa=.51, test='auto', padj_method='fdr_bh', types=None, progress_bar=True):
'''Performs the alternative splicing event test.
Expand All @@ -145,22 +166,10 @@ def altsplice_test(self, groups, min_total=100, min_alt_fraction=.1, min_n=10, m
:param test: The name of one of the implemented statistical tests ('betabinom_lr','binom_lr','proportions').
:param padj_method: Specify the method for multiple testing correction.
:param types: Restrict the analysis on types of events. If ommited, all types are tested.'''
# assert len(groups) == 2 , "length of groups should be 2, but found %i" % len(groups)
# find groups and sample indices
if isinstance(groups, dict):
groupnames = list(groups)
groups = list(groups.values())
elif all(isinstance(gn, str) and gn in self.groups() for gn in groups):
groupnames = list(groups)
groups = [self.groups()[gn] for gn in groupnames]
elif all(isinstance(grp, list) for grp in groups):
groupnames = ['group{i+1}' for i in range(len(groups))]
else:
raise ValueError('groups not found in dataset')
notfound = [sa for grp in groups for sa in grp if sa not in self.samples]
if notfound:
raise ValueError(f"Cannot find the following samples: {notfound}")
assert not (groupnames[0] in groupnames[1] or groupnames[1] in groupnames[0]), 'group names must not be contained in other group names'

groupnames, groups, grp_idx = _check_groups(self, groups)
sidx = grp_idx[0] + grp_idx[1]

if isinstance(test, str):
if test == 'auto':
test = 'betabinom_lr' if min(len(g) for g in groups[:2]) > 1 else 'proportions'
Expand All @@ -173,9 +182,7 @@ def altsplice_test(self, groups, min_total=100, min_alt_fraction=.1, min_n=10, m
test_name = 'custom'

logger.info('testing differential splicing for %s using %s test', ' vs '.join(f'{groupnames[i]} ({len(groups[i])})' for i in range(2)), test_name)
sa_idx = {sa: idx[0] for sa, idx in self._get_sample_idx().items()}
grp_idx = [[sa_idx[sa] for sa in grp] for grp in groups]
sidx = grp_idx[0] + grp_idx[1]

if min_sa < 1:
min_sa *= sum(len(gr) for gr in groups[:2])
res = []
Expand Down Expand Up @@ -236,6 +243,31 @@ def altsplice_test(self, groups, min_total=100, min_alt_fraction=.1, min_n=10, m
return df


def die_test(self, groups, min_cov=25, n_isoforms=10, padj_method='fdr_bh', progress_bar=True):
''' Reimplementation of the DIE test, suggested by Joglekar et al in Nat Commun 12, 463 (2021):
"A spatially resolved brain region- and cell type-specific isoform atlas of the postnatal mouse brain"
Syntax and parameters follow the original implementation in
https://github.com/noush-joglekar/scisorseqr/blob/master/inst/RScript/IsoformTest.R
:param groups: Dict with groupnames as keys and lists of samplenames as values, defining the two groups for the test.
If more then two groups are provided, test is performed between first two groups, but maximum likelihood parameters
(expected PSI and dispersion) will be computet for the other groups as well.
:param min_cov: Minimal number of reads per group for each gene.
:param n_isoforms: Number of isoforms to consider in the test for each gene. All additional least expressed isoforms get summarized.'''

groupnames, groups, grp_idx = _check_groups(self, groups)
logger.info('testing differential isoform expression (DIE) for %s.', ' vs '.join(f'{groupnames[i]} ({len(groups[i])})' for i in range(2)))

result = [(g.id, g.name, g.chrom, g.strand, g.start, g.end)+g.die_test(grp_idx, min_cov, n_isoforms) for g in self.iter_genes(progress_bar=progress_bar)]
result = pd.DataFrame(result, columns=['gene_id', 'gene_name', 'chrom', 'strand', 'start', 'end', 'pvalue', 'deltaPI', 'transcript_ids'])
mask = np.isfinite(result['pvalue'])
padj = np.empty(mask.shape)
padj.fill(np.nan)
padj[mask] = multi.multipletests(result.loc[mask, 'pvalue'], method=padj_method)[1]
result.insert(6, 'padj', padj)
return result


def alternative_splicing_events(self, min_total=100, min_alt_fraction=.1, samples=None, region=None, query=None, progress_bar=False):
'''Finds alternative splicing events.
Expand Down
12 changes: 9 additions & 3 deletions src/isotools/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def splice_identical(tr1, tr2):
# all splice sites are equal
if len(tr1) != len(tr2): # different number of exons
return False
if len(tr1) == 1 and overlap(tr1[0], tr2[0]): # single exon genes
if len(tr1) == 1 and has_overlap(tr1[0], tr2[0]): # single exon genes
return True
if tr1[0][1] != tr2[0][1] or tr1[-1][0] != tr2[-1][0]: # check first and last exons
return False
Expand All @@ -127,7 +127,7 @@ def splice_identical(tr1, tr2):
return True


def overlap(r1, r2):
def has_overlap(r1, r2):
"check the overlap of two intervals"
# assuming start < end
if r1[1] < r2[0] or r2[1] < r1[0]:
Expand All @@ -136,6 +136,12 @@ def overlap(r1, r2):
return True


def get_overlap(r1, r2):
"check the overlap of two intervals"
# assuming start < end
return min(0, min(r1[1], r2[1]) - max(r1[0], r2[0]))


def get_intersects(tr1, tr2):
"get the number of intersecting splice sites and intersecting bases of two transcripts"
tr1_enum = enumerate(tr1)
Expand All @@ -151,7 +157,7 @@ def get_intersects(tr1, tr2):
sjintersect += 1
if tr2_exon[1] == tr1_exon[1] and i < len(tr2)-1 and j < len(tr1)-1:
sjintersect += 1
if overlap(tr1_exon, tr2_exon):
if has_overlap(tr1_exon, tr2_exon):
# region intersect
i_end = min(tr1_exon[1], tr2_exon[1])
i_start = max(tr1_exon[0], tr2_exon[0])
Expand Down
51 changes: 51 additions & 0 deletions src/isotools/gene.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from intervaltree import Interval
from collections.abc import Iterable
from scipy.stats import chi2_contingency
from Bio.Seq import Seq
import numpy as np
import copy
Expand Down Expand Up @@ -415,6 +416,56 @@ def copy(self):
'Returns a shallow copy of self.'
return self.__copy__()

def die_test(self, groups, min_cov=25, n_isoforms=10):
''' Reimplementation of the DIE test, suggested by Joglekar et al in Nat Commun 12, 463 (2021):
"A spatially resolved brain region- and cell type-specific isoform atlas of the postnatal mouse brain"
Syntax and parameters follow the original implementation in
https://github.com/noush-joglekar/scisorseqr/blob/master/inst/RScript/IsoformTest.R
:param groups: Define the columns for the groups.
:param min_cov: Minimal number of reads per group for the gene.
:param n_isoforms: Number of isoforms to consider in the test for the gene. All additional least expressed isoforms get summarized.'''
# select the samples and sum the group counts
try:
# Fast mode when testing several genes
cov = np.array([self.coverage[grp].sum(0) for grp in groups]).T
except IndexError:
# Fall back to looking up the sample indices
from isotools._transcriptome_stats import _check_groups
_, _, groups = _check_groups(self._transcriptome, groups)
cov = np.array([self.coverage[grp].sum(0) for grp in groups]).T

if np.any(cov.sum(0) < min_cov):
return np.nan, np.nan, []
# if there are more than 'numIsoforms' isoforms of the gene, all additional least expressed get summarized.
if cov.shape[0] > n_isoforms:
idx = np.argpartition(-cov.sum(1), n_isoforms)

additional = cov[idx[n_isoforms:]].sum(0)
cov = cov[idx[:n_isoforms]]
cov[n_isoforms-1] += additional
idx[n_isoforms-1] = -1 # this isoform gets all other - I give it index
elif cov.shape[0] < 2:
return np.nan, np.nan, []
else:
idx = np.array(range(cov.shape[0]))
try:
_, pval, _, _ = chi2_contingency(cov)
except ValueError:
logger.debug(f'chi2_contingency({cov})')
raise
iso_frac = cov/cov.sum(0)
deltaPI = iso_frac[..., 0]-iso_frac[..., 1]
order = np.argsort(deltaPI)
pos_idx = [order[-i] for i in range(1, 3) if deltaPI[order[-i]] > 0]
neg_idx = [order[i] for i in range(2) if deltaPI[order[i]] < 0]
deltaPI_pos = deltaPI[pos_idx].sum()
deltaPI_neg = deltaPI[neg_idx].sum()
if deltaPI_pos > -deltaPI_neg:
return pval, deltaPI_pos, idx[pos_idx]
else:
return pval, deltaPI_neg, idx[neg_idx]


def _coding_len(exons, cds):
coding_len = [0, 0, 0]
Expand Down
12 changes: 6 additions & 6 deletions src/isotools/splice_graph.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np
import logging
from sortedcontainers import SortedDict # for SpliceGraph
from ._utils import pairwise, overlap
from ._utils import pairwise, has_overlap
from .decorators import deprecated, experimental
from typing import Union

Expand Down Expand Up @@ -284,7 +284,7 @@ def _check_exon(self, j1, j2, is_first, is_reverse, e, e2=None):
else:
altsplice = {'novel exon': [e]}
j2 = j1
elif (is_first and is_last): # mono-exon
elif (is_first and is_last): # mono-exon TODO: add info wether there is a ref mono exon?
altsplice['mono-exon'] = []
category = 1
else: # check splice sites
Expand All @@ -298,7 +298,7 @@ def _check_exon(self, j1, j2, is_first, is_reverse, e, e2=None):
elif self[j1][0] > e[0] and not any(j in self._tss for j in range(j1, j2 + 1)): # exon start is intronic in ref
site = 'PAS' if is_reverse else 'TSS'
altsplice.setdefault(f'novel exonic {site}', []).append((e[0], self[j1][0]))
category = max(2, category)
category = max(1, category)
if self[j2][1] != e[1]: # second splice site missmatch
if not is_last:
# pos="intronic" if self[j2][1]<e[1] else "exonic"
Expand All @@ -310,7 +310,7 @@ def _check_exon(self, j1, j2, is_first, is_reverse, e, e2=None):
elif self[j2][1] < e[1] and not any(j in self._pas for j in range(j1, j2 + 1)): # exon end is intronic in ref & not overlapping tss
site = 'TSS' if is_reverse else 'PAS'
altsplice.setdefault(f'novel exonic {site}', []).append((self[j2][1], e[1]))
category = max(2, category)
category = max(1, category)

# find intron retentions
if j1 < j2 and any(self[ji + 1].start - self[ji].end > 0 for ji in range(j1, j2)):
Expand Down Expand Up @@ -519,14 +519,14 @@ def get_exon_support_matrix(self, exons):
esm = np.zeros((len(self._tss), len(exons)), np.bool) # the intron support matrix
for tr_nr, tss in enumerate(self._tss): # check overlap of first exon
for j in range(tss, len(self)):
if overlap(self[j], exons[0]):
if has_overlap(self[j], exons[0]):
esm[tr_nr, 0] = True
elif self[j].suc.get(tr_nr, None) == j + 1 and j - 1 < len(self) and self[j].end == self[j + 1].start:
continue
break
for tr_nr, pas in enumerate(self._pas): # check overlap of last exon
for j in range(pas, -1, -1):
if overlap(self[j], exons[-1]):
if has_overlap(self[j], exons[-1]):
esm[tr_nr, -1] = True
elif self[j].pre.get(tr_nr, None) == j - 1 and j > 0 and self[j].start == self[j - 1].end:
continue
Expand Down
2 changes: 1 addition & 1 deletion src/isotools/transcriptome.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ def __iter__(self):
from ._transcriptome_filter import add_qc_metrics, add_filter, remove_filter, iter_genes, iter_transcripts, iter_ref_transcripts

# statistic: differential splicing, alternative_splicing_events
from ._transcriptome_stats import altsplice_test, alternative_splicing_events, rarefaction
from ._transcriptome_stats import die_test, altsplice_test, alternative_splicing_events, rarefaction

# statistic: summary tables (can be used as input to plot_bar / plot_dist)
from ._transcriptome_stats import altsplice_stats, filter_stats, transcript_length_hist, transcript_coverage_hist,\
Expand Down

0 comments on commit 3ca5fd1

Please sign in to comment.