Skip to content

Commit

Permalink
add include_for_norm and exclude_for_norm to export_coverage
Browse files Browse the repository at this point in the history
  • Loading branch information
kaizhang committed Nov 27, 2024
1 parent 35f98b2 commit 8aa8516
Show file tree
Hide file tree
Showing 6 changed files with 108 additions and 34 deletions.
5 changes: 5 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@

## Nightly (unreleased)

### Features:

- Implement `BPM` normalization in `ex.export_coverage`.
- Add `include_for_norm` and `exclude_for_norm` to `ex.export_coverage`.

## Release 2.7.1 (released October 29, 2024)

### Features:
Expand Down
86 changes: 63 additions & 23 deletions snapatac2-core/src/export.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,8 @@ pub trait Exporter: SnapData {
resolution: usize,
blacklist_regions: Option<&GIntervalMap<()>>,
normalization: Option<Normalization>,
ignore_for_norm: Option<&HashSet<&str>>,
include_for_norm: Option<&GIntervalMap<()>>,
exclude_for_norm: Option<&GIntervalMap<()>>,
min_fragment_length: Option<u64>,
max_fragment_length: Option<u64>,
smooth_length: Option<u16>,
Expand Down Expand Up @@ -206,7 +207,8 @@ pub trait Exporter: SnapData {
smooth_length,
blacklist_regions,
normalization,
ignore_for_norm,
include_for_norm,
exclude_for_norm,
);

match format {
Expand Down Expand Up @@ -276,15 +278,19 @@ impl std::str::FromStr for Normalization {
/// is considered (the total of 60 bp).
/// * `blacklist_regions` - Blacklist regions to be ignored.
/// * `normalization` - Normalization method.
/// * `ignore_for_norm` - Chromosomes to be ignored for normalization.
/// * `include_for_norm` - If specified, only the regions that overlap with these intervals will be used for normalization.
/// * `exclude_for_norm` - If specified, the regions that overlap with these intervals will be
/// excluded from normalization. If a region is in both "include_for_norm" and
/// "exclude_for_norm", it will be excluded.
fn create_bedgraph_from_fragments<I>(
fragments: I,
chrom_sizes: &ChromSizes,
bin_size: u64,
smooth_length: Option<u16>,
blacklist_regions: Option<&GIntervalMap<()>>,
normalization: Option<Normalization>,
ignore_for_norm: Option<&HashSet<&str>>,
include_for_norm: Option<&GIntervalMap<()>>,
exclude_for_norm: Option<&GIntervalMap<()>>,
) -> Vec<BedGraph<f32>>
where
I: Iterator<Item = Fragment>,
Expand All @@ -294,21 +300,23 @@ where
.map(|(k, v)| GenomicRange::new(k, 0, *v))
.collect();
let mut counter = SparseBinnedCoverage::new(&genome, bin_size);
let mut total_reads = 0.0;
let mut norm_factor = 0.0;
fragments.for_each(|frag| {
let not_in_blacklist = blacklist_regions.map_or(true, |bl| !bl.is_overlapped(&frag));
if not_in_blacklist {
if ignore_for_norm.map_or(true, |x| !x.contains(frag.chrom())) {
total_reads += 1.0;
if include_for_norm.map_or(true, |x| x.is_overlapped(&frag))
&& !exclude_for_norm.map_or(false, |x| x.is_overlapped(&frag))
{
norm_factor += 1.0;
}
counter.insert(&frag, 1.0);
}
});

let norm_factor = match normalization {
norm_factor = match normalization {
None => 1.0,
Some(Normalization::RPKM) => total_reads * bin_size as f32 / 1e9,
Some(Normalization::CPM) => total_reads / 1e6,
Some(Normalization::RPKM) => norm_factor * bin_size as f32 / 1e9,
Some(Normalization::CPM) => norm_factor / 1e6,
Some(Normalization::BPM) => counter.get_coverage().values().sum::<f32>() / 1e6,
Some(Normalization::RPGC) => todo!(),
};
Expand All @@ -321,15 +329,22 @@ where
let counts: Box<dyn Iterator<Item = _>> = if let Some(smooth_length) = smooth_length {
let smooth_left = (smooth_length - 1) / 2;
let smooth_right = smooth_left + (smooth_left - 1) % 2;
Box::new(smooth_bedgraph(counts, bin_size, smooth_left, smooth_right, chrom_sizes))
Box::new(smooth_bedgraph(
counts,
bin_size,
smooth_left,
smooth_right,
chrom_sizes,
))
} else {
Box::new(counts)
};

let chunks = counts
.map(|(region, count)| BedGraph::from_bed(&region, count))
.chunk_by(|x| x.value);
chunks.into_iter()
chunks
.into_iter()
.flat_map(|(_, groups)| {
merge_sorted_bed_with(groups, |beds| {
let mut iter = beds.into_iter();
Expand Down Expand Up @@ -502,7 +517,11 @@ mod tests {
None,
None,
None,
).into_iter().map(|x| x.value).collect();
None,
)
.into_iter()
.map(|x| x.value)
.collect();
let expected = vec![1.0, 3.0, 4.0, 3.0, 2.0, 4.0, 3.0, 2.0];
assert_eq!(output, expected);

Expand All @@ -514,7 +533,11 @@ mod tests {
None,
None,
None,
).into_iter().map(|x| x.value).collect();
None,
)
.into_iter()
.map(|x| x.value)
.collect();
let expected = vec![2.0, 4.0, 3.0, 4.0, 3.0, 2.0, 1.0];
assert_eq!(output, expected);
}
Expand All @@ -531,29 +554,45 @@ mod tests {

let output = create_bedgraph_from_fragments(
fragments.clone().into_iter(),
&[("chr1", 248956422), ("chr2", 242193529)].into_iter().collect(),
&[("chr1", 248956422), ("chr2", 242193529)]
.into_iter()
.collect(),
1,
None,
None,
None,
None,
None,
);
assert_eq!(output, expected);

let output = create_bedgraph_from_fragments(
fragments.into_iter(),
&[("chr1", 248956422), ("chr2", 242193529)].into_iter().collect(),
&[("chr1", 248956422), ("chr2", 242193529)]
.into_iter()
.collect(),
1,
None,
None,
Some(Normalization::BPM),
None,
None,
);
let scale_factor: f32 = expected
.iter()
.map(|x| x.value * x.len() as f32)
.sum::<f32>()
/ 1e6;
assert_eq!(
output,
expected
.into_iter()
.map(|mut x| {
x.value = x.value / scale_factor;
x
})
.collect::<Vec<_>>()
);
let scale_factor: f32 = expected.iter().map(|x| x.value * x.len() as f32).sum::<f32>() / 1e6;
assert_eq!(output, expected.into_iter().map(|mut x| {
x.value = x.value / scale_factor;
x
}).collect::<Vec<_>>());
}

#[test]
Expand All @@ -572,7 +611,8 @@ mod tests {
2,
2,
&[("chr1", 10000000)].into_iter().collect(),
).collect::<Vec<_>>(),
)
.collect::<Vec<_>>(),
vec![
(GenomicRange::new("chr1", 0, 100), 0.6),
(GenomicRange::new("chr1", 100, 200), 0.6),
Expand All @@ -590,4 +630,4 @@ mod tests {
],
);
}
}
}
1 change: 1 addition & 0 deletions snapatac2-core/src/preprocessing/qc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,7 @@ pub fn read_tss<R: Read>(file: R) -> impl Iterator<Item = (String, u64, bool)> {
})
}

#[derive(Debug, Clone)]
pub struct TssRegions {
pub promoters: GIntervalMap<bool>,
window_size: u64,
Expand Down
20 changes: 14 additions & 6 deletions snapatac2-python/python/snapatac2/export/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ def export_coverage(
bin_size: int = 10,
blacklist: Path | None = None,
normalization: Literal["RPKM", "CPM", "BPM"] | None = "RPKM",
ignore_for_norm: list[str] | None = None,
include_for_norm: list[str] | Path = None,
exclude_for_norm: list[str] | Path = None,
min_frag_length: int | None = None,
max_frag_length: int | None = 2000,
smooth_length: int | None = None,
Expand All @@ -99,8 +100,9 @@ def export_coverage(
) -> dict[str, str]:
"""Export and save coverage in a bedgraph or bigwig format file.
This function generates coverage tracks (bigWig or bedGraph) for each group
of cells defined in `groupby`. The coverage is calculated as the number of reads
This function first divides cells into groups based on the `groupby` parameter.
It then independently generates the genome-wide coverage track (bigWig or bedGraph) for each group
of cells. The coverage is calculated as the number of reads
per bin, where bins are short consecutive counting windows of a defined size.
For paired-end data, the reads are extended to the fragment length and the
coverage is calculated as the number of fragments per bin.
Expand Down Expand Up @@ -130,8 +132,14 @@ def export_coverage(
- RPKM (per bin) = #reads per bin / (#mapped_reads (in millions) * bin length (kb)).
- CPM (per bin) = #reads per bin / #mapped_reads (in millions).
- BPM (per bin) = #reads per bin / sum of all reads per bin (in millions).
ignore_for_norm
A list of chromosomes to ignore for normalization.
include_for_norm
A list of string or a BED file containing the genomic loci to include for normalization.
If specified, only the reads that overlap with these loci will be used for normalization.
A typical use case is to include only the promoter regions for the normalization.
exclude_for_norm
A list of string or a BED file containing the genomic loci to exclude for normalization.
If specified, the reads that overlap with these loci will be excluded from normalization.
If a read overlaps with both `include_for_norm` and `exclude_for_norm`, it will be excluded.
min_frag_length
Minimum fragment length to be included in the computation.
max_frag_length
Expand Down Expand Up @@ -213,6 +221,6 @@ def export_coverage(
n_jobs = None if n_jobs <= 0 else n_jobs
return internal.export_coverage(
adata, list(groupby), bin_size, out_dir, prefix, suffix, output_format, selections,
blacklist, normalization, ignore_for_norm, min_frag_length,
blacklist, normalization, include_for_norm, exclude_for_norm, min_frag_length,
max_frag_length, smooth_length, compression, compression_level, tempdir, n_jobs,
)
14 changes: 9 additions & 5 deletions snapatac2-python/src/export.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::utils::AnnDataLike;
use crate::utils::{read_genomic_ranges, AnnDataLike};
use snapatac2_core::{export::{Exporter, Normalization, CoverageOutputFormat}, utils};

use pyo3::{prelude::*, pybacked::PyBackedStr};
Expand Down Expand Up @@ -51,7 +51,8 @@ pub fn export_coverage(
selections: Option<HashSet<PyBackedStr>>,
blacklist: Option<PathBuf>,
normalization: Option<&str>,
ignore_for_norm: Option<HashSet<PyBackedStr>>,
include_for_norm: Option<&Bound<'_, PyAny>>,
exclude_for_norm: Option<&Bound<'_, PyAny>>,
min_frag_length: Option<u64>,
max_frag_length: Option<u64>,
smooth_length: Option<u16>,
Expand All @@ -63,8 +64,10 @@ pub fn export_coverage(
let group_by = group_by.iter().map(|x| x.as_ref()).collect();
let selections = selections.as_ref()
.map(|s| s.iter().map(|x| x.as_ref()).collect());
let ignore_for_norm = ignore_for_norm.as_ref()
.map(|s| s.iter().map(|x| x.as_ref()).collect());
let include_for_norm = include_for_norm.as_ref()
.map(|s| read_genomic_ranges(s).unwrap().into_iter().map(|x| (x, ())).collect());
let exclude_for_norm = exclude_for_norm.as_ref()
.map(|s| read_genomic_ranges(s).unwrap().into_iter().map(|x| (x, ())).collect());

let black: Option<GIntervalMap<()>> = blacklist.map(|black| {
Reader::new(utils::open_file_for_read(black), None)
Expand All @@ -80,7 +83,8 @@ pub fn export_coverage(
($data:expr) => {
$data.export_coverage(
&group_by, selections, resolution, black.as_ref(), normalization,
ignore_for_norm.as_ref(), min_frag_length, max_frag_length, smooth_length, dir, prefix,
include_for_norm.as_ref(), exclude_for_norm.as_ref(),
min_frag_length, max_frag_length, smooth_length, dir, prefix,
suffix, output_format, compression.map(|x| utils::Compression::from_str(x).unwrap()),
compression_level, temp_dir, num_threads,
)
Expand Down
16 changes: 16 additions & 0 deletions snapatac2-python/src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,22 @@ pub(crate) fn jm_regress(
Ok(lin_reg_imprecise(iter).unwrap())
}

/// Get a list of genomic ranges from a python object. Acceptable types are:
/// - file-like object
/// - list of strings
pub(crate) fn read_genomic_ranges(input: &Bound<'_, PyAny>) -> Result<Vec<GenomicRange>> {
if let Ok(list) = input.downcast::<pyo3::types::PyList>() {
list.iter().map(|str| {
let str: &str = str.extract()?;
Ok(GenomicRange::from_str(str).unwrap())
}).collect()
} else {
let file: PathBuf = input.extract()?;
let mut reader = bed::io::Reader::new(utils::open_file_for_read(file), None);
Ok(reader.records::<GenomicRange>().map(|x| x.unwrap()).collect())
}
}

/// Read genomic regions from a bed file.
/// Returns a list of strings
#[pyfunction]
Expand Down

0 comments on commit 8aa8516

Please sign in to comment.