add include_for_norm and exclude_for_norm to export_coverage

kaizhang · Nov 27, 2024 · 8aa8516 · 8aa8516
1 parent 35f98b2
commit 8aa8516
Show file tree

Hide file tree

Showing 6 changed files with 108 additions and 34 deletions.
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -2,6 +2,11 @@
 
 ## Nightly (unreleased)
 
+### Features:
+
+  - Implement `BPM` normalization in `ex.export_coverage`.
+  - Add `include_for_norm` and `exclude_for_norm` to `ex.export_coverage`.
+
 ## Release 2.7.1 (released October 29, 2024)
 
 ### Features:

diff --git a/snapatac2-core/src/export.rs b/snapatac2-core/src/export.rs
@@ -133,7 +133,8 @@ pub trait Exporter: SnapData {
         resolution: usize,
         blacklist_regions: Option<&GIntervalMap<()>>,
         normalization: Option<Normalization>,
-        ignore_for_norm: Option<&HashSet<&str>>,
+        include_for_norm: Option<&GIntervalMap<()>>,
+        exclude_for_norm: Option<&GIntervalMap<()>>,
         min_fragment_length: Option<u64>,
         max_fragment_length: Option<u64>,
         smooth_length: Option<u16>,
@@ -206,7 +207,8 @@ pub trait Exporter: SnapData {
                         smooth_length,
                         blacklist_regions,
                         normalization,
-                        ignore_for_norm,
+                        include_for_norm,
+                        exclude_for_norm,
                     );
 
                     match format {
@@ -276,15 +278,19 @@ impl std::str::FromStr for Normalization {
 ///                     is considered (the total of 60 bp).
 /// * `blacklist_regions` - Blacklist regions to be ignored.
 /// * `normalization` - Normalization method.
-/// * `ignore_for_norm` - Chromosomes to be ignored for normalization.
+/// * `include_for_norm` - If specified, only the regions that overlap with these intervals will be used for normalization.
+/// * `exclude_for_norm` - If specified, the regions that overlap with these intervals will be
+///                        excluded from normalization. If a region is in both "include_for_norm" and
+///                        "exclude_for_norm", it will be excluded.
 fn create_bedgraph_from_fragments<I>(
     fragments: I,
     chrom_sizes: &ChromSizes,
     bin_size: u64,
     smooth_length: Option<u16>,
     blacklist_regions: Option<&GIntervalMap<()>>,
     normalization: Option<Normalization>,
-    ignore_for_norm: Option<&HashSet<&str>>,
+    include_for_norm: Option<&GIntervalMap<()>>,
+    exclude_for_norm: Option<&GIntervalMap<()>>,
 ) -> Vec<BedGraph<f32>>
 where
     I: Iterator<Item = Fragment>,
@@ -294,21 +300,23 @@ where
         .map(|(k, v)| GenomicRange::new(k, 0, *v))
         .collect();
     let mut counter = SparseBinnedCoverage::new(&genome, bin_size);
-    let mut total_reads = 0.0;
+    let mut norm_factor = 0.0;
     fragments.for_each(|frag| {
         let not_in_blacklist = blacklist_regions.map_or(true, |bl| !bl.is_overlapped(&frag));
         if not_in_blacklist {
-            if ignore_for_norm.map_or(true, |x| !x.contains(frag.chrom())) {
-                total_reads += 1.0;
+            if include_for_norm.map_or(true, |x| x.is_overlapped(&frag))
+                && !exclude_for_norm.map_or(false, |x| x.is_overlapped(&frag))
+            {
+                norm_factor += 1.0;
             }
             counter.insert(&frag, 1.0);
         }
     });
 
-    let norm_factor = match normalization {
+    norm_factor = match normalization {
         None => 1.0,
-        Some(Normalization::RPKM) => total_reads * bin_size as f32 / 1e9,
-        Some(Normalization::CPM) => total_reads / 1e6,
+        Some(Normalization::RPKM) => norm_factor * bin_size as f32 / 1e9,
+        Some(Normalization::CPM) => norm_factor / 1e6,
         Some(Normalization::BPM) => counter.get_coverage().values().sum::<f32>() / 1e6,
         Some(Normalization::RPGC) => todo!(),
     };
@@ -321,15 +329,22 @@ where
     let counts: Box<dyn Iterator<Item = _>> = if let Some(smooth_length) = smooth_length {
         let smooth_left = (smooth_length - 1) / 2;
         let smooth_right = smooth_left + (smooth_left - 1) % 2;
-        Box::new(smooth_bedgraph(counts, bin_size, smooth_left, smooth_right, chrom_sizes))
+        Box::new(smooth_bedgraph(
+            counts,
+            bin_size,
+            smooth_left,
+            smooth_right,
+            chrom_sizes,
+        ))
     } else {
         Box::new(counts)
     };
 
     let chunks = counts
         .map(|(region, count)| BedGraph::from_bed(&region, count))
         .chunk_by(|x| x.value);
-    chunks.into_iter()
+    chunks
+        .into_iter()
         .flat_map(|(_, groups)| {
             merge_sorted_bed_with(groups, |beds| {
                 let mut iter = beds.into_iter();
@@ -502,7 +517,11 @@ mod tests {
             None,
             None,
             None,
-        ).into_iter().map(|x| x.value).collect();
+            None,
+        )
+        .into_iter()
+        .map(|x| x.value)
+        .collect();
         let expected = vec![1.0, 3.0, 4.0, 3.0, 2.0, 4.0, 3.0, 2.0];
         assert_eq!(output, expected);
 
@@ -514,7 +533,11 @@ mod tests {
             None,
             None,
             None,
-        ).into_iter().map(|x| x.value).collect();
+            None,
+        )
+        .into_iter()
+        .map(|x| x.value)
+        .collect();
         let expected = vec![2.0, 4.0, 3.0, 4.0, 3.0, 2.0, 1.0];
         assert_eq!(output, expected);
     }
@@ -531,29 +554,45 @@ mod tests {
 
         let output = create_bedgraph_from_fragments(
             fragments.clone().into_iter(),
-            &[("chr1", 248956422), ("chr2", 242193529)].into_iter().collect(),
+            &[("chr1", 248956422), ("chr2", 242193529)]
+                .into_iter()
+                .collect(),
             1,
             None,
             None,
             None,
             None,
+            None,
         );
         assert_eq!(output, expected);
 
         let output = create_bedgraph_from_fragments(
             fragments.into_iter(),
-            &[("chr1", 248956422), ("chr2", 242193529)].into_iter().collect(),
+            &[("chr1", 248956422), ("chr2", 242193529)]
+                .into_iter()
+                .collect(),
             1,
             None,
             None,
             Some(Normalization::BPM),
             None,
+            None,
+        );
+        let scale_factor: f32 = expected
+            .iter()
+            .map(|x| x.value * x.len() as f32)
+            .sum::<f32>()
+            / 1e6;
+        assert_eq!(
+            output,
+            expected
+                .into_iter()
+                .map(|mut x| {
+                    x.value = x.value / scale_factor;
+                    x
+                })
+                .collect::<Vec<_>>()
         );
-        let scale_factor: f32 = expected.iter().map(|x| x.value * x.len() as f32).sum::<f32>() / 1e6;
-        assert_eq!(output, expected.into_iter().map(|mut x| {
-            x.value = x.value / scale_factor;
-            x
-        }).collect::<Vec<_>>());
     }
 
     #[test]
@@ -572,7 +611,8 @@ mod tests {
                 2,
                 2,
                 &[("chr1", 10000000)].into_iter().collect(),
-            ).collect::<Vec<_>>(),
+            )
+            .collect::<Vec<_>>(),
             vec![
                 (GenomicRange::new("chr1", 0, 100), 0.6),
                 (GenomicRange::new("chr1", 100, 200), 0.6),
@@ -590,4 +630,4 @@ mod tests {
             ],
         );
     }
-}
+}
diff --git a/snapatac2-core/src/preprocessing/qc.rs b/snapatac2-core/src/preprocessing/qc.rs
@@ -249,6 +249,7 @@ pub fn read_tss<R: Read>(file: R) -> impl Iterator<Item = (String, u64, bool)> {
     })
 }
 
+#[derive(Debug, Clone)]
 pub struct TssRegions {
     pub promoters: GIntervalMap<bool>,
     window_size: u64,

diff --git a/snapatac2-python/python/snapatac2/export/__init__.py b/snapatac2-python/python/snapatac2/export/__init__.py
@@ -84,7 +84,8 @@ def export_coverage(
     bin_size: int = 10,
     blacklist: Path | None = None,
     normalization: Literal["RPKM", "CPM", "BPM"] | None = "RPKM",
-    ignore_for_norm: list[str] | None = None,
+    include_for_norm: list[str] | Path = None,
+    exclude_for_norm: list[str] | Path = None,
     min_frag_length: int | None = None,
     max_frag_length: int | None = 2000,
     smooth_length: int | None = None,
@@ -99,8 +100,9 @@ def export_coverage(
 ) -> dict[str, str]:
     """Export and save coverage in a bedgraph or bigwig format file.
 
-    This function generates coverage tracks (bigWig or bedGraph) for each group
-    of cells defined in `groupby`. The coverage is calculated as the number of reads
+    This function first divides cells into groups based on the `groupby` parameter.
+    It then independently generates the genome-wide coverage track (bigWig or bedGraph) for each group
+    of cells. The coverage is calculated as the number of reads
     per bin, where bins are short consecutive counting windows of a defined size.
     For paired-end data, the reads are extended to the fragment length and the
     coverage is calculated as the number of fragments per bin.
@@ -130,8 +132,14 @@ def export_coverage(
         - RPKM (per bin) = #reads per bin / (#mapped_reads (in millions) * bin length (kb)).
         - CPM (per bin) = #reads per bin / #mapped_reads (in millions).
         - BPM (per bin) = #reads per bin / sum of all reads per bin (in millions).
-    ignore_for_norm
-        A list of chromosomes to ignore for normalization.
+    include_for_norm
+        A list of string or a BED file containing the genomic loci to include for normalization.
+        If specified, only the reads that overlap with these loci will be used for normalization.
+        A typical use case is to include only the promoter regions for the normalization.
+    exclude_for_norm
+        A list of string or a BED file containing the genomic loci to exclude for normalization.
+        If specified, the reads that overlap with these loci will be excluded from normalization.
+        If a read overlaps with both `include_for_norm` and `exclude_for_norm`, it will be excluded.
     min_frag_length
         Minimum fragment length to be included in the computation.
     max_frag_length
@@ -213,6 +221,6 @@ def export_coverage(
     n_jobs = None if n_jobs <= 0 else n_jobs
     return internal.export_coverage(
         adata, list(groupby), bin_size, out_dir, prefix, suffix, output_format, selections,
-        blacklist, normalization, ignore_for_norm, min_frag_length,
+        blacklist, normalization, include_for_norm, exclude_for_norm, min_frag_length,
         max_frag_length, smooth_length, compression, compression_level, tempdir, n_jobs,
     )
diff --git a/snapatac2-python/src/export.rs b/snapatac2-python/src/export.rs
@@ -1,4 +1,4 @@
-use crate::utils::AnnDataLike;
+use crate::utils::{read_genomic_ranges, AnnDataLike};
 use snapatac2_core::{export::{Exporter, Normalization, CoverageOutputFormat}, utils};
 
 use pyo3::{prelude::*, pybacked::PyBackedStr};
@@ -51,7 +51,8 @@ pub fn export_coverage(
     selections: Option<HashSet<PyBackedStr>>,
     blacklist: Option<PathBuf>,
     normalization: Option<&str>,
-    ignore_for_norm: Option<HashSet<PyBackedStr>>,
+    include_for_norm: Option<&Bound<'_, PyAny>>,
+    exclude_for_norm: Option<&Bound<'_, PyAny>>,
     min_frag_length: Option<u64>,
     max_frag_length: Option<u64>,
     smooth_length: Option<u16>,
@@ -63,8 +64,10 @@ pub fn export_coverage(
     let group_by = group_by.iter().map(|x| x.as_ref()).collect();
     let selections = selections.as_ref()
         .map(|s| s.iter().map(|x| x.as_ref()).collect());
-    let ignore_for_norm = ignore_for_norm.as_ref()
-        .map(|s| s.iter().map(|x| x.as_ref()).collect());
+    let include_for_norm = include_for_norm.as_ref()
+        .map(|s| read_genomic_ranges(s).unwrap().into_iter().map(|x| (x, ())).collect());
+    let exclude_for_norm = exclude_for_norm.as_ref()
+        .map(|s| read_genomic_ranges(s).unwrap().into_iter().map(|x| (x, ())).collect());
 
     let black: Option<GIntervalMap<()>> = blacklist.map(|black| {
         Reader::new(utils::open_file_for_read(black), None)
@@ -80,7 +83,8 @@ pub fn export_coverage(
         ($data:expr) => {
             $data.export_coverage(
                 &group_by, selections, resolution, black.as_ref(), normalization,
-                ignore_for_norm.as_ref(), min_frag_length, max_frag_length, smooth_length, dir, prefix,
+                include_for_norm.as_ref(), exclude_for_norm.as_ref(),
+                min_frag_length, max_frag_length, smooth_length, dir, prefix,
                 suffix, output_format, compression.map(|x| utils::Compression::from_str(x).unwrap()),
                 compression_level, temp_dir, num_threads,
             )

diff --git a/snapatac2-python/src/utils.rs b/snapatac2-python/src/utils.rs
@@ -228,6 +228,22 @@ pub(crate) fn jm_regress(
     Ok(lin_reg_imprecise(iter).unwrap())
 }
 
+/// Get a list of genomic ranges from a python object. Acceptable types are:
+/// - file-like object
+/// - list of strings
+pub(crate) fn read_genomic_ranges(input: &Bound<'_, PyAny>) -> Result<Vec<GenomicRange>> {
+    if let Ok(list) = input.downcast::<pyo3::types::PyList>() {
+        list.iter().map(|str| {
+            let str: &str = str.extract()?;
+            Ok(GenomicRange::from_str(str).unwrap())
+        }).collect()
+    } else {
+        let file: PathBuf = input.extract()?;
+        let mut reader = bed::io::Reader::new(utils::open_file_for_read(file), None);
+        Ok(reader.records::<GenomicRange>().map(|x| x.unwrap()).collect())
+    }
+}
+
 /// Read genomic regions from a bed file.
 /// Returns a list of strings
 #[pyfunction]