fix tests

kaizhang · Oct 8, 2023 · 8accb32 · 8accb32
1 parent 9c84520
commit 8accb32
Show file tree

Hide file tree

Showing 24 changed files with 242 additions and 188 deletions.
diff --git a/snapatac2-core/Cargo.toml b/snapatac2-core/Cargo.toml
@@ -15,7 +15,7 @@ anndata = { git = "https://github.com/kaizhang/anndata-rs.git", rev = "7279f0a8a
 anyhow = "1.0"
 bigtools = "0.2"
 bincode = "1.3"
-bed-utils = { git = "https://github.com/kaizhang/bed-utils.git", rev = "19832eda2909dbe87289e09ba966af0797adb9f8" }
+bed-utils = { git = "https://github.com/kaizhang/bed-utils.git", rev = "27ae1ee81db4ac84434014877b8d17aa22583df2" }
 extsort = "0.4"
 either = "1.8"
 flate2 = "1.0"

diff --git a/snapatac2-core/src/export.rs b/snapatac2-core/src/export.rs
@@ -61,7 +61,7 @@ pub trait Exporter: SnapData {
                     if let Some(barcodes_) = barcodes {
                         let bc = barcodes_[start + i];
                         entry.extend(xs.into_iter().map(|mut x| {
-                            x.name = Some(bc.to_string());
+                            x.barcode = Some(bc.to_string());
                             x
                         }));
                     } else {

diff --git a/snapatac2-core/src/preprocessing/bam.rs b/snapatac2-core/src/preprocessing/bam.rs
@@ -83,7 +83,7 @@ pub fn make_fragment_file<P1: AsRef<Path>, P2: AsRef<Path>>(
     let header: Header = fix_header(reader.read_header().unwrap()).parse().unwrap();
     reader.read_reference_sequences().unwrap();
 
-    let f = File::create(output_file.as_ref().clone()).expect("cannot create the output file");
+    let f = File::create(output_file.as_ref()).expect("cannot create the output file");
     let mut output: Box<dyn Write> =
         if output_file.as_ref().extension().and_then(|x| x.to_str()) == Some("gz") {
             Box::new(GzEncoder::new(BufWriter::new(f), Compression::default()))

diff --git a/snapatac2-core/src/preprocessing/count_data.rs b/snapatac2-core/src/preprocessing/count_data.rs
@@ -84,7 +84,7 @@ pub trait SnapData: AnnDataOp {
 
     /// Compute the fraction of reads in each region.
     fn frip<D>(&self, regions: &Vec<BedTree<D>>) -> Result<Array2<f64>> {
-        let vec = qc::fraction_in_region(self.get_count_iter(2000)?.into_raw(), regions)
+        let vec = qc::fraction_of_reads_in_region(self.get_count_iter(2000)?.into_raw(), regions)
             .map(|x| x.0).flatten().flatten().collect::<Vec<_>>();
         Array2::from_shape_vec((self.n_obs(), regions.len()), vec).map_err(Into::into)
     }

diff --git a/snapatac2-core/src/preprocessing/count_data/coverage.rs b/snapatac2-core/src/preprocessing/count_data/coverage.rs
@@ -1,8 +1,8 @@
-use crate::preprocessing::count_data::genome::{FeatureCounter, GenomeBaseIndex, ChromSizes};
+use crate::preprocessing::{count_data::genome::{FeatureCounter, GenomeBaseIndex, ChromSizes}, Fragment};
 
 use std::collections::HashMap;
 use anndata::data::{utils::to_csr_data, CsrNonCanonical};
-use bed_utils::bed::{BEDLike, BED, Strand, GenomicRange};
+use bed_utils::bed::{BEDLike, Strand, GenomicRange};
 use nalgebra_sparse::{CsrMatrix, pattern::SparsityPattern};
 use num::traits::{FromPrimitive, One, Zero, SaturatingAdd};
 use rayon::iter::{IntoParallelIterator, ParallelIterator};
@@ -70,7 +70,7 @@ where
         self
     }
 
-    pub fn into_raw(self) -> impl ExactSizeIterator<Item = (Vec<Vec<BED<6>>>, usize, usize)> {
+    pub fn into_raw(self) -> impl ExactSizeIterator<Item = (Vec<Vec<Fragment>>, usize, usize)> {
         let index = self.index;
         self.coverage.map(move |(raw_mat, a, b)| {
             let beds = match raw_mat {
@@ -85,25 +85,23 @@ where
                             let size = values[j];
                             let (chrom, pos) = index.get_position(col_indices[j]);
                             if size > 0 {
-                                BED::new(
-                                    chrom,
-                                    pos,
-                                    pos + size as u64,
-                                    None,
-                                    None,
-                                    Some(Strand::Forward),
-                                    Default::default(),
-                                )
+                                Fragment {
+                                    chrom: chrom.to_string(),
+                                    start: pos,
+                                    end: pos + size as u64,
+                                    barcode: None,
+                                    count: 1,
+                                    strand: Some(Strand::Forward),
+                                }
                             } else {
-                                BED::new(
-                                    chrom,
-                                    pos + 1 - size.abs() as u64,
-                                    pos + 1,
-                                    None,
-                                    None,
-                                    Some(Strand::Reverse),
-                                    Default::default(),
-                                )
+                                Fragment {
+                                    chrom: chrom.to_string(),
+                                    start: pos + 1 - size.abs() as u64,
+                                    end: pos + 1,
+                                    barcode: None,
+                                    count: 1,
+                                    strand: Some(Strand::Reverse),
+                                }
                             }
                         }).collect()
                     }).collect()
@@ -118,12 +116,14 @@ where
                         (row_start..row_end).map(|j| {
                             let size = values[j];
                             let (chrom, start) = index.get_position(col_indices[j]);
-                            BED::new(
-                                chrom,
+                            Fragment {
+                                chrom: chrom.to_string(),
                                 start,
-                                start + size as u64,
-                                None, None, None, Default::default(),
-                            )
+                                end: start + size as u64,
+                                barcode: None,
+                                count: 1,
+                                strand: None,
+                            }
                         }).collect()
                     }).collect()
                 },
@@ -132,7 +132,7 @@ where
         })
     }
 
-    pub fn into_raw_groups<F, K>(self, key: F) -> impl ExactSizeIterator<Item = HashMap<K, Vec<BED<6>>>>
+    pub fn into_raw_groups<F, K>(self, key: F) -> impl ExactSizeIterator<Item = HashMap<K, Vec<Fragment>>>
     where
         F: Fn(usize) -> K,
         K: Eq + PartialEq + std::hash::Hash,
@@ -146,7 +146,6 @@ where
                     .or_insert_with(Vec::new)
                     .extend(xs.into_iter());
             });
-
             ordered
         })
     }

diff --git a/snapatac2-core/src/preprocessing/count_data/import.rs b/snapatac2-core/src/preprocessing/count_data/import.rs
@@ -62,7 +62,7 @@ where
     anndata.obsm().add_iter(
         obsm_key,
         fragments
-            .group_by(|x| x.barcode.clone())
+            .group_by(|x| x.name().unwrap().to_string())
             .into_iter()
             .progress_with(spinner)
             .filter(|(key, _)| white_list.map_or(true, |x| x.contains(key)))

diff --git a/snapatac2-core/src/preprocessing/qc.rs b/snapatac2-core/src/preprocessing/qc.rs
@@ -1,13 +1,11 @@
 use std::{io::{Read, BufRead, BufReader}, ops::Div, collections::{HashMap, HashSet}};
 use anndata::data::CsrNonCanonical;
-use bed_utils::bed::{
-    GenomicRange, BEDLike, tree::BedTree,
-    ParseError, Strand, BED,
-};
+use bed_utils::bed::{GenomicRange, BEDLike, tree::BedTree, ParseError, Strand};
 use anyhow::Result;
 use serde::{Serialize, Deserialize};
 use extsort::sorter::Sortable;
 use bincode;
+use smallvec::{SmallVec, smallvec};
 
 pub type CellBarcode = String;
 
@@ -18,7 +16,7 @@ pub struct Fragment {
     pub chrom: String,
     pub start: u64,
     pub end: u64,
-    pub barcode: CellBarcode,
+    pub barcode: Option<CellBarcode>,
     pub count: u32,
     pub strand: Option<Strand>,
 }
@@ -35,16 +33,16 @@ impl Sortable for Fragment {
 }
 
 impl Fragment {
-    pub fn to_insertions(&self) -> Vec<GenomicRange> {
+    pub fn to_reads(&self) -> SmallVec<[GenomicRange; 2]> {
         match self.strand {
-            None => vec![
+            None => smallvec![
                 GenomicRange::new(self.chrom.clone(), self.start, self.start + 1),
                 GenomicRange::new(self.chrom.clone(), self.end - 1, self.end),
             ],
-            Some(Strand::Forward) => vec![
+            Some(Strand::Forward) => smallvec![
                 GenomicRange::new(self.chrom.clone(), self.start, self.start + 1)
             ],
-            Some(Strand::Reverse) => vec![
+            Some(Strand::Reverse) => smallvec![
                 GenomicRange::new(self.chrom.clone(), self.end - 1, self.end)
             ],
         }
@@ -67,9 +65,33 @@ impl BEDLike for Fragment {
         self.end = end;
         self
     }
-    fn name(&self) -> Option<&str> { None }
-    fn score(&self) -> Option<bed_utils::bed::Score> { None }
-    fn strand(&self) -> Option<Strand> { None }
+    fn name(&self) -> Option<&str> {
+        self.barcode.as_deref()
+    }
+    fn score(&self) -> Option<bed_utils::bed::Score> {
+        Some(self.count.try_into().unwrap())
+    }
+    fn strand(&self) -> Option<Strand> {
+        self.strand
+    }
+}
+
+impl core::fmt::Display for Fragment {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        write!(
+            f,
+            "{}\t{}\t{}\t{}\t{}",
+            self.chrom(),
+            self.start(),
+            self.end(),
+            self.barcode.as_deref().unwrap_or("."),
+            self.count,
+        )?;
+        if let Some(strand) = self.strand() {
+            write!(f, "\t{}", strand)?;
+        }
+        Ok(())
+    }
 }
 
 impl std::str::FromStr for Fragment {
@@ -82,8 +104,13 @@ impl std::str::FromStr for Fragment {
             .and_then(|s| lexical::parse(s).map_err(ParseError::InvalidStartPosition))?;
         let end = fields.next().ok_or(ParseError::MissingEndPosition)
             .and_then(|s| lexical::parse(s).map_err(ParseError::InvalidEndPosition))?;
-        let barcode = fields.next().ok_or(ParseError::MissingName)
-            .map(|s| s.into())?;
+        let barcode = fields
+            .next()
+            .ok_or(ParseError::MissingName)
+            .map(|s| match s {
+                "." => None,
+                _ => Some(s.into()),
+            })?;
         let count = fields.next().map_or(Ok(1), |s| if s == "." {
             Ok(1)
         } else {
@@ -245,15 +272,15 @@ where
 {
     let mut barcodes = HashMap::new();
     fragments.for_each(|frag| {
-        let key = frag.barcode.clone();
+        let key = frag.barcode.unwrap().clone();
         *barcodes.entry(key).or_insert(0) += 1;
     });
     barcodes
 }
 
 pub fn tss_enrichment<I>(fragments: I, promoter: &BedTree<bool>) -> f64
 where
-    I: Iterator<Item = BED<6>>,
+    I: Iterator<Item = Fragment>,
 {
     fn find_pos<'a>(promoter: &'a BedTree<bool>, ins: &'a GenomicRange) -> impl Iterator<Item = usize> + 'a {
         promoter.find(ins).map(|(entry, data)| {
@@ -316,25 +343,28 @@ pub fn fragment_size_distribution<I>(data: I, max_size: usize) -> Vec<usize>
     size_dist
 }
 
-/// Count the fraction of the records in the given regions.
-pub fn fraction_in_region<'a, I, D>(
+/// Count the fraction of the reads in the given regions.
+pub fn fraction_of_reads_in_region<'a, I, D>(
     iter: I, regions: &'a Vec<BedTree<D>>,
 ) -> impl Iterator<Item = (Vec<Vec<f64>>, usize, usize)> + 'a
 where
-    I: Iterator<Item = (Vec<Vec<BED<6>>>, usize, usize)> + 'a,
+    I: Iterator<Item = (Vec<Vec<Fragment>>, usize, usize)> + 'a,
 {
     let k = regions.len();
-    iter.map(move |(beds, start, end)| {
-        let frac = beds.into_iter().map(|xs| {
-            let sum = xs.len() as f64;
+    iter.map(move |(data, start, end)| {
+        let frac = data.into_iter().map(|fragments| {
+            let mut sum = 0.0;
             let mut counts = vec![0.0; k];
-            xs.into_iter().for_each(|x|
-                regions.iter().enumerate().for_each(|(i, r)| {
-                    if r.is_overlapped(&x) {
-                        counts[i] += 1.0;
-                    }
-                })
-            );
+            fragments
+                .into_iter().flat_map(|fragment| fragment.to_reads())
+                .for_each(|read| {
+                    sum += 1.0;
+                    regions.iter().enumerate().for_each(|(i, r)|
+                        if r.is_overlapped(&read) {
+                            counts[i] += 1.0;
+                        }
+                    )
+                });
             counts.iter_mut().for_each(|x| *x /= sum);
             counts
         }).collect::<Vec<_>>();

diff --git a/snapatac2-python/Cargo.toml b/snapatac2-python/Cargo.toml
@@ -18,7 +18,7 @@ anndata-hdf5 = { git = "https://github.com/kaizhang/anndata-rs.git", rev = "7279
 pyanndata = { git = "https://github.com/kaizhang/anndata-rs.git", rev = "7279f0a8a2bf8a40aef5067af6dd2b4dca44a437" }
 extsort = "0.4"
 anyhow = "1.0"
-bed-utils = { git = "https://github.com/kaizhang/bed-utils.git", rev = "19832eda2909dbe87289e09ba966af0797adb9f8" }
+bed-utils = { git = "https://github.com/kaizhang/bed-utils.git", rev = "27ae1ee81db4ac84434014877b8d17aa22583df2" }
 flate2 = "1.0"
 itertools = "0.11"
 indicatif = "0.17"

diff --git a/snapatac2-python/snapatac2/_utils.py b/snapatac2-python/snapatac2/_utils.py
@@ -1,11 +1,11 @@
 import numpy as np
-import anndata as ad
 import logging
 
-from snapatac2._snapatac2 import AnnData, AnnDataSet, read
+from anndata import AnnData
+import snapatac2._snapatac2 as internal
 
 def is_anndata(data) -> bool:
-    return isinstance(data, ad.AnnData) or isinstance(data, AnnData) or isinstance(data, AnnDataSet)
+    return isinstance(data, AnnData) or isinstance(data, internal.AnnData) or isinstance(data, internal.AnnDataSet)
 
 def anndata_par(adatas, func, n_jobs=4):
     return anndata_ipar(list(enumerate(adatas)), lambda x: func(x[1]), n_jobs=n_jobs)
@@ -15,7 +15,7 @@ def anndata_ipar(inputs, func, n_jobs=4):
 
     exist_in_memory_adata = False
     for _, adata in inputs:
-        if isinstance(adata, ad.AnnData):
+        if isinstance(adata, AnnData):
             exist_in_memory_adata = True
             break
     if exist_in_memory_adata:
@@ -27,7 +27,7 @@ def anndata_ipar(inputs, func, n_jobs=4):
         from multiprocess import get_context
 
         def _func(x):
-            adata = read(x[1])
+            adata = internal.read(x[1])
             result = func((x[0], adata))
             adata.close() 
             return result

diff --git a/snapatac2-python/snapatac2/datasets.py b/snapatac2-python/snapatac2/datasets.py
@@ -127,11 +127,11 @@ def pbmc5k(type: Literal["fragment, h5ad, annotated_h5ad"] = "fragment") -> Path
         path to the file.
     """
     if type == "fragment":
-        return Path(datasets().fetch("atac_pbmc_5k.tsv.gz"))
+        return Path(datasets().fetch("atac_pbmc_5k.tsv.gz", progressbar=True))
     elif type == "h5ad":
-        return Path(datasets().fetch("atac_pbmc_5k.h5ad"))
+        return Path(datasets().fetch("atac_pbmc_5k.h5ad", progressbar=True))
     elif type == "annotated_h5ad":
-        return Path(datasets().fetch("atac_pbmc_5k_annotated.h5ad"))
+        return Path(datasets().fetch("atac_pbmc_5k_annotated.h5ad", progressbar=True))
     else:
         raise NameError("type '{}' is not available.".format(type))
 
@@ -175,7 +175,7 @@ def colon() -> list[tuple[str, Path]]:
     list[tuple[str, Path]]
         A list of tuples, each tuple contains the sample name and the path to the fragment file.
     """
-    files = datasets().fetch("colon_transverse.tar", processor = pooch.Untar())
+    files = datasets().fetch("colon_transverse.tar", progressbar=True, processor = pooch.Untar())
     return [(fl.split("/")[-1].split("_rep1_fragments")[0], Path(fl)) for fl in files]
 
 def cre_HEA() -> Path:

diff --git a/snapatac2-python/snapatac2/genome.py b/snapatac2-python/snapatac2/genome.py
@@ -8,10 +8,13 @@ def __init__(self, chrom_sizes, annotation_filename, fasta = None) -> None:
         self._fasta_filename = fasta
 
     def fetch_annotations(self):
-        return datasets().fetch(self._annotation_filename)
+        return datasets().fetch(self._annotation_filename, progressbar=True)
 
     def fetch_fasta(self):
-        return datasets().fetch(self._fasta_filename, processor = Decompress(method = "gzip"))
+        return datasets().fetch(
+            self._fasta_filename,
+            processor=Decompress(method = "gzip"),
+            progressbar=True)
 
 GRCh37 = Genome(
     {