diff --git a/.gitignore b/.gitignore index 8a43336f..64d1be4e 100644 --- a/.gitignore +++ b/.gitignore @@ -26,4 +26,7 @@ _autosummary # h5ad *.h5ad -*.h5ads \ No newline at end of file +*.h5ads +*.bdg +*.bam +*.bai \ No newline at end of file diff --git a/snapatac2-core/src/feature_count/mod.rs b/snapatac2-core/src/feature_count/mod.rs index 50a740a5..c229b3e2 100644 --- a/snapatac2-core/src/feature_count/mod.rs +++ b/snapatac2-core/src/feature_count/mod.rs @@ -16,6 +16,10 @@ use polars::frame::DataFrame; use crate::genome::ChromSizes; +pub const FRAGMENT_SINGLE: &str = "fragment_single"; +pub const FRAGMENT_PAIRED: &str = "fragment_paired"; +pub const BASE_VALUE: &str = "_values"; + /// The `SnapData` trait represents an interface for reading and /// manipulating single-cell assay data. It extends the `AnnDataOp` trait, /// adding methods for reading chromosome sizes and genome-wide base-resolution coverage. @@ -70,20 +74,20 @@ impl SnapData for AnnData { fn get_fragment_iter(&self, chunk_size: usize) -> Result { let obsm = self.obsm(); let matrices: FragmentDataIter = if let Some(insertion) = - obsm.get_item_iter("fragment_single", chunk_size) + obsm.get_item_iter(FRAGMENT_SINGLE, chunk_size) { FragmentDataIter::FragmentSingle(Box::new(insertion)) } else if let Some(fragment) = obsm.get_item_iter("fragment_paired", chunk_size) { FragmentDataIter::FragmentPaired(Box::new(fragment)) } else { - bail!("one of the following keys must be present in the '.obsm': 'fragment_single', 'fragment_paired'") + bail!("one of the following keys must be present in the '.obsm': '{}', '{}'", FRAGMENT_SINGLE, FRAGMENT_PAIRED) }; Ok(FragmentData::new(self.read_chrom_sizes()?, matrices)) } fn get_base_iter(&self, chunk_size: usize) -> Result, usize, usize)>>> { let obsm = self.obsm(); - if let Some(data) = obsm.get_item_iter::>("_values", chunk_size) { + if let Some(data) = obsm.get_item_iter::>(BASE_VALUE, chunk_size) { Ok(BaseData::new(self.read_chrom_sizes()?, data)) } else { bail!("key '_values' is not present in the '.obsm'") @@ -96,13 +100,13 @@ impl SnapData for AnnDataSet { let adatas = self.adatas().inner(); let obsm = adatas.get_obsm(); let matrices: FragmentDataIter = if let Some(insertion) = - obsm.get_item_iter("fragment_single", chunk_size) + obsm.get_item_iter(FRAGMENT_SINGLE, chunk_size) { FragmentDataIter::FragmentSingle(Box::new(insertion)) } else if let Some(fragment) = obsm.get_item_iter("fragment_paired", chunk_size) { FragmentDataIter::FragmentPaired(Box::new(fragment)) } else { - bail!("one of the following keys must be present in the '.obsm': 'fragment_single', 'fragment_paired'") + bail!("one of the following keys must be present in the '.obsm': '{}', '{}'", FRAGMENT_SINGLE, FRAGMENT_PAIRED) }; Ok(FragmentData::new(self.read_chrom_sizes()?, matrices)) } @@ -110,7 +114,7 @@ impl SnapData for AnnDataSet { fn get_base_iter(&self, chunk_size: usize) -> Result, usize, usize)>>> { let obsm = self.obsm(); - if let Some(data) = obsm.get_item_iter::>("_values", chunk_size) { + if let Some(data) = obsm.get_item_iter::>(BASE_VALUE, chunk_size) { Ok(BaseData::new(self.read_chrom_sizes()?, data)) } else { bail!("key '_values' is not present in the '.obsm'") diff --git a/snapatac2-core/src/preprocessing/import.rs b/snapatac2-core/src/preprocessing/import.rs index 814b3e0b..8105804f 100644 --- a/snapatac2-core/src/preprocessing/import.rs +++ b/snapatac2-core/src/preprocessing/import.rs @@ -1,4 +1,4 @@ -use crate::feature_count::ContactData; +use crate::feature_count::{ContactData, BASE_VALUE, FRAGMENT_PAIRED, FRAGMENT_SINGLE}; use crate::genome::{ChromSizes, GenomeBaseIndex}; use crate::preprocessing::qc::{Contact, Fragment, FragmentSummary, FragmentQC}; @@ -53,9 +53,9 @@ where false }; let obsm_key = if is_paired { - "fragment_paired" + FRAGMENT_PAIRED } else { - "fragment_single" + FRAGMENT_SINGLE }; let genome_index = GenomeBaseIndex::new(chrom_sizes); @@ -336,7 +336,6 @@ where ) .unwrap(), ); - let obsm_key = "_values"; let genome_index = GenomeBaseIndex::new(chrom_sizes); let genome_size = genome_index.len(); @@ -381,7 +380,7 @@ where CsrMatrix::try_from_csr_data(r, c, offset, ind, csr_data).unwrap() }); - anndata.obsm().add_iter(obsm_key, arrays)?; + anndata.obsm().add_iter(BASE_VALUE, arrays)?; anndata .uns() .add("reference_sequences", chrom_sizes.to_dataframe())?; diff --git a/snapatac2-core/src/preprocessing/qc.rs b/snapatac2-core/src/preprocessing/qc.rs index c77fd6f1..69c832be 100644 --- a/snapatac2-core/src/preprocessing/qc.rs +++ b/snapatac2-core/src/preprocessing/qc.rs @@ -15,6 +15,65 @@ use crate::feature_count::{FragmentDataIter, SnapData}; pub type CellBarcode = String; +pub trait QualityControl: SnapData { + /// [ATAC QC] Compute TSS enrichment. + fn tss_enrichment<'a>(&self, promoter: &'a TssRegions) -> Result<(Vec, TSSe<'a>)> { + let library_tsse = Arc::new(Mutex::new(TSSe::new(promoter))); + let scores = self + .get_fragment_iter(2000)? + .into_fragments() + .flat_map(|(list_of_fragments, _, _)| { + list_of_fragments + .into_par_iter() + .map(|fragments| { + let mut tsse = TSSe::new(promoter); + fragments.into_iter().for_each(|x| tsse.add(&x)); + library_tsse.lock().unwrap().add_from(&tsse); + tsse.result().0 + }) + .collect::>() + }) + .collect(); + Ok(( + scores, + Arc::into_inner(library_tsse).unwrap().into_inner().unwrap(), + )) + } + + /// [ATAC QC] Compute the fragment size distribution. + fn fragment_size_distribution(&self, max_size: usize) -> Result> { + if let FragmentDataIter::FragmentPaired(fragments) = + self.get_fragment_iter(500)?.into_inner() + { + Ok(fragment_size_distribution(fragments.map(|x| x.0), max_size)) + } else { + bail!("key 'fragment_paired' is not present in the '.obsm'") + } + } + + /// [ATAC QC] Compute the fraction of reads in each region. + fn frip( + &self, + regions: &Vec>, + normalized: bool, + count_as_insertion: bool, + ) -> Result> { + let vec = fraction_of_reads_in_region( + self.get_fragment_iter(2000)?.into_fragments(), + regions, + normalized, + count_as_insertion, + ) + .map(|x| x.0) + .flatten() + .flatten() + .collect::>(); + Array2::from_shape_vec((self.n_obs(), regions.len()), vec).map_err(Into::into) + } +} + +impl QualityControl for T {} + /// Fragments from single-cell ATAC-seq experiment. Each fragment is represented /// by a genomic coordinate, cell barcode and a integer value. #[derive(Serialize, Deserialize, Debug, Clone)] @@ -501,63 +560,4 @@ where .collect::>(); (frac, start, end) }) -} - -pub trait QualityControl: SnapData { - /// Compute TSS enrichment. - fn tss_enrichment<'a>(&self, promoter: &'a TssRegions) -> Result<(Vec, TSSe<'a>)> { - let library_tsse = Arc::new(Mutex::new(TSSe::new(promoter))); - let scores = self - .get_fragment_iter(2000)? - .into_fragments() - .flat_map(|(list_of_fragments, _, _)| { - list_of_fragments - .into_par_iter() - .map(|fragments| { - let mut tsse = TSSe::new(promoter); - fragments.into_iter().for_each(|x| tsse.add(&x)); - library_tsse.lock().unwrap().add_from(&tsse); - tsse.result().0 - }) - .collect::>() - }) - .collect(); - Ok(( - scores, - Arc::into_inner(library_tsse).unwrap().into_inner().unwrap(), - )) - } - - /// Compute the fragment size distribution. - fn fragment_size_distribution(&self, max_size: usize) -> Result> { - if let FragmentDataIter::FragmentPaired(fragments) = - self.get_fragment_iter(500)?.into_inner() - { - Ok(fragment_size_distribution(fragments.map(|x| x.0), max_size)) - } else { - bail!("key 'fragment_paired' is not present in the '.obsm'") - } - } - - /// Compute the fraction of reads in each region. - fn frip( - &self, - regions: &Vec>, - normalized: bool, - count_as_insertion: bool, - ) -> Result> { - let vec = fraction_of_reads_in_region( - self.get_fragment_iter(2000)?.into_fragments(), - regions, - normalized, - count_as_insertion, - ) - .map(|x| x.0) - .flatten() - .flatten() - .collect::>(); - Array2::from_shape_vec((self.n_obs(), regions.len()), vec).map_err(Into::into) - } -} - -impl QualityControl for T {} +} \ No newline at end of file diff --git a/snapatac2-python/src/utils/anndata.rs b/snapatac2-python/src/utils/anndata.rs index 104f5ab7..e89a8f74 100644 --- a/snapatac2-python/src/utils/anndata.rs +++ b/snapatac2-python/src/utils/anndata.rs @@ -10,7 +10,7 @@ use pyanndata::anndata::memory; use pyanndata::{AnnData, AnnDataSet}; use pyo3::prelude::*; -use snapatac2_core::SnapData; +use snapatac2_core::{feature_count::{BASE_VALUE, FRAGMENT_PAIRED, FRAGMENT_SINGLE}, SnapData}; use snapatac2_core::feature_count::{BaseData, FragmentData, FragmentDataIter}; pub struct PyAnnData<'py>(memory::PyAnnData<'py>); @@ -144,19 +144,19 @@ impl<'py> SnapData for PyAnnData<'py> { fn get_fragment_iter(&self, chunk_size: usize) -> Result { let obsm = self.obsm(); let matrices: FragmentDataIter = - if let Some(insertion) = obsm.get_item_iter("fragment_single", chunk_size) { + if let Some(insertion) = obsm.get_item_iter(FRAGMENT_SINGLE, chunk_size) { FragmentDataIter::FragmentSingle(Box::new(insertion)) - } else if let Some(fragment) = obsm.get_item_iter("fragment_paired", chunk_size) { + } else if let Some(fragment) = obsm.get_item_iter(FRAGMENT_SINGLE, chunk_size) { FragmentDataIter::FragmentPaired(Box::new(fragment)) } else { - bail!("one of the following keys must be present in the '.obsm': 'fragment_single', 'fragment_paired'") + bail!("one of the following keys must be present in the '.obsm': '{}', '{}'", FRAGMENT_SINGLE, FRAGMENT_PAIRED) }; Ok(FragmentData::new(self.read_chrom_sizes()?, matrices)) } fn get_base_iter(&self, chunk_size: usize) -> Result, usize, usize)>>> { let obsm = self.obsm(); - if let Some(data) = obsm.get_item_iter::>("_values", chunk_size) { + if let Some(data) = obsm.get_item_iter::>(BASE_VALUE, chunk_size) { Ok(BaseData::new(self.read_chrom_sizes()?, data)) } else { bail!("key '_values' is not present in the '.obsm'")