From a90f162d01b748d94fc3ebba7674d82e2048ba81 Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Wed, 4 Dec 2024 10:55:27 +0800 Subject: [PATCH] refactoring --- snapatac2-core/src/export.rs | 29 ++- .../count_data => feature_count}/coverage.rs | 6 +- .../count_data => feature_count}/matrix.rs | 5 +- snapatac2-core/src/feature_count/mod.rs | 5 + .../{preprocessing/count_data => }/genome.rs | 3 +- snapatac2-core/src/lib.rs | 217 ++++++++++++++++- snapatac2-core/src/network.rs | 2 +- .../src/preprocessing/count_data.rs | 218 ------------------ .../preprocessing/{count_data => }/import.rs | 48 ++-- snapatac2-core/src/preprocessing/mod.rs | 17 +- snapatac2-core/src/{ => utils}/knn.rs | 0 snapatac2-core/src/{utils.rs => utils/mod.rs} | 3 +- snapatac2-python/src/call_peaks.rs | 3 +- snapatac2-python/src/knn.rs | 2 +- snapatac2-python/src/network.rs | 2 +- snapatac2-python/src/preprocessing.rs | 25 +- snapatac2-python/src/utils.rs | 5 +- snapatac2-python/src/utils/anndata.rs | 5 +- 18 files changed, 302 insertions(+), 293 deletions(-) rename snapatac2-core/src/{preprocessing/count_data => feature_count}/coverage.rs (99%) rename snapatac2-core/src/{preprocessing/count_data => feature_count}/matrix.rs (98%) create mode 100644 snapatac2-core/src/feature_count/mod.rs rename snapatac2-core/src/{preprocessing/count_data => }/genome.rs (99%) delete mode 100644 snapatac2-core/src/preprocessing/count_data.rs rename snapatac2-core/src/preprocessing/{count_data => }/import.rs (91%) rename snapatac2-core/src/{ => utils}/knn.rs (100%) rename snapatac2-core/src/{utils.rs => utils/mod.rs} (97%) diff --git a/snapatac2-core/src/export.rs b/snapatac2-core/src/export.rs index 58a7ec67e..33bf00e6d 100644 --- a/snapatac2-core/src/export.rs +++ b/snapatac2-core/src/export.rs @@ -1,8 +1,7 @@ +use crate::genome::ChromSizes; +use crate::SnapData; use crate::{ - preprocessing::{ - count_data::{ChromSizes, SnapData}, - Fragment, - }, + preprocessing::Fragment, utils::{self, Compression}, }; @@ -571,7 +570,13 @@ mod tests { None, None, ); - assert_eq!(output, expected, "Left: {:?}\n\n{:?}", &output[..10], &expected[..10]); + assert_eq!( + output, + expected, + "Left: {:?}\n\n{:?}", + &output[..10], + &expected[..10] + ); let output = create_bedgraph_from_sorted_fragments( fragments.into_iter(), @@ -585,11 +590,7 @@ mod tests { None, None, ); - let scale_factor: f32 = expected - .iter() - .map(|x| x.value) - .sum::() - / 1e6; + let scale_factor: f32 = expected.iter().map(|x| x.value).sum::() / 1e6; expected = expected .into_iter() .map(|mut x| { @@ -597,7 +598,13 @@ mod tests { x }) .collect::>(); - assert_eq!(output, expected, "Left: {:?}\n\n{:?}", &output[..10], &expected[..10]); + assert_eq!( + output, + expected, + "Left: {:?}\n\n{:?}", + &output[..10], + &expected[..10] + ); } #[test] diff --git a/snapatac2-core/src/preprocessing/count_data/coverage.rs b/snapatac2-core/src/feature_count/coverage.rs similarity index 99% rename from snapatac2-core/src/preprocessing/count_data/coverage.rs rename to snapatac2-core/src/feature_count/coverage.rs index 6091eba85..63d0353f8 100644 --- a/snapatac2-core/src/preprocessing/count_data/coverage.rs +++ b/snapatac2-core/src/feature_count/coverage.rs @@ -1,5 +1,5 @@ +use crate::genome::{ChromSizes, FeatureCounter, GenomeBaseIndex}; use crate::preprocessing::{ - count_data::genome::{ChromSizes, FeatureCounter, GenomeBaseIndex}, Fragment, }; @@ -404,13 +404,13 @@ where CsrMatrix::try_from_csr_data(r, c, offset, ind, data).unwrap() } -pub struct ContactDataCounter { +pub struct ContactData { index: GenomeBaseIndex, coverage: I, resolution: usize, } -impl ContactDataCounter +impl ContactData where I: Iterator>, { diff --git a/snapatac2-core/src/preprocessing/count_data/matrix.rs b/snapatac2-core/src/feature_count/matrix.rs similarity index 98% rename from snapatac2-core/src/preprocessing/count_data/matrix.rs rename to snapatac2-core/src/feature_count/matrix.rs index ba2a3af2e..6f3b6279e 100644 --- a/snapatac2-core/src/preprocessing/count_data/matrix.rs +++ b/snapatac2-core/src/feature_count/matrix.rs @@ -1,6 +1,7 @@ use super::coverage::CountingStrategy; -use crate::preprocessing::count_data::{ - FeatureCounter, GeneCount, Promoters, SnapData, Transcript, TranscriptCount, +use crate::SnapData; +use crate::genome::{ + FeatureCounter, GeneCount, Promoters, Transcript, TranscriptCount, }; use anndata::{data::DataFrameIndex, AnnDataOp, ArrayData}; diff --git a/snapatac2-core/src/feature_count/mod.rs b/snapatac2-core/src/feature_count/mod.rs new file mode 100644 index 000000000..44347ad2e --- /dev/null +++ b/snapatac2-core/src/feature_count/mod.rs @@ -0,0 +1,5 @@ +mod matrix; +mod coverage; + +pub use coverage::{BaseData, FragmentData, ContactData, CountingStrategy, FragmentDataIter}; +pub use matrix::{create_gene_matrix, create_tile_matrix, create_peak_matrix}; \ No newline at end of file diff --git a/snapatac2-core/src/preprocessing/count_data/genome.rs b/snapatac2-core/src/genome.rs similarity index 99% rename from snapatac2-core/src/preprocessing/count_data/genome.rs rename to snapatac2-core/src/genome.rs index 87ed3a8ca..8535971e4 100644 --- a/snapatac2-core/src/preprocessing/count_data/genome.rs +++ b/snapatac2-core/src/genome.rs @@ -35,7 +35,8 @@ use polars::prelude::{NamedFrom, Series}; use rayon::iter::{IntoParallelIterator, ParallelIterator}; use std::ops::Range; -use super::{qc::Fragment, CountingStrategy}; +use crate::preprocessing::Fragment; +use crate::feature_count::CountingStrategy; /// Position is 1-based. #[derive(Clone, Debug, Eq, PartialEq)] diff --git a/snapatac2-core/src/lib.rs b/snapatac2-core/src/lib.rs index b88526205..426a841e0 100644 --- a/snapatac2-core/src/lib.rs +++ b/snapatac2-core/src/lib.rs @@ -1,7 +1,214 @@ -pub mod utils; +pub mod genome; pub mod preprocessing; -pub mod knn; -pub mod network; -pub mod motif; +pub mod feature_count; pub mod export; -pub mod embedding; \ No newline at end of file +pub mod motif; +pub mod network; +pub mod embedding; +pub mod utils; + +use feature_count::{BaseData, FragmentData, FragmentDataIter}; +use genome::{ChromSizes, ChromValueIter}; + +use anndata::{ + container::{ChunkedArrayElem, StackedChunkedArrayElem}, + ArrayElemOp, +}; +use anndata::{AnnData, AnnDataOp, AnnDataSet, AxisArraysOp, Backend, ElemCollectionOp}; +use anyhow::{bail, Context, Result}; +use bed_utils::bed::{map::GIntervalMap, GenomicRange}; +use nalgebra_sparse::CsrMatrix; +use ndarray::Array2; +use num::integer::div_ceil; +use polars::frame::DataFrame; +use preprocessing::{fraction_of_reads_in_region, fragment_size_distribution, TSSe, TssRegions}; +use rayon::iter::{IntoParallelIterator, ParallelIterator}; +use std::{ + str::FromStr, + sync::{Arc, Mutex}, +}; + +/// The `SnapData` trait represents an interface for reading and +/// manipulating single-cell assay data. It extends the `AnnDataOp` trait, +/// adding methods for reading chromosome sizes and genome-wide base-resolution coverage. +pub trait SnapData: AnnDataOp { + type CountIter: ExactSizeIterator, usize, usize)>; + + /// Return chromosome names and sizes. + fn read_chrom_sizes(&self) -> Result { + let df = self + .uns() + .get_item::("reference_sequences")? + .context("key 'reference_sequences' is not present in the '.uns'")?; + let chrs = df.column("reference_seq_name").unwrap().str()?; + let chr_sizes = df.column("reference_seq_length").unwrap().u64()?; + let res = chrs + .into_iter() + .flatten() + .map(|x| x.to_string()) + .zip(chr_sizes.into_iter().flatten()) + .collect(); + Ok(res) + } + + /// Read fragment data stored in the `.obsm` matrix. + fn get_fragment_iter(&self, chunk_size: usize) -> Result; + + /// Read base values stored in the `.obsm` matrix. + fn get_base_iter(&self, chunk_size: usize) -> Result, usize, usize)>>>; + + /// Read counts stored in the `X` matrix. + fn read_chrom_values( + &self, + chunk_size: usize, + ) -> Result::X as ArrayElemOp>::ArrayIter>>> + { + let regions = self + .var_names() + .into_vec() + .into_iter() + .map(|x| GenomicRange::from_str(x.as_str()).unwrap()) + .collect(); + Ok(ChromValueIter { + regions, + iter: self.x().iter(chunk_size), + length: div_ceil(self.n_obs(), chunk_size), + }) + } + + /// QC metrics for the data. + + /// Compute TSS enrichment. + fn tss_enrichment<'a>(&self, promoter: &'a TssRegions) -> Result<(Vec, TSSe<'a>)> { + let library_tsse = Arc::new(Mutex::new(TSSe::new(promoter))); + let scores = self + .get_fragment_iter(2000)? + .into_fragments() + .flat_map(|(list_of_fragments, _, _)| { + list_of_fragments + .into_par_iter() + .map(|fragments| { + let mut tsse = TSSe::new(promoter); + fragments.into_iter().for_each(|x| tsse.add(&x)); + library_tsse.lock().unwrap().add_from(&tsse); + tsse.result().0 + }) + .collect::>() + }) + .collect(); + Ok(( + scores, + Arc::into_inner(library_tsse).unwrap().into_inner().unwrap(), + )) + } + + /// Compute the fragment size distribution. + fn fragment_size_distribution(&self, max_size: usize) -> Result>; + + /// Compute the fraction of reads in each region. + fn frip( + &self, + regions: &Vec>, + normalized: bool, + count_as_insertion: bool, + ) -> Result> { + let vec = fraction_of_reads_in_region( + self.get_fragment_iter(2000)?.into_fragments(), + regions, + normalized, + count_as_insertion, + ) + .map(|x| x.0) + .flatten() + .flatten() + .collect::>(); + Array2::from_shape_vec((self.n_obs(), regions.len()), vec).map_err(Into::into) + } + + fn genome_size(&self) -> Result { + Ok(self.read_chrom_sizes()?.total_size()) + } +} + +impl SnapData for AnnData { + type CountIter = ChunkedArrayElem>; + + fn get_fragment_iter(&self, chunk_size: usize) -> Result { + let obsm = self.obsm(); + let matrices: FragmentDataIter = if let Some(insertion) = + obsm.get_item_iter("fragment_single", chunk_size) + { + FragmentDataIter::FragmentSingle(Box::new(insertion)) + } else if let Some(fragment) = obsm.get_item_iter("fragment_paired", chunk_size) { + FragmentDataIter::FragmentPaired(Box::new(fragment)) + } else { + bail!("one of the following keys must be present in the '.obsm': 'fragment_single', 'fragment_paired'") + }; + Ok(FragmentData::new(self.read_chrom_sizes()?, matrices)) + } + + fn get_base_iter(&self, chunk_size: usize) -> Result, usize, usize)>>> { + let obsm = self.obsm(); + if let Some(data) = obsm.get_item_iter::>("_values", chunk_size) { + Ok(BaseData::new(self.read_chrom_sizes()?, data)) + } else { + bail!("key '_values' is not present in the '.obsm'") + } + } + + fn fragment_size_distribution(&self, max_size: usize) -> Result> { + if let Some(fragment) = self.obsm().get_item_iter("fragment_paired", 500) { + Ok(fragment_size_distribution( + fragment.map(|x| x.0), + max_size, + )) + } else { + bail!("key 'fragment_paired' is not present in the '.obsm'") + } + } +} + +impl SnapData for AnnDataSet { + type CountIter = StackedChunkedArrayElem>; + + fn get_fragment_iter(&self, chunk_size: usize) -> Result { + let adatas = self.adatas().inner(); + let obsm = adatas.get_obsm(); + let matrices: FragmentDataIter = if let Some(insertion) = + obsm.get_item_iter("fragment_single", chunk_size) + { + FragmentDataIter::FragmentSingle(Box::new(insertion)) + } else if let Some(fragment) = obsm.get_item_iter("fragment_paired", chunk_size) { + FragmentDataIter::FragmentPaired(Box::new(fragment)) + } else { + bail!("one of the following keys must be present in the '.obsm': 'fragment_single', 'fragment_paired'") + }; + Ok(FragmentData::new(self.read_chrom_sizes()?, matrices)) + } + + fn get_base_iter(&self, chunk_size: usize) -> Result, usize, usize)>>> + { + let obsm = self.obsm(); + if let Some(data) = obsm.get_item_iter::>("_values", chunk_size) { + Ok(BaseData::new(self.read_chrom_sizes()?, data)) + } else { + bail!("key '_values' is not present in the '.obsm'") + } + } + + fn fragment_size_distribution(&self, max_size: usize) -> Result> { + if let Some(fragment) = self + .adatas() + .inner() + .get_obsm() + .get_item_iter("fragment_paired", 500) + { + Ok(fragment_size_distribution( + fragment.map(|x| x.0), + max_size, + )) + } else { + bail!("key 'fragment_paired' is not present in the '.obsm'") + } + } +} \ No newline at end of file diff --git a/snapatac2-core/src/network.rs b/snapatac2-core/src/network.rs index 7f43a8b91..2f041c55b 100644 --- a/snapatac2-core/src/network.rs +++ b/snapatac2-core/src/network.rs @@ -1,4 +1,4 @@ -use crate::preprocessing::Promoters; +use crate::genome::Promoters; use bed_utils::bed::BEDLike; use std::collections::HashMap; diff --git a/snapatac2-core/src/preprocessing/count_data.rs b/snapatac2-core/src/preprocessing/count_data.rs deleted file mode 100644 index eb2de624d..000000000 --- a/snapatac2-core/src/preprocessing/count_data.rs +++ /dev/null @@ -1,218 +0,0 @@ -mod coverage; -mod genome; -mod import; -mod matrix; - -pub use crate::preprocessing::qc; -pub use coverage::{FragmentDataIter, ContactDataCounter, CountingStrategy, FragmentData, BaseData}; -pub use genome::{ - read_transcripts_from_gff, read_transcripts_from_gtf, ChromSizes, ChromValueIter, ChromValues, - FeatureCounter, GeneCount, GenomeBaseIndex, Promoters, Transcript, TranscriptCount, - TranscriptParserOptions, -}; -pub use import::{import_contacts, import_fragments, import_values, ChromValue}; -pub use matrix::{create_gene_matrix, create_peak_matrix, create_tile_matrix}; - -use anndata::{ - container::{ChunkedArrayElem, StackedChunkedArrayElem}, - ArrayElemOp, -}; -use anndata::{AnnData, AnnDataOp, AnnDataSet, AxisArraysOp, Backend, ElemCollectionOp}; -use anyhow::{bail, Context, Result}; -use bed_utils::bed::{map::GIntervalMap, GenomicRange}; -use nalgebra_sparse::CsrMatrix; -use ndarray::Array2; -use num::integer::div_ceil; -use polars::frame::DataFrame; -use rayon::iter::{IntoParallelIterator, ParallelIterator}; -use std::{ - str::FromStr, - sync::{Arc, Mutex}, -}; - -use self::qc::TSSe; - -/// The `SnapData` trait represents an interface for reading and -/// manipulating single-cell assay data. It extends the `AnnDataOp` trait, -/// adding methods for reading chromosome sizes and genome-wide base-resolution coverage. -pub trait SnapData: AnnDataOp { - type CountIter: ExactSizeIterator, usize, usize)>; - - /// Return chromosome names and sizes. - fn read_chrom_sizes(&self) -> Result { - let df = self - .uns() - .get_item::("reference_sequences")? - .context("key 'reference_sequences' is not present in the '.uns'")?; - let chrs = df.column("reference_seq_name").unwrap().str()?; - let chr_sizes = df.column("reference_seq_length").unwrap().u64()?; - let res = chrs - .into_iter() - .flatten() - .map(|x| x.to_string()) - .zip(chr_sizes.into_iter().flatten()) - .collect(); - Ok(res) - } - - /// Read fragment data stored in the `.obsm` matrix. - fn get_fragment_iter(&self, chunk_size: usize) -> Result; - - /// Read base values stored in the `.obsm` matrix. - fn get_base_iter(&self, chunk_size: usize) -> Result, usize, usize)>>>; - - /// Read counts stored in the `X` matrix. - fn read_chrom_values( - &self, - chunk_size: usize, - ) -> Result::X as ArrayElemOp>::ArrayIter>>> - { - let regions = self - .var_names() - .into_vec() - .into_iter() - .map(|x| GenomicRange::from_str(x.as_str()).unwrap()) - .collect(); - Ok(ChromValueIter { - regions, - iter: self.x().iter(chunk_size), - length: div_ceil(self.n_obs(), chunk_size), - }) - } - - /// QC metrics for the data. - - /// Compute TSS enrichment. - fn tss_enrichment<'a>(&self, promoter: &'a qc::TssRegions) -> Result<(Vec, TSSe<'a>)> { - let library_tsse = Arc::new(Mutex::new(qc::TSSe::new(promoter))); - let scores = self - .get_fragment_iter(2000)? - .into_fragments() - .flat_map(|(list_of_fragments, _, _)| { - list_of_fragments - .into_par_iter() - .map(|fragments| { - let mut tsse = qc::TSSe::new(promoter); - fragments.into_iter().for_each(|x| tsse.add(&x)); - library_tsse.lock().unwrap().add_from(&tsse); - tsse.result().0 - }) - .collect::>() - }) - .collect(); - Ok(( - scores, - Arc::into_inner(library_tsse).unwrap().into_inner().unwrap(), - )) - } - - /// Compute the fragment size distribution. - fn fragment_size_distribution(&self, max_size: usize) -> Result>; - - /// Compute the fraction of reads in each region. - fn frip( - &self, - regions: &Vec>, - normalized: bool, - count_as_insertion: bool, - ) -> Result> { - let vec = qc::fraction_of_reads_in_region( - self.get_fragment_iter(2000)?.into_fragments(), - regions, - normalized, - count_as_insertion, - ) - .map(|x| x.0) - .flatten() - .flatten() - .collect::>(); - Array2::from_shape_vec((self.n_obs(), regions.len()), vec).map_err(Into::into) - } - - fn genome_size(&self) -> Result { - Ok(self.read_chrom_sizes()?.total_size()) - } -} - -impl SnapData for AnnData { - type CountIter = ChunkedArrayElem>; - - fn get_fragment_iter(&self, chunk_size: usize) -> Result { - let obsm = self.obsm(); - let matrices: FragmentDataIter = if let Some(insertion) = - obsm.get_item_iter("fragment_single", chunk_size) - { - FragmentDataIter::FragmentSingle(Box::new(insertion)) - } else if let Some(fragment) = obsm.get_item_iter("fragment_paired", chunk_size) { - FragmentDataIter::FragmentPaired(Box::new(fragment)) - } else { - bail!("one of the following keys must be present in the '.obsm': 'fragment_single', 'fragment_paired'") - }; - Ok(FragmentData::new(self.read_chrom_sizes()?, matrices)) - } - - fn get_base_iter(&self, chunk_size: usize) -> Result, usize, usize)>>> { - let obsm = self.obsm(); - if let Some(data) = obsm.get_item_iter::>("_values", chunk_size) { - Ok(BaseData::new(self.read_chrom_sizes()?, data)) - } else { - bail!("key '_values' is not present in the '.obsm'") - } - } - - fn fragment_size_distribution(&self, max_size: usize) -> Result> { - if let Some(fragment) = self.obsm().get_item_iter("fragment_paired", 500) { - Ok(qc::fragment_size_distribution( - fragment.map(|x| x.0), - max_size, - )) - } else { - bail!("key 'fragment_paired' is not present in the '.obsm'") - } - } -} - -impl SnapData for AnnDataSet { - type CountIter = StackedChunkedArrayElem>; - - fn get_fragment_iter(&self, chunk_size: usize) -> Result { - let adatas = self.adatas().inner(); - let obsm = adatas.get_obsm(); - let matrices: FragmentDataIter = if let Some(insertion) = - obsm.get_item_iter("fragment_single", chunk_size) - { - FragmentDataIter::FragmentSingle(Box::new(insertion)) - } else if let Some(fragment) = obsm.get_item_iter("fragment_paired", chunk_size) { - FragmentDataIter::FragmentPaired(Box::new(fragment)) - } else { - bail!("one of the following keys must be present in the '.obsm': 'fragment_single', 'fragment_paired'") - }; - Ok(FragmentData::new(self.read_chrom_sizes()?, matrices)) - } - - fn get_base_iter(&self, chunk_size: usize) -> Result, usize, usize)>>> - { - let obsm = self.obsm(); - if let Some(data) = obsm.get_item_iter::>("_values", chunk_size) { - Ok(BaseData::new(self.read_chrom_sizes()?, data)) - } else { - bail!("key '_values' is not present in the '.obsm'") - } - } - - fn fragment_size_distribution(&self, max_size: usize) -> Result> { - if let Some(fragment) = self - .adatas() - .inner() - .get_obsm() - .get_item_iter("fragment_paired", 500) - { - Ok(qc::fragment_size_distribution( - fragment.map(|x| x.0), - max_size, - )) - } else { - bail!("key 'fragment_paired' is not present in the '.obsm'") - } - } -} diff --git a/snapatac2-core/src/preprocessing/count_data/import.rs b/snapatac2-core/src/preprocessing/import.rs similarity index 91% rename from snapatac2-core/src/preprocessing/count_data/import.rs rename to snapatac2-core/src/preprocessing/import.rs index face3dcb3..b704b52bb 100644 --- a/snapatac2-core/src/preprocessing/count_data/import.rs +++ b/snapatac2-core/src/preprocessing/import.rs @@ -1,7 +1,6 @@ -use crate::preprocessing::{ - count_data::{ChromSizes, GenomeBaseIndex}, - qc::{Contact, Fragment, FragmentSummary, QualityControl}, -}; +use crate::feature_count::ContactData; +use crate::genome::{ChromSizes, GenomeBaseIndex}; +use crate::preprocessing::qc::{Contact, Fragment, FragmentSummary, QualityControl}; use anndata::{ data::array::utils::{from_csr_data, to_csr_data}, @@ -290,7 +289,7 @@ where let (r, c, offset, ind, data) = to_csr_data(counts, genome_size * genome_size); CsrMatrix::try_from_csr_data(r, c, offset, ind, data).unwrap() }); - let contact_map = super::ContactDataCounter::new(chrom_sizes, binding3).with_resolution(bin_size); + let contact_map = ContactData::new(chrom_sizes, binding3).with_resolution(bin_size); anndata.set_x_from_iter(contact_map.into_values::())?; anndata.set_var_names(anndata.n_vars().into())?; @@ -350,25 +349,30 @@ where .chunks(chunk_size); let arrays = chunked_values.into_iter().map(|chunk| { // Collect into vector for parallel processing - let chunk: Vec> = chunk.map(|(barcode, x)| { - if !scanned_barcodes.insert(barcode.clone()) { - panic!("Please sort fragment file by barcodes"); - } - x.collect() - }).collect(); + let chunk: Vec> = chunk + .map(|(barcode, x)| { + if !scanned_barcodes.insert(barcode.clone()) { + panic!("Please sort fragment file by barcodes"); + } + x.collect() + }) + .collect(); let counts = chunk .into_par_iter() .map(|data| { - let mut count = data.into_iter().flat_map(|value| { - let chrom = &value.chrom; - if genome_index.contain_chrom(chrom) { - let pos = genome_index.get_position_rev(chrom, value.pos); - Some((pos, value.value)) - } else { - None - } - }).collect::>(); + let mut count = data + .into_iter() + .flat_map(|value| { + let chrom = &value.chrom; + if genome_index.contain_chrom(chrom) { + let pos = genome_index.get_position_rev(chrom, value.pos); + Some((pos, value.value)) + } else { + None + } + }) + .collect::>(); count.sort_by(|x, y| x.0.cmp(&y.0)); count }) @@ -378,7 +382,9 @@ where }); anndata.obsm().add_iter(obsm_key, arrays)?; - anndata.uns().add("reference_sequences", chrom_sizes.to_dataframe())?; + anndata + .uns() + .add("reference_sequences", chrom_sizes.to_dataframe())?; anndata.set_obs_names(scanned_barcodes.into_iter().collect())?; Ok(()) } diff --git a/snapatac2-core/src/preprocessing/mod.rs b/snapatac2-core/src/preprocessing/mod.rs index 1020b55a2..75d6387dc 100644 --- a/snapatac2-core/src/preprocessing/mod.rs +++ b/snapatac2-core/src/preprocessing/mod.rs @@ -1,14 +1,11 @@ -pub mod bam; -pub mod barcode; -pub mod count_data; -pub mod qc; +mod bam; +mod barcode; +mod import; +mod qc; pub use bam::{make_fragment_file, BamQC, FlagStat}; -pub use count_data::{ - create_gene_matrix, create_peak_matrix, create_tile_matrix, import_contacts, import_fragments, - import_values, read_transcripts_from_gff, read_transcripts_from_gtf, ChromValue, ContactDataCounter, - Promoters, SnapData, Transcript, -}; +pub use import::{import_contacts, import_fragments, import_values, ChromValue}; pub use qc::{ - get_barcode_count, make_promoter_map, read_tss, CellBarcode, Contact, Fragment, TssRegions, + get_barcode_count, make_promoter_map, read_tss, CellBarcode, Contact, Fragment, TSSe, + TssRegions, fraction_of_reads_in_region, fragment_size_distribution, }; diff --git a/snapatac2-core/src/knn.rs b/snapatac2-core/src/utils/knn.rs similarity index 100% rename from snapatac2-core/src/knn.rs rename to snapatac2-core/src/utils/knn.rs diff --git a/snapatac2-core/src/utils.rs b/snapatac2-core/src/utils/mod.rs similarity index 97% rename from snapatac2-core/src/utils.rs rename to snapatac2-core/src/utils/mod.rs index 69aaafdc7..9e7df52fd 100644 --- a/snapatac2-core/src/utils.rs +++ b/snapatac2-core/src/utils/mod.rs @@ -1,4 +1,5 @@ pub mod similarity; +pub mod knn; use std::path::Path; use std::fs::File; @@ -40,7 +41,7 @@ where merge_sorted_bed_with(input, iterative_merge) } -pub fn clip_peak(mut peak: NarrowPeak, chrom_sizes: &crate::preprocessing::count_data::ChromSizes) -> NarrowPeak { +pub fn clip_peak(mut peak: NarrowPeak, chrom_sizes: &crate::genome::ChromSizes) -> NarrowPeak { let chr = peak.chrom(); let max_len = chrom_sizes.get(chr).expect(&format!("Size missing for chromosome: {}", chr)); let new_start = peak.start().max(0).min(max_len); diff --git a/snapatac2-python/src/call_peaks.rs b/snapatac2-python/src/call_peaks.rs index 7686c2d10..1e7f6b0b0 100644 --- a/snapatac2-python/src/call_peaks.rs +++ b/snapatac2-python/src/call_peaks.rs @@ -6,7 +6,8 @@ use indicatif::{ProgressIterator, ProgressStyle}; use itertools::Itertools; use snapatac2_core::utils::{self, Compression}; use snapatac2_core::{ - preprocessing::{Fragment, SnapData}, + SnapData, + preprocessing::Fragment, utils::{clip_peak, merge_peaks, open_file_for_write}, }; diff --git a/snapatac2-python/src/knn.rs b/snapatac2-python/src/knn.rs index e3b0a4b77..c9f73ccb2 100644 --- a/snapatac2-python/src/knn.rs +++ b/snapatac2-python/src/knn.rs @@ -2,7 +2,7 @@ use anndata::ArrayData; use pyanndata::data::PyArrayData; use pyo3::{prelude::*, PyResult}; use numpy::{PyReadonlyArray, Ix2}; -use snapatac2_core::knn; +use snapatac2_core::utils::knn; #[pyfunction] pub(crate) fn nearest_neighbour_graph( diff --git a/snapatac2-python/src/network.rs b/snapatac2-python/src/network.rs index 0b56f7991..0ec6276a3 100644 --- a/snapatac2-python/src/network.rs +++ b/snapatac2-python/src/network.rs @@ -4,7 +4,7 @@ use pyo3::prelude::*; use snapatac2_core::{ network::link_region_to_promoter, - preprocessing::Promoters, + genome::Promoters, }; use bed_utils::bed::GenomicRange; use std::{ diff --git a/snapatac2-python/src/preprocessing.rs b/snapatac2-python/src/preprocessing.rs index 8b64f4867..21b3e5308 100644 --- a/snapatac2-python/src/preprocessing.rs +++ b/snapatac2-python/src/preprocessing.rs @@ -4,9 +4,6 @@ use itertools::Itertools; use pyo3::{prelude::*, pybacked::PyBackedStr}; use anndata::Backend; use anndata_hdf5::H5; -use snapatac2_core::preprocessing::count_data::TranscriptParserOptions; -use snapatac2_core::preprocessing::count_data::CountingStrategy; -use snapatac2_core::preprocessing::ChromValue; use std::collections::HashMap; use std::io::{BufRead, BufReader}; use std::path::PathBuf; @@ -17,8 +14,12 @@ use pyanndata::PyAnnData; use anyhow::Result; use snapatac2_core::{ - preprocessing::{Fragment, Contact, SnapData}, - preprocessing, utils, + SnapData, + genome::TranscriptParserOptions, + feature_count::{create_gene_matrix, create_tile_matrix, create_peak_matrix, CountingStrategy}, + preprocessing::{Fragment, Contact, ChromValue}, + preprocessing, + utils, }; #[pyfunction] @@ -93,7 +94,7 @@ pub(crate) fn import_fragments( }; let chrom_sizes = chrom_size.into_iter().collect(); let fragments = bed::io::Reader::new(utils::open_file_for_read(&fragment_file), Some("#".to_string())) - .into_records::().map(|f| { + .into_records().map(|f| { let mut f = f.unwrap(); shift_fragment(&mut f, shift_left, shift_right); f @@ -253,7 +254,7 @@ pub(crate) fn mk_tile_matrix( if let Some(out) = out { macro_rules! run2 { ($out_data:expr) => { - preprocessing::create_tile_matrix( + create_tile_matrix( $data, bin_size, chunk_size, @@ -267,7 +268,7 @@ pub(crate) fn mk_tile_matrix( } crate::with_anndata!(&out, run2); } else { - preprocessing::create_tile_matrix( + create_tile_matrix( $data, bin_size, chunk_size, @@ -306,13 +307,13 @@ pub(crate) fn mk_peak_matrix( if let Some(out) = out { macro_rules! run2 { ($out_data:expr) => { - preprocessing::create_peak_matrix($data, peaks, chunk_size, strategy, + create_peak_matrix($data, peaks, chunk_size, strategy, min_fragment_size, max_fragment_size, Some($out_data), use_x)? }; } crate::with_anndata!(&out, run2); } else { - preprocessing::create_peak_matrix($data, peaks, chunk_size, strategy, + create_peak_matrix($data, peaks, chunk_size, strategy, min_fragment_size, max_fragment_size, None::<&PyAnnData>, use_x)?; } } @@ -354,14 +355,14 @@ pub(crate) fn mk_gene_matrix( if let Some(out) = out { macro_rules! run2 { ($out_data:expr) => { - preprocessing::create_gene_matrix($data, transcripts, id_type, + create_gene_matrix($data, transcripts, id_type, upstream, downstream, include_gene_body, chunk_size, strategy, min_fragment_size, max_fragment_size, Some($out_data), use_x)? }; } crate::with_anndata!(&out, run2); } else { - preprocessing::create_gene_matrix($data, transcripts, id_type, + create_gene_matrix($data, transcripts, id_type, upstream, downstream, include_gene_body, chunk_size, strategy, min_fragment_size, max_fragment_size, None::<&PyAnnData>, use_x)?; } diff --git a/snapatac2-python/src/utils.rs b/snapatac2-python/src/utils.rs index ba1156507..b528cdd95 100644 --- a/snapatac2-python/src/utils.rs +++ b/snapatac2-python/src/utils.rs @@ -9,9 +9,8 @@ use numpy::{ Element, IntoPyArray, Ix1, Ix2, PyArray, PyArrayMethods, PyReadonlyArray, PyReadonlyArrayDyn, }; use pyo3::{prelude::*, types::PyIterator, PyResult, Python}; -use snapatac2_core::preprocessing::count_data::TranscriptParserOptions; -use snapatac2_core::preprocessing::{ - read_transcripts_from_gff, read_transcripts_from_gtf, Transcript, +use snapatac2_core::genome::{ + read_transcripts_from_gff, read_transcripts_from_gtf, Transcript, TranscriptParserOptions, }; use snapatac2_core::utils; diff --git a/snapatac2-python/src/utils/anndata.rs b/snapatac2-python/src/utils/anndata.rs index c5f3f85de..959474357 100644 --- a/snapatac2-python/src/utils/anndata.rs +++ b/snapatac2-python/src/utils/anndata.rs @@ -10,7 +10,8 @@ use pyanndata::anndata::memory; use pyanndata::{AnnData, AnnDataSet}; use pyo3::prelude::*; -use snapatac2_core::preprocessing::{count_data::{BaseData, FragmentData, FragmentDataIter}, qc, SnapData}; +use snapatac2_core::SnapData; +use snapatac2_core::feature_count::{BaseData, FragmentData, FragmentDataIter}; pub struct PyAnnData<'py>(memory::PyAnnData<'py>); @@ -166,7 +167,7 @@ impl<'py> SnapData for PyAnnData<'py> { fn fragment_size_distribution(&self, max_size: usize) -> Result> { if let Some(fragment) = self.obsm().get_item_iter("fragment_paired", 500) { - Ok(qc::fragment_size_distribution(fragment.map(|x| x.0), max_size)) + Ok(snapatac2_core::preprocessing::fragment_size_distribution(fragment.map(|x| x.0), max_size)) } else { bail!("key 'fragment_paired' is not present in the '.obsm'") }