From a90f162d01b748d94fc3ebba7674d82e2048ba81 Mon Sep 17 00:00:00 2001
From: Kai Zhang <zhangkai33@westlake.edu.cn>
Date: Wed, 4 Dec 2024 10:55:27 +0800
Subject: [PATCH] refactoring

---
 snapatac2-core/src/export.rs                  |  29 ++-
 .../count_data => feature_count}/coverage.rs  |   6 +-
 .../count_data => feature_count}/matrix.rs    |   5 +-
 snapatac2-core/src/feature_count/mod.rs       |   5 +
 .../{preprocessing/count_data => }/genome.rs  |   3 +-
 snapatac2-core/src/lib.rs                     | 217 ++++++++++++++++-
 snapatac2-core/src/network.rs                 |   2 +-
 .../src/preprocessing/count_data.rs           | 218 ------------------
 .../preprocessing/{count_data => }/import.rs  |  48 ++--
 snapatac2-core/src/preprocessing/mod.rs       |  17 +-
 snapatac2-core/src/{ => utils}/knn.rs         |   0
 snapatac2-core/src/{utils.rs => utils/mod.rs} |   3 +-
 snapatac2-python/src/call_peaks.rs            |   3 +-
 snapatac2-python/src/knn.rs                   |   2 +-
 snapatac2-python/src/network.rs               |   2 +-
 snapatac2-python/src/preprocessing.rs         |  25 +-
 snapatac2-python/src/utils.rs                 |   5 +-
 snapatac2-python/src/utils/anndata.rs         |   5 +-
 18 files changed, 302 insertions(+), 293 deletions(-)
 rename snapatac2-core/src/{preprocessing/count_data => feature_count}/coverage.rs (99%)
 rename snapatac2-core/src/{preprocessing/count_data => feature_count}/matrix.rs (98%)
 create mode 100644 snapatac2-core/src/feature_count/mod.rs
 rename snapatac2-core/src/{preprocessing/count_data => }/genome.rs (99%)
 delete mode 100644 snapatac2-core/src/preprocessing/count_data.rs
 rename snapatac2-core/src/preprocessing/{count_data => }/import.rs (91%)
 rename snapatac2-core/src/{ => utils}/knn.rs (100%)
 rename snapatac2-core/src/{utils.rs => utils/mod.rs} (97%)
diff --git a/snapatac2-core/src/export.rs b/snapatac2-core/src/export.rs
index 58a7ec67e..33bf00e6d 100644
--- a/snapatac2-core/src/export.rs
+++ b/snapatac2-core/src/export.rs
@@ -1,8 +1,7 @@
+use crate::genome::ChromSizes;
+use crate::SnapData;
 use crate::{
-    preprocessing::{
-        count_data::{ChromSizes, SnapData},
-        Fragment,
-    },
+    preprocessing::Fragment,
     utils::{self, Compression},
 };
 
@@ -571,7 +570,13 @@ mod tests {
             None,
             None,
         );
-        assert_eq!(output, expected, "Left: {:?}\n\n{:?}", &output[..10], &expected[..10]);
+        assert_eq!(
+            output,
+            expected,
+            "Left: {:?}\n\n{:?}",
+            &output[..10],
+            &expected[..10]
+        );
 
         let output = create_bedgraph_from_sorted_fragments(
             fragments.into_iter(),
@@ -585,11 +590,7 @@ mod tests {
             None,
             None,
         );
-        let scale_factor: f32 = expected
-            .iter()
-            .map(|x| x.value)
-            .sum::<f32>()
-            / 1e6;
+        let scale_factor: f32 = expected.iter().map(|x| x.value).sum::<f32>() / 1e6;
         expected = expected
             .into_iter()
             .map(|mut x| {
@@ -597,7 +598,13 @@ mod tests {
                 x
             })
             .collect::<Vec<_>>();
-        assert_eq!(output, expected, "Left: {:?}\n\n{:?}", &output[..10], &expected[..10]);
+        assert_eq!(
+            output,
+            expected,
+            "Left: {:?}\n\n{:?}",
+            &output[..10],
+            &expected[..10]
+        );
     }
 
     #[test]
diff --git a/snapatac2-core/src/preprocessing/count_data/coverage.rs b/snapatac2-core/src/feature_count/coverage.rs
similarity index 99%
rename from snapatac2-core/src/preprocessing/count_data/coverage.rs
rename to snapatac2-core/src/feature_count/coverage.rs
index 6091eba85..63d0353f8 100644
--- a/snapatac2-core/src/preprocessing/count_data/coverage.rs
+++ b/snapatac2-core/src/feature_count/coverage.rs
@@ -1,5 +1,5 @@
+use crate::genome::{ChromSizes, FeatureCounter, GenomeBaseIndex};
 use crate::preprocessing::{
-    count_data::genome::{ChromSizes, FeatureCounter, GenomeBaseIndex},
     Fragment,
 };
 
@@ -404,13 +404,13 @@ where
     CsrMatrix::try_from_csr_data(r, c, offset, ind, data).unwrap()
 }
 
-pub struct ContactDataCounter<I> {
+pub struct ContactData<I> {
     index: GenomeBaseIndex,
     coverage: I,
     resolution: usize,
 }
 
-impl<I> ContactDataCounter<I>
+impl<I> ContactData<I>
 where
     I: Iterator<Item = CsrMatrix<u32>>,
 {
diff --git a/snapatac2-core/src/preprocessing/count_data/matrix.rs b/snapatac2-core/src/feature_count/matrix.rs
similarity index 98%
rename from snapatac2-core/src/preprocessing/count_data/matrix.rs
rename to snapatac2-core/src/feature_count/matrix.rs
index ba2a3af2e..6f3b6279e 100644
--- a/snapatac2-core/src/preprocessing/count_data/matrix.rs
+++ b/snapatac2-core/src/feature_count/matrix.rs
@@ -1,6 +1,7 @@
 use super::coverage::CountingStrategy;
-use crate::preprocessing::count_data::{
-    FeatureCounter, GeneCount, Promoters, SnapData, Transcript, TranscriptCount,
+use crate::SnapData;
+use crate::genome::{
+    FeatureCounter, GeneCount, Promoters, Transcript, TranscriptCount,
 };
 
 use anndata::{data::DataFrameIndex, AnnDataOp, ArrayData};
diff --git a/snapatac2-core/src/feature_count/mod.rs b/snapatac2-core/src/feature_count/mod.rs
new file mode 100644
index 000000000..44347ad2e
--- /dev/null
+++ b/snapatac2-core/src/feature_count/mod.rs
@@ -0,0 +1,5 @@
+mod matrix;
+mod coverage;
+
+pub use coverage::{BaseData, FragmentData, ContactData, CountingStrategy, FragmentDataIter};
+pub use matrix::{create_gene_matrix, create_tile_matrix, create_peak_matrix};
\ No newline at end of file
diff --git a/snapatac2-core/src/preprocessing/count_data/genome.rs b/snapatac2-core/src/genome.rs
similarity index 99%
rename from snapatac2-core/src/preprocessing/count_data/genome.rs
rename to snapatac2-core/src/genome.rs
index 87ed3a8ca..8535971e4 100644
--- a/snapatac2-core/src/preprocessing/count_data/genome.rs
+++ b/snapatac2-core/src/genome.rs
@@ -35,7 +35,8 @@ use polars::prelude::{NamedFrom, Series};
 use rayon::iter::{IntoParallelIterator, ParallelIterator};
 use std::ops::Range;
 
-use super::{qc::Fragment, CountingStrategy};
+use crate::preprocessing::Fragment;
+use crate::feature_count::CountingStrategy;
 
 /// Position is 1-based.
 #[derive(Clone, Debug, Eq, PartialEq)]
diff --git a/snapatac2-core/src/lib.rs b/snapatac2-core/src/lib.rs
index b88526205..426a841e0 100644
--- a/snapatac2-core/src/lib.rs
+++ b/snapatac2-core/src/lib.rs
@@ -1,7 +1,214 @@
-pub mod utils;
+pub mod genome;
 pub mod preprocessing;
-pub mod knn;
-pub mod network;
-pub mod motif;
+pub mod feature_count;
 pub mod export;
-pub mod embedding;
\ No newline at end of file
+pub mod motif;
+pub mod network;
+pub mod embedding;
+pub mod utils;
+
+use feature_count::{BaseData, FragmentData, FragmentDataIter};
+use genome::{ChromSizes, ChromValueIter};
+
+use anndata::{
+    container::{ChunkedArrayElem, StackedChunkedArrayElem},
+    ArrayElemOp,
+};
+use anndata::{AnnData, AnnDataOp, AnnDataSet, AxisArraysOp, Backend, ElemCollectionOp};
+use anyhow::{bail, Context, Result};
+use bed_utils::bed::{map::GIntervalMap, GenomicRange};
+use nalgebra_sparse::CsrMatrix;
+use ndarray::Array2;
+use num::integer::div_ceil;
+use polars::frame::DataFrame;
+use preprocessing::{fraction_of_reads_in_region, fragment_size_distribution, TSSe, TssRegions};
+use rayon::iter::{IntoParallelIterator, ParallelIterator};
+use std::{
+    str::FromStr,
+    sync::{Arc, Mutex},
+};
+
+/// The `SnapData` trait represents an interface for reading and
+/// manipulating single-cell assay data. It extends the `AnnDataOp` trait,
+/// adding methods for reading chromosome sizes and genome-wide base-resolution coverage.
+pub trait SnapData: AnnDataOp {
+    type CountIter: ExactSizeIterator<Item = (CsrMatrix<u8>, usize, usize)>;
+
+    /// Return chromosome names and sizes.
+    fn read_chrom_sizes(&self) -> Result<ChromSizes> {
+        let df = self
+            .uns()
+            .get_item::<DataFrame>("reference_sequences")?
+            .context("key 'reference_sequences' is not present in the '.uns'")?;
+        let chrs = df.column("reference_seq_name").unwrap().str()?;
+        let chr_sizes = df.column("reference_seq_length").unwrap().u64()?;
+        let res = chrs
+            .into_iter()
+            .flatten()
+            .map(|x| x.to_string())
+            .zip(chr_sizes.into_iter().flatten())
+            .collect();
+        Ok(res)
+    }
+
+    /// Read fragment data stored in the `.obsm` matrix.
+    fn get_fragment_iter(&self, chunk_size: usize) -> Result<FragmentData>;
+
+    /// Read base values stored in the `.obsm` matrix.
+    fn get_base_iter(&self, chunk_size: usize) -> Result<BaseData<impl ExactSizeIterator<Item = (CsrMatrix<f32>, usize, usize)>>>;
+
+    /// Read counts stored in the `X` matrix.
+    fn read_chrom_values(
+        &self,
+        chunk_size: usize,
+    ) -> Result<ChromValueIter<<<Self as AnnDataOp>::X as ArrayElemOp>::ArrayIter<CsrMatrix<u32>>>>
+    {
+        let regions = self
+            .var_names()
+            .into_vec()
+            .into_iter()
+            .map(|x| GenomicRange::from_str(x.as_str()).unwrap())
+            .collect();
+        Ok(ChromValueIter {
+            regions,
+            iter: self.x().iter(chunk_size),
+            length: div_ceil(self.n_obs(), chunk_size),
+        })
+    }
+
+    /// QC metrics for the data.
+
+    /// Compute TSS enrichment.
+    fn tss_enrichment<'a>(&self, promoter: &'a TssRegions) -> Result<(Vec<f64>, TSSe<'a>)> {
+        let library_tsse = Arc::new(Mutex::new(TSSe::new(promoter)));
+        let scores = self
+            .get_fragment_iter(2000)?
+            .into_fragments()
+            .flat_map(|(list_of_fragments, _, _)| {
+                list_of_fragments
+                    .into_par_iter()
+                    .map(|fragments| {
+                        let mut tsse = TSSe::new(promoter);
+                        fragments.into_iter().for_each(|x| tsse.add(&x));
+                        library_tsse.lock().unwrap().add_from(&tsse);
+                        tsse.result().0
+                    })
+                    .collect::<Vec<_>>()
+            })
+            .collect();
+        Ok((
+            scores,
+            Arc::into_inner(library_tsse).unwrap().into_inner().unwrap(),
+        ))
+    }
+
+    /// Compute the fragment size distribution.
+    fn fragment_size_distribution(&self, max_size: usize) -> Result<Vec<usize>>;
+
+    /// Compute the fraction of reads in each region.
+    fn frip<D>(
+        &self,
+        regions: &Vec<GIntervalMap<D>>,
+        normalized: bool,
+        count_as_insertion: bool,
+    ) -> Result<Array2<f64>> {
+        let vec = fraction_of_reads_in_region(
+            self.get_fragment_iter(2000)?.into_fragments(),
+            regions,
+            normalized,
+            count_as_insertion,
+        )
+        .map(|x| x.0)
+        .flatten()
+        .flatten()
+        .collect::<Vec<_>>();
+        Array2::from_shape_vec((self.n_obs(), regions.len()), vec).map_err(Into::into)
+    }
+
+    fn genome_size(&self) -> Result<u64> {
+        Ok(self.read_chrom_sizes()?.total_size())
+    }
+}
+
+impl<B: Backend> SnapData for AnnData<B> {
+    type CountIter = ChunkedArrayElem<B, CsrMatrix<u8>>;
+
+    fn get_fragment_iter(&self, chunk_size: usize) -> Result<FragmentData> {
+        let obsm = self.obsm();
+        let matrices: FragmentDataIter = if let Some(insertion) =
+            obsm.get_item_iter("fragment_single", chunk_size)
+        {
+            FragmentDataIter::FragmentSingle(Box::new(insertion))
+        } else if let Some(fragment) = obsm.get_item_iter("fragment_paired", chunk_size) {
+            FragmentDataIter::FragmentPaired(Box::new(fragment))
+        } else {
+            bail!("one of the following keys must be present in the '.obsm': 'fragment_single', 'fragment_paired'")
+        };
+        Ok(FragmentData::new(self.read_chrom_sizes()?, matrices))
+    }
+
+    fn get_base_iter(&self, chunk_size: usize) -> Result<BaseData<impl ExactSizeIterator<Item = (CsrMatrix<f32>, usize, usize)>>> {
+        let obsm = self.obsm();
+        if let Some(data) = obsm.get_item_iter::<CsrMatrix<f32>>("_values", chunk_size) {
+            Ok(BaseData::new(self.read_chrom_sizes()?, data))
+        } else {
+            bail!("key '_values' is not present in the '.obsm'")
+        }
+    }
+
+    fn fragment_size_distribution(&self, max_size: usize) -> Result<Vec<usize>> {
+        if let Some(fragment) = self.obsm().get_item_iter("fragment_paired", 500) {
+            Ok(fragment_size_distribution(
+                fragment.map(|x| x.0),
+                max_size,
+            ))
+        } else {
+            bail!("key 'fragment_paired' is not present in the '.obsm'")
+        }
+    }
+}
+
+impl<B: Backend> SnapData for AnnDataSet<B> {
+    type CountIter = StackedChunkedArrayElem<B, CsrMatrix<u8>>;
+
+    fn get_fragment_iter(&self, chunk_size: usize) -> Result<FragmentData> {
+        let adatas = self.adatas().inner();
+        let obsm = adatas.get_obsm();
+        let matrices: FragmentDataIter = if let Some(insertion) =
+            obsm.get_item_iter("fragment_single", chunk_size)
+        {
+            FragmentDataIter::FragmentSingle(Box::new(insertion))
+        } else if let Some(fragment) = obsm.get_item_iter("fragment_paired", chunk_size) {
+            FragmentDataIter::FragmentPaired(Box::new(fragment))
+        } else {
+            bail!("one of the following keys must be present in the '.obsm': 'fragment_single', 'fragment_paired'")
+        };
+        Ok(FragmentData::new(self.read_chrom_sizes()?, matrices))
+    }
+
+    fn get_base_iter(&self, chunk_size: usize) -> Result<BaseData<impl ExactSizeIterator<Item = (CsrMatrix<f32>, usize, usize)>>>
+    {
+        let obsm = self.obsm();
+        if let Some(data) = obsm.get_item_iter::<CsrMatrix<f32>>("_values", chunk_size) {
+            Ok(BaseData::new(self.read_chrom_sizes()?, data))
+        } else {
+            bail!("key '_values' is not present in the '.obsm'")
+        }
+    }
+
+    fn fragment_size_distribution(&self, max_size: usize) -> Result<Vec<usize>> {
+        if let Some(fragment) = self
+            .adatas()
+            .inner()
+            .get_obsm()
+            .get_item_iter("fragment_paired", 500)
+        {
+            Ok(fragment_size_distribution(
+                fragment.map(|x| x.0),
+                max_size,
+            ))
+        } else {
+            bail!("key 'fragment_paired' is not present in the '.obsm'")
+        }
+    }
+}
\ No newline at end of file
diff --git a/snapatac2-core/src/network.rs b/snapatac2-core/src/network.rs
index 7f43a8b91..2f041c55b 100644
--- a/snapatac2-core/src/network.rs
+++ b/snapatac2-core/src/network.rs
@@ -1,4 +1,4 @@
-use crate::preprocessing::Promoters;
+use crate::genome::Promoters;
 
 use bed_utils::bed::BEDLike;
 use std::collections::HashMap;
diff --git a/snapatac2-core/src/preprocessing/count_data.rs b/snapatac2-core/src/preprocessing/count_data.rs
deleted file mode 100644
index eb2de624d..000000000
--- a/snapatac2-core/src/preprocessing/count_data.rs
+++ /dev/null
@@ -1,218 +0,0 @@
-mod coverage;
-mod genome;
-mod import;
-mod matrix;
-
-pub use crate::preprocessing::qc;
-pub use coverage::{FragmentDataIter, ContactDataCounter, CountingStrategy, FragmentData, BaseData};
-pub use genome::{
-    read_transcripts_from_gff, read_transcripts_from_gtf, ChromSizes, ChromValueIter, ChromValues,
-    FeatureCounter, GeneCount, GenomeBaseIndex, Promoters, Transcript, TranscriptCount,
-    TranscriptParserOptions,
-};
-pub use import::{import_contacts, import_fragments, import_values, ChromValue};
-pub use matrix::{create_gene_matrix, create_peak_matrix, create_tile_matrix};
-
-use anndata::{
-    container::{ChunkedArrayElem, StackedChunkedArrayElem},
-    ArrayElemOp,
-};
-use anndata::{AnnData, AnnDataOp, AnnDataSet, AxisArraysOp, Backend, ElemCollectionOp};
-use anyhow::{bail, Context, Result};
-use bed_utils::bed::{map::GIntervalMap, GenomicRange};
-use nalgebra_sparse::CsrMatrix;
-use ndarray::Array2;
-use num::integer::div_ceil;
-use polars::frame::DataFrame;
-use rayon::iter::{IntoParallelIterator, ParallelIterator};
-use std::{
-    str::FromStr,
-    sync::{Arc, Mutex},
-};
-
-use self::qc::TSSe;
-
-/// The `SnapData` trait represents an interface for reading and
-/// manipulating single-cell assay data. It extends the `AnnDataOp` trait,
-/// adding methods for reading chromosome sizes and genome-wide base-resolution coverage.
-pub trait SnapData: AnnDataOp {
-    type CountIter: ExactSizeIterator<Item = (CsrMatrix<u8>, usize, usize)>;
-
-    /// Return chromosome names and sizes.
-    fn read_chrom_sizes(&self) -> Result<ChromSizes> {
-        let df = self
-            .uns()
-            .get_item::<DataFrame>("reference_sequences")?
-            .context("key 'reference_sequences' is not present in the '.uns'")?;
-        let chrs = df.column("reference_seq_name").unwrap().str()?;
-        let chr_sizes = df.column("reference_seq_length").unwrap().u64()?;
-        let res = chrs
-            .into_iter()
-            .flatten()
-            .map(|x| x.to_string())
-            .zip(chr_sizes.into_iter().flatten())
-            .collect();
-        Ok(res)
-    }
-
-    /// Read fragment data stored in the `.obsm` matrix.
-    fn get_fragment_iter(&self, chunk_size: usize) -> Result<FragmentData>;
-
-    /// Read base values stored in the `.obsm` matrix.
-    fn get_base_iter(&self, chunk_size: usize) -> Result<BaseData<impl ExactSizeIterator<Item = (CsrMatrix<f32>, usize, usize)>>>;
-
-    /// Read counts stored in the `X` matrix.
-    fn read_chrom_values(
-        &self,
-        chunk_size: usize,
-    ) -> Result<ChromValueIter<<<Self as AnnDataOp>::X as ArrayElemOp>::ArrayIter<CsrMatrix<u32>>>>
-    {
-        let regions = self
-            .var_names()
-            .into_vec()
-            .into_iter()
-            .map(|x| GenomicRange::from_str(x.as_str()).unwrap())
-            .collect();
-        Ok(ChromValueIter {
-            regions,
-            iter: self.x().iter(chunk_size),
-            length: div_ceil(self.n_obs(), chunk_size),
-        })
-    }
-
-    /// QC metrics for the data.
-
-    /// Compute TSS enrichment.
-    fn tss_enrichment<'a>(&self, promoter: &'a qc::TssRegions) -> Result<(Vec<f64>, TSSe<'a>)> {
-        let library_tsse = Arc::new(Mutex::new(qc::TSSe::new(promoter)));
-        let scores = self
-            .get_fragment_iter(2000)?
-            .into_fragments()
-            .flat_map(|(list_of_fragments, _, _)| {
-                list_of_fragments
-                    .into_par_iter()
-                    .map(|fragments| {
-                        let mut tsse = qc::TSSe::new(promoter);
-                        fragments.into_iter().for_each(|x| tsse.add(&x));
-                        library_tsse.lock().unwrap().add_from(&tsse);
-                        tsse.result().0
-                    })
-                    .collect::<Vec<_>>()
-            })
-            .collect();
-        Ok((
-            scores,
-            Arc::into_inner(library_tsse).unwrap().into_inner().unwrap(),
-        ))
-    }
-
-    /// Compute the fragment size distribution.
-    fn fragment_size_distribution(&self, max_size: usize) -> Result<Vec<usize>>;
-
-    /// Compute the fraction of reads in each region.
-    fn frip<D>(
-        &self,
-        regions: &Vec<GIntervalMap<D>>,
-        normalized: bool,
-        count_as_insertion: bool,
-    ) -> Result<Array2<f64>> {
-        let vec = qc::fraction_of_reads_in_region(
-            self.get_fragment_iter(2000)?.into_fragments(),
-            regions,
-            normalized,
-            count_as_insertion,
-        )
-        .map(|x| x.0)
-        .flatten()
-        .flatten()
-        .collect::<Vec<_>>();
-        Array2::from_shape_vec((self.n_obs(), regions.len()), vec).map_err(Into::into)
-    }
-
-    fn genome_size(&self) -> Result<u64> {
-        Ok(self.read_chrom_sizes()?.total_size())
-    }
-}
-
-impl<B: Backend> SnapData for AnnData<B> {
-    type CountIter = ChunkedArrayElem<B, CsrMatrix<u8>>;
-
-    fn get_fragment_iter(&self, chunk_size: usize) -> Result<FragmentData> {
-        let obsm = self.obsm();
-        let matrices: FragmentDataIter = if let Some(insertion) =
-            obsm.get_item_iter("fragment_single", chunk_size)
-        {
-            FragmentDataIter::FragmentSingle(Box::new(insertion))
-        } else if let Some(fragment) = obsm.get_item_iter("fragment_paired", chunk_size) {
-            FragmentDataIter::FragmentPaired(Box::new(fragment))
-        } else {
-            bail!("one of the following keys must be present in the '.obsm': 'fragment_single', 'fragment_paired'")
-        };
-        Ok(FragmentData::new(self.read_chrom_sizes()?, matrices))
-    }
-
-    fn get_base_iter(&self, chunk_size: usize) -> Result<BaseData<impl ExactSizeIterator<Item = (CsrMatrix<f32>, usize, usize)>>> {
-        let obsm = self.obsm();
-        if let Some(data) = obsm.get_item_iter::<CsrMatrix<f32>>("_values", chunk_size) {
-            Ok(BaseData::new(self.read_chrom_sizes()?, data))
-        } else {
-            bail!("key '_values' is not present in the '.obsm'")
-        }
-    }
-
-    fn fragment_size_distribution(&self, max_size: usize) -> Result<Vec<usize>> {
-        if let Some(fragment) = self.obsm().get_item_iter("fragment_paired", 500) {
-            Ok(qc::fragment_size_distribution(
-                fragment.map(|x| x.0),
-                max_size,
-            ))
-        } else {
-            bail!("key 'fragment_paired' is not present in the '.obsm'")
-        }
-    }
-}
-
-impl<B: Backend> SnapData for AnnDataSet<B> {
-    type CountIter = StackedChunkedArrayElem<B, CsrMatrix<u8>>;
-
-    fn get_fragment_iter(&self, chunk_size: usize) -> Result<FragmentData> {
-        let adatas = self.adatas().inner();
-        let obsm = adatas.get_obsm();
-        let matrices: FragmentDataIter = if let Some(insertion) =
-            obsm.get_item_iter("fragment_single", chunk_size)
-        {
-            FragmentDataIter::FragmentSingle(Box::new(insertion))
-        } else if let Some(fragment) = obsm.get_item_iter("fragment_paired", chunk_size) {
-            FragmentDataIter::FragmentPaired(Box::new(fragment))
-        } else {
-            bail!("one of the following keys must be present in the '.obsm': 'fragment_single', 'fragment_paired'")
-        };
-        Ok(FragmentData::new(self.read_chrom_sizes()?, matrices))
-    }
-
-    fn get_base_iter(&self, chunk_size: usize) -> Result<BaseData<impl ExactSizeIterator<Item = (CsrMatrix<f32>, usize, usize)>>>
-    {
-        let obsm = self.obsm();
-        if let Some(data) = obsm.get_item_iter::<CsrMatrix<f32>>("_values", chunk_size) {
-            Ok(BaseData::new(self.read_chrom_sizes()?, data))
-        } else {
-            bail!("key '_values' is not present in the '.obsm'")
-        }
-    }
-
-    fn fragment_size_distribution(&self, max_size: usize) -> Result<Vec<usize>> {
-        if let Some(fragment) = self
-            .adatas()
-            .inner()
-            .get_obsm()
-            .get_item_iter("fragment_paired", 500)
-        {
-            Ok(qc::fragment_size_distribution(
-                fragment.map(|x| x.0),
-                max_size,
-            ))
-        } else {
-            bail!("key 'fragment_paired' is not present in the '.obsm'")
-        }
-    }
-}
diff --git a/snapatac2-core/src/preprocessing/count_data/import.rs b/snapatac2-core/src/preprocessing/import.rs
similarity index 91%
rename from snapatac2-core/src/preprocessing/count_data/import.rs
rename to snapatac2-core/src/preprocessing/import.rs
index face3dcb3..b704b52bb 100644
--- a/snapatac2-core/src/preprocessing/count_data/import.rs
+++ b/snapatac2-core/src/preprocessing/import.rs
@@ -1,7 +1,6 @@
-use crate::preprocessing::{
-    count_data::{ChromSizes, GenomeBaseIndex},
-    qc::{Contact, Fragment, FragmentSummary, QualityControl},
-};
+use crate::feature_count::ContactData;
+use crate::genome::{ChromSizes, GenomeBaseIndex};
+use crate::preprocessing::qc::{Contact, Fragment, FragmentSummary, QualityControl};
 
 use anndata::{
     data::array::utils::{from_csr_data, to_csr_data},
@@ -290,7 +289,7 @@ where
         let (r, c, offset, ind, data) = to_csr_data(counts, genome_size * genome_size);
         CsrMatrix::try_from_csr_data(r, c, offset, ind, data).unwrap()
     });
-    let contact_map = super::ContactDataCounter::new(chrom_sizes, binding3).with_resolution(bin_size);
+    let contact_map = ContactData::new(chrom_sizes, binding3).with_resolution(bin_size);
 
     anndata.set_x_from_iter(contact_map.into_values::<u32>())?;
     anndata.set_var_names(anndata.n_vars().into())?;
@@ -350,25 +349,30 @@ where
         .chunks(chunk_size);
     let arrays = chunked_values.into_iter().map(|chunk| {
         // Collect into vector for parallel processing
-        let chunk: Vec<Vec<_>> = chunk.map(|(barcode, x)| {
-            if !scanned_barcodes.insert(barcode.clone()) {
-                panic!("Please sort fragment file by barcodes");
-            }
-            x.collect()
-        }).collect();
+        let chunk: Vec<Vec<_>> = chunk
+            .map(|(barcode, x)| {
+                if !scanned_barcodes.insert(barcode.clone()) {
+                    panic!("Please sort fragment file by barcodes");
+                }
+                x.collect()
+            })
+            .collect();
 
         let counts = chunk
             .into_par_iter()
             .map(|data| {
-                let mut count = data.into_iter().flat_map(|value| {
-                    let chrom = &value.chrom;
-                    if genome_index.contain_chrom(chrom) {
-                        let pos = genome_index.get_position_rev(chrom, value.pos);
-                        Some((pos, value.value))
-                    } else {
-                        None
-                    }
-                }).collect::<Vec<_>>();
+                let mut count = data
+                    .into_iter()
+                    .flat_map(|value| {
+                        let chrom = &value.chrom;
+                        if genome_index.contain_chrom(chrom) {
+                            let pos = genome_index.get_position_rev(chrom, value.pos);
+                            Some((pos, value.value))
+                        } else {
+                            None
+                        }
+                    })
+                    .collect::<Vec<_>>();
                 count.sort_by(|x, y| x.0.cmp(&y.0));
                 count
             })
@@ -378,7 +382,9 @@ where
     });
 
     anndata.obsm().add_iter(obsm_key, arrays)?;
-    anndata.uns().add("reference_sequences", chrom_sizes.to_dataframe())?;
+    anndata
+        .uns()
+        .add("reference_sequences", chrom_sizes.to_dataframe())?;
     anndata.set_obs_names(scanned_barcodes.into_iter().collect())?;
     Ok(())
 }
diff --git a/snapatac2-core/src/preprocessing/mod.rs b/snapatac2-core/src/preprocessing/mod.rs
index 1020b55a2..75d6387dc 100644
--- a/snapatac2-core/src/preprocessing/mod.rs
+++ b/snapatac2-core/src/preprocessing/mod.rs
@@ -1,14 +1,11 @@
-pub mod bam;
-pub mod barcode;
-pub mod count_data;
-pub mod qc;
+mod bam;
+mod barcode;
+mod import;
+mod qc;
 
 pub use bam::{make_fragment_file, BamQC, FlagStat};
-pub use count_data::{
-    create_gene_matrix, create_peak_matrix, create_tile_matrix, import_contacts, import_fragments,
-    import_values, read_transcripts_from_gff, read_transcripts_from_gtf, ChromValue, ContactDataCounter,
-    Promoters, SnapData, Transcript,
-};
+pub use import::{import_contacts, import_fragments, import_values, ChromValue};
 pub use qc::{
-    get_barcode_count, make_promoter_map, read_tss, CellBarcode, Contact, Fragment, TssRegions,
+    get_barcode_count, make_promoter_map, read_tss, CellBarcode, Contact, Fragment, TSSe,
+    TssRegions, fraction_of_reads_in_region, fragment_size_distribution,
 };
diff --git a/snapatac2-core/src/knn.rs b/snapatac2-core/src/utils/knn.rs
similarity index 100%
rename from snapatac2-core/src/knn.rs
rename to snapatac2-core/src/utils/knn.rs
diff --git a/snapatac2-core/src/utils.rs b/snapatac2-core/src/utils/mod.rs
similarity index 97%
rename from snapatac2-core/src/utils.rs
rename to snapatac2-core/src/utils/mod.rs
index 69aaafdc7..9e7df52fd 100644
--- a/snapatac2-core/src/utils.rs
+++ b/snapatac2-core/src/utils/mod.rs
@@ -1,4 +1,5 @@
 pub mod similarity;
+pub mod knn;
 
 use std::path::Path;
 use std::fs::File;
@@ -40,7 +41,7 @@ where
     merge_sorted_bed_with(input, iterative_merge)
 }
 
-pub fn clip_peak(mut peak: NarrowPeak, chrom_sizes: &crate::preprocessing::count_data::ChromSizes) -> NarrowPeak {
+pub fn clip_peak(mut peak: NarrowPeak, chrom_sizes: &crate::genome::ChromSizes) -> NarrowPeak {
     let chr = peak.chrom();
     let max_len = chrom_sizes.get(chr).expect(&format!("Size missing for chromosome: {}", chr));
     let new_start = peak.start().max(0).min(max_len);
diff --git a/snapatac2-python/src/call_peaks.rs b/snapatac2-python/src/call_peaks.rs
index 7686c2d10..1e7f6b0b0 100644
--- a/snapatac2-python/src/call_peaks.rs
+++ b/snapatac2-python/src/call_peaks.rs
@@ -6,7 +6,8 @@ use indicatif::{ProgressIterator, ProgressStyle};
 use itertools::Itertools;
 use snapatac2_core::utils::{self, Compression};
 use snapatac2_core::{
-    preprocessing::{Fragment, SnapData},
+    SnapData,
+    preprocessing::Fragment,
     utils::{clip_peak, merge_peaks, open_file_for_write},
 };
 
diff --git a/snapatac2-python/src/knn.rs b/snapatac2-python/src/knn.rs
index e3b0a4b77..c9f73ccb2 100644
--- a/snapatac2-python/src/knn.rs
+++ b/snapatac2-python/src/knn.rs
@@ -2,7 +2,7 @@ use anndata::ArrayData;
 use pyanndata::data::PyArrayData;
 use pyo3::{prelude::*, PyResult};
 use numpy::{PyReadonlyArray, Ix2};
-use snapatac2_core::knn;
+use snapatac2_core::utils::knn;
 
 #[pyfunction]
 pub(crate) fn nearest_neighbour_graph(
diff --git a/snapatac2-python/src/network.rs b/snapatac2-python/src/network.rs
index 0b56f7991..0ec6276a3 100644
--- a/snapatac2-python/src/network.rs
+++ b/snapatac2-python/src/network.rs
@@ -4,7 +4,7 @@ use pyo3::prelude::*;
 
 use snapatac2_core::{
     network::link_region_to_promoter,
-    preprocessing::Promoters,
+    genome::Promoters,
 };
 use bed_utils::bed::GenomicRange;
 use std::{
diff --git a/snapatac2-python/src/preprocessing.rs b/snapatac2-python/src/preprocessing.rs
index 8b64f4867..21b3e5308 100644
--- a/snapatac2-python/src/preprocessing.rs
+++ b/snapatac2-python/src/preprocessing.rs
@@ -4,9 +4,6 @@ use itertools::Itertools;
 use pyo3::{prelude::*, pybacked::PyBackedStr};
 use anndata::Backend;
 use anndata_hdf5::H5;
-use snapatac2_core::preprocessing::count_data::TranscriptParserOptions;
-use snapatac2_core::preprocessing::count_data::CountingStrategy;
-use snapatac2_core::preprocessing::ChromValue;
 use std::collections::HashMap;
 use std::io::{BufRead, BufReader};
 use std::path::PathBuf;
@@ -17,8 +14,12 @@ use pyanndata::PyAnnData;
 use anyhow::Result;
 
 use snapatac2_core::{
-    preprocessing::{Fragment, Contact, SnapData},
-    preprocessing, utils,
+    SnapData,
+    genome::TranscriptParserOptions,
+    feature_count::{create_gene_matrix, create_tile_matrix, create_peak_matrix, CountingStrategy},
+    preprocessing::{Fragment, Contact, ChromValue},
+    preprocessing,
+    utils,
 };
 
 #[pyfunction]
@@ -93,7 +94,7 @@ pub(crate) fn import_fragments(
     };
     let chrom_sizes = chrom_size.into_iter().collect();
     let fragments = bed::io::Reader::new(utils::open_file_for_read(&fragment_file), Some("#".to_string()))
-        .into_records::<Fragment>().map(|f| {
+        .into_records().map(|f| {
             let mut f = f.unwrap();
             shift_fragment(&mut f, shift_left, shift_right);
             f
@@ -253,7 +254,7 @@ pub(crate) fn mk_tile_matrix(
             if let Some(out) = out {
                 macro_rules! run2 {
                     ($out_data:expr) => {
-                        preprocessing::create_tile_matrix(
+                        create_tile_matrix(
                             $data,
                             bin_size,
                             chunk_size,
@@ -267,7 +268,7 @@ pub(crate) fn mk_tile_matrix(
                 }
                 crate::with_anndata!(&out, run2);
             } else {
-                preprocessing::create_tile_matrix(
+                create_tile_matrix(
                     $data,
                     bin_size,
                     chunk_size,
@@ -306,13 +307,13 @@ pub(crate) fn mk_peak_matrix(
             if let Some(out) = out {
                 macro_rules! run2 {
                     ($out_data:expr) => {
-                        preprocessing::create_peak_matrix($data, peaks, chunk_size, strategy,
+                        create_peak_matrix($data, peaks, chunk_size, strategy,
                             min_fragment_size, max_fragment_size, Some($out_data), use_x)?
                     };
                 }
                 crate::with_anndata!(&out, run2);
             } else {
-                preprocessing::create_peak_matrix($data, peaks, chunk_size, strategy,
+                create_peak_matrix($data, peaks, chunk_size, strategy,
                     min_fragment_size, max_fragment_size, None::<&PyAnnData>, use_x)?;
             }
         }
@@ -354,14 +355,14 @@ pub(crate) fn mk_gene_matrix(
             if let Some(out) = out {
                 macro_rules! run2 {
                     ($out_data:expr) => {
-                        preprocessing::create_gene_matrix($data, transcripts, id_type,
+                        create_gene_matrix($data, transcripts, id_type,
                             upstream, downstream, include_gene_body, chunk_size, strategy,
                             min_fragment_size, max_fragment_size, Some($out_data), use_x)?
                     };
                 }
                 crate::with_anndata!(&out, run2);
             } else {
-                preprocessing::create_gene_matrix($data, transcripts, id_type,
+                create_gene_matrix($data, transcripts, id_type,
                     upstream, downstream, include_gene_body, chunk_size, strategy,
                     min_fragment_size, max_fragment_size, None::<&PyAnnData>, use_x)?;
             }
diff --git a/snapatac2-python/src/utils.rs b/snapatac2-python/src/utils.rs
index ba1156507..b528cdd95 100644
--- a/snapatac2-python/src/utils.rs
+++ b/snapatac2-python/src/utils.rs
@@ -9,9 +9,8 @@ use numpy::{
     Element, IntoPyArray, Ix1, Ix2, PyArray, PyArrayMethods, PyReadonlyArray, PyReadonlyArrayDyn,
 };
 use pyo3::{prelude::*, types::PyIterator, PyResult, Python};
-use snapatac2_core::preprocessing::count_data::TranscriptParserOptions;
-use snapatac2_core::preprocessing::{
-    read_transcripts_from_gff, read_transcripts_from_gtf, Transcript,
+use snapatac2_core::genome::{
+    read_transcripts_from_gff, read_transcripts_from_gtf, Transcript, TranscriptParserOptions,
 };
 use snapatac2_core::utils;
 
diff --git a/snapatac2-python/src/utils/anndata.rs b/snapatac2-python/src/utils/anndata.rs
index c5f3f85de..959474357 100644
--- a/snapatac2-python/src/utils/anndata.rs
+++ b/snapatac2-python/src/utils/anndata.rs
@@ -10,7 +10,8 @@ use pyanndata::anndata::memory;
 use pyanndata::{AnnData, AnnDataSet};
 use pyo3::prelude::*;
 
-use snapatac2_core::preprocessing::{count_data::{BaseData, FragmentData, FragmentDataIter}, qc, SnapData};
+use snapatac2_core::SnapData;
+use snapatac2_core::feature_count::{BaseData, FragmentData, FragmentDataIter};
 
 pub struct PyAnnData<'py>(memory::PyAnnData<'py>);
 
@@ -166,7 +167,7 @@ impl<'py> SnapData for PyAnnData<'py> {
 
     fn fragment_size_distribution(&self, max_size: usize) -> Result<Vec<usize>> {
         if let Some(fragment) = self.obsm().get_item_iter("fragment_paired", 500) {
-            Ok(qc::fragment_size_distribution(fragment.map(|x| x.0), max_size))
+            Ok(snapatac2_core::preprocessing::fragment_size_distribution(fragment.map(|x| x.0), max_size))
         } else {
             bail!("key 'fragment_paired' is not present in the '.obsm'")
         }