diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 407e5d27..4ef13d3c 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -20,12 +20,12 @@ jobs: CIBW_ARCHS_LINUX: "auto64" CIBW_ENVIRONMENT: 'PATH="$PATH:$HOME/.cargo/bin"' + CIBW_ENVIRONMENT_MACOS: 'MACOSX_DEPLOYMENT_TARGET="10.9"' CIBW_SKIP: "pp* *-win32 *-musllinux*" CIBW_BUILD: ${{ matrix.python_version }} CIBW_TEST_REQUIRES: pytest hypothesis==6.72.4 CIBW_TEST_COMMAND: "pytest {project}/snapatac2-python/tests" - MACOSX_DEPLOYMENT_TARGET: 10.7 steps: - name: Checkout code diff --git a/snapatac2-core/src/feature_count/counter.rs b/snapatac2-core/src/feature_count/counter.rs new file mode 100644 index 00000000..803e4ed1 --- /dev/null +++ b/snapatac2-core/src/feature_count/counter.rs @@ -0,0 +1,275 @@ +use bed_utils::bed::map::GIntervalIndexSet; +use bed_utils::bed::BEDLike; +use indexmap::map::IndexMap; +use itertools::Itertools; +use num::{ + traits::{NumAssignOps, NumCast, ToPrimitive}, + Num, +}; +use std::{collections::BTreeMap, fmt::Debug}; + +use crate::genome::Promoters; +use crate::preprocessing::Fragment; + +/// The `CountingStrategy` enum represents different counting strategies. +/// It is used to count the number of fragments that overlap for a given list of genomic features. +/// Three counting strategies are supported: Insertion, Fragment, and Paired-Insertion Counting (PIC). +#[derive(Clone, Copy, Debug)] +pub enum CountingStrategy { + Insertion, // Insertion based counting + Fragment, // Fragment based counting + PIC, // Paired-Insertion Counting (PIC) +} + +/// `FeatureCounter` is a trait that provides an interface for counting genomic features. +/// Types implementing `FeatureCounter` can store feature counts and provide several +/// methods for manipulating and retrieving those counts. +pub trait FeatureCounter { + type Value; + + /// Returns the total number of distinct features counted. + fn num_features(&self) -> usize { + self.get_feature_ids().len() + } + + /// Resets the counter for all features. + fn reset(&mut self); + + /// Updates the counter according to the given region and count. + fn insert(&mut self, tag: &B, count: N); + + /// Updates the counter according to the given fragment + fn insert_fragment(&mut self, tag: &Fragment, strategy: &CountingStrategy); + + /// Returns a vector of feature ids. + fn get_feature_ids(&self) -> Vec; + + /// Returns a vector of feature names if available. + fn get_feature_names(&self) -> Option> { + None + } + + /// Returns a vector of tuples, each containing a feature's index and its count. + fn get_values(&self) -> Vec<(usize, Self::Value)>; + + /// Returns a vector of tuples, each containing a feature's index and its average count. + fn get_values_and_counts(&self) -> impl Iterator; +} + +#[derive(Clone)] +pub struct RegionCounter<'a, V> { + regions: &'a GIntervalIndexSet, + values: BTreeMap, +} + +impl<'a, V> RegionCounter<'a, V> { + pub fn new(regions: &'a GIntervalIndexSet) -> Self { + Self { + regions, + values: BTreeMap::new(), + } + } +} + +impl FeatureCounter for RegionCounter<'_, V> { + type Value = V; + + fn reset(&mut self) { + self.values.clear(); + } + + fn insert(&mut self, tag: &B, count: N) { + let val = ::from(count).unwrap(); + self.regions.find_index_of(tag).for_each(|idx| { + self.values + .entry(idx) + .and_modify(|(v, c)| { + *v += val; + *c += 1; + }) + .or_insert((val, 1)); + }); + } + + fn insert_fragment(&mut self, tag: &Fragment, strategy: &CountingStrategy) { + if tag.is_single() { + tag.to_insertions().iter().for_each(|x| { + self.insert(x, V::one()); + }); + } else { + match strategy { + CountingStrategy::Fragment => { + self.insert(tag, V::one()); + } + CountingStrategy::Insertion => { + tag.to_insertions().iter().for_each(|x| { + self.insert(x, V::one()); + }); + } + CountingStrategy::PIC => { + tag.to_insertions() + .into_iter() + .flat_map(|x| self.regions.find_index_of(&x)) + .unique() + .collect::>() + .into_iter() + .for_each(|i| { + self.values + .entry(i) + .and_modify(|(v, c)| { + *v += V::one(); + *c += 1; + }) + .or_insert((V::one(), 1)); + }); + } + } + } + } + + fn get_feature_ids(&self) -> Vec { + self.regions + .iter() + .map(|x| x.to_genomic_range().pretty_show()) + .collect() + } + + fn get_values(&self) -> Vec<(usize, Self::Value)> { + self.values.iter().map(|(k, v)| (*k, v.0)).collect() + } + + fn get_values_and_counts(&self) -> impl Iterator { + self.values + .iter().map(|(k, v)| (*k, (v.0, v.1))) + } +} + +/// `TranscriptCount` is a struct that represents the count of genomic features at the transcript level. +/// It holds a `SparseCoverage` counter and a reference to `Promoters`. +#[derive(Clone)] +pub struct TranscriptCount<'a> { + counter: RegionCounter<'a, u32>, + promoters: &'a Promoters, +} + +impl<'a> TranscriptCount<'a> { + pub fn new(promoters: &'a Promoters) -> Self { + Self { + counter: RegionCounter::new(&promoters.regions), + promoters, + } + } + + pub fn gene_names(&self) -> Vec { + self.promoters + .transcripts + .iter() + .map(|x| x.gene_name.clone()) + .collect() + } +} + +/// `GeneCount` is a struct that represents the count of genomic features at the gene level. +/// It holds a `TranscriptCount` counter and a map from gene names to their indices. +#[derive(Clone)] +pub struct GeneCount<'a> { + counter: TranscriptCount<'a>, + gene_name_to_idx: IndexMap<&'a str, usize>, +} + +/// Implementation of `GeneCount` +impl<'a> GeneCount<'a> { + pub fn new(counter: TranscriptCount<'a>) -> Self { + let gene_name_to_idx: IndexMap<_, _> = counter + .promoters + .transcripts + .iter() + .map(|x| x.gene_name.as_str()) + .unique() + .enumerate() + .map(|(a, b)| (b, a)) + .collect(); + Self { + counter, + gene_name_to_idx, + } + } +} + +/// Implementations of `FeatureCounter` trait for `TranscriptCount` and `GeneCount` structs. +impl FeatureCounter for TranscriptCount<'_> { + type Value = u32; + + fn reset(&mut self) { + self.counter.reset(); + } + + fn insert(&mut self, tag: &B, count: N) { + self.counter + .insert(tag, ::from(count).unwrap()); + } + + fn insert_fragment(&mut self, tag: &Fragment, strategy: &CountingStrategy) { + self.counter.insert_fragment(tag, strategy); + } + + fn get_feature_ids(&self) -> Vec { + self.promoters + .transcripts + .iter() + .map(|x| x.transcript_id.clone()) + .collect() + } + + fn get_values(&self) -> Vec<(usize, Self::Value)> { + self.counter.get_values() + } + + fn get_values_and_counts(&self) -> impl Iterator { + self.counter.get_values_and_counts() + } +} + +impl FeatureCounter for GeneCount<'_> { + type Value = u32; + + fn reset(&mut self) { + self.counter.reset(); + } + + fn insert(&mut self, tag: &B, count: N) { + self.counter + .insert(tag, ::from(count).unwrap()); + } + + fn insert_fragment(&mut self, tag: &Fragment, strategy: &CountingStrategy) { + self.counter.insert_fragment(tag, strategy); + } + + fn get_feature_ids(&self) -> Vec { + self.gene_name_to_idx + .keys() + .map(|x| x.to_string()) + .collect() + } + + fn get_values(&self) -> Vec<(usize, Self::Value)> { + let mut counts = BTreeMap::new(); + self.counter.get_values().into_iter().for_each(|(k, v)| { + let idx = *self + .gene_name_to_idx + .get(self.counter.promoters.transcripts[k].gene_name.as_str()) + .unwrap(); + let current_v = counts.entry(idx).or_insert(v); + if *current_v < v { + *current_v = v + } + }); + counts.into_iter().collect() + } + + fn get_values_and_counts(&self) -> impl Iterator { + todo!(); + self.counter.get_values_and_counts() + } +} diff --git a/snapatac2-core/src/feature_count/coverage.rs b/snapatac2-core/src/feature_count/data_iter.rs similarity index 86% rename from snapatac2-core/src/feature_count/coverage.rs rename to snapatac2-core/src/feature_count/data_iter.rs index 63d0353f..5d523a14 100644 --- a/snapatac2-core/src/feature_count/coverage.rs +++ b/snapatac2-core/src/feature_count/data_iter.rs @@ -1,10 +1,9 @@ -use crate::genome::{ChromSizes, FeatureCounter, GenomeBaseIndex}; -use crate::preprocessing::{ - Fragment, -}; +use crate::genome::{ChromSizes, GenomeBaseIndex}; +use crate::preprocessing::Fragment; +use crate::feature_count::{CountingStrategy, FeatureCounter}; use anndata::{data::{utils::to_csr_data, CsrNonCanonical}, ArrayData}; -use bed_utils::bed::{BEDLike, GenomicRange, Strand}; +use bed_utils::bed::{BEDLike, BedGraph, GenomicRange, Strand}; use nalgebra_sparse::CsrMatrix; use num::traits::{FromPrimitive, One, Zero}; use rayon::iter::{IntoParallelIterator, ParallelIterator}; @@ -113,16 +112,6 @@ fn pair_to_fragments( }) } -/// The `CountingStrategy` enum represents different counting strategies. -/// It is used to count the number of fragments that overlap for a given list of genomic features. -/// Three counting strategies are supported: Insertion, Fragment, and Paired-Insertion Counting (PIC). -#[derive(Clone, Copy, Debug)] -pub enum CountingStrategy { - Insertion, // Insertion based counting - Fragment, // Fragment based counting - PIC, // Paired-Insertion Counting (PIC) -} - /// The `FragmentData` struct is used to count the number of reads that overlap /// for a given list of genomic features (such as genes, exons, ChIP-Seq peaks, or the like). /// It stores the counts as an iterator of tuples, each containing a @@ -286,7 +275,7 @@ impl FragmentData beds.into_iter().for_each(|fragment| { coverage.insert_fragment(&fragment, &strategy); }); - coverage.get_counts() + coverage.get_values() }) .collect::>(); let (r, c, offset, ind, data) = to_csr_data(vec, n_col); @@ -520,6 +509,8 @@ where self } + /// Output the raw coverage matrix. Note the values belong to the same interval + /// will be aggregated by the mean value. pub fn into_array_iter(self) -> impl ExactSizeIterator { let index = self.get_gindex(); @@ -557,6 +548,8 @@ where }) } + /// Aggregate the coverage by a feature counter. Values belong to the same interval + /// will be aggregated by the mean value. pub fn into_aggregated_array_iter(self, counter: C) -> impl ExactSizeIterator where C: FeatureCounter + Clone + Sync, @@ -574,7 +567,9 @@ where coverage.insert(&GenomicRange::new(chrom, pos, pos+1), *val); } }); - coverage.get_counts() + coverage.get_values_and_counts().map(|(idx, (val, count))| { + (idx, val / count as f32) + }).collect::>() }) .collect::>(); let (r, c, offset, ind, data) = to_csr_data(vec, n_col); @@ -585,4 +580,91 @@ where ) }) } -} \ No newline at end of file +} + +/// `ChromValues` is a type alias for a vector of `BedGraph` objects. +/// Each `BedGraph` instance represents a genomic region along with a +/// numerical value (like coverage or score). +pub type ChromValues = Vec>; + +/// `ChromValueIter` represents an iterator over the chromosome values. +/// Each item in the iterator is a tuple of a vector of `ChromValues` objects, +/// a start index, and an end index. +pub struct ChromValueIter { + pub(crate) iter: I, + pub(crate) regions: Vec, + pub(crate) length: usize, +} + +impl<'a, I, T> ChromValueIter +where + I: ExactSizeIterator, usize, usize)> + 'a, + T: Copy, +{ + /// Aggregate the values in the iterator by the given `FeatureCounter`. + pub fn aggregate_by( + self, + mut counter: C, + ) -> impl ExactSizeIterator, usize, usize)> + where + C: FeatureCounter + Clone + Sync, + T: Sync + Send + num::ToPrimitive, + { + let n_col = counter.num_features(); + counter.reset(); + self.iter.map(move |(mat, i, j)| { + let n = j - i; + let vec = (0..n) + .into_par_iter() + .map(|k| { + let row = mat.get_row(k).unwrap(); + let mut coverage = counter.clone(); + row.col_indices() + .into_iter() + .zip(row.values()) + .for_each(|(idx, val)| { + coverage.insert(&self.regions[*idx], *val); + }); + coverage.get_values() + }) + .collect::>(); + let (r, c, offset, ind, data) = to_csr_data(vec, n_col); + (CsrMatrix::try_from_csr_data(r,c,offset,ind, data).unwrap(), i, j) + }) + } +} + +impl Iterator for ChromValueIter +where + I: Iterator, usize, usize)>, + T: Copy, +{ + type Item = (Vec>, usize, usize); + + fn next(&mut self) -> Option { + self.iter.next().map(|(x, start, end)| { + let values = x + .row_iter() + .map(|row| { + row.col_indices() + .iter() + .zip(row.values()) + .map(|(i, v)| BedGraph::from_bed(&self.regions[*i], *v)) + .collect() + }) + .collect(); + (values, start, end) + }) + } +} + +impl ExactSizeIterator for ChromValueIter +where + I: Iterator, usize, usize)>, + T: Copy, +{ + fn len(&self) -> usize { + self.length + } +} + diff --git a/snapatac2-core/src/feature_count/matrix.rs b/snapatac2-core/src/feature_count/matrix.rs index 6f3b6279..069e3e11 100644 --- a/snapatac2-core/src/feature_count/matrix.rs +++ b/snapatac2-core/src/feature_count/matrix.rs @@ -1,15 +1,10 @@ -use super::coverage::CountingStrategy; +use super::counter::{CountingStrategy, FeatureCounter, GeneCount, RegionCounter, TranscriptCount}; +use crate::genome::{Promoters, Transcript}; use crate::SnapData; -use crate::genome::{ - FeatureCounter, GeneCount, Promoters, Transcript, TranscriptCount, -}; use anndata::{data::DataFrameIndex, AnnDataOp, ArrayData}; use anyhow::{bail, Result}; -use bed_utils::{ - bed::{map::GIntervalIndexSet, BEDLike}, - coverage::SparseCoverage, -}; +use bed_utils::bed::{map::GIntervalIndexSet, BEDLike}; use indicatif::{ProgressIterator, ProgressStyle}; use polars::prelude::{DataFrame, NamedFrom, Series}; @@ -115,17 +110,18 @@ where let regions: GIntervalIndexSet = peaks.collect(); let data_iter: Box>; - let feature_names: Vec; + let feature_names: Vec; if use_x { - let counter = SparseCoverage::new(®ions); + let counter = RegionCounter::new(®ions); feature_names = counter.get_feature_ids(); - let data = adata.read_chrom_values(chunk_size)? + let data = adata + .read_chrom_values(chunk_size)? .aggregate_by(counter) .map(|x| x.0.into()); data_iter = Box::new(data); } else if let Ok(mut fragments) = adata.get_fragment_iter(chunk_size) { - let counter = SparseCoverage::new(®ions); + let counter = RegionCounter::new(®ions); feature_names = counter.get_feature_ids(); fragments = fragments.set_counting_strategy(counting_strategy); if let Some(min_fragment_size) = min_fragment_size { @@ -134,11 +130,19 @@ where if let Some(max_fragment_size) = max_fragment_size { fragments = fragments.max_fragment_size(max_fragment_size); } - data_iter = Box::new(fragments.into_aggregated_array_iter(counter).map(|x| x.0.into())); + data_iter = Box::new( + fragments + .into_aggregated_array_iter(counter) + .map(|x| x.0.into()), + ); } else if let Ok(values) = adata.get_base_iter(chunk_size) { - let counter = SparseCoverage::new(®ions); + let counter = RegionCounter::new(®ions); feature_names = counter.get_feature_ids(); - data_iter = Box::new(values.into_aggregated_array_iter(counter).map(|x| x.0.into())); + data_iter = Box::new( + values + .into_aggregated_array_iter(counter) + .map(|x| x.0.into()), + ); } else { bail!("No fragment data found in the anndata object"); } diff --git a/snapatac2-core/src/feature_count/mod.rs b/snapatac2-core/src/feature_count/mod.rs index 44347ad2..9870ccc9 100644 --- a/snapatac2-core/src/feature_count/mod.rs +++ b/snapatac2-core/src/feature_count/mod.rs @@ -1,5 +1,7 @@ mod matrix; -mod coverage; +mod counter; +mod data_iter; -pub use coverage::{BaseData, FragmentData, ContactData, CountingStrategy, FragmentDataIter}; +pub use data_iter::{ChromValueIter, BaseData, FragmentData, ContactData, FragmentDataIter}; +pub use counter::{FeatureCounter, CountingStrategy}; pub use matrix::{create_gene_matrix, create_tile_matrix, create_peak_matrix}; \ No newline at end of file diff --git a/snapatac2-core/src/genome.rs b/snapatac2-core/src/genome.rs index 8535971e..2496d85b 100644 --- a/snapatac2-core/src/genome.rs +++ b/snapatac2-core/src/genome.rs @@ -1,42 +1,33 @@ //! # Genomic Feature Counter Module //! -//! This module provides the functionality to count genomic features (such as genes or transcripts) -//! in genomic data. The primary structures in this module are `TranscriptCount` and `GeneCount`, -//! both of which implement the `FeatureCounter` trait. The `FeatureCounter` trait provides a -//! common interface for handling feature counts, including methods for resetting counts, +//! This module provides the functionality to count genomic features (such as genes or transcripts) +//! in genomic data. The primary structures in this module are `TranscriptCount` and `GeneCount`, +//! both of which implement the `FeatureCounter` trait. The `FeatureCounter` trait provides a +//! common interface for handling feature counts, including methods for resetting counts, //! updating counts, and retrieving feature IDs, names, and counts. //! -//! `SparseCoverage`, from the bed_utils crate, is used for maintaining counts of genomic features, +//! `SparseCoverage`, from the bed_utils crate, is used for maintaining counts of genomic features, //! and this structure also implements the `FeatureCounter` trait in this module. //! -//! `TranscriptCount` and `GeneCount` structures also hold a reference to `Promoters`, which +//! `TranscriptCount` and `GeneCount` structures also hold a reference to `Promoters`, which //! provides additional information about the genomic features being counted. //! -//! To handle the mapping of gene names to indices, an `IndexMap` is used in the `GeneCount` structure. -//! This allows for efficient look-up of gene indices by name, which is useful when summarizing counts +//! To handle the mapping of gene names to indices, an `IndexMap` is used in the `GeneCount` structure. +//! This allows for efficient look-up of gene indices by name, which is useful when summarizing counts //! at the gene level. //! -//! The module aims to provide a comprehensive, efficient, and flexible way to handle and manipulate +//! The module aims to provide a comprehensive, efficient, and flexible way to handle and manipulate //! genomic feature counts in Rust. -use noodles::{core::Position, gff, gff::record::Strand, gtf}; use anyhow::{bail, Context, Result}; -use std::{collections::{BTreeMap, HashMap}, fmt::Debug, io::BufRead, str::FromStr}; -use indexmap::map::IndexMap; use bed_utils::bed::map::GIntervalIndexSet; -use bed_utils::{bed::{GenomicRange, BEDLike}, coverage::SparseCoverage}; -use itertools::Itertools; -use num::{traits::{NumAssignOps, NumCast, ToPrimitive}, Num}; -use anndata::data::utils::to_csr_data; -use bed_utils::bed::BedGraph; +use bed_utils::bed::GenomicRange; +use indexmap::map::IndexMap; use indexmap::IndexSet; +use noodles::{core::Position, gff, gff::record::Strand, gtf}; use polars::frame::DataFrame; -use nalgebra_sparse::CsrMatrix; use polars::prelude::{NamedFrom, Series}; -use rayon::iter::{IntoParallelIterator, ParallelIterator}; use std::ops::Range; - -use crate::preprocessing::Fragment; -use crate::feature_count::CountingStrategy; +use std::{collections::HashMap, fmt::Debug, io::BufRead, str::FromStr}; /// Position is 1-based. #[derive(Clone, Debug, Eq, PartialEq)] @@ -82,12 +73,17 @@ fn from_gtf(record: >f::Record, options: &TranscriptParserOptions) -> Result String { - attributes.get(key).expect(&format!("failed to find '{}' in record: {}", key, record)) .to_string() + let get_attr = |key: &str| -> String { + attributes + .get(key) + .expect(&format!("failed to find '{}' in record: {}", key, record)) + .to_string() }; Ok(Transcript { - transcript_name: attributes.get(options.transcript_name_key.as_str()).map(|x| x.to_string()), + transcript_name: attributes + .get(options.transcript_name_key.as_str()) + .map(|x| x.to_string()), transcript_id: get_attr(options.transcript_id_key.as_str()), gene_name: get_attr(options.gene_name_key.as_str()), gene_id: get_attr(options.gene_id_key.as_str()), @@ -113,12 +109,17 @@ fn from_gff(record: &gff::Record, options: &TranscriptParserOptions) -> Result String { - attributes.get(key).expect(&format!("failed to find '{}' in record: {}", key, record)) .to_string() + let get_attr = |key: &str| -> String { + attributes + .get(key) + .expect(&format!("failed to find '{}' in record: {}", key, record)) + .to_string() }; Ok(Transcript { - transcript_name: attributes.get(options.transcript_name_key.as_str()).map(|x| x.to_string()), + transcript_name: attributes + .get(options.transcript_name_key.as_str()) + .map(|x| x.to_string()), transcript_id: get_attr(options.transcript_id_key.as_str()), gene_name: get_attr(options.gene_name_key.as_str()), gene_id: get_attr(options.gene_id_key.as_str()), @@ -144,14 +145,18 @@ impl Transcript { } } -pub fn read_transcripts_from_gtf(input: R, options: &TranscriptParserOptions) -> Result> +pub fn read_transcripts_from_gtf( + input: R, + options: &TranscriptParserOptions, +) -> Result> where R: BufRead, { let mut results = Vec::new(); input.lines().try_for_each(|line| { let line = line?; - let line = gtf::Line::from_str(&line).with_context(|| format!("failed to parse GTF line: {}", line))?; + let line = gtf::Line::from_str(&line) + .with_context(|| format!("failed to parse GTF line: {}", line))?; if let gtf::line::Line::Record(rec) = line { if rec.ty() == "transcript" { results.push(from_gtf(&rec, options)?); @@ -162,14 +167,18 @@ where Ok(results) } -pub fn read_transcripts_from_gff(input: R, options: &TranscriptParserOptions) -> Result> +pub fn read_transcripts_from_gff( + input: R, + options: &TranscriptParserOptions, +) -> Result> where R: BufRead, { let mut results = Vec::new(); input.lines().try_for_each(|line| { let line = line?; - let line = gff::Line::from_str(&line).with_context(|| format!("failed to parse GFF line: {}", line))?; + let line = gff::Line::from_str(&line) + .with_context(|| format!("failed to parse GFF line: {}", line))?; if let gff::line::Line::Record(rec) = line { if rec.ty() == "transcript" { results.push(from_gff(&rec, options)?); @@ -220,186 +229,6 @@ impl Promoters { } } - -/// `FeatureCounter` is a trait that provides an interface for counting genomic features. -/// Types implementing `FeatureCounter` can store feature counts and provide several -/// methods for manipulating and retrieving those counts. -pub trait FeatureCounter { - type Value; - - /// Returns the total number of distinct features counted. - fn num_features(&self) -> usize { self.get_feature_ids().len() } - - /// Resets the counter for all features. - fn reset(&mut self); - - /// Updates the counter according to the given region and count. - fn insert(&mut self, tag: &B, count: N); - - /// Updates the counter according to the given fragment - fn insert_fragment(&mut self, tag: &Fragment, strategy: &CountingStrategy); - - /// Returns a vector of feature ids. - fn get_feature_ids(&self) -> Vec; - - /// Returns a vector of feature names if available. - fn get_feature_names(&self) -> Option> { None } - - /// Returns a vector of tuples, each containing a feature's index and its count. - fn get_counts(&self) -> Vec<(usize, Self::Value)>; -} - -/// Implementation of `FeatureCounter` trait for `SparseCoverage` struct. -/// `SparseCoverage` represents a sparse coverage map for genomic data. -impl FeatureCounter for SparseCoverage<'_, T> { - type Value = T; - - fn reset(&mut self) { self.reset(); } - - fn insert(&mut self, tag: &B, count: N) { - self.insert(tag, ::from(count).unwrap()); - } - - fn insert_fragment(&mut self, tag: &Fragment, strategy: &CountingStrategy) { - if tag.is_single() { - tag.to_insertions().iter().for_each(|x| { - self.insert(x, T::one()); - }); - } else { - match strategy { - CountingStrategy::Fragment => { - self.insert(tag, T::one()); - }, - CountingStrategy::Insertion => { - tag.to_insertions().iter().for_each(|x| { - self.insert(x, T::one()); - }); - }, - CountingStrategy::PIC => { - tag.to_insertions().into_iter() - .flat_map(|x| self.get_index(&x)) - .unique().collect::>().into_iter().for_each(|i| { - self.insert_at_index::(i, T::one()); - }); - } - } - } - } - - fn get_feature_ids(&self) -> Vec { - self.regions().map(|x| x.to_genomic_range().pretty_show()).collect() - } - - fn get_counts(&self) -> Vec<(usize, Self::Value)> { - self.get_coverage().iter().map(|(k, v)| (*k, *v)).collect() - } -} - -/// `TranscriptCount` is a struct that represents the count of genomic features at the transcript level. -/// It holds a `SparseCoverage` counter and a reference to `Promoters`. -#[derive(Clone)] -pub struct TranscriptCount<'a> { - counter: SparseCoverage<'a, u32>, - promoters: &'a Promoters, -} - -impl<'a> TranscriptCount<'a> { - pub fn new(promoters: &'a Promoters) -> Self { - Self { - counter: SparseCoverage::new(&promoters.regions), - promoters, - } - } - - pub fn gene_names(&self) -> Vec { - self.promoters - .transcripts - .iter() - .map(|x| x.gene_name.clone()) - .collect() - } -} - -/// `GeneCount` is a struct that represents the count of genomic features at the gene level. -/// It holds a `TranscriptCount` counter and a map from gene names to their indices. -#[derive(Clone)] -pub struct GeneCount<'a> { - counter: TranscriptCount<'a>, - gene_name_to_idx: IndexMap<&'a str, usize>, -} - -/// Implementation of `GeneCount` -impl<'a> GeneCount<'a> { - pub fn new(counter: TranscriptCount<'a>) -> Self { - let gene_name_to_idx: IndexMap<_, _> = counter - .promoters - .transcripts - .iter() - .map(|x| x.gene_name.as_str()) - .unique() - .enumerate() - .map(|(a, b)| (b, a)) - .collect(); - Self { - counter, - gene_name_to_idx, - } - } -} - -/// Implementations of `FeatureCounter` trait for `TranscriptCount` and `GeneCount` structs. -impl FeatureCounter for TranscriptCount<'_> { - type Value = u32; - - fn reset(&mut self) { self.counter.reset(); } - - fn insert(&mut self, tag: &B, count: N) { - self.counter.insert(tag, ::from(count).unwrap()); - } - - fn insert_fragment(&mut self, tag: &Fragment, strategy: &CountingStrategy) { - self.counter.insert_fragment(tag, strategy); - } - - fn get_feature_ids(&self) -> Vec { - self.promoters.transcripts.iter().map(|x| x.transcript_id.clone()).collect() - } - - fn get_counts(&self) -> Vec<(usize, Self::Value)> { - self.counter.get_counts() - } -} - -impl FeatureCounter for GeneCount<'_> { - type Value = u32; - - fn reset(&mut self) { self.counter.reset(); } - - fn insert(&mut self, tag: &B, count: N) { - self.counter.insert(tag, ::from(count).unwrap()); - } - - fn insert_fragment(&mut self, tag: &Fragment, strategy: &CountingStrategy) { - self.counter.insert_fragment(tag, strategy); - } - - fn get_feature_ids(&self) -> Vec { - self.gene_name_to_idx.keys().map(|x| x.to_string()).collect() - } - - fn get_counts(&self) -> Vec<(usize, Self::Value)> { - let mut counts = BTreeMap::new(); - self.counter.get_counts().into_iter().for_each(|(k, v)| { - let idx = *self.gene_name_to_idx.get( - self.counter.promoters.transcripts[k].gene_name.as_str() - ).unwrap(); - let current_v = counts.entry(idx).or_insert(v); - if *current_v < v { *current_v = v } - }); - counts.into_iter().collect() - } -} - #[derive(Debug, Clone, Eq, PartialEq)] pub struct ChromSizes(IndexMap); @@ -536,12 +365,16 @@ impl GenomeBaseIndex { pub fn with_step(&self, s: usize) -> Self { let mut prev = 0; let mut acc_low_res = 0; - let binned_accum_len = self.base_accum_len.iter().map(|acc| { - let length = acc - prev; - prev = *acc; - acc_low_res += num::Integer::div_ceil(&length, &(s as u64)); - acc_low_res - }).collect(); + let binned_accum_len = self + .base_accum_len + .iter() + .map(|acc| { + let length = acc - prev; + prev = *acc; + acc_low_res += num::Integer::div_ceil(&length, &(s as u64)); + acc_low_res + }) + .collect(); Self { chroms: self.chroms.clone(), base_accum_len: self.base_accum_len.clone(), @@ -552,7 +385,10 @@ impl GenomeBaseIndex { /// Given a genomic position, return the corresponding index. pub fn get_position_rev(&self, chrom: &str, pos: u64) -> usize { - let i = self.chroms.get_index_of(chrom).expect(format!("Chromosome {} not found", chrom).as_str()); + let i = self + .chroms + .get_index_of(chrom) + .expect(format!("Chromosome {} not found", chrom).as_str()); let size = if i == 0 { self.base_accum_len[i] } else { @@ -650,92 +486,6 @@ impl GenomeBaseIndex { } } -/// `ChromValues` is a type alias for a vector of `BedGraph` objects. -/// Each `BedGraph` instance represents a genomic region along with a -/// numerical value (like coverage or score). -pub type ChromValues = Vec>; - -/// `ChromValueIter` represents an iterator over the chromosome values. -/// Each item in the iterator is a tuple of a vector of `ChromValues` objects, -/// a start index, and an end index. -pub struct ChromValueIter { - pub(crate) iter: I, - pub(crate) regions: Vec, - pub(crate) length: usize, -} - -impl<'a, I, T> ChromValueIter -where - I: ExactSizeIterator, usize, usize)> + 'a, - T: Copy, -{ - /// Aggregate the values in the iterator by the given `FeatureCounter`. - pub fn aggregate_by( - self, - mut counter: C, - ) -> impl ExactSizeIterator, usize, usize)> - where - C: FeatureCounter + Clone + Sync, - T: Sync + Send + num::ToPrimitive, - { - let n_col = counter.num_features(); - counter.reset(); - self.iter.map(move |(mat, i, j)| { - let n = j - i; - let vec = (0..n) - .into_par_iter() - .map(|k| { - let row = mat.get_row(k).unwrap(); - let mut coverage = counter.clone(); - row.col_indices() - .into_iter() - .zip(row.values()) - .for_each(|(idx, val)| { - coverage.insert(&self.regions[*idx], *val); - }); - coverage.get_counts() - }) - .collect::>(); - let (r, c, offset, ind, data) = to_csr_data(vec, n_col); - (CsrMatrix::try_from_csr_data(r,c,offset,ind, data).unwrap(), i, j) - }) - } -} - -impl Iterator for ChromValueIter -where - I: Iterator, usize, usize)>, - T: Copy, -{ - type Item = (Vec>, usize, usize); - - fn next(&mut self) -> Option { - self.iter.next().map(|(x, start, end)| { - let values = x - .row_iter() - .map(|row| { - row.col_indices() - .iter() - .zip(row.values()) - .map(|(i, v)| BedGraph::from_bed(&self.regions[*i], *v)) - .collect() - }) - .collect(); - (values, start, end) - }) - } -} - -impl ExactSizeIterator for ChromValueIter -where - I: Iterator, usize, usize)>, - T: Copy, -{ - fn len(&self) -> usize { - self.length - } -} - #[cfg(test)] mod tests { use super::*; @@ -748,7 +498,9 @@ mod tests { ("1".to_owned(), 13), ("2".to_owned(), 71), ("3".to_owned(), 100), - ].into_iter().collect(); + ] + .into_iter() + .collect(); let mut index = GenomeBaseIndex::new(&chrom_sizes); assert_eq!(index.get_range("1").unwrap(), 0..13); @@ -808,7 +560,9 @@ mod tests { ("1".to_owned(), 13), ("2".to_owned(), 71), ("3".to_owned(), 100), - ].into_iter().collect(); + ] + .into_iter() + .collect(); let index = GenomeBaseIndex::new(&chrom_sizes); [(0, 0), (12, 12), (13, 13), (100, 100)] @@ -876,9 +630,8 @@ mod tests { expected ); - //let failed_line = "NC_051341.1\tGnomon\ttranscript\t26923605\t26924789\t.\t+\t.\tgene_id \"RGD1565766\"; transcript_id \"XM_039113095.1\"; db_xref \"GeneID:500622\"; gbkey \"mRNA\"; gene_name \"RGD1565766\"; model_evidence \"Supporting evidence includes similarity to: 1 EST, 4 Proteins, and 100% coverage of the annotated genomic feature by RNAseq alignments\"; product \"hypothetical gene supported by BC088468; NM_001009712\"; transcript_biotype \"mRNA\";"; let worked = "chr1\tG\ttranscript\t26\t92\t.\t+\t.\tgene_id \"RGD\"; transcript_id \"XM_5\"; gene_name \"RGD\"; note \"note1;note2\";"; assert!(read_transcripts_from_gtf(worked.as_bytes(), &Default::default()).is_ok()); } -} \ No newline at end of file +} diff --git a/snapatac2-core/src/lib.rs b/snapatac2-core/src/lib.rs index 426a841e..e96330e1 100644 --- a/snapatac2-core/src/lib.rs +++ b/snapatac2-core/src/lib.rs @@ -7,8 +7,8 @@ pub mod network; pub mod embedding; pub mod utils; -use feature_count::{BaseData, FragmentData, FragmentDataIter}; -use genome::{ChromSizes, ChromValueIter}; +use feature_count::{ChromValueIter, BaseData, FragmentData, FragmentDataIter}; +use genome::ChromSizes; use anndata::{ container::{ChunkedArrayElem, StackedChunkedArrayElem}, diff --git a/snapatac2-python/python/snapatac2/preprocessing/_basic.py b/snapatac2-python/python/snapatac2/preprocessing/_basic.py index 9a9ab548..b6e80a32 100644 --- a/snapatac2-python/python/snapatac2/preprocessing/_basic.py +++ b/snapatac2-python/python/snapatac2/preprocessing/_basic.py @@ -410,7 +410,8 @@ def import_values( chunk_size: int = 200, backend: Literal['hdf5'] = 'hdf5', ) -> internal.AnnData: - """Import chromatin contacts. + """Import values associated with base pairs, typically from experiments like + whole-genome bisulfite sequencing (WGBS). Parameters ----------