Skip to content

Commit

Permalink
allow computing avearge in FeatureCounter [wheel]
Browse files Browse the repository at this point in the history
  • Loading branch information
kaizhang committed Dec 4, 2024
1 parent a90f162 commit b2b1b5b
Show file tree
Hide file tree
Showing 8 changed files with 465 additions and 348 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@ jobs:
CIBW_ARCHS_LINUX: "auto64"
CIBW_ENVIRONMENT: 'PATH="$PATH:$HOME/.cargo/bin"'
CIBW_ENVIRONMENT_MACOS: 'MACOSX_DEPLOYMENT_TARGET="10.9"'
CIBW_SKIP: "pp* *-win32 *-musllinux*"
CIBW_BUILD: ${{ matrix.python_version }}
CIBW_TEST_REQUIRES: pytest hypothesis==6.72.4
CIBW_TEST_COMMAND: "pytest {project}/snapatac2-python/tests"

MACOSX_DEPLOYMENT_TARGET: 10.7

steps:
- name: Checkout code
Expand Down
275 changes: 275 additions & 0 deletions snapatac2-core/src/feature_count/counter.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,275 @@
use bed_utils::bed::map::GIntervalIndexSet;
use bed_utils::bed::BEDLike;
use indexmap::map::IndexMap;
use itertools::Itertools;
use num::{
traits::{NumAssignOps, NumCast, ToPrimitive},
Num,
};
use std::{collections::BTreeMap, fmt::Debug};

use crate::genome::Promoters;
use crate::preprocessing::Fragment;

/// The `CountingStrategy` enum represents different counting strategies.
/// It is used to count the number of fragments that overlap for a given list of genomic features.
/// Three counting strategies are supported: Insertion, Fragment, and Paired-Insertion Counting (PIC).
#[derive(Clone, Copy, Debug)]
pub enum CountingStrategy {
Insertion, // Insertion based counting
Fragment, // Fragment based counting
PIC, // Paired-Insertion Counting (PIC)
}

/// `FeatureCounter` is a trait that provides an interface for counting genomic features.
/// Types implementing `FeatureCounter` can store feature counts and provide several
/// methods for manipulating and retrieving those counts.
pub trait FeatureCounter {
type Value;

/// Returns the total number of distinct features counted.
fn num_features(&self) -> usize {
self.get_feature_ids().len()
}

/// Resets the counter for all features.
fn reset(&mut self);

/// Updates the counter according to the given region and count.
fn insert<B: BEDLike, N: ToPrimitive + Copy>(&mut self, tag: &B, count: N);

/// Updates the counter according to the given fragment
fn insert_fragment(&mut self, tag: &Fragment, strategy: &CountingStrategy);

/// Returns a vector of feature ids.
fn get_feature_ids(&self) -> Vec<String>;

/// Returns a vector of feature names if available.
fn get_feature_names(&self) -> Option<Vec<String>> {
None
}

/// Returns a vector of tuples, each containing a feature's index and its count.
fn get_values(&self) -> Vec<(usize, Self::Value)>;

/// Returns a vector of tuples, each containing a feature's index and its average count.
fn get_values_and_counts(&self) -> impl Iterator<Item = (usize, (Self::Value, usize))>;
}

#[derive(Clone)]
pub struct RegionCounter<'a, V> {
regions: &'a GIntervalIndexSet,
values: BTreeMap<usize, (V, usize)>,
}

impl<'a, V> RegionCounter<'a, V> {
pub fn new(regions: &'a GIntervalIndexSet) -> Self {
Self {
regions,
values: BTreeMap::new(),
}
}
}

impl<V: Num + NumCast + NumAssignOps + Copy> FeatureCounter for RegionCounter<'_, V> {
type Value = V;

fn reset(&mut self) {
self.values.clear();
}

fn insert<B: BEDLike, N: ToPrimitive + Copy>(&mut self, tag: &B, count: N) {
let val = <V as NumCast>::from(count).unwrap();
self.regions.find_index_of(tag).for_each(|idx| {
self.values
.entry(idx)
.and_modify(|(v, c)| {
*v += val;
*c += 1;
})
.or_insert((val, 1));
});
}

fn insert_fragment(&mut self, tag: &Fragment, strategy: &CountingStrategy) {
if tag.is_single() {
tag.to_insertions().iter().for_each(|x| {
self.insert(x, V::one());
});
} else {
match strategy {
CountingStrategy::Fragment => {
self.insert(tag, V::one());
}
CountingStrategy::Insertion => {
tag.to_insertions().iter().for_each(|x| {
self.insert(x, V::one());
});
}
CountingStrategy::PIC => {
tag.to_insertions()
.into_iter()
.flat_map(|x| self.regions.find_index_of(&x))
.unique()
.collect::<Vec<_>>()
.into_iter()
.for_each(|i| {
self.values
.entry(i)
.and_modify(|(v, c)| {
*v += V::one();
*c += 1;
})
.or_insert((V::one(), 1));
});
}
}
}
}

fn get_feature_ids(&self) -> Vec<String> {
self.regions
.iter()
.map(|x| x.to_genomic_range().pretty_show())
.collect()
}

fn get_values(&self) -> Vec<(usize, Self::Value)> {
self.values.iter().map(|(k, v)| (*k, v.0)).collect()
}

fn get_values_and_counts(&self) -> impl Iterator<Item = (usize, (Self::Value, usize))> {
self.values
.iter().map(|(k, v)| (*k, (v.0, v.1)))
}
}

/// `TranscriptCount` is a struct that represents the count of genomic features at the transcript level.
/// It holds a `SparseCoverage` counter and a reference to `Promoters`.
#[derive(Clone)]
pub struct TranscriptCount<'a> {
counter: RegionCounter<'a, u32>,
promoters: &'a Promoters,
}

impl<'a> TranscriptCount<'a> {
pub fn new(promoters: &'a Promoters) -> Self {
Self {
counter: RegionCounter::new(&promoters.regions),
promoters,
}
}

pub fn gene_names(&self) -> Vec<String> {
self.promoters
.transcripts
.iter()
.map(|x| x.gene_name.clone())
.collect()
}
}

/// `GeneCount` is a struct that represents the count of genomic features at the gene level.
/// It holds a `TranscriptCount` counter and a map from gene names to their indices.
#[derive(Clone)]
pub struct GeneCount<'a> {
counter: TranscriptCount<'a>,
gene_name_to_idx: IndexMap<&'a str, usize>,
}

/// Implementation of `GeneCount`
impl<'a> GeneCount<'a> {
pub fn new(counter: TranscriptCount<'a>) -> Self {
let gene_name_to_idx: IndexMap<_, _> = counter
.promoters
.transcripts
.iter()
.map(|x| x.gene_name.as_str())
.unique()
.enumerate()
.map(|(a, b)| (b, a))
.collect();
Self {
counter,
gene_name_to_idx,
}
}
}

/// Implementations of `FeatureCounter` trait for `TranscriptCount` and `GeneCount` structs.
impl FeatureCounter for TranscriptCount<'_> {
type Value = u32;

fn reset(&mut self) {
self.counter.reset();
}

fn insert<B: BEDLike, N: ToPrimitive + Copy>(&mut self, tag: &B, count: N) {
self.counter
.insert(tag, <u32 as NumCast>::from(count).unwrap());
}

fn insert_fragment(&mut self, tag: &Fragment, strategy: &CountingStrategy) {
self.counter.insert_fragment(tag, strategy);
}

fn get_feature_ids(&self) -> Vec<String> {
self.promoters
.transcripts
.iter()
.map(|x| x.transcript_id.clone())
.collect()
}

fn get_values(&self) -> Vec<(usize, Self::Value)> {
self.counter.get_values()
}

fn get_values_and_counts(&self) -> impl Iterator<Item = (usize, (Self::Value, usize))> {
self.counter.get_values_and_counts()
}
}

impl FeatureCounter for GeneCount<'_> {
type Value = u32;

fn reset(&mut self) {
self.counter.reset();
}

fn insert<B: BEDLike, N: ToPrimitive + Copy>(&mut self, tag: &B, count: N) {
self.counter
.insert(tag, <u32 as NumCast>::from(count).unwrap());
}

fn insert_fragment(&mut self, tag: &Fragment, strategy: &CountingStrategy) {
self.counter.insert_fragment(tag, strategy);
}

fn get_feature_ids(&self) -> Vec<String> {
self.gene_name_to_idx
.keys()
.map(|x| x.to_string())
.collect()
}

fn get_values(&self) -> Vec<(usize, Self::Value)> {
let mut counts = BTreeMap::new();
self.counter.get_values().into_iter().for_each(|(k, v)| {
let idx = *self
.gene_name_to_idx
.get(self.counter.promoters.transcripts[k].gene_name.as_str())
.unwrap();
let current_v = counts.entry(idx).or_insert(v);
if *current_v < v {
*current_v = v
}
});
counts.into_iter().collect()
}

fn get_values_and_counts(&self) -> impl Iterator<Item = (usize, (Self::Value, usize))> {
todo!();
self.counter.get_values_and_counts()
}
}
Loading

0 comments on commit b2b1b5b

Please sign in to comment.