Skip to content

Commit

Permalink
store ratio instead of f32
Browse files Browse the repository at this point in the history
  • Loading branch information
kaizhang committed Dec 5, 2024
1 parent 8692b58 commit 99bfcc5
Show file tree
Hide file tree
Showing 8 changed files with 316 additions and 100 deletions.
351 changes: 290 additions & 61 deletions snapatac2-core/src/feature_count/data_iter.rs

Large diffs are not rendered by default.

14 changes: 7 additions & 7 deletions snapatac2-core/src/feature_count/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ mod data_iter;
use std::str::FromStr;

use anyhow::{bail, Context, Result};
use anndata::{AnnData, AnnDataOp, AnnDataSet, ArrayElemOp, AxisArraysOp, Backend, ElemCollectionOp};
use anndata::{data::DynCsrMatrix, AnnData, AnnDataOp, AnnDataSet, ArrayElemOp, AxisArraysOp, Backend, ElemCollectionOp};
use bed_utils::bed::GenomicRange;
pub use data_iter::{ChromValueIter, BaseData, FragmentData, ContactData, FragmentDataIter};
pub use data_iter::{BaseValue, ChromValueIter, BaseData, FragmentData, ContactData, FragmentDataIter};
pub use counter::{FeatureCounter, CountingStrategy};
pub use matrix::{create_gene_matrix, create_tile_matrix, create_peak_matrix};
use nalgebra_sparse::CsrMatrix;
Expand All @@ -28,7 +28,7 @@ pub trait SnapData: AnnDataOp {
fn get_fragment_iter(&self, chunk_size: usize) -> Result<FragmentData>;

/// Read base values stored in the `.obsm` matrix.
fn get_base_iter(&self, chunk_size: usize) -> Result<BaseData<impl ExactSizeIterator<Item = (CsrMatrix<f32>, usize, usize)>>>;
fn get_base_iter(&self, chunk_size: usize) -> Result<BaseData<impl ExactSizeIterator<Item = (DynCsrMatrix, usize, usize)>>>;

/// Read counts stored in the `X` matrix.
fn read_chrom_values(
Expand Down Expand Up @@ -85,9 +85,9 @@ impl<B: Backend> SnapData for AnnData<B> {
Ok(FragmentData::new(self.read_chrom_sizes()?, matrices))
}

fn get_base_iter(&self, chunk_size: usize) -> Result<BaseData<impl ExactSizeIterator<Item = (CsrMatrix<f32>, usize, usize)>>> {
fn get_base_iter(&self, chunk_size: usize) -> Result<BaseData<impl ExactSizeIterator<Item = (DynCsrMatrix, usize, usize)>>> {
let obsm = self.obsm();
if let Some(data) = obsm.get_item_iter::<CsrMatrix<f32>>(BASE_VALUE, chunk_size) {
if let Some(data) = obsm.get_item_iter(BASE_VALUE, chunk_size) {
Ok(BaseData::new(self.read_chrom_sizes()?, data))
} else {
bail!("key '_values' is not present in the '.obsm'")
Expand All @@ -111,10 +111,10 @@ impl<B: Backend> SnapData for AnnDataSet<B> {
Ok(FragmentData::new(self.read_chrom_sizes()?, matrices))
}

fn get_base_iter(&self, chunk_size: usize) -> Result<BaseData<impl ExactSizeIterator<Item = (CsrMatrix<f32>, usize, usize)>>>
fn get_base_iter(&self, chunk_size: usize) -> Result<BaseData<impl ExactSizeIterator<Item = (DynCsrMatrix, usize, usize)>>>
{
let obsm = self.obsm();
if let Some(data) = obsm.get_item_iter::<CsrMatrix<f32>>(BASE_VALUE, chunk_size) {
if let Some(data) = obsm.get_item_iter(BASE_VALUE, chunk_size) {
Ok(BaseData::new(self.read_chrom_sizes()?, data))
} else {
bail!("key '_values' is not present in the '.obsm'")
Expand Down
17 changes: 5 additions & 12 deletions snapatac2-core/src/preprocessing/import.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::feature_count::{ContactData, BASE_VALUE, FRAGMENT_PAIRED, FRAGMENT_SINGLE};
use crate::feature_count::{BaseValue, ContactData, BASE_VALUE, FRAGMENT_PAIRED, FRAGMENT_SINGLE};
use crate::genome::{ChromSizes, GenomeBaseIndex};
use crate::preprocessing::qc::{Contact, Fragment, FragmentQC, FragmentQCBuilder};

Expand Down Expand Up @@ -312,13 +312,6 @@ where
Ok(())
}

pub struct ChromValue {
pub chrom: String,
pub pos: u64,
pub value: f32,
pub barcode: String,
}

/// Import values
pub fn import_values<A, I>(
anndata: &A,
Expand All @@ -328,7 +321,7 @@ pub fn import_values<A, I>(
) -> Result<()>
where
A: AnnDataOp,
I: Iterator<Item = ChromValue>,
I: Iterator<Item = (String, BaseValue)>,
{
let spinner = ProgressBar::with_draw_target(None, ProgressDrawTarget::stderr_with_hz(1))
.with_style(
Expand All @@ -343,7 +336,7 @@ where
let mut scanned_barcodes = IndexSet::new();

let mut qc_metrics = Vec::new();
let chunked_values = values.chunk_by(|x| x.barcode.clone());
let chunked_values = values.chunk_by(|x| x.0.clone());
let chunked_values = chunked_values
.into_iter()
.progress_with(spinner)
Expand All @@ -365,12 +358,12 @@ where
let mut qc = BaseValueQC::new();
let mut count = cell_data
.into_iter()
.flat_map(|value| {
.flat_map(|(_, value)| {
let chrom = &value.chrom;
if genome_index.contain_chrom(chrom) {
qc.add();
let pos = genome_index.get_position_rev(chrom, value.pos);
Some((pos, value.value))
Some((pos, value.value()))
} else {
None
}
Expand Down
2 changes: 1 addition & 1 deletion snapatac2-core/src/preprocessing/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ mod import;
mod qc;

pub use bam::{make_fragment_file, BamQC, FlagStat};
pub use import::{import_contacts, import_fragments, import_values, ChromValue};
pub use import::{import_contacts, import_fragments, import_values};
pub use qc::{
SummaryType,
get_barcode_count, make_promoter_map,
Expand Down
2 changes: 1 addition & 1 deletion snapatac2-core/src/preprocessing/qc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ pub trait QualityControl: SnapData {
} else if let Ok(values) = self.get_base_iter(2000) {
values.into_values().for_each(|(data, s, _)| {
data.into_iter().enumerate().for_each(|(i, values)| {
let values = values.into_iter().map(|x| (x.chrom.to_string(), x.value));
let values = values.into_iter().map(|x| (x.chrom.to_string(), x.value()));
let stat = match mode {
SummaryType::Sum => sum(values),
SummaryType::Count => count(values),
Expand Down
1 change: 1 addition & 0 deletions snapatac2-python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ linfa = "0.6"
linfa-clustering = "0.6"
noodles = { version = "0.84", features = ["bam", "sam"] }
numpy = "0.21.0"
num = "0.4"
nalgebra-sparse = "0.9"
nalgebra = "0.32"
ndarray = "0.15"
Expand Down
20 changes: 8 additions & 12 deletions snapatac2-python/src/preprocessing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use itertools::Itertools;
use pyo3::{prelude::*, pybacked::PyBackedStr};
use anndata::Backend;
use anndata_hdf5::H5;
use num::rational::Ratio;
use std::collections::HashMap;
use std::io::{BufRead, BufReader};
use std::path::PathBuf;
Expand All @@ -16,8 +17,8 @@ use anyhow::Result;
use snapatac2_core::{
QualityControl,
genome::TranscriptParserOptions,
feature_count::{create_gene_matrix, create_tile_matrix, create_peak_matrix, CountingStrategy},
preprocessing::{Fragment, Contact, ChromValue},
feature_count::{BaseValue, create_gene_matrix, create_tile_matrix, create_peak_matrix, CountingStrategy},
preprocessing::{Fragment, Contact},
preprocessing,
utils,
};
Expand Down Expand Up @@ -191,23 +192,18 @@ pub(crate) fn import_values(
chunk_size: usize,
) -> Result<()>
{
fn read_chrom_values(path: PathBuf) -> impl Iterator<Item = ChromValue> {
fn read_chrom_values(path: PathBuf) -> impl Iterator<Item = (String, BaseValue)> {
let barcode = path.file_stem().unwrap().to_str().unwrap().to_string();
let reader = BufReader::new(utils::open_file_for_read(&path));
reader.lines().skip(1).map(move |line| {
let line = line.unwrap();
let mut parts = line.split_whitespace();
let chrom = parts.next().unwrap();
let pos = parts.next().unwrap().parse().unwrap();
parts.next();
parts.next();
let value = parts.next().unwrap().parse().unwrap();
ChromValue {
chrom: chrom.to_string(),
pos,
value,
barcode: barcode.clone(),
}
let methyl = parts.next().unwrap().parse().unwrap();
let unmethyl: u16 = parts.next().unwrap().parse().unwrap();
let value = BaseValue::from_ratio(chrom, pos, Ratio::new_raw(methyl, unmethyl + methyl));
(barcode.clone(), value)
})
}

Expand Down
9 changes: 3 additions & 6 deletions snapatac2-python/src/utils/anndata.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
use anndata::{
data::{ArrayChunk, DataFrameIndex},
AnnDataOp, ArrayData, HasShape,
WriteArrayData, AxisArraysOp,
data::{ArrayChunk, DataFrameIndex, DynCsrMatrix}, AnnDataOp, ArrayData, AxisArraysOp, HasShape, WriteArrayData
};
use nalgebra_sparse::CsrMatrix;
use anyhow::{Result, bail};
use polars::prelude::DataFrame;
use pyanndata::anndata::memory;
Expand Down Expand Up @@ -154,9 +151,9 @@ impl<'py> SnapData for PyAnnData<'py> {
Ok(FragmentData::new(self.read_chrom_sizes()?, matrices))
}

fn get_base_iter(&self, chunk_size: usize) -> Result<BaseData<impl ExactSizeIterator<Item = (CsrMatrix<f32>, usize, usize)>>> {
fn get_base_iter(&self, chunk_size: usize) -> Result<BaseData<impl ExactSizeIterator<Item = (DynCsrMatrix, usize, usize)>>> {
let obsm = self.obsm();
if let Some(data) = obsm.get_item_iter::<CsrMatrix<f32>>(BASE_VALUE, chunk_size) {
if let Some(data) = obsm.get_item_iter(BASE_VALUE, chunk_size) {
Ok(BaseData::new(self.read_chrom_sizes()?, data))
} else {
bail!("key '_values' is not present in the '.obsm'")
Expand Down

0 comments on commit 99bfcc5

Please sign in to comment.