Skip to content

Commit

Permalink
add metrics.summary_by_chrom.
Browse files Browse the repository at this point in the history
  • Loading branch information
kaizhang committed Dec 5, 2024
1 parent bd88f41 commit 6c30260
Show file tree
Hide file tree
Showing 9 changed files with 287 additions and 103 deletions.
3 changes: 2 additions & 1 deletion docs/api/metrics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ Quality Control

metrics.frag_size_distr
metrics.tsse
metrics.frip
metrics.frip
metrics.summary_by_chrom
1 change: 1 addition & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
- BedGraph generation in `ex.export_coverage` is 10x faster.
- Implement broad peak calling in `tl.macs3`.
- Add `pp.import_values` for importing single base pair values.
- Add `metrics.summary_by_chrom`.

### Bugs fixed:

Expand Down
41 changes: 41 additions & 0 deletions snapatac2-core/src/feature_count/data_iter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,13 @@ where
}
}

#[derive(Debug, Clone)]
pub struct BaseValue {
pub chrom: String,
pub pos: u64,
pub value: f32,
}

pub struct BaseData<I> {
index: GenomeBaseIndex,
data_iter: I,
Expand Down Expand Up @@ -517,6 +524,40 @@ where
self
}

/// Return an iterator of raw values.
pub fn into_values(
self,
) -> impl ExactSizeIterator<Item = (Vec<Vec<BaseValue>>, usize, usize)> {
self.data_iter.map(move |(mat, a, b)| {
let row_offsets = mat.row_offsets();
let col_indices = mat.col_indices();
let values = mat.values();
let values = (0..(row_offsets.len() - 1))
.into_par_iter()
.map(|i| {
let row_start = row_offsets[i];
let row_end = row_offsets[i + 1];
(row_start..row_end)
.flat_map(|j| {
let (chrom, start) = self.index.get_position(col_indices[j]);
if self.exclude_chroms.contains(chrom) {
None
} else {
let v = values[j];
Some(BaseValue {
chrom: chrom.to_string(),
pos: start,
value: v,
})
}
})
.collect()
})
.collect();
(values, a, b)
})
}

/// Output the raw coverage matrix. Note the values belong to the same interval
/// will be aggregated by the mean value.
pub fn into_array_iter(self) -> impl ExactSizeIterator<Item = (ArrayData, usize, usize)>
Expand Down
25 changes: 17 additions & 8 deletions snapatac2-core/src/preprocessing/import.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
use crate::feature_count::{ContactData, BASE_VALUE, FRAGMENT_PAIRED, FRAGMENT_SINGLE};
use crate::genome::{ChromSizes, GenomeBaseIndex};
use crate::preprocessing::qc::{Contact, Fragment, FragmentSummary, FragmentQC};
use crate::preprocessing::qc::{Contact, Fragment, FragmentQC, FragmentQCBuilder};

use super::qc::BaseValueQC;
use anndata::{
data::array::utils::{from_csr_data, to_csr_data},
AnnDataOp, ArrayData, AxisArraysOp, ElemCollectionOp,
Expand Down Expand Up @@ -165,7 +166,7 @@ where
V: TryFrom<i64> + Ord,
<V as TryFrom<i64>>::Error: std::fmt::Debug,
{
let mut qc = FragmentSummary::new(mitochrondrial_dna);
let mut qc = FragmentQCBuilder::new(mitochrondrial_dna);
let mut values = Vec::new();
fragments.into_iter().for_each(|f| {
let chrom = &f.chrom;
Expand Down Expand Up @@ -204,7 +205,7 @@ where
}
});
values.sort();
(qc.get_qc(), values)
(qc.finish(), values)
}

fn qc_to_df(qc: Vec<FragmentQC>) -> DataFrame {
Expand Down Expand Up @@ -341,6 +342,7 @@ where
let genome_size = genome_index.len();
let mut scanned_barcodes = IndexSet::new();

let mut qc_metrics = Vec::new();
let chunked_values = values.chunk_by(|x| x.barcode.clone());
let chunked_values = chunked_values
.into_iter()
Expand All @@ -357,14 +359,16 @@ where
})
.collect();

let counts = chunk
let (qc, counts): (Vec<_>, Vec<_>) = chunk
.into_par_iter()
.map(|data| {
let mut count = data
.map(|cell_data| {
let mut qc = BaseValueQC::new();
let mut count = cell_data
.into_iter()
.flat_map(|value| {
let chrom = &value.chrom;
if genome_index.contain_chrom(chrom) {
qc.add();
let pos = genome_index.get_position_rev(chrom, value.pos);
Some((pos, value.value))
} else {
Expand All @@ -373,9 +377,10 @@ where
})
.collect::<Vec<_>>();
count.sort_by(|x, y| x.0.cmp(&y.0));
count
(qc, count)
})
.collect::<Vec<_>>();
.unzip();
qc_metrics.extend(qc);
let (r, c, offset, ind, csr_data) = to_csr_data(counts, genome_size);
CsrMatrix::try_from_csr_data(r, c, offset, ind, csr_data).unwrap()
});
Expand All @@ -385,5 +390,9 @@ where
.uns()
.add("reference_sequences", chrom_sizes.to_dataframe())?;
anndata.set_obs_names(scanned_barcodes.into_iter().collect())?;
anndata.set_obs(DataFrame::new(vec![Series::new(
"num_values",
qc_metrics.iter().map(|x| x.num_values).collect::<Series>(),
)])?)?;
Ok(())
}
3 changes: 2 additions & 1 deletion snapatac2-core/src/preprocessing/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ mod qc;
pub use bam::{make_fragment_file, BamQC, FlagStat};
pub use import::{import_contacts, import_fragments, import_values, ChromValue};
pub use qc::{
fraction_of_reads_in_region, fragment_size_distribution, get_barcode_count, make_promoter_map,
SummaryType,
get_barcode_count, make_promoter_map,
read_tss, CellBarcode, Contact, Fragment, QualityControl, TSSe, TssRegions,
};
Loading

0 comments on commit 6c30260

Please sign in to comment.