Skip to content

Commit

Permalink
fix tests
Browse files Browse the repository at this point in the history
  • Loading branch information
kaizhang committed Oct 8, 2023
1 parent 9c84520 commit 8accb32
Show file tree
Hide file tree
Showing 24 changed files with 242 additions and 188 deletions.
2 changes: 1 addition & 1 deletion snapatac2-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ anndata = { git = "https://github.com/kaizhang/anndata-rs.git", rev = "7279f0a8a
anyhow = "1.0"
bigtools = "0.2"
bincode = "1.3"
bed-utils = { git = "https://github.com/kaizhang/bed-utils.git", rev = "19832eda2909dbe87289e09ba966af0797adb9f8" }
bed-utils = { git = "https://github.com/kaizhang/bed-utils.git", rev = "27ae1ee81db4ac84434014877b8d17aa22583df2" }
extsort = "0.4"
either = "1.8"
flate2 = "1.0"
Expand Down
2 changes: 1 addition & 1 deletion snapatac2-core/src/export.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ pub trait Exporter: SnapData {
if let Some(barcodes_) = barcodes {
let bc = barcodes_[start + i];
entry.extend(xs.into_iter().map(|mut x| {
x.name = Some(bc.to_string());
x.barcode = Some(bc.to_string());
x
}));
} else {
Expand Down
2 changes: 1 addition & 1 deletion snapatac2-core/src/preprocessing/bam.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ pub fn make_fragment_file<P1: AsRef<Path>, P2: AsRef<Path>>(
let header: Header = fix_header(reader.read_header().unwrap()).parse().unwrap();
reader.read_reference_sequences().unwrap();

let f = File::create(output_file.as_ref().clone()).expect("cannot create the output file");
let f = File::create(output_file.as_ref()).expect("cannot create the output file");
let mut output: Box<dyn Write> =
if output_file.as_ref().extension().and_then(|x| x.to_str()) == Some("gz") {
Box::new(GzEncoder::new(BufWriter::new(f), Compression::default()))
Expand Down
2 changes: 1 addition & 1 deletion snapatac2-core/src/preprocessing/count_data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ pub trait SnapData: AnnDataOp {

/// Compute the fraction of reads in each region.
fn frip<D>(&self, regions: &Vec<BedTree<D>>) -> Result<Array2<f64>> {
let vec = qc::fraction_in_region(self.get_count_iter(2000)?.into_raw(), regions)
let vec = qc::fraction_of_reads_in_region(self.get_count_iter(2000)?.into_raw(), regions)
.map(|x| x.0).flatten().flatten().collect::<Vec<_>>();
Array2::from_shape_vec((self.n_obs(), regions.len()), vec).map_err(Into::into)
}
Expand Down
55 changes: 27 additions & 28 deletions snapatac2-core/src/preprocessing/count_data/coverage.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
use crate::preprocessing::count_data::genome::{FeatureCounter, GenomeBaseIndex, ChromSizes};
use crate::preprocessing::{count_data::genome::{FeatureCounter, GenomeBaseIndex, ChromSizes}, Fragment};

use std::collections::HashMap;
use anndata::data::{utils::to_csr_data, CsrNonCanonical};
use bed_utils::bed::{BEDLike, BED, Strand, GenomicRange};
use bed_utils::bed::{BEDLike, Strand, GenomicRange};
use nalgebra_sparse::{CsrMatrix, pattern::SparsityPattern};
use num::traits::{FromPrimitive, One, Zero, SaturatingAdd};
use rayon::iter::{IntoParallelIterator, ParallelIterator};
Expand Down Expand Up @@ -70,7 +70,7 @@ where
self
}

pub fn into_raw(self) -> impl ExactSizeIterator<Item = (Vec<Vec<BED<6>>>, usize, usize)> {
pub fn into_raw(self) -> impl ExactSizeIterator<Item = (Vec<Vec<Fragment>>, usize, usize)> {
let index = self.index;
self.coverage.map(move |(raw_mat, a, b)| {
let beds = match raw_mat {
Expand All @@ -85,25 +85,23 @@ where
let size = values[j];
let (chrom, pos) = index.get_position(col_indices[j]);
if size > 0 {
BED::new(
chrom,
pos,
pos + size as u64,
None,
None,
Some(Strand::Forward),
Default::default(),
)
Fragment {
chrom: chrom.to_string(),
start: pos,
end: pos + size as u64,
barcode: None,
count: 1,
strand: Some(Strand::Forward),
}
} else {
BED::new(
chrom,
pos + 1 - size.abs() as u64,
pos + 1,
None,
None,
Some(Strand::Reverse),
Default::default(),
)
Fragment {
chrom: chrom.to_string(),
start: pos + 1 - size.abs() as u64,
end: pos + 1,
barcode: None,
count: 1,
strand: Some(Strand::Reverse),
}
}
}).collect()
}).collect()
Expand All @@ -118,12 +116,14 @@ where
(row_start..row_end).map(|j| {
let size = values[j];
let (chrom, start) = index.get_position(col_indices[j]);
BED::new(
chrom,
Fragment {
chrom: chrom.to_string(),
start,
start + size as u64,
None, None, None, Default::default(),
)
end: start + size as u64,
barcode: None,
count: 1,
strand: None,
}
}).collect()
}).collect()
},
Expand All @@ -132,7 +132,7 @@ where
})
}

pub fn into_raw_groups<F, K>(self, key: F) -> impl ExactSizeIterator<Item = HashMap<K, Vec<BED<6>>>>
pub fn into_raw_groups<F, K>(self, key: F) -> impl ExactSizeIterator<Item = HashMap<K, Vec<Fragment>>>
where
F: Fn(usize) -> K,
K: Eq + PartialEq + std::hash::Hash,
Expand All @@ -146,7 +146,6 @@ where
.or_insert_with(Vec::new)
.extend(xs.into_iter());
});

ordered
})
}
Expand Down
2 changes: 1 addition & 1 deletion snapatac2-core/src/preprocessing/count_data/import.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ where
anndata.obsm().add_iter(
obsm_key,
fragments
.group_by(|x| x.barcode.clone())
.group_by(|x| x.name().unwrap().to_string())
.into_iter()
.progress_with(spinner)
.filter(|(key, _)| white_list.map_or(true, |x| x.contains(key)))
Expand Down
88 changes: 59 additions & 29 deletions snapatac2-core/src/preprocessing/qc.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
use std::{io::{Read, BufRead, BufReader}, ops::Div, collections::{HashMap, HashSet}};
use anndata::data::CsrNonCanonical;
use bed_utils::bed::{
GenomicRange, BEDLike, tree::BedTree,
ParseError, Strand, BED,
};
use bed_utils::bed::{GenomicRange, BEDLike, tree::BedTree, ParseError, Strand};
use anyhow::Result;
use serde::{Serialize, Deserialize};
use extsort::sorter::Sortable;
use bincode;
use smallvec::{SmallVec, smallvec};

pub type CellBarcode = String;

Expand All @@ -18,7 +16,7 @@ pub struct Fragment {
pub chrom: String,
pub start: u64,
pub end: u64,
pub barcode: CellBarcode,
pub barcode: Option<CellBarcode>,
pub count: u32,
pub strand: Option<Strand>,
}
Expand All @@ -35,16 +33,16 @@ impl Sortable for Fragment {
}

impl Fragment {
pub fn to_insertions(&self) -> Vec<GenomicRange> {
pub fn to_reads(&self) -> SmallVec<[GenomicRange; 2]> {
match self.strand {
None => vec![
None => smallvec![
GenomicRange::new(self.chrom.clone(), self.start, self.start + 1),
GenomicRange::new(self.chrom.clone(), self.end - 1, self.end),
],
Some(Strand::Forward) => vec![
Some(Strand::Forward) => smallvec![
GenomicRange::new(self.chrom.clone(), self.start, self.start + 1)
],
Some(Strand::Reverse) => vec![
Some(Strand::Reverse) => smallvec![
GenomicRange::new(self.chrom.clone(), self.end - 1, self.end)
],
}
Expand All @@ -67,9 +65,33 @@ impl BEDLike for Fragment {
self.end = end;
self
}
fn name(&self) -> Option<&str> { None }
fn score(&self) -> Option<bed_utils::bed::Score> { None }
fn strand(&self) -> Option<Strand> { None }
fn name(&self) -> Option<&str> {
self.barcode.as_deref()
}
fn score(&self) -> Option<bed_utils::bed::Score> {
Some(self.count.try_into().unwrap())
}
fn strand(&self) -> Option<Strand> {
self.strand
}
}

impl core::fmt::Display for Fragment {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
write!(
f,
"{}\t{}\t{}\t{}\t{}",
self.chrom(),
self.start(),
self.end(),
self.barcode.as_deref().unwrap_or("."),
self.count,
)?;
if let Some(strand) = self.strand() {
write!(f, "\t{}", strand)?;
}
Ok(())
}
}

impl std::str::FromStr for Fragment {
Expand All @@ -82,8 +104,13 @@ impl std::str::FromStr for Fragment {
.and_then(|s| lexical::parse(s).map_err(ParseError::InvalidStartPosition))?;
let end = fields.next().ok_or(ParseError::MissingEndPosition)
.and_then(|s| lexical::parse(s).map_err(ParseError::InvalidEndPosition))?;
let barcode = fields.next().ok_or(ParseError::MissingName)
.map(|s| s.into())?;
let barcode = fields
.next()
.ok_or(ParseError::MissingName)
.map(|s| match s {
"." => None,
_ => Some(s.into()),
})?;
let count = fields.next().map_or(Ok(1), |s| if s == "." {
Ok(1)
} else {
Expand Down Expand Up @@ -245,15 +272,15 @@ where
{
let mut barcodes = HashMap::new();
fragments.for_each(|frag| {
let key = frag.barcode.clone();
let key = frag.barcode.unwrap().clone();
*barcodes.entry(key).or_insert(0) += 1;
});
barcodes
}

pub fn tss_enrichment<I>(fragments: I, promoter: &BedTree<bool>) -> f64
where
I: Iterator<Item = BED<6>>,
I: Iterator<Item = Fragment>,
{
fn find_pos<'a>(promoter: &'a BedTree<bool>, ins: &'a GenomicRange) -> impl Iterator<Item = usize> + 'a {
promoter.find(ins).map(|(entry, data)| {
Expand Down Expand Up @@ -316,25 +343,28 @@ pub fn fragment_size_distribution<I>(data: I, max_size: usize) -> Vec<usize>
size_dist
}

/// Count the fraction of the records in the given regions.
pub fn fraction_in_region<'a, I, D>(
/// Count the fraction of the reads in the given regions.
pub fn fraction_of_reads_in_region<'a, I, D>(
iter: I, regions: &'a Vec<BedTree<D>>,
) -> impl Iterator<Item = (Vec<Vec<f64>>, usize, usize)> + 'a
where
I: Iterator<Item = (Vec<Vec<BED<6>>>, usize, usize)> + 'a,
I: Iterator<Item = (Vec<Vec<Fragment>>, usize, usize)> + 'a,
{
let k = regions.len();
iter.map(move |(beds, start, end)| {
let frac = beds.into_iter().map(|xs| {
let sum = xs.len() as f64;
iter.map(move |(data, start, end)| {
let frac = data.into_iter().map(|fragments| {
let mut sum = 0.0;
let mut counts = vec![0.0; k];
xs.into_iter().for_each(|x|
regions.iter().enumerate().for_each(|(i, r)| {
if r.is_overlapped(&x) {
counts[i] += 1.0;
}
})
);
fragments
.into_iter().flat_map(|fragment| fragment.to_reads())
.for_each(|read| {
sum += 1.0;
regions.iter().enumerate().for_each(|(i, r)|
if r.is_overlapped(&read) {
counts[i] += 1.0;
}
)
});
counts.iter_mut().for_each(|x| *x /= sum);
counts
}).collect::<Vec<_>>();
Expand Down
2 changes: 1 addition & 1 deletion snapatac2-python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ anndata-hdf5 = { git = "https://github.com/kaizhang/anndata-rs.git", rev = "7279
pyanndata = { git = "https://github.com/kaizhang/anndata-rs.git", rev = "7279f0a8a2bf8a40aef5067af6dd2b4dca44a437" }
extsort = "0.4"
anyhow = "1.0"
bed-utils = { git = "https://github.com/kaizhang/bed-utils.git", rev = "19832eda2909dbe87289e09ba966af0797adb9f8" }
bed-utils = { git = "https://github.com/kaizhang/bed-utils.git", rev = "27ae1ee81db4ac84434014877b8d17aa22583df2" }
flate2 = "1.0"
itertools = "0.11"
indicatif = "0.17"
Expand Down
10 changes: 5 additions & 5 deletions snapatac2-python/snapatac2/_utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import numpy as np
import anndata as ad
import logging

from snapatac2._snapatac2 import AnnData, AnnDataSet, read
from anndata import AnnData
import snapatac2._snapatac2 as internal

def is_anndata(data) -> bool:
return isinstance(data, ad.AnnData) or isinstance(data, AnnData) or isinstance(data, AnnDataSet)
return isinstance(data, AnnData) or isinstance(data, internal.AnnData) or isinstance(data, internal.AnnDataSet)

def anndata_par(adatas, func, n_jobs=4):
return anndata_ipar(list(enumerate(adatas)), lambda x: func(x[1]), n_jobs=n_jobs)
Expand All @@ -15,7 +15,7 @@ def anndata_ipar(inputs, func, n_jobs=4):

exist_in_memory_adata = False
for _, adata in inputs:
if isinstance(adata, ad.AnnData):
if isinstance(adata, AnnData):
exist_in_memory_adata = True
break
if exist_in_memory_adata:
Expand All @@ -27,7 +27,7 @@ def anndata_ipar(inputs, func, n_jobs=4):
from multiprocess import get_context

def _func(x):
adata = read(x[1])
adata = internal.read(x[1])
result = func((x[0], adata))
adata.close()
return result
Expand Down
8 changes: 4 additions & 4 deletions snapatac2-python/snapatac2/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,11 +127,11 @@ def pbmc5k(type: Literal["fragment, h5ad, annotated_h5ad"] = "fragment") -> Path
path to the file.
"""
if type == "fragment":
return Path(datasets().fetch("atac_pbmc_5k.tsv.gz"))
return Path(datasets().fetch("atac_pbmc_5k.tsv.gz", progressbar=True))
elif type == "h5ad":
return Path(datasets().fetch("atac_pbmc_5k.h5ad"))
return Path(datasets().fetch("atac_pbmc_5k.h5ad", progressbar=True))
elif type == "annotated_h5ad":
return Path(datasets().fetch("atac_pbmc_5k_annotated.h5ad"))
return Path(datasets().fetch("atac_pbmc_5k_annotated.h5ad", progressbar=True))
else:
raise NameError("type '{}' is not available.".format(type))

Expand Down Expand Up @@ -175,7 +175,7 @@ def colon() -> list[tuple[str, Path]]:
list[tuple[str, Path]]
A list of tuples, each tuple contains the sample name and the path to the fragment file.
"""
files = datasets().fetch("colon_transverse.tar", processor = pooch.Untar())
files = datasets().fetch("colon_transverse.tar", progressbar=True, processor = pooch.Untar())
return [(fl.split("/")[-1].split("_rep1_fragments")[0], Path(fl)) for fl in files]

def cre_HEA() -> Path:
Expand Down
7 changes: 5 additions & 2 deletions snapatac2-python/snapatac2/genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,13 @@ def __init__(self, chrom_sizes, annotation_filename, fasta = None) -> None:
self._fasta_filename = fasta

def fetch_annotations(self):
return datasets().fetch(self._annotation_filename)
return datasets().fetch(self._annotation_filename, progressbar=True)

def fetch_fasta(self):
return datasets().fetch(self._fasta_filename, processor = Decompress(method = "gzip"))
return datasets().fetch(
self._fasta_filename,
processor=Decompress(method = "gzip"),
progressbar=True)

GRCh37 = Genome(
{
Expand Down
Loading

0 comments on commit 8accb32

Please sign in to comment.