Skip to content

Commit

Permalink
fix align stats
Browse files Browse the repository at this point in the history
  • Loading branch information
kaizhang committed Dec 29, 2024
1 parent 58d1f1c commit 796d6b7
Show file tree
Hide file tree
Showing 7 changed files with 148 additions and 60 deletions.
37 changes: 18 additions & 19 deletions precellar/src/align/aligners.rs
Original file line number Diff line number Diff line change
Expand Up @@ -121,12 +121,12 @@ pub trait Aligner {
/// * `records` - Vector of annotated FASTQ records to align.
///
/// # Returns
/// A vector of tuples where each tuple contains the primary alignment and optional secondary alignments for a read.
/// A vector of tuples where each tuple contains the alignments.
fn align_reads(
&mut self,
num_threads: u16,
records: Vec<AnnotatedFastq>,
) -> Vec<(MultiMapR, Option<MultiMapR>)>;
) -> Vec<(Option<MultiMapR>, Option<MultiMapR>)>;
}

impl Aligner for BurrowsWheelerAligner {
Expand All @@ -146,7 +146,7 @@ impl Aligner for BurrowsWheelerAligner {
&mut self,
num_threads: u16,
records: Vec<AnnotatedFastq>,
) -> Vec<(MultiMapR, Option<MultiMapR>)> {
) -> Vec<(Option<MultiMapR>, Option<MultiMapR>)> {
if records[0].read2.is_some() {
let (info, mut reads): (Vec<_>, Vec<_>) = records
.into_iter()
Expand Down Expand Up @@ -177,7 +177,7 @@ impl Aligner for BurrowsWheelerAligner {
add_umi(&mut ali1, umi.sequence(), umi.quality_scores());
add_umi(&mut ali2, umi.sequence(), umi.quality_scores());
}
(ali1.into(), Some(ali2.into()))
(Some(ali1.into()), Some(ali2.into()))
})
.collect()
} else {
Expand All @@ -199,7 +199,7 @@ impl Aligner for BurrowsWheelerAligner {
if let Some(umi) = umi {
add_umi(&mut alignment, umi.sequence(), umi.quality_scores());
}
(alignment.into(), None)
(Some(alignment.into()), None)
})
.collect()
}
Expand All @@ -220,7 +220,7 @@ impl Aligner for StarAligner {
&mut self,
num_threads: u16,
records: Vec<AnnotatedFastq>,
) -> Vec<(MultiMapR, Option<MultiMapR>)> {
) -> Vec<(Option<MultiMapR>, Option<MultiMapR>)> {
let chunk_size = get_chunk_size(records.len(), num_threads as usize);

records
Expand All @@ -229,18 +229,12 @@ impl Aligner for StarAligner {
let mut aligner = self.clone();
chunk.iter().map(move |rec| {
let bc = rec.barcode.as_ref().unwrap();
let read1;
let mut read2 = None;
if rec.read1.is_some() {
read1 = rec.read1.as_ref().unwrap();
read2 = rec.read2.as_ref();
} else {
read1 = rec.read2.as_ref().unwrap();
}
let read1 = rec.read1.as_ref();
let read2 = rec.read2.as_ref();

if read2.is_some() {
if read1.is_some() && read2.is_some() {
let (mut ali1, mut ali2) =
aligner.align_read_pair(read1, &read2.unwrap()).unwrap();
aligner.align_read_pair(&read1.unwrap(), &read2.unwrap()).unwrap();
ali1.iter_mut()
.chain(ali2.iter_mut())
.for_each(|alignment| {
Expand All @@ -254,9 +248,10 @@ impl Aligner for StarAligner {
add_umi(alignment, umi.sequence(), umi.quality_scores());
};
});
(ali1.try_into().unwrap(), Some(ali2.try_into().unwrap()))
(Some(ali1.try_into().unwrap()), Some(ali2.try_into().unwrap()))
} else {
let mut ali = aligner.align_read(read1).unwrap();
let read = read1.or(read2).unwrap();
let mut ali = aligner.align_read(read).unwrap();
ali.iter_mut().for_each(|alignment| {
add_cell_barcode(
alignment,
Expand All @@ -268,7 +263,11 @@ impl Aligner for StarAligner {
add_umi(alignment, umi.sequence(), umi.quality_scores());
};
});
(ali.try_into().unwrap(), None)
if read1.is_some() {
(Some(ali.try_into().unwrap()), None)
} else {
(None, Some(ali.try_into().unwrap()))
}
}
})
})
Expand Down
17 changes: 15 additions & 2 deletions precellar/src/align/fastq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ impl FastqProcessor {
aligner: &'a mut A,
num_threads: u16,
chunk_size: usize,
) -> impl Iterator<Item = Vec<(MultiMapR, Option<MultiMapR>)>> + 'a {
) -> impl Iterator<Item = Vec<(Option<MultiMapR>, Option<MultiMapR>)>> + 'a {
let fq_reader = self.gen_barcoded_fastq(true).with_chunk_size(chunk_size);
let total_reads = fq_reader.total_reads.unwrap_or(0);

Expand All @@ -111,7 +111,20 @@ impl FastqProcessor {
let results: Vec<_> = aligner.align_reads(num_threads, data);
results
.iter()
.for_each(|(ali1, ali2)| align_qc.add(&header, ali1, ali2.as_ref()).unwrap());
.for_each(|ali| {
match ali {
(Some(ali1), Some(ali2)) => {
align_qc.add_pair(&header, ali1, ali2).unwrap();
},
(Some(ali1), None) => {
align_qc.add_read1(&header, ali1).unwrap();
},
(None, Some(ali2)) => {
align_qc.add_read2(&header, ali2).unwrap();
},
_ => {}
}
});
progress_bar.update(results.len()).unwrap();
results
})
Expand Down
10 changes: 6 additions & 4 deletions precellar/src/fragment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -174,20 +174,22 @@ impl FragmentGenerator {
impl FnMut(&AlignmentInfo) -> String + 'a,
>
where
I: Iterator<Item = Vec<(MultiMap<R>, Option<MultiMap<R>>)>> + 'a,
I: Iterator<Item = Vec<(Option<MultiMap<R>>, Option<MultiMap<R>>)>> + 'a,
R: Record + 'a,
{
let data = records.flat_map(|chunk|
chunk.into_iter().flat_map(|(r1, r2)| if r2.is_some() {
chunk.into_iter().flat_map(|(r1, r2)| if r1.is_some() && r2.is_some() {
let r1 = r1.unwrap();
let r2 = r2.unwrap();
if filter_read_pair((&r1.primary, &r2.primary), self.mapq) {
AlignmentInfo::from_read_pair((&r1.primary, &r2.primary), header).unwrap()
} else {
None
}
} else {
if filter_read(&r1.primary, self.mapq) {
AlignmentInfo::from_read(&r1.primary, header).unwrap()
let r = r1.or(r2).unwrap();
if filter_read(&r.primary, self.mapq) {
AlignmentInfo::from_read(&r.primary, header).unwrap()
} else {
None
}
Expand Down
107 changes: 86 additions & 21 deletions precellar/src/qc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,10 +114,14 @@ impl PairAlignStat {
self.read1.duplicate + self.read2.duplicate
}

fn add<R: Record>(&mut self, record: &MultiMap<R>) -> Result<()> {
fn add_read1<R: Record>(&mut self, record: &MultiMap<R>) -> Result<()> {
self.read1.add(record)
}

fn add_read2<R: Record>(&mut self, record: &MultiMap<R>) -> Result<()> {
self.read2.add(record)
}

fn add_pair<R: Record>(&mut self, record1: &MultiMap<R>, record2: &MultiMap<R>) -> Result<()> {
self.read1.add(record1)?;
self.read2.add(record2)?;
Expand Down Expand Up @@ -151,27 +155,24 @@ impl AlignQC {
self.mito_dna.insert(mito_dna);
}

pub fn add<R: Record>(&mut self, header: &sam::Header, record1: &MultiMap<R>, record2: Option<&MultiMap<R>>) -> Result<()> {
pub fn add_pair<R: Record>(&mut self, header: &sam::Header, record1: &MultiMap<R>, record2: &MultiMap<R>) -> Result<()> {
let mut stat= PairAlignStat::default();

self.num_read1_bases += record1.primary.sequence().len() as u64;
self.num_read2_bases += record2.primary.sequence().len() as u64;

self.num_read1_q30_bases += record1.primary
.quality_scores()
.iter()
.filter(|s| s.as_ref().map(|x| *x >= 30).unwrap_or(false))
.count() as u64;
self.num_read2_q30_bases += record2.primary
.quality_scores()
.iter()
.filter(|s| s.as_ref().map(|x| *x >= 30).unwrap_or(false))
.count() as u64;

if let Some(record2) = record2 {
self.num_read2_bases += record2.primary.sequence().len() as u64;
self.num_read2_q30_bases += record2.primary
.quality_scores()
.iter()
.filter(|s| s.as_ref().map(|x| *x >= 30).unwrap_or(false))
.count() as u64;
stat.add_pair(record1, record2)?;
} else {
stat.add(record1)?;
}
stat.add_pair(record1, record2)?;

self.stat_all.combine(&stat);

Expand All @@ -192,6 +193,66 @@ impl AlignQC {
Ok(())
}

pub fn add_read1<R: Record>(&mut self, header: &sam::Header, record: &MultiMap<R>) -> Result<()> {
let mut stat= PairAlignStat::default();

self.num_read1_bases += record.primary.sequence().len() as u64;
self.num_read1_q30_bases += record.primary
.quality_scores()
.iter()
.filter(|s| s.as_ref().map(|x| *x >= 30).unwrap_or(false))
.count() as u64;
stat.add_read1(record)?;

self.stat_all.combine(&stat);

if record.primary
.data()
.get(&Tag::CELL_BARCODE_ID)
.transpose()
.unwrap()
.is_some()
{
self.stat_barcoded.combine(&stat);
if let Some(rid) = record.primary.reference_sequence_id(header) {
if self.mito_dna.contains(&rid.unwrap()) {
self.stat_mito.combine(&stat);
}
}
}
Ok(())
}

pub fn add_read2<R: Record>(&mut self, header: &sam::Header, record: &MultiMap<R>) -> Result<()> {
let mut stat= PairAlignStat::default();

self.num_read2_bases += record.primary.sequence().len() as u64;
self.num_read2_q30_bases += record.primary
.quality_scores()
.iter()
.filter(|s| s.as_ref().map(|x| *x >= 30).unwrap_or(false))
.count() as u64;
stat.add_read2(record)?;

self.stat_all.combine(&stat);

if record.primary
.data()
.get(&Tag::CELL_BARCODE_ID)
.transpose()
.unwrap()
.is_some()
{
self.stat_barcoded.combine(&stat);
if let Some(rid) = record.primary.reference_sequence_id(header) {
if self.mito_dna.contains(&rid.unwrap()) {
self.stat_mito.combine(&stat);
}
}
}
Ok(())
}

pub fn report(&self, metric: &mut Metrics) {
let stat_all = &self.stat_all;
let stat_barcoded = &self.stat_barcoded;
Expand All @@ -203,14 +264,18 @@ impl AlignQC {

metric.insert("sequenced_reads".to_string(), stat_all.total_reads() as f64);
metric.insert("sequenced_read_pairs".to_string(), stat_all.total_pairs() as f64);
metric.insert(
"frac_q30_bases_read1".to_string(),
self.num_read1_q30_bases as f64 / self.num_read1_bases as f64,
);
metric.insert(
"frac_q30_bases_read2".to_string(),
self.num_read2_q30_bases as f64 / self.num_read2_bases as f64,
);
if self.num_read1_bases > 0 {
metric.insert(
"frac_q30_bases_read1".to_string(),
self.num_read1_q30_bases as f64 / self.num_read1_bases as f64,
);
}
if self.num_read2_bases > 0 {
metric.insert(
"frac_q30_bases_read2".to_string(),
self.num_read2_q30_bases as f64 / self.num_read2_bases as f64,
);
}
metric.insert(
"frac_confidently_mapped".to_string(),
fraction_confidently_mapped,
Expand Down
19 changes: 13 additions & 6 deletions precellar/src/transcript/quantification.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ impl Quantifier {

pub fn quantify<'a, I, P>(&'a self, header: &'a Header, records: I, output: P) -> Result<()>
where
I: Iterator<Item = Vec<(MultiMapR, Option<MultiMapR>)>> + 'a,
I: Iterator<Item = Vec<(Option<MultiMapR>, Option<MultiMapR>)>> + 'a,
P: AsRef<std::path::Path>,
{
let adata: AnnData<H5> = AnnData::new(output)?;
Expand Down Expand Up @@ -142,15 +142,22 @@ impl Quantifier {
fn make_gene_alignment(
&self,
header: &Header,
rec1: MultiMapR,
rec1: Option<MultiMapR>,
rec2: Option<MultiMapR>,
) -> Option<(String, GeneAlignment)> {
let barcode = rec1.barcode().unwrap()?;
let umi = rec1.umi().unwrap();
let anno = if let Some(rec2) = rec2 {
let barcode;
let umi;
let anno = if rec1.is_some() && rec2.is_some() {
let rec1 = rec1.unwrap();
let rec2 = rec2.unwrap();
barcode = rec1.barcode().unwrap()?;
umi = rec1.umi().unwrap();
self.annotator.annotate_alignments_pe(header, rec1, rec2)
} else {
self.annotator.annotate_alignments_se(header, rec1)
let rec = rec1.or(rec2).unwrap();
barcode = rec.barcode().unwrap()?;
umi = rec.umi().unwrap();
self.annotator.annotate_alignments_se(header, rec)
}?;

let gene_id;
Expand Down
10 changes: 6 additions & 4 deletions python/src/align.rs
Original file line number Diff line number Diff line change
Expand Up @@ -244,16 +244,18 @@ fn write_alignments<'a>(
py: Python<'a>,
output: PathBuf,
header: &'a sam::Header,
alignments: impl Iterator<Item = Vec<(MultiMapR, Option<MultiMapR>)>> + 'a,
alignments: impl Iterator<Item = Vec<(Option<MultiMapR>, Option<MultiMapR>)>> + 'a,
) -> Result<()> {
let mut writer = noodles::bam::io::writer::Builder::default().build_from_path(output)?;
writer.write_header(&header)?;

alignments.for_each(move |data| {
py.check_signals().unwrap();
data.iter().for_each(|(a, b)| {
a.iter()
.for_each(|x| writer.write_alignment_record(&header, x).unwrap());
a.as_ref().map(|x| {
x.iter()
.for_each(|x| writer.write_alignment_record(&header, x).unwrap())
});
b.as_ref().map(|x| {
x.iter()
.for_each(|x| writer.write_alignment_record(&header, x).unwrap())
Expand All @@ -271,7 +273,7 @@ fn write_fragments<'a>(
header: &'a sam::Header,
mito_dna: &Vec<String>,
fragment_generator: FragmentGenerator,
alignments: impl Iterator<Item = Vec<(MultiMapR, Option<MultiMapR>)>> + 'a,
alignments: impl Iterator<Item = Vec<(Option<MultiMapR>, Option<MultiMapR>)>> + 'a,
) -> Result<FragmentQC> {
let mut fragment_qc = FragmentQC::default();
mito_dna.iter().for_each(|x| {
Expand Down
8 changes: 4 additions & 4 deletions python/src/pyseqspec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,15 @@ impl Assay {
#[new]
#[pyo3(signature = (path))]
pub fn new(path: Bound<'_, PyAny>) -> Result<Self> {
let assay = if let Ok(path) = path.extract::<PathBuf>() {
seqspec::Assay::from_path(path)?
} else {
let path = path.extract::<&str>()?;
let assay = if let Ok(path) = path.extract::<&str>() {
if url::Url::parse(path).is_ok() {
seqspec::Assay::from_url(path)?
} else {
seqspec::Assay::from_path(path)?
}
} else {
let path = path.extract::<PathBuf>()?;
seqspec::Assay::from_path(path)?
};
Ok(Assay(assay))
}
Expand Down

0 comments on commit 796d6b7

Please sign in to comment.