From 783f265aac9e338f449d193085f5fc79a1d192cd Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Sat, 30 Nov 2024 21:45:39 -0500 Subject: [PATCH 01/16] parallelize gpl --- Cargo.lock | 6 +- Cargo.toml | 1 + src/cellfilter.rs | 158 +++++++++++++++++++++++++++++++++------------- src/utils.rs | 26 ++++++++ 4 files changed, 146 insertions(+), 45 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ef465e5..4d6dd56 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -45,6 +45,7 @@ dependencies = [ "crossbeam-channel", "crossbeam-queue", "csv", + "dashmap", "flate2", "indicatif", "itertools", @@ -475,9 +476,9 @@ dependencies = [ [[package]] name = "dashmap" -version = "6.0.1" +version = "6.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "804c8821570c3f8b70230c2ba75ffa5c0f9a4189b9a432b6656c536712acae28" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" dependencies = [ "cfg-if", "crossbeam-utils", @@ -485,6 +486,7 @@ dependencies = [ "lock_api", "once_cell", "parking_lot_core", + "serde", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index ae8642d..1145e34 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -75,6 +75,7 @@ clap = { version = "4.5.9", features = ["derive", "wrap_help", "cargo", "help", noodles = { version = "0.77.0", features = ["bam", "bgzf", "sam"] } noodles-util = { version = "0.48.0", features = ["alignment"] } +dashmap = { version = "6.1.0", features = ["serde", "inline"] } [profile.release] #debug = true diff --git a/src/cellfilter.rs b/src/cellfilter.rs index a9131fd..775fb51 100644 --- a/src/cellfilter.rs +++ b/src/cellfilter.rs @@ -8,6 +8,7 @@ */ use anyhow::{anyhow, Context}; +use dashmap::DashMap; use slog::crit; use slog::info; @@ -35,7 +36,9 @@ use std::collections::HashMap; use std::fs::File; use std::io::{BufRead, BufReader, Read}; use std::io::{BufWriter, Write}; +use std::num::NonZeroUsize; use std::path::{Path, PathBuf}; +use std::sync::atomic::Ordering; use std::time::Instant; #[derive(Clone, Debug, Serialize)] @@ -195,9 +198,9 @@ fn get_knee(freq: &[u64], max_iterations: usize, log: &slog::Logger) -> usize { fn populate_unfiltered_barcode_map( br: BufReader, first_bclen: &mut usize, -) -> HashMap { +) -> DashMap { let s = ahash::RandomState::with_seeds(2u64, 7u64, 1u64, 8u64); - let mut hm = HashMap::with_hasher(s); + let hm = DashMap::with_hasher(s); // read through the external unfiltered barcode list // and generate a vector of encoded barcodes @@ -225,7 +228,7 @@ fn populate_unfiltered_barcode_map( #[allow(clippy::unnecessary_unwrap, clippy::too_many_arguments)] fn process_unfiltered( - mut hm: HashMap, + hm: DashMap, mut unmatched_bc: Vec, file_tag_map: &rad_types::TagMap, filter_meth: &CellFilterMethod, @@ -257,19 +260,19 @@ fn process_unfiltered( let mut kept_bc = Vec::::new(); // iterate over the count map - for (k, v) in hm.iter_mut() { + for mut kvp in hm.iter_mut() { // if this satisfies our requirement for the minimum count // then keep this barcode - if *v >= min_freq { - kept_bc.push(*k); + if *kvp.value() >= min_freq { + kept_bc.push(*kvp.key()); } else { // otherwise, we have to add this barcode's // counts to our unmatched list - for _ in 0..*v { - unmatched_bc.push(*k); + for _ in 0..*kvp.value() { + unmatched_bc.push(*kvp.key()); } // and then reset the counter for this barcode to 0 - *v = 0u64; + *kvp.value_mut() = 0u64; } } @@ -329,7 +332,7 @@ fn process_unfiltered( if cbc != *ubc && n == 1 { // then increment the count of this // barcode by 1 (because we'll correct to it) - if let Some(c) = hm.get_mut(&cbc) { + if let Some(mut c) = hm.get_mut(&cbc) { *c += count as u64; corrected_list.push((*ubc, cbc)); } @@ -392,6 +395,8 @@ fn process_unfiltered( })?; let o_path = parent.join("permit_freq.bin"); + // convert the DashMap to a HashMap + let mut hm: HashMap = hm.into_iter().collect(); match afutils::write_permit_list_freq(&o_path, barcode_len, &hm) { Ok(_) => {} Err(error) => { @@ -412,6 +417,7 @@ fn process_unfiltered( for (k, v) in hm.iter_mut() { // each present barcode corrects to itself *v = *k; + //*kvp.value_mut() = *kvp.key(); } for (uncorrected, corrected) in corrected_list.iter() { hm.insert(*uncorrected, *corrected); @@ -643,11 +649,18 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result ); } + const NWORKERS: usize = 4; + let i_file = File::open(i_dir.join("map.rad")).context("could not open input rad file")?; + let ifile = BufReader::new(i_file); + let mut rad_reader = libradicl::readers::ParallelRadReader::< + AlevinFryReadRecord, + BufReader, + >::new(ifile, NonZeroUsize::new(NWORKERS).unwrap()); let i_file = File::open(i_dir.join("map.rad")).context("could not open input rad file")?; let mut br = BufReader::new(i_file); - let prelude = RadPrelude::from_bytes(&mut br)?; - let hdr = &prelude.hdr; + //let prelude = &rad_reader.prelude; //RadPrelude::from_bytes(&mut br)?; + let hdr = &rad_reader.prelude.hdr; info!( log, "paired : {:?}, ref_count : {}, num_chunks : {}", @@ -655,12 +668,13 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result hdr.ref_count.to_formatted_string(&Locale::en), hdr.num_chunks.to_formatted_string(&Locale::en) ); + let num_chunks = hdr.num_chunks(); // file-level - let fl_tags = &prelude.file_tags; + let fl_tags = &rad_reader.prelude.file_tags; info!(log, "read {:?} file-level tags", fl_tags.tags.len()); // read-level - let rl_tags = &prelude.read_tags; + let rl_tags = &rad_reader.prelude.read_tags; info!(log, "read {:?} read-level tags", rl_tags.tags.len()); // right now, we only handle BC and UMI types of U8—U64, so validate that @@ -693,13 +707,17 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result assert!(umit.is_some(), "umi type tag must be present."); // alignment-level - let al_tags = &prelude.aln_tags; + let al_tags = &rad_reader.prelude.aln_tags; info!(log, "read {:?} alignemnt-level tags", al_tags.tags.len()); - let file_tag_map = prelude.file_tags.parse_tags_from_bytes(&mut br)?; - info!(log, "File-level tag values {:?}", file_tag_map); + { + let file_tag_map = &rad_reader.file_tag_map; //file_tags.parse_tags_from_bytes(&mut br)?; + info!(log, "File-level tag values {:?}", file_tag_map); + } - let record_context = prelude.get_record_context::()?; + let record_context = &rad_reader + .prelude + .get_record_context::()?; let mut num_reads: usize = 0; // if dealing with filtered type @@ -716,31 +734,84 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result CellFilterMethod::UnfilteredExternalList(_, _min_reads) => { unmatched_bc = Vec::with_capacity(10000000); // the unfiltered_bc_count map must be valid in this branch - if let Some(mut hmu) = unfiltered_bc_counts { + if unfiltered_bc_counts.is_some() { + let hmu = std::thread::scope(|s| { + let hmu = std::sync::Arc::new(unfiltered_bc_counts.unwrap()); + let mut handles = Vec::< + std::thread::ScopedJoinHandle<(usize, usize, Vec, usize)>, + >::new(); + for _ in 0..NWORKERS { + let rd = rad_reader.is_done(); + let q = rad_reader.get_queue(); + let hmu = hmu.clone(); + let handle = s.spawn(move || { + let mut unmatched_bc = Vec::::new(); + let mut max_ambiguity_read = 0usize; + let mut num_reads = 0; + let mut num_orientation_compat_reads = 0; + while !rd.load(Ordering::SeqCst) { + while let Some(meta_chunk) = q.pop() { + for c in meta_chunk.iter() { + num_orientation_compat_reads += + update_barcode_hist_unfiltered( + &hmu, + &mut unmatched_bc, + &mut max_ambiguity_read, + &c, + &expected_ori, + ); + num_reads += c.reads.len(); + } + } + } + ( + num_reads, + num_orientation_compat_reads, + unmatched_bc, + max_ambiguity_read, + ) + }); + handles.push(handle); + } + let _ = rad_reader + .start_chunk_parsing(libradicl::readers::EMPTY_METACHUNK_CALLBACK); + for handle in handles { + let (nr, nocr, ubc, mar) = + handle.join().expect("The parsing thread panicked"); + num_reads += nr; + num_orientation_compat_reads += nocr; + unmatched_bc.extend_from_slice(&ubc); + max_ambiguity_read = max_ambiguity_read.max(mar); + } + // return the hash map we no longer need + std::sync::Arc::>::into_inner(hmu) + }); + /* for _ in 0..(hdr.num_chunks as usize) { - let c = - chunk::Chunk::::from_bytes(&mut br, &record_context); - num_orientation_compat_reads += update_barcode_hist_unfiltered( - &mut hmu, - &mut unmatched_bc, - &mut max_ambiguity_read, - &c, - &expected_ori, - ); - num_reads += c.reads.len(); + let c = + chunk::Chunk::::from_bytes(&mut br, &record_context); + num_orientation_compat_reads += update_barcode_hist_unfiltered( + &mut hmu, + &mut unmatched_bc, + &mut max_ambiguity_read, + &c, + &expected_ori, + ); + num_reads += c.reads.len(); } + */ info!( - log, - "observed {} reads ({} orientation consistent) in {} chunks --- max ambiguity read occurs in {} refs", - num_reads.to_formatted_string(&Locale::en), - num_orientation_compat_reads.to_formatted_string(&Locale::en), - hdr.num_chunks.to_formatted_string(&Locale::en), - max_ambiguity_read.to_formatted_string(&Locale::en) - ); + log, + "observed {} reads ({} orientation consistent) in {} chunks --- max ambiguity read occurs in {} refs", + num_reads.to_formatted_string(&Locale::en), + num_orientation_compat_reads.to_formatted_string(&Locale::en), + num_chunks.expect("nonzero").to_formatted_string(&Locale::en), + max_ambiguity_read.to_formatted_string(&Locale::en) + ); process_unfiltered( - hmu, + hmu.unwrap(), unmatched_bc, - &file_tag_map, + &rad_reader.file_tag_map, &filter_meth, expected_ori, output_dir, @@ -756,6 +827,7 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result } } _ => { + let mut max_ambiguity_read = 0usize; for _ in 0..(hdr.num_chunks as usize) { let c = chunk::Chunk::::from_bytes(&mut br, &record_context); update_barcode_hist(&mut hm, &mut max_ambiguity_read, &c, &expected_ori); @@ -770,7 +842,7 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result ); process_filtered( &hm, - &file_tag_map, + &rad_reader.file_tag_map, &filter_meth, expected_ori, output_dir, @@ -914,7 +986,7 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result } pub fn update_barcode_hist_unfiltered( - hist: &mut HashMap, + hist: &DashMap, unmatched_bc: &mut Vec, max_ambiguity_read: &mut usize, chunk: &chunk::Chunk, @@ -930,7 +1002,7 @@ pub fn update_barcode_hist_unfiltered( // barcodes match hist.get_mut(&r.bc) { // if we find a match, increment the count - Some(c) => *c += 1, + Some(mut c) => *c += 1, // otherwise, push this into the unmatched list None => { unmatched_bc.push(r.bc); @@ -947,7 +1019,7 @@ pub fn update_barcode_hist_unfiltered( // barcodes match hist.get_mut(&r.bc) { // if we find a match, increment the count - Some(c) => *c += 1, + Some(mut c) => *c += 1, // otherwise, push this into the unmatched list None => { unmatched_bc.push(r.bc); @@ -965,7 +1037,7 @@ pub fn update_barcode_hist_unfiltered( // barcodes match hist.get_mut(&r.bc) { // if we find a match, increment the count - Some(c) => *c += 1, + Some(mut c) => *c += 1, // otherwise, push this into the unmatched list None => { unmatched_bc.push(r.bc); diff --git a/src/utils.rs b/src/utils.rs index 0e54c74..ebe6733 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -11,8 +11,10 @@ use crate::eq_class::IndexedEqList; use anyhow::{anyhow, Context}; use bstr::io::BufReadExt; use core::fmt; +use dashmap::DashMap; use libradicl::utils::SPLICE_MASK_U32; use needletail::bitkmer::*; +use serde::Serialize; use std::collections::{HashMap, HashSet}; use std::error::Error; use std::fs::File; @@ -106,6 +108,30 @@ pub fn write_permit_list_freq( Ok(()) } +/// Write the permit_freq.bin and all_freq.bin files +pub fn write_permit_list_freq_dashmap( + o_path: &std::path::Path, + bclen: u16, + permit_freq_map: &DashMap, +) -> Result<(), Box> { + let output = std::fs::File::create(o_path)?; + let mut writer = BufWriter::new(&output); + + { + // the first u64 represents file format version. + writer + .write_all(&afconst::PERMIT_FILE_VER.to_le_bytes()) + .unwrap(); + + // the second u64 represents barcode length + writer.write_all(&(u64::from(bclen)).to_le_bytes()).unwrap(); + + // the rest records the permitted barcode:freq hashmap + bincode::serialize_into(&mut writer, &permit_freq_map)?; + } + Ok(()) +} + /// Parse a 3 column tsv of the format /// transcript_name gene_name status /// where status is one of S or U each gene will be allocated both a spliced and From c6108c79746552c0d54dba93823d22bdbc15cddd Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Sun, 1 Dec 2024 21:11:51 -0500 Subject: [PATCH 02/16] multithreading with progress bar --- src/cellfilter.rs | 132 +++++++++++++++++++++++++++++++++------------- src/lib.rs | 1 + src/main.rs | 6 +++ src/prog_opts.rs | 1 + src/utils.rs | 1 - 5 files changed, 102 insertions(+), 39 deletions(-) diff --git a/src/cellfilter.rs b/src/cellfilter.rs index 775fb51..c85db15 100644 --- a/src/cellfilter.rs +++ b/src/cellfilter.rs @@ -7,26 +7,24 @@ * License: 3-clause BSD, see https://opensource.org/licenses/BSD-3-Clause */ -use anyhow::{anyhow, Context}; +use anyhow::Context; use dashmap::DashMap; use slog::crit; -use slog::info; +use slog::{info, warn}; +use crate::diagnostics; use crate::prog_opts::GenPermitListOpts; use crate::utils as afutils; #[allow(unused_imports)] use ahash::{AHasher, RandomState}; use bio_types::strand::Strand; use bstr::io::BufReadExt; +use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle}; use itertools::Itertools; use libradicl::exit_codes; use libradicl::rad_types::{self, RadType}; use libradicl::BarcodeLookupMap; -use libradicl::{ - chunk, - header::RadPrelude, - record::{AlevinFryReadRecord, AlevinFryRecordContext}, -}; +use libradicl::{chunk, record::AlevinFryReadRecord}; use needletail::bitkmer::*; use num_format::{Locale, ToFormattedString}; use serde::Serialize; @@ -38,7 +36,7 @@ use std::io::{BufRead, BufReader, Read}; use std::io::{BufWriter, Write}; use std::num::NonZeroUsize; use std::path::{Path, PathBuf}; -use std::sync::atomic::Ordering; +use std::sync::{atomic::Ordering, Arc}; use std::time::Instant; #[derive(Clone, Debug, Serialize)] @@ -362,8 +360,8 @@ fn process_unfiltered( info!( log, "There were {} distinct unmatched barcodes, and {} that can be recovered", - distinct_unmatched_bc, - distinct_recoverable_bc + distinct_unmatched_bc.to_formatted_string(&Locale::en), + distinct_recoverable_bc.to_formatted_string(&Locale::en) ); info!( log, @@ -459,7 +457,7 @@ fn process_unfiltered( #[allow(clippy::unnecessary_unwrap, clippy::too_many_arguments)] fn process_filtered( - hm: &HashMap, + hm: DashMap, file_tag_map: &rad_types::TagMap, filter_meth: &CellFilterMethod, expected_ori: Strand, @@ -472,6 +470,7 @@ fn process_filtered( gpl_opts: &GenPermitListOpts, ) -> anyhow::Result { let valid_bc: Vec; + let hm: HashMap = hm.into_iter().collect(); let mut freq: Vec = hm.values().cloned().collect(); freq.sort_unstable(); freq.reverse(); @@ -489,7 +488,7 @@ fn process_filtered( // collect all of the barcodes that have a frequency // >= to min_thresh. - valid_bc = permit_list_from_threshold(hm, min_freq); + valid_bc = permit_list_from_threshold(&hm, min_freq); info!( log, "knee distance method resulted in the selection of {} permitted barcodes.", @@ -507,7 +506,7 @@ fn process_filtered( // collect all of the barcodes that have a frequency // >= to min_thresh. - valid_bc = permit_list_from_threshold(hm, min_freq); + valid_bc = permit_list_from_threshold(&hm, min_freq); } CellFilterMethod::ExplicitList(valid_bc_file) => { valid_bc = permit_list_from_file(valid_bc_file, barcode_len); @@ -520,7 +519,7 @@ fn process_filtered( let ind = cmp::min(freq.len() - 1, robust_ind as usize); let robust_freq = freq[ind]; let min_freq = std::cmp::max(1u64, (robust_freq as f64 / robust_div).round() as u64); - valid_bc = permit_list_from_threshold(hm, min_freq); + valid_bc = permit_list_from_threshold(&hm, min_freq); } CellFilterMethod::UnfilteredExternalList(_, _min_reads) => { unimplemented!(); @@ -562,7 +561,7 @@ fn process_filtered( let o_path = parent.join("all_freq.bin"); - match afutils::write_permit_list_freq(&o_path, barcode_len, hm) { + match afutils::write_permit_list_freq(&o_path, barcode_len, &hm) { Ok(_) => {} Err(error) => { panic!("Error: {}", error); @@ -629,7 +628,7 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result rad_dir.display() ); // std::process::exit(1); - return Err(anyhow!("execution terminated unexpectedly")); + anyhow::bail!("execution terminated because input RAD path does not exist."); } let mut first_bclen = 0usize; @@ -649,17 +648,14 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result ); } - const NWORKERS: usize = 4; + let nworkers: usize = gpl_opts.threads; let i_file = File::open(i_dir.join("map.rad")).context("could not open input rad file")?; let ifile = BufReader::new(i_file); let mut rad_reader = libradicl::readers::ParallelRadReader::< AlevinFryReadRecord, BufReader, - >::new(ifile, NonZeroUsize::new(NWORKERS).unwrap()); - let i_file = File::open(i_dir.join("map.rad")).context("could not open input rad file")?; - let mut br = BufReader::new(i_file); + >::new(ifile, NonZeroUsize::new(nworkers).unwrap()); - //let prelude = &rad_reader.prelude; //RadPrelude::from_bytes(&mut br)?; let hdr = &rad_reader.prelude.hdr; info!( log, @@ -711,18 +707,15 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result info!(log, "read {:?} alignemnt-level tags", al_tags.tags.len()); { - let file_tag_map = &rad_reader.file_tag_map; //file_tags.parse_tags_from_bytes(&mut br)?; + let file_tag_map = &rad_reader.file_tag_map; info!(log, "File-level tag values {:?}", file_tag_map); } - let record_context = &rad_reader - .prelude - .get_record_context::()?; let mut num_reads: usize = 0; // if dealing with filtered type let s = ahash::RandomState::with_seeds(2u64, 7u64, 1u64, 8u64); - let mut hm = HashMap::with_hasher(s); + let hm = std::sync::Arc::new(DashMap::with_hasher(s)); // if dealing with the unfiltered type // the set of barcodes that are not an exact match for any known barcodes @@ -730,6 +723,20 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result let mut num_orientation_compat_reads = 0usize; let mut max_ambiguity_read = 0usize; + let nc = num_chunks.expect("unknwon number of chunks").get() as u64; + let pbar = ProgressBar::new(nc); + pbar.set_style( + ProgressStyle::with_template( + "[{elapsed_precise}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}", + ) + .unwrap() + .progress_chars("##-"), + ); + pbar.set_draw_target(ProgressDrawTarget::stderr_with_hz(5)); + let cb = |_new_bytes: u64, new_chunks: u64| { + pbar.inc(new_chunks); + }; + match filter_meth { CellFilterMethod::UnfilteredExternalList(_, _min_reads) => { unmatched_bc = Vec::with_capacity(10000000); @@ -740,7 +747,7 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result let mut handles = Vec::< std::thread::ScopedJoinHandle<(usize, usize, Vec, usize)>, >::new(); - for _ in 0..NWORKERS { + for _ in 0..nworkers { let rd = rad_reader.is_done(); let q = rad_reader.get_queue(); let hmu = hmu.clone(); @@ -773,8 +780,7 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result }); handles.push(handle); } - let _ = rad_reader - .start_chunk_parsing(libradicl::readers::EMPTY_METACHUNK_CALLBACK); + let _ = rad_reader.start_chunk_parsing(Some(cb)); //libradicl::readers::EMPTY_METACHUNK_CALLBACK); for handle in handles { let (nr, nocr, ubc, mar) = handle.join().expect("The parsing thread panicked"); @@ -783,6 +789,7 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result unmatched_bc.extend_from_slice(&ubc); max_ambiguity_read = max_ambiguity_read.max(mar); } + pbar.finish_with_message(format!("finished parsing RAD file\n",)); // return the hash map we no longer need std::sync::Arc::>::into_inner(hmu) }); @@ -808,6 +815,22 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result num_chunks.expect("nonzero").to_formatted_string(&Locale::en), max_ambiguity_read.to_formatted_string(&Locale::en) ); + let valid_thresh = 0.3f64; + match diagnostics::likely_valid_permit_list( + unmatched_bc.len(), + num_reads, + valid_thresh, + ) { + Ok(f) => { + info!(log, + "The percentage of mapped reads not matching a known barcode exactly is {:.3}%, which is < the warning threshold {:.3}%", + f * 100f64, valid_thresh * 100f64); + } + Err(e) => { + warn!(log, "{:?}", e); + } + } + process_unfiltered( hmu.unwrap(), unmatched_bc, @@ -827,21 +850,54 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result } } _ => { - let mut max_ambiguity_read = 0usize; - for _ in 0..(hdr.num_chunks as usize) { - let c = chunk::Chunk::::from_bytes(&mut br, &record_context); - update_barcode_hist(&mut hm, &mut max_ambiguity_read, &c, &expected_ori); - num_reads += c.reads.len(); - } + let hm = std::thread::scope(|s| { + let mut handles = + Vec::>::new(); + for _ in 0..nworkers { + let rd = rad_reader.is_done(); + let q = rad_reader.get_queue(); + let hm = hm.clone(); + let handle = s.spawn(move || { + let mut max_ambiguity_read = 0usize; + let mut num_reads = 0; + while !rd.load(Ordering::SeqCst) { + while let Some(meta_chunk) = q.pop() { + for c in meta_chunk.iter() { + update_barcode_hist( + &hm, + &mut max_ambiguity_read, + &c, + &expected_ori, + ); + num_reads += c.reads.len(); + } + } + } + (num_reads, num_orientation_compat_reads, max_ambiguity_read) + }); + handles.push(handle); + } + let _ = rad_reader.start_chunk_parsing(Some(cb)); + for handle in handles { + let (nr, nocr, mar) = handle.join().expect("The parsing thread panicked"); + num_reads += nr; + num_orientation_compat_reads += nocr; + max_ambiguity_read = max_ambiguity_read.max(mar); + } + // return the hash map we no longer need + Arc::>::into_inner(hm) + .expect("unique reference to DashMap") + }); + pbar.finish_with_message(format!("finished parsing RAD file\n",)); info!( log, "observed {} reads in {} chunks --- max ambiguity read occurs in {} refs", num_reads.to_formatted_string(&Locale::en), - hdr.num_chunks.to_formatted_string(&Locale::en), + num_chunks.unwrap().to_formatted_string(&Locale::en), max_ambiguity_read.to_formatted_string(&Locale::en) ); process_filtered( - &hm, + hm, &rad_reader.file_tag_map, &filter_meth, expected_ori, @@ -1051,7 +1107,7 @@ pub fn update_barcode_hist_unfiltered( } pub fn update_barcode_hist( - hist: &mut HashMap, + hist: &DashMap, max_ambiguity_read: &mut usize, chunk: &chunk::Chunk, expected_ori: &Strand, diff --git a/src/lib.rs b/src/lib.rs index aab2b10..0e75560 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,6 +12,7 @@ pub mod cmd_parse_utils; pub mod collate; pub mod constants; pub mod convert; +pub mod diagnostics; pub mod em; pub mod eq_class; pub mod infer; diff --git a/src/main.rs b/src/main.rs index 12fd613..15ddfb4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -49,6 +49,7 @@ fn main() -> anyhow::Result<()> { let num_hardware_threads = num_cpus::get() as u32; let max_num_threads: String = (num_cpus::get() as u32).to_string(); let max_num_collate_threads: String = (16_u32.min(num_hardware_threads).max(2_u32)).to_string(); + let max_num_gpl_threads: String = (8_u32.min(num_hardware_threads).max(2_u32)).to_string(); let crate_authors = crate_authors!("\n"); let version = crate_version!(); @@ -104,6 +105,7 @@ fn main() -> anyhow::Result<()> { -k --"knee-distance" "attempt to determine the number of barcodes to keep using the knee distance method." ) ) + .arg(arg!(-t --threads "number of threads to use for the first phase of permit-list generation").value_parser(value_parser!(u32)).default_value(max_num_gpl_threads)) .arg(arg!(-e --"expect-cells" "defines the expected number of cells to use in determining the (read, not UMI) based cutoff") .value_parser(value_parser!(usize))) .arg(arg!(-f --"force-cells" "select the top-k most-frequent barcodes, based on read count, as valid (true)") @@ -331,6 +333,9 @@ fn main() -> anyhow::Result<()> { // velo_mode --- currently, on this branch, it is always false let velo_mode = false; //t.get_flag("velocity-mode"); + let gpl_threads: usize = *t + .get_one::("threads") + .expect("valid integer number of threads") as usize; let gpl_opts = GenPermitListOpts::builder() .input_dir(input_dir) @@ -338,6 +343,7 @@ fn main() -> anyhow::Result<()> { .fmeth(fmeth) .expected_ori(expected_ori) .version(VERSION) + .threads(gpl_threads) .velo_mode(velo_mode) .cmdline(&cmdline) .log(&log) diff --git a/src/prog_opts.rs b/src/prog_opts.rs index a57a857..0f909d0 100644 --- a/src/prog_opts.rs +++ b/src/prog_opts.rs @@ -49,6 +49,7 @@ pub struct GenPermitListOpts<'a, 'b, 'c, 'd, 'e> { pub fmeth: CellFilterMethod, pub expected_ori: Strand, pub velo_mode: bool, + pub threads: usize, pub cmdline: &'c str, pub version: &'d str, #[serde(skip_serializing)] diff --git a/src/utils.rs b/src/utils.rs index ebe6733..6777be6 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -14,7 +14,6 @@ use core::fmt; use dashmap::DashMap; use libradicl::utils::SPLICE_MASK_U32; use needletail::bitkmer::*; -use serde::Serialize; use std::collections::{HashMap, HashSet}; use std::error::Error; use std::fs::File; From bbbc0377719863e3e2d078776117cd400d52f15c Mon Sep 17 00:00:00 2001 From: rob-p Date: Tue, 3 Dec 2024 16:10:10 -0500 Subject: [PATCH 03/16] add atac module and update deps --- Cargo.lock | 739 ++++++++++++++++------------------- Cargo.toml | 14 +- src/atac/cellfilter.rs | 542 ++++++++++++++++++++++++++ src/atac/collate.rs | 811 ++++++++++++++++++++++++++++++++++++++ src/atac/deduplicate.rs | 271 +++++++++++++ src/atac/mod.rs | 7 + src/atac/prog_opts.rs | 28 ++ src/atac/run.rs | 127 ++++++ src/atac/sort.rs | 837 ++++++++++++++++++++++++++++++++++++++++ src/atac/utils.rs | 18 + src/cellfilter.rs | 5 +- src/convert.rs | 4 +- src/em.rs | 9 +- src/lib.rs | 3 +- src/main.rs | 107 ++++- 15 files changed, 3094 insertions(+), 428 deletions(-) create mode 100644 src/atac/cellfilter.rs create mode 100644 src/atac/collate.rs create mode 100644 src/atac/deduplicate.rs create mode 100644 src/atac/mod.rs create mode 100644 src/atac/prog_opts.rs create mode 100644 src/atac/run.rs create mode 100644 src/atac/sort.rs create mode 100644 src/atac/utils.rs diff --git a/Cargo.lock b/Cargo.lock index 4d6dd56..84cf4c0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,12 +1,12 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] -name = "adler" -version = "1.0.2" +name = "adler2" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" [[package]] name = "ahash" @@ -51,9 +51,9 @@ dependencies = [ "itertools", "libradicl", "mimalloc", + "nalgebra", "needletail", "noodles", - "noodles-util", "num-format", "num_cpus", "petgraph", @@ -69,7 +69,7 @@ dependencies = [ "snap", "sprs", "statrs", - "thiserror", + "thiserror 2.0.4", "typed-builder", ] @@ -101,9 +101,9 @@ dependencies = [ [[package]] name = "anstream" -version = "0.6.14" +version = "0.6.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b" +checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" dependencies = [ "anstyle", "anstyle-parse", @@ -116,43 +116,43 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.7" +version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" [[package]] name = "anstyle-parse" -version = "0.2.4" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c03a11a9034d92058ceb6ee011ce58af4a9bf61491aa7e1e59ecd24bd40d22d4" +checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" dependencies = [ "utf8parse", ] [[package]] name = "anstyle-query" -version = "1.1.0" +version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad186efb764318d35165f1758e7dcef3b10628e26d41a44bc5550652e6804391" +checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" dependencies = [ - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] name = "anstyle-wincon" -version = "3.0.3" +version = "3.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19" +checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125" dependencies = [ "anstyle", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] name = "anyhow" -version = "1.0.86" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da" +checksum = "c1fd03a028ef38ba2276dce7e33fcd6369c158a1bca17946c4b1b701891c1ff7" [[package]] name = "approx" @@ -174,15 +174,15 @@ dependencies = [ [[package]] name = "arrayvec" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "autocfg" -version = "1.3.0" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" [[package]] name = "bincode" @@ -195,23 +195,23 @@ dependencies = [ [[package]] name = "bio-types" -version = "1.0.2" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dd588ae658a299e435a6e3bdc289b9ab3b39711c32653acebe62e729eb3c89f" +checksum = "f4dcf54f8b7f51450207d54780bab09c05f30b8b0caa991545082842e466ad7e" dependencies = [ "derive-new", "lazy_static", "regex", "serde", "strum_macros", - "thiserror", + "thiserror 1.0.69", ] [[package]] name = "bit-vec" -version = "0.6.3" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" [[package]] name = "bitflags" @@ -219,20 +219,11 @@ version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" -[[package]] -name = "block-buffer" -version = "0.10.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" -dependencies = [ - "generic-array", -] - [[package]] name = "bstr" -version = "1.9.1" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05efc5cfd9110c8416e471df0e96702d58690178e206e61b7173706673c93706" +checksum = "1a68f1f47cdf0ec8ee4b941b2eee2a80cb796db73118c0dd09ac63fbe405be22" dependencies = [ "memchr", "regex-automata", @@ -241,9 +232,9 @@ dependencies = [ [[package]] name = "buffer-redux" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c9f8ddd22e0a12391d1e7ada69ec3b0da1914f1cec39c5cf977143c5b2854f5" +checksum = "4e8acf87c5b9f5897cd3ebb9a327f420e0cae9dd4e5c1d2e36f2c84c571a58f1" dependencies = [ "memchr", ] @@ -262,9 +253,9 @@ checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce" [[package]] name = "bytemuck" -version = "1.16.1" +version = "1.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b236fc92302c97ed75b38da1f4917b5cdda4984745740f153a5d3059e48d725e" +checksum = "8b37c88a63ffd85d15b406896cc343916d7cf57838a847b3a6f2ca5d39a5695a" [[package]] name = "byteorder" @@ -274,9 +265,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.6.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" +checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b" [[package]] name = "bzip2" @@ -301,9 +292,14 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.1" +version = "1.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "907d8581360765417f8f2e0e7d602733bbed60156b4465b7617243689ef9b83d" +checksum = "f34d93e62b03caf570cccc334cbc6c2fceca82f39211051345108adcba3eebdc" +dependencies = [ + "jobserver", + "libc", + "shlex", +] [[package]] name = "cfg-if" @@ -322,14 +318,14 @@ dependencies = [ "js-sys", "num-traits", "wasm-bindgen", - "windows-targets 0.52.6", + "windows-targets", ] [[package]] name = "clap" -version = "4.5.9" +version = "4.5.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64acc1846d54c1fe936a78dc189c34e28d3f5afc348403f28ecf53660b9b8462" +checksum = "fb3b4b9e5a7c7514dfa52869339ee98b3156b0bfb4e8a77c4ff4babb64b1604f" dependencies = [ "clap_builder", "clap_derive", @@ -337,9 +333,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.9" +version = "4.5.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fb8393d67ba2e7bfaf28a23458e4e2b543cc73a99595511eb207fdb8aede942" +checksum = "b17a95aa67cc7b5ebd32aa5370189aa0d79069ef1c64ce893bd30fb24bff20ec" dependencies = [ "anstream", "anstyle", @@ -350,27 +346,27 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.8" +version = "4.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2bac35c6dafb060fd4d275d9a4ffae97917c13a6327903a8be2153cd964f7085" +checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab" dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.71", + "syn 2.0.90", ] [[package]] name = "clap_lex" -version = "0.7.1" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70" +checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7" [[package]] name = "colorchoice" -version = "1.0.1" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422" +checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" [[package]] name = "console" @@ -381,15 +377,15 @@ dependencies = [ "encode_unicode", "lazy_static", "libc", - "unicode-width", + "unicode-width 0.1.14", "windows-sys 0.52.0", ] [[package]] name = "core-foundation-sys" -version = "0.8.6" +version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" [[package]] name = "crc32fast" @@ -443,21 +439,11 @@ version = "0.8.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" -[[package]] -name = "crypto-common" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" -dependencies = [ - "generic-array", - "typenum", -] - [[package]] name = "csv" -version = "1.3.0" +version = "1.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" +checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf" dependencies = [ "csv-core", "itoa", @@ -482,7 +468,7 @@ checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" dependencies = [ "cfg-if", "crossbeam-utils", - "hashbrown", + "hashbrown 0.14.5", "lock_api", "once_cell", "parking_lot_core", @@ -511,23 +497,13 @@ dependencies = [ [[package]] name = "derive-new" -version = "0.5.9" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3418329ca0ad70234b9735dc4ceed10af4df60eff9c8e7b06cb5e520d92c3535" +checksum = "d150dea618e920167e5973d70ae6ece4385b7164e0d799fe7c122dd0a5d912ad" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", -] - -[[package]] -name = "digest" -version = "0.10.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" -dependencies = [ - "block-buffer", - "crypto-common", + "syn 2.0.90", ] [[package]] @@ -571,12 +547,12 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" -version = "0.3.9" +version = "0.3.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" +checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d" dependencies = [ "libc", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -587,24 +563,14 @@ checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" [[package]] name = "flate2" -version = "1.0.30" +version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f54427cfd1c7829e2a139fcefea601bf088ebca651d2bf53ebc600eac295dae" +checksum = "c936bfdafb507ebbf50b8074c54fa31c5be9a1e7e5f467dd659697041407d07c" dependencies = [ "crc32fast", "miniz_oxide", ] -[[package]] -name = "generic-array" -version = "0.14.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" -dependencies = [ - "typenum", - "version_check", -] - [[package]] name = "getrandom" version = "0.2.15" @@ -622,6 +588,12 @@ version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +[[package]] +name = "hashbrown" +version = "0.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" + [[package]] name = "heck" version = "0.5.0" @@ -634,11 +606,17 @@ version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" +[[package]] +name = "hermit-abi" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbf6a919d6cf397374f7dfeeea91d974c7c0a7221d0d0f4f20d859d329e53fcc" + [[package]] name = "iana-time-zone" -version = "0.1.60" +version = "0.1.61" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141" +checksum = "235e081f3925a06703c2d0117ea8b91f042756fd6e7a6e5d901e8ca1a996b220" dependencies = [ "android_system_properties", "core-foundation-sys", @@ -659,52 +637,43 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.2.6" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" +checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f" dependencies = [ "equivalent", - "hashbrown", + "hashbrown 0.15.2", ] [[package]] name = "indicatif" -version = "0.17.8" +version = "0.17.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "763a5a8f45087d6bcea4222e7b72c291a054edf80e4ef6efd2a4979878c7bea3" +checksum = "cbf675b85ed934d3c67b5c5469701eec7db22689d0a2139d856e0925fa28b281" dependencies = [ "console", - "instant", "number_prefix", "portable-atomic", - "unicode-width", -] - -[[package]] -name = "instant" -version = "0.1.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" -dependencies = [ - "cfg-if", + "unicode-width 0.2.0", + "web-time", ] [[package]] name = "is-terminal" -version = "0.4.12" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b" +checksum = "261f68e344040fbd0edea105bef17c66edf46f984ddb1115b775ce31be948f4b" dependencies = [ - "hermit-abi", + "hermit-abi 0.4.0", "libc", "windows-sys 0.52.0", ] [[package]] name = "is_terminal_polyfill" -version = "1.70.0" +version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" [[package]] name = "itertools" @@ -717,16 +686,26 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.11" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" + +[[package]] +name = "jobserver" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" +dependencies = [ + "libc", +] [[package]] name = "js-sys" -version = "0.3.69" +version = "0.3.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" +checksum = "a865e038f7f6ed956f788f0d7d60c541fff74c7bd74272c5d4cf15c63743e705" dependencies = [ + "once_cell", "wasm-bindgen", ] @@ -738,9 +717,9 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "lexical-core" -version = "0.8.5" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2cde5de06e8d4c2faabc400238f9ae1c74d5412d03a7bd067645ccbc47070e46" +checksum = "0431c65b318a590c1de6b8fd6e72798c92291d27762d94c9e6c37ed7a73d8458" dependencies = [ "lexical-parse-float", "lexical-parse-integer", @@ -751,9 +730,9 @@ dependencies = [ [[package]] name = "lexical-parse-float" -version = "0.8.5" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "683b3a5ebd0130b8fb52ba0bdc718cc56815b6a097e28ae5a6997d0ad17dc05f" +checksum = "eb17a4bdb9b418051aa59d41d65b1c9be5affab314a872e5ad7f06231fb3b4e0" dependencies = [ "lexical-parse-integer", "lexical-util", @@ -762,9 +741,9 @@ dependencies = [ [[package]] name = "lexical-parse-integer" -version = "0.8.6" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d0994485ed0c312f6d965766754ea177d07f9c00c9b82a5ee62ed5b47945ee9" +checksum = "5df98f4a4ab53bf8b175b363a34c7af608fe31f93cc1fb1bf07130622ca4ef61" dependencies = [ "lexical-util", "static_assertions", @@ -772,18 +751,18 @@ dependencies = [ [[package]] name = "lexical-util" -version = "0.8.5" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5255b9ff16ff898710eb9eb63cb39248ea8a5bb036bea8085b1a767ff6c4e3fc" +checksum = "85314db53332e5c192b6bca611fb10c114a80d1b831ddac0af1e9be1b9232ca0" dependencies = [ "static_assertions", ] [[package]] name = "lexical-write-float" -version = "0.8.5" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "accabaa1c4581f05a3923d1b4cfd124c329352288b7b9da09e766b0668116862" +checksum = "6e7c3ad4e37db81c1cbe7cf34610340adc09c322871972f74877a712abc6c809" dependencies = [ "lexical-util", "lexical-write-integer", @@ -792,9 +771,9 @@ dependencies = [ [[package]] name = "lexical-write-integer" -version = "0.8.5" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1b6f3d1f4422866b68192d62f77bc5c700bee84f3069f2469d7bc8c77852446" +checksum = "eb89e9f6958b83258afa3deed90b5de9ef68eef090ad5086c791cd2345610162" dependencies = [ "lexical-util", "static_assertions", @@ -802,15 +781,35 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.155" +version = "0.2.167" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" +checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc" + +[[package]] +name = "liblzma" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "603222e049bf0da71529325ada5d02dc3871cbd3679cf905429f7f0de93da87b" +dependencies = [ + "liblzma-sys", +] + +[[package]] +name = "liblzma-sys" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6630cb23edeb2e563cd6c30d4117554c69646871455843c33ddcb1d9aef82ecf" +dependencies = [ + "cc", + "libc", + "pkg-config", +] [[package]] name = "libm" -version = "0.2.8" +version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" +checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa" [[package]] name = "libmimalloc-sys" @@ -824,8 +823,8 @@ dependencies = [ [[package]] name = "libradicl" -version = "0.9.0" -source = "git+https://github.com/COMBINE-lab/libradicl?branch=develop#5dd74da16309409ed75287f611d54f2398b85ae5" +version = "0.9.1" +source = "git+https://github.com/COMBINE-lab/libradicl?branch=develop#5dea2f8f2fc15bf79d3f873066c9bf29595c7820" dependencies = [ "ahash", "anyhow", @@ -835,8 +834,7 @@ dependencies = [ "dashmap", "derivative", "itertools", - "noodles-bam", - "noodles-sam", + "noodles", "num", "scroll", "serde", @@ -876,37 +874,16 @@ version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" -[[package]] -name = "lzma-sys" -version = "0.1.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" -dependencies = [ - "cc", - "libc", - "pkg-config", -] - [[package]] name = "matrixmultiply" -version = "0.3.8" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7574c1cf36da4798ab73da5b215bbf444f50718207754cb522201d78d1cd0ff2" +checksum = "9380b911e3e96d10c1f415da0876389aaf1b56759054eeb0de7df940c456ba1a" dependencies = [ "autocfg", "rawpointer", ] -[[package]] -name = "md-5" -version = "0.10.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" -dependencies = [ - "cfg-if", - "digest", -] - [[package]] name = "memchr" version = "2.7.4" @@ -924,18 +901,18 @@ dependencies = [ [[package]] name = "miniz_oxide" -version = "0.7.4" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08" +checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" dependencies = [ - "adler", + "adler2", ] [[package]] name = "nalgebra" -version = "0.32.6" +version = "0.33.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b5c17de023a86f59ed79891b2e5d5a94c705dbe904a5b5c9c952ea6221b03e4" +checksum = "26aecdf64b707efd1310e3544d709c5c0ac61c13756046aaaba41be5c4f66a3b" dependencies = [ "approx 0.5.1", "matrixmultiply", @@ -957,41 +934,43 @@ checksum = "254a5372af8fc138e36684761d3c0cdb758a4410e938babcff1c860ce14ddbfc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.71", + "syn 2.0.90", ] [[package]] name = "ndarray" -version = "0.15.6" +version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32" +checksum = "882ed72dce9365842bf196bdeedf5055305f11fc8c03dee7bb0194a6cad34841" dependencies = [ "matrixmultiply", "num-complex 0.4.6", "num-integer", "num-traits", + "portable-atomic", + "portable-atomic-util", "rawpointer", ] [[package]] name = "needletail" -version = "0.5.1" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db05a5ab397f64070d8c998fa0fbb84e484b81f95752af317dac183a82d9295d" +checksum = "f29a3c5015d6985f33318d154fa0c41315eb2e7df29432c844c74a83434bfe21" dependencies = [ "buffer-redux", "bytecount", "bzip2", "flate2", + "liblzma", "memchr", - "xz2", ] [[package]] name = "noodles" -version = "0.77.0" +version = "0.85.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f8629ce12eeb2d6483be18af64beb9488840299ecd617be4ec2f4ebf66688e6" +checksum = "05ecd3b4ddb5792b0358e035fbd41543613e4234dc2d0f13c814e3d5e705d8c8" dependencies = [ "noodles-bam", "noodles-bgzf", @@ -1000,15 +979,15 @@ dependencies = [ [[package]] name = "noodles-bam" -version = "0.64.0" +version = "0.70.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fd4e9825cce4ed3c7fa958efce37268d899bdb7fdeb22d73493a095de1564be" +checksum = "57dbbc6b91efed384ceac16c1f4f764bc07d4941bacf7d5d7fdccfef48f52031" dependencies = [ - "bit-vec", "bstr", "byteorder", "bytes", "indexmap", + "memchr", "noodles-bgzf", "noodles-core", "noodles-csi", @@ -1017,9 +996,9 @@ dependencies = [ [[package]] name = "noodles-bgzf" -version = "0.31.0" +version = "0.33.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "754f4d28655c5d486e2a62814c5dcc46ad2cee5a2b6cbe91fd765a5ec997f91a" +checksum = "3b50aaa8f0a3c8a0b738b641a6d1a78d9fd30a899ab2d398779ee3c4eb80f1c1" dependencies = [ "byteorder", "bytes", @@ -1036,58 +1015,25 @@ dependencies = [ "bstr", ] -[[package]] -name = "noodles-cram" -version = "0.65.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4cf95f7e286f62550121f6135a1cb372a0233eba7bffb7f7bebe55eb4f12b35c" -dependencies = [ - "bitflags", - "bstr", - "byteorder", - "bytes", - "bzip2", - "flate2", - "indexmap", - "md-5", - "noodles-bam", - "noodles-core", - "noodles-fasta", - "noodles-sam", - "xz2", -] - [[package]] name = "noodles-csi" -version = "0.36.0" +version = "0.40.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89ac24bac459ae99c046086d771f5a79f632ab27a9280406f2fbb7a3238d79cc" +checksum = "99fdf34a300b37bc15160afcbec76487750bf6e3d9a6ff69c8e4fd635bc66745" dependencies = [ "bit-vec", + "bstr", "byteorder", "indexmap", "noodles-bgzf", "noodles-core", ] -[[package]] -name = "noodles-fasta" -version = "0.40.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "195e5a37b273b03499d85a83e9101532c02a21c7b9b2d9c11960b6fbfdcd39ad" -dependencies = [ - "bstr", - "bytes", - "memchr", - "noodles-bgzf", - "noodles-core", -] - [[package]] name = "noodles-sam" -version = "0.61.0" +version = "0.66.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48e2636562b1b13e21369e82b0426cd532a3fb2dc676eccec89215a1a8dad243" +checksum = "c26c31cfbb05b20fc8fb3cf1ac556efca0ff4b3455112705dd2ea81ae01e7d81" dependencies = [ "bitflags", "bstr", @@ -1099,22 +1045,6 @@ dependencies = [ "noodles-csi", ] -[[package]] -name = "noodles-util" -version = "0.48.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2aa80a65a665b668318834f00b5e84286ef3ca59a702b11de4cb3e8f2564c91" -dependencies = [ - "flate2", - "noodles-bam", - "noodles-bgzf", - "noodles-core", - "noodles-cram", - "noodles-csi", - "noodles-fasta", - "noodles-sam", -] - [[package]] name = "num" version = "0.4.3" @@ -1221,7 +1151,7 @@ version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" dependencies = [ - "hermit-abi", + "hermit-abi 0.3.9", "libc", ] @@ -1233,9 +1163,9 @@ checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" [[package]] name = "once_cell" -version = "1.19.0" +version = "1.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" [[package]] name = "parking_lot_core" @@ -1247,7 +1177,7 @@ dependencies = [ "libc", "redox_syscall", "smallvec", - "windows-targets 0.52.6", + "windows-targets", ] [[package]] @@ -1268,15 +1198,24 @@ dependencies = [ [[package]] name = "pkg-config" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" +checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2" [[package]] name = "portable-atomic" -version = "1.6.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0" +checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6" + +[[package]] +name = "portable-atomic-util" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" +dependencies = [ + "portable-atomic", +] [[package]] name = "powerfmt" @@ -1286,24 +1225,27 @@ checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" [[package]] name = "ppv-lite86" -version = "0.2.17" +version = "0.2.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy", +] [[package]] name = "proc-macro2" -version = "1.0.86" +version = "1.0.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.36" +version = "1.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" dependencies = [ "proc-macro2", ] @@ -1376,29 +1318,29 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.5.2" +version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c82cf8cff14456045f55ec4241383baeff27af886adb72ffb2162f99911de0fd" +checksum = "9b6dfecf2c74bce2466cabf93f6664d6998a69eb21e39f4207930065b27b771f" dependencies = [ "bitflags", ] [[package]] name = "redox_users" -version = "0.4.5" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd283d9651eeda4b2a83a43c1c91b266c40fd76ecd39a50a8c630ae69dc72891" +checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" dependencies = [ "getrandom", "libredox", - "thiserror", + "thiserror 1.0.69", ] [[package]] name = "regex" -version = "1.10.5" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" dependencies = [ "aho-corasick", "memchr", @@ -1408,9 +1350,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.7" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" dependencies = [ "aho-corasick", "memchr", @@ -1419,15 +1361,15 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.8.4" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "rustix" -version = "0.38.34" +version = "0.38.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" +checksum = "d7f649912bc1495e167a6edee79151c84b1bad49748cb4f1f1167f459f6224f6" dependencies = [ "bitflags", "errno", @@ -1438,9 +1380,9 @@ dependencies = [ [[package]] name = "rustversion" -version = "1.0.17" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6" +checksum = "0e819f2bc632f285be6d7cd36e25940d45b2391dd6d9b939e79de557f7014248" [[package]] name = "ryu" @@ -1485,40 +1427,47 @@ checksum = "6ab8598aa408498679922eff7fa985c25d58a90771bd6be794434c5277eab1a6" [[package]] name = "serde" -version = "1.0.204" +version = "1.0.215" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc76f558e0cbb2a839d37354c575f1dc3fdc6546b5be373ba43d95f231bf7c12" +checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.204" +version = "1.0.215" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0cd7e117be63d3c3678776753929474f3b04a43a080c744d6b0ae2a8c28e222" +checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.71", + "syn 2.0.90", ] [[package]] name = "serde_json" -version = "1.0.120" +version = "1.0.133" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e0d21c9a8cae1235ad58a00c11cb40d4b1e5c784f1ef2c537876ed6ffd8b7c5" +checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377" dependencies = [ "itoa", + "memchr", "ryu", "serde", ] +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + [[package]] name = "simba" -version = "0.8.1" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "061507c94fc6ab4ba1c9a0305018408e312e17c041eb63bef8aa726fa33aceae" +checksum = "b3a386a501cd104797982c15ae17aafe8b9261315b5d07e3ec803f2ea26be0fa" dependencies = [ "approx 0.5.1", "num-complex 0.4.6", @@ -1572,9 +1521,9 @@ checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" [[package]] name = "sprs" -version = "0.11.1" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88bab60b0a18fb9b3e0c26e92796b3c3a278bf5fa4880f5ad5cc3bdfb843d0b1" +checksum = "704ef26d974e8a452313ed629828cd9d4e4fa34667ca1ad9d6b1fffa43c6e166" dependencies = [ "alga", "ndarray", @@ -1593,9 +1542,9 @@ checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" [[package]] name = "statrs" -version = "0.17.1" +version = "0.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f697a07e4606a0a25c044de247e583a330dbb1731d11bc7350b81f48ad567255" +checksum = "2a3fe7c28c6512e766b0874335db33c94ad7b8f9054228ae1c2abd47ce7d335e" dependencies = [ "approx 0.5.1", "nalgebra", @@ -1619,7 +1568,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.71", + "syn 2.0.90", ] [[package]] @@ -1635,9 +1584,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.71" +version = "2.0.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b146dcf730474b4bcd16c311627b31ede9ab149045db4d6088b3becaea046462" +checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31" dependencies = [ "proc-macro2", "quote", @@ -1663,32 +1612,52 @@ dependencies = [ [[package]] name = "terminal_size" -version = "0.3.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7" +checksum = "5352447f921fda68cf61b4101566c0bdb5104eff6804d0678e5227580ab6a4e9" dependencies = [ "rustix", - "windows-sys 0.48.0", + "windows-sys 0.59.0", ] [[package]] name = "thiserror" -version = "1.0.62" +version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2675633b1499176c2dff06b0856a27976a8f9d436737b4cf4f312d4d91d8bbb" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" dependencies = [ - "thiserror-impl", + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f49a1853cf82743e3b7950f77e0f4d622ca36cf4317cba00c767838bac8d490" +dependencies = [ + "thiserror-impl 2.0.4", ] [[package]] name = "thiserror-impl" -version = "1.0.62" +version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d20468752b09f49e909e55a5d338caa8bedf615594e9d80bc4c565d30faf798c" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.71", + "syn 2.0.90", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8381894bb3efe0c4acac3ded651301ceee58a15d47c2e34885ed1908ad667061" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.90", ] [[package]] @@ -1703,9 +1672,9 @@ dependencies = [ [[package]] name = "time" -version = "0.3.36" +version = "0.3.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885" +checksum = "35e7868883861bd0e56d9ac6efcaaca0d6d5d82a2a7ec8209ff492c07cf37b21" dependencies = [ "deranged", "itoa", @@ -1724,9 +1693,9 @@ checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" [[package]] name = "time-macros" -version = "0.2.18" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf" +checksum = "2834e6017e3e5e4b9834939793b282bc03b37a3336245fa820e35e233e2a85de" dependencies = [ "num-conv", "time-core", @@ -1745,22 +1714,22 @@ dependencies = [ [[package]] name = "typed-builder" -version = "0.18.2" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77739c880e00693faef3d65ea3aad725f196da38b22fdc7ea6ded6e1ce4d3add" +checksum = "7e14ed59dc8b7b26cacb2a92bad2e8b1f098806063898ab42a3bd121d7d45e75" dependencies = [ "typed-builder-macro", ] [[package]] name = "typed-builder-macro" -version = "0.18.2" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f718dfaf347dcb5b983bfc87608144b0bad87970aebcbea5ce44d2a30c08e63" +checksum = "560b82d656506509d43abe30e0ba64c56b1953ab3d4fe7ba5902747a7a3cedd5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.71", + "syn 2.0.90", ] [[package]] @@ -1771,15 +1740,21 @@ checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" [[package]] name = "unicode-ident" -version = "1.0.12" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" + +[[package]] +name = "unicode-width" +version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" +checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" [[package]] name = "unicode-width" -version = "0.1.13" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d" +checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" [[package]] name = "utf8parse" @@ -1789,9 +1764,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "version_check" -version = "0.9.4" +version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" [[package]] name = "wasi" @@ -1801,34 +1776,35 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.92" +version = "0.2.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" +checksum = "d15e63b4482863c109d70a7b8706c1e364eb6ea449b201a76c5b89cedcec2d5c" dependencies = [ "cfg-if", + "once_cell", "wasm-bindgen-macro", ] [[package]] name = "wasm-bindgen-backend" -version = "0.2.92" +version = "0.2.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" +checksum = "8d36ef12e3aaca16ddd3f67922bc63e48e953f126de60bd33ccc0101ef9998cd" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", - "syn 2.0.71", + "syn 2.0.90", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-macro" -version = "0.2.92" +version = "0.2.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" +checksum = "705440e08b42d3e4b36de7d66c944be628d579796b8090bfa3471478a2260051" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -1836,28 +1812,38 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.92" +version = "0.2.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" +checksum = "98c9ae5a76e46f4deecd0f0255cc223cfa18dc9b261213b8aa0c7b36f61b3f1d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.71", + "syn 2.0.90", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.92" +version = "0.2.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" +checksum = "6ee99da9c5ba11bd675621338ef6fa52296b76b83305e9b6e5c77d4c286d6d49" + +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] [[package]] name = "wide" -version = "0.7.25" +version = "0.7.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2caba658a80831539b30698ae9862a72db6697dfdd7151e46920f5f2755c3ce2" +checksum = "58e6db2670d2be78525979e9a5f9c69d296fd7d670549fe9ebf70f8708cb5019" dependencies = [ "bytemuck", "safe_arch", @@ -1891,16 +1877,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" dependencies = [ - "windows-targets 0.52.6", -] - -[[package]] -name = "windows-sys" -version = "0.48.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" -dependencies = [ - "windows-targets 0.48.5", + "windows-targets", ] [[package]] @@ -1909,22 +1886,16 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.52.6", + "windows-targets", ] [[package]] -name = "windows-targets" -version = "0.48.5" +name = "windows-sys" +version = "0.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" dependencies = [ - "windows_aarch64_gnullvm 0.48.5", - "windows_aarch64_msvc 0.48.5", - "windows_i686_gnu 0.48.5", - "windows_i686_msvc 0.48.5", - "windows_x86_64_gnu 0.48.5", - "windows_x86_64_gnullvm 0.48.5", - "windows_x86_64_msvc 0.48.5", + "windows-targets", ] [[package]] @@ -1933,46 +1904,28 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm 0.52.6", - "windows_aarch64_msvc 0.52.6", - "windows_i686_gnu 0.52.6", + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", "windows_i686_gnullvm", - "windows_i686_msvc 0.52.6", - "windows_x86_64_gnu 0.52.6", - "windows_x86_64_gnullvm 0.52.6", - "windows_x86_64_msvc 0.52.6", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", ] -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" - [[package]] name = "windows_aarch64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" -[[package]] -name = "windows_aarch64_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" - [[package]] name = "windows_aarch64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" -[[package]] -name = "windows_i686_gnu" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" - [[package]] name = "windows_i686_gnu" version = "0.52.6" @@ -1985,69 +1938,37 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" -[[package]] -name = "windows_i686_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" - [[package]] name = "windows_i686_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" -[[package]] -name = "windows_x86_64_gnu" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" - [[package]] name = "windows_x86_64_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" - [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" -[[package]] -name = "windows_x86_64_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" - [[package]] name = "windows_x86_64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" -[[package]] -name = "xz2" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" -dependencies = [ - "lzma-sys", -] - [[package]] name = "zerocopy" version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" dependencies = [ + "byteorder", "zerocopy-derive", ] @@ -2059,5 +1980,5 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.71", + "syn 2.0.90", ] diff --git a/Cargo.toml b/Cargo.toml index 1145e34..80b2d0e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,7 +35,7 @@ categories = ["command-line-utilities", "science"] [dependencies] # for local development, look in the libradicl git repository # but when published, pull the specified version -libradicl = { git = "https://github.com/COMBINE-lab/libradicl", branch = "develop", version = "0.9.0" } +libradicl = { git = "https://github.com/COMBINE-lab/libradicl", branch = "develop", version = "0.9.1" } anyhow = "1.0.86" arrayvec = "0.7.4" ahash = "0.8.11" @@ -43,9 +43,9 @@ bincode = "1.3.3" bstr = "1.9.1" crossbeam-channel = "0.5.13" crossbeam-queue = "0.3.11" -typed-builder = "0.18.2" +typed-builder = "0.20.0" indicatif = "0.17.8" -needletail = "0.5.1" +needletail = "0.6.0" petgraph = "0.6.5" flate2 = "1.0.30" scroll = "0.12.0" @@ -65,17 +65,17 @@ num-format = "0.4.4" num_cpus = "1.16.0" bio-types = { version = "1.0.2", default-features = true, features = ["serde"] } itertools = "0.13.0" -thiserror = "1.0.62" -statrs = "0.17.1" +thiserror = "2.0.4" +statrs = "0.18.0" sce = { git = "https://github.com/parazodiac/SingleCellExperiment", branch = "dev", version = "0.2.0" } # no shenanigans; clap makes breaking "fixes" too often to allow variability # in the version different from what we tested with clap = { version = "4.5.9", features = ["derive", "wrap_help", "cargo", "help", "usage", "string", "error-context"] } -noodles = { version = "0.77.0", features = ["bam", "bgzf", "sam"] } -noodles-util = { version = "0.48.0", features = ["alignment"] } +noodles = { version = "0.85.0", features = ["bam", "bgzf", "sam"] } dashmap = { version = "6.1.0", features = ["serde", "inline"] } +nalgebra = "0.33.2" [profile.release] #debug = true diff --git a/src/atac/cellfilter.rs b/src/atac/cellfilter.rs new file mode 100644 index 0000000..8c47633 --- /dev/null +++ b/src/atac/cellfilter.rs @@ -0,0 +1,542 @@ +use crate::atac::prog_opts::GenPermitListOpts; +use crate::utils as afutils; +use anyhow::{anyhow, Context}; +use bstr::io::BufReadExt; +// use indexmap::map::IndexMap; +use itertools::Itertools; +use libradicl::exit_codes; +use libradicl::rad_types; +use libradicl::rad_types::TagValue; +use libradicl::utils::has_data_left; +use libradicl::BarcodeLookupMap; +use libradicl::{ + chunk, + header::RadPrelude, + record::{AtacSeqReadRecord, AtacSeqRecordContext}, +}; +use num_format::{Locale, ToFormattedString}; +use serde::Serialize; +use serde_json::json; +use slog::crit; +use slog::info; +use std::collections::HashMap; +use std::fs::File; +use std::io::{BufReader, Read}; +use std::io::{BufWriter, Write}; +use std::path::PathBuf; +use std::time::Instant; + +/// Initialize the index map with key being references and position +/// Take the largest reference length (from chromosome) +/// For each chromosome divide into ranges of size_range uptil max_size +/// If we had ref length corresponding to each chromosome, we would divide the reference based on its length only +pub fn initialize_rec_list( + b_lens: &mut [u64], + ref_lens: &[u64], + size_range: u64, +) -> anyhow::Result { + let mut tot_zeros = 0; + let num_refs = ref_lens.len(); + + for r in 0..num_refs { + let nrange = (ref_lens[r] as f32 / size_range as f32).ceil() as u64; + let cum = b_lens[r] + nrange; + b_lens[r + 1] = cum; + tot_zeros += nrange; + } + Ok(tot_zeros) +} + +#[derive(Clone, Debug, Serialize)] +pub enum CellFilterMethod { + // use the distance method to + // automatically find the knee + // in the curve + KneeFinding, + // barcodes will be provided in the + // form of an *unfiltered* external + // permit list + UnfilteredExternalList(PathBuf, usize), +} + +pub fn update_barcode_hist_unfiltered( + hist: &mut HashMap, + unmatched_bc: &mut Vec, + max_ambiguity_read: &mut usize, + chunk: &chunk::Chunk, + bins: &mut [u64], + blens: &[u64], + size_range: u64, +) -> usize { + let mut num_strand_compat_reads = 0usize; + for r in &chunk.reads { + num_strand_compat_reads += 1; + *max_ambiguity_read = r.refs.len().max(*max_ambiguity_read); + // lookup the barcode in the map of unfiltered known + // barcodes + match hist.get_mut(&r.bc) { + // if we find a match, increment the count + Some(c) => *c += 1, + // otherwise, push this into the unmatched list + None => { + unmatched_bc.push(r.bc); + } + } + + // for (i,_j) in r.start_pos.iter().enumerate() { + if r.start_pos.len() == 1 { + let i = 0; + let ref_id = r.refs[i]; + let sp = r.start_pos[i]; + let bid = sp as u64 / size_range; + let ind: usize = (blens[ref_id as usize] + bid) as usize; + bins[ind] += 1; + } + } + num_strand_compat_reads +} + +fn populate_unfiltered_barcode_map( + br: BufReader, + first_bclen: &mut usize, + rev_bc: bool, +) -> HashMap { + let s = ahash::RandomState::with_seeds(2u64, 7u64, 1u64, 8u64); + let mut hm = HashMap::with_hasher(s); + + // read through the external unfiltered barcode list + // and generate a vector of encoded barcodes + // let mut kv = Vec::::new(); + for l in br.byte_lines().flatten() { + if *first_bclen == 0 { + *first_bclen = l.len(); + } else { + assert_eq!( + *first_bclen, + l.len(), + "found barcodes of different lengths {} and {}", + *first_bclen, + l.len() + ); + } + if let Some((_, km, _)) = + needletail::bitkmer::BitNuclKmer::new(&l[..], l.len() as u8, false).next() + { + if rev_bc { + let km_rev = needletail::bitkmer::reverse_complement(km); + hm.insert(km_rev.0, 0); + } else { + hm.insert(km.0, 0); + } + } + } + hm +} + +#[allow(clippy::unnecessary_unwrap, clippy::too_many_arguments)] +fn process_unfiltered( + mut hm: HashMap, + mut unmatched_bc: Vec, + file_tag_map: &rad_types::TagMap, + filter_meth: &CellFilterMethod, + output_dir: &PathBuf, + version: &str, + max_ambiguity_read: usize, + num_chunks: u32, + cmdline: &str, + log: &slog::Logger, + bmax: u64, + gpl_opts: &GenPermitListOpts, +) -> anyhow::Result { + // the smallest number of reads we'll allow per barcode + let min_freq = match filter_meth { + CellFilterMethod::UnfilteredExternalList(_, min_reads) => { + info!(log, "minimum num reads for barcode pass = {}", *min_reads); + *min_reads as u64 + } + _ => { + unimplemented!(); + } + }; + + // the set of barcodes we'll keep + let mut kept_bc = Vec::::new(); + + // iterate over the count map + for (k, v) in hm.iter_mut() { + // if this satisfies our requirement for the minimum count + // then keep this barcode + if *v >= min_freq { + kept_bc.push(*k); + } else { + // otherwise, we have to add this barcode's + // counts to our unmatched list + for _ in 0..*v { + unmatched_bc.push(*k); + } + // and then reset the counter for this barcode to 0 + *v = 0u64; + } + } + + // drop the absent barcodes from hm + hm.retain(|_, &mut v| v > 0); + + // how many we will keep + let num_passing = kept_bc.len(); + info!( + log, + "num_passing = {}", + num_passing.to_formatted_string(&Locale::en) + ); + let barcode_tag = file_tag_map + .get("cblen") + .expect("tag map must contain cblen"); + let barcode_len: u16 = barcode_tag.try_into()?; + + // let ref_lens = file_tag_map + // .get("ref_lengths") + // .expect("tag map must contain ref_lengths"); + // now, we create a second barcode map with just the barcodes + // for cells we will keep / rescue. + let bcmap2 = BarcodeLookupMap::new(kept_bc, barcode_len as u32); + info!( + log, + "found {} cells with non-trivial number of reads by exact barcode match", + bcmap2.barcodes.len().to_formatted_string(&Locale::en) + ); + + // finally, we'll go through the set of unmatched barcodes + // and try to rescue those that have a *unique* neighbor in the + // list of retained barcodes. + + //let mut found_exact = 0usize; + let mut found_approx = 0usize; + let mut ambig_approx = 0usize; + let mut not_found = 0usize; + + let start_unmatched_time = Instant::now(); + + unmatched_bc.sort_unstable(); + + let mut distinct_unmatched_bc = 0usize; + let mut distinct_recoverable_bc = 0usize; + + // mapping the uncorrected barcode to what it corrects to + let mut corrected_list = Vec::<(u64, u64)>::with_capacity(1_000_000); + + for (count, ubc) in unmatched_bc.iter().dedup_with_count() { + // try to find the unmatched barcode, but + // look up to 1 edit away + match bcmap2.find_neighbors(*ubc, false) { + // if we have a match + (Some(x), n) => { + let cbc = bcmap2.barcodes[x]; + // if the uncorrected barcode had a + // single, unique retained neighbor + if cbc != *ubc && n == 1 { + // then increment the count of this + // barcode by 1 (because we'll correct to it) + if let Some(c) = hm.get_mut(&cbc) { + *c += count as u64; + corrected_list.push((*ubc, cbc)); + } + // this counts as an approximate find + found_approx += count; + distinct_recoverable_bc += 1; + } + // if we had > 1 single-mismatch neighbor + // then don't keep the barcode, but remember + // the count of such events + if n > 1 { + ambig_approx += count; + } + } + // if we had no single-mismatch neighbor + // then this barcode is not_found and gets + // dropped. + (None, _) => { + not_found += count; + } + } + distinct_unmatched_bc += 1; + } + let unmatched_duration = start_unmatched_time.elapsed(); + let num_corrected = distinct_recoverable_bc as u64; + + info!( + log, + "There were {} distinct unmatched barcodes, and {} that can be recovered", + distinct_unmatched_bc, + distinct_recoverable_bc + ); + info!( + log, + "Matching unmatched barcodes to retained barcodes took {:?}", unmatched_duration + ); + info!(log, "Of the unmatched barcodes\n============"); + info!( + log, + "\t{} had exactly 1 single-edit neighbor in the retained list", + found_approx.to_formatted_string(&Locale::en) + ); + info!( + log, + "\t{} had >1 single-edit neighbor in the retained list", + ambig_approx.to_formatted_string(&Locale::en) + ); + info!( + log, + "\t{} had no neighbor in the retained list", + not_found.to_formatted_string(&Locale::en) + ); + + let parent = std::path::Path::new(output_dir); + let o_path = parent.join("permit_freq.bin"); + + match afutils::write_permit_list_freq(&o_path, barcode_len, &hm) { + Ok(_) => {} + Err(error) => { + panic!("Error: {}", error); + } + }; + + /* + // don't need this right now + let s_path = parent.join("bcmap.bin"); + let s_file = std::fs::File::create(&s_path).expect("could not create serialization file."); + let mut s_writer = BufWriter::new(&s_file); + bincode::serialize_into(&mut s_writer, &bcmap2).expect("couldn't serialize barcode list."); + */ + + // now that we are done with using hm to count, we can repurpose it as + // the correction map. + for (k, v) in hm.iter_mut() { + // each present barcode corrects to itself + *v = *k; + } + for (uncorrected, corrected) in corrected_list.iter() { + hm.insert(*uncorrected, *corrected); + } + + let pm_path = parent.join("permit_map.bin"); + let pm_file = std::fs::File::create(pm_path).context("could not create serialization file.")?; + let mut pm_writer = BufWriter::new(&pm_file); + bincode::serialize_into(&mut pm_writer, &hm) + .context("couldn't serialize permit list mapping.")?; + + let meta_info = json!({ + "version_str" : version, + "max-ambig-record" : max_ambiguity_read, + "num-chunks" : num_chunks, + "cmd" : cmdline, + "permit-list-type" : "unfiltered", + "gpl_options" : &gpl_opts, + "max-rec-in-bin": bmax + }); + + let m_path = parent.join("generate_permit_list.json"); + let mut m_file = std::fs::File::create(m_path).context("could not create metadata file.")?; + + let meta_info_string = + serde_json::to_string_pretty(&meta_info).context("could not format json.")?; + m_file + .write_all(meta_info_string.as_bytes()) + .context("cannot write to generate_permit_list.json file")?; + + info!( + log, + "total number of distinct corrected barcodes : {}", + num_corrected.to_formatted_string(&Locale::en) + ); + + Ok(num_corrected) +} + +pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result { + let rad_dir = gpl_opts.input_dir; + let output_dir = gpl_opts.output_dir; + + let filter_meth = gpl_opts.fmeth.clone(); + let version = gpl_opts.version; + let cmdline = gpl_opts.cmdline; + let log = gpl_opts.log; + let rc = gpl_opts.rc; + let mut num_chunks = 0; + let size_range: u64 = 100000; + let i_dir = std::path::Path::new(&rad_dir); + let o_dir_path = std::path::Path::new(&output_dir); + // let ref_lens = read_ref_lengths(i_dir)?; + + // should we assume this condition was already checked + // during parsing? + if !i_dir.exists() { + crit!( + log, + "the input rad path {} does not exist", + rad_dir.display() + ); + // std::process::exit(1); + return Err(anyhow!("execution terminated unexpectedly")); + } + + let mut first_bclen = 0usize; + let mut unfiltered_bc_counts = None; + + let p_exi = o_dir_path + .try_exists() + .expect("Can't check existence of path"); + if !p_exi { + std::fs::create_dir_all(o_dir_path).with_context(|| { + format!( + "couldn't create path to output directory {}", + o_dir_path.display() + ) + })?; + } + + if let CellFilterMethod::UnfilteredExternalList(fname, _) = &filter_meth { + let i_file = File::open(fname).context("could not open input file")?; + let br = BufReader::new(i_file); + unfiltered_bc_counts = Some(populate_unfiltered_barcode_map(br, &mut first_bclen, rc)); + info!( + log, + "number of unfiltered bcs read = {}", + unfiltered_bc_counts + .as_ref() + .unwrap() + .len() + .to_formatted_string(&Locale::en) + ); + } + + let i_file = File::open(i_dir.join("map.rad")).context("could not open input rad file")?; + let mut br = BufReader::new(i_file); + let prelude = RadPrelude::from_bytes(&mut br)?; + let hdr = &prelude.hdr; + + info!( + log, + "paired : {:?}, ref_count : {}, num_chunks : {}", + hdr.is_paired != 0, + hdr.ref_count.to_formatted_string(&Locale::en), + hdr.num_chunks.to_formatted_string(&Locale::en), + ); + + // file-level + let fl_tags = &prelude.file_tags; + info!(log, "read {:?} file-level tags", fl_tags.tags.len()); + // read-level + let rl_tags = &prelude.read_tags; + info!(log, "read {:?} read-level tags", rl_tags.tags.len()); + + const BNAME: &str = "barcode"; + // let mut bct: Option = None; + + for rt in &rl_tags.tags { + // if this is one of our tags + if rt.name == BNAME && !rt.typeid.is_int_type() { + crit!( + log, + "currently only RAD types 1--4 are supported for 'b' tags." + ); + std::process::exit(exit_codes::EXIT_UNSUPPORTED_TAG_TYPE); + // if rt.name == BNAME { + // bct = Some(rt.typeid); + // } + } + } + + // alignment-level + let al_tags = &prelude.aln_tags; + info!(log, "read {:?} alignment-level tags", al_tags.tags.len()); + + let file_tag_map = prelude.file_tags.parse_tags_from_bytes(&mut br)?; + info!(log, "File-level tag values {:?}", file_tag_map); + + let ref_tag = file_tag_map + .get("ref_lengths") + .expect("tag must contain ref lengths"); + let TagValue::ArrayU64(ref_lens) = ref_tag else { + todo!() + }; + + let record_context = prelude.get_record_context::()?; + let mut num_reads: usize = 0; + + let mut blens: Vec = vec![0; ref_lens.len() + 1]; + let tot_bins = initialize_rec_list(&mut blens, ref_lens, size_range); + let mut bins: Vec = vec![0; tot_bins.unwrap() as usize]; + + // if dealing with the unfiltered type + // the set of barcodes that are not an exact match for any known barcodes + let mut unmatched_bc: Vec; + // let mut num_orientation_compat_reads = 0usize; + let mut max_ambiguity_read = 0usize; + let mut num_orientation_compat_reads = 0usize; + // Tracking if a unique or a multihit + + match filter_meth { + CellFilterMethod::UnfilteredExternalList(_, _min_reads) => { + unmatched_bc = Vec::with_capacity(10000000); + // the unfiltered_bc_count map must be valid in this branch + + if let Some(mut hmu) = unfiltered_bc_counts { + while has_data_left(&mut br).expect("encountered error reading input file") { + let c = chunk::Chunk::::from_bytes(&mut br, &record_context); + num_orientation_compat_reads += update_barcode_hist_unfiltered( + &mut hmu, + &mut unmatched_bc, + &mut max_ambiguity_read, + &c, + &mut bins, + &blens, + size_range, + ); + num_chunks += 1; + num_reads += c.reads.len(); + } + + let bin_recs_path = output_dir.join("bin_recs.bin"); + let br_file = std::fs::File::create(bin_recs_path) + .expect("could not create serialization file."); + let mut br_writer = BufWriter::new(&br_file); + bincode::serialize_into(&mut br_writer, &bins) + .expect("couldn't serialize bins recs."); + + let bin_lens_path = output_dir.join("bin_lens.bin"); + let bl_file = std::fs::File::create(bin_lens_path) + .expect("could not create serialization file."); + let mut bl_writer = BufWriter::new(&bl_file); + bincode::serialize_into(&mut bl_writer, &blens) + .expect("couldn't serialize bins lengths."); + + info!( + log, + "observed {} reads ({} orientation consistent) in {} chunks --- max ambiguity read occurs in {} refs", + num_reads.to_formatted_string(&Locale::en), + num_orientation_compat_reads.to_formatted_string(&Locale::en), + num_chunks.to_formatted_string(&Locale::en), + max_ambiguity_read.to_formatted_string(&Locale::en) + ); + process_unfiltered( + hmu, + unmatched_bc, + &file_tag_map, + &filter_meth, + output_dir, + version, + max_ambiguity_read, + num_chunks, + cmdline, + log, + *bins.iter().max().unwrap(), + &gpl_opts, + ) + } else { + Ok(0) + } + } + _ => Ok(0), + } +} diff --git a/src/atac/collate.rs b/src/atac/collate.rs new file mode 100644 index 0000000..48f70b5 --- /dev/null +++ b/src/atac/collate.rs @@ -0,0 +1,811 @@ +/* + * Copyright (c) 2020-2024 COMBINE-lab. + * + * This file is part of alevin-fry + * (see https://www.github.com/COMBINE-lab/alevin-fry). + * + * License: 3-clause BSD, see https://opensource.org/licenses/BSD-3-Clause + */ + +use anyhow::{anyhow, Context}; +use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle}; +use slog::{crit, info}; +//use anyhow::{anyhow, Result}; +use crate::constants as afconst; +use crate::utils::InternalVersionInfo; +use crossbeam_queue::ArrayQueue; +// use dashmap::DashMap; + +use libradicl::chunk; +use libradicl::header::{RadHeader, RadPrelude}; +use libradicl::rad_types; +use libradicl::record::AtacSeqReadRecord; +use libradicl::schema::{CollateKey, TempCellInfo}; + +use num_format::{Locale, ToFormattedString}; +use scroll::{Pread, Pwrite}; +use serde_json::json; +use std::collections::HashMap; +use std::fs::File; +use std::io::BufReader; +use std::io::{BufWriter, Cursor, Read, Seek, Write}; +use std::iter::FromIterator; +use std::path::{Path, PathBuf}; +use std::str::FromStr; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::{Arc, Mutex}; +use std::thread; + +#[allow(clippy::too_many_arguments)] +pub fn collate( + input_dir: P1, + rad_dir: P2, + num_threads: u32, + max_records: u32, + compress_out: bool, + cmdline: &str, + version_str: &str, + //expected_ori: Strand, + log: &slog::Logger, +) -> anyhow::Result<()> +where + P1: Into, + P2: AsRef, +{ + let input_dir = input_dir.into(); + let parent = std::path::Path::new(input_dir.as_path()); + + // open the metadata file and read the json + let gpl_path = parent.join("generate_permit_list.json"); + let meta_data_file = File::open(&gpl_path) + .with_context(|| format!("Could not open the file {:?}.", gpl_path.display()))?; + let mdata: serde_json::Value = serde_json::from_reader(meta_data_file)?; + + let calling_version = InternalVersionInfo::from_str(version_str)?; + let vd: InternalVersionInfo; + match mdata.get("version_str") { + Some(vs) => match vs.as_str() { + Some(s) => { + vd = InternalVersionInfo::from_str(s)?; + } + None => { + return Err(anyhow!("The version_str field must be a string")); + } + }, + None => { + return Err(anyhow!("The generate_permit_list.json file does not contain a version_str field. Please re-run the generate-permit-list step with a newer version of alevin-fry")); + } + }; + + if let Err(es) = calling_version.is_compatible_with(&vd) { + return Err(anyhow!(es)); + } + + // if only an *old* version of the permit_freq is present, then complain and exit + if parent.join("permit_freq.tsv").exists() && !parent.join("permit_freq.bin").exists() { + crit!(log, "The file permit_freq.bin doesn't exist, please rerun alevin-fry generate-permit-list command."); + // std::process::exit(1); + return Err(anyhow!("execution terminated unexpectedly")); + } + + // open file + let freq_file = + std::fs::File::open(parent.join("permit_freq.bin")).context("couldn't open file")?; + + // header buffer + let mut rbuf = [0u8; 8]; + + // read header + let mut rdr = BufReader::new(&freq_file); + rdr.read_exact(&mut rbuf) + .context("couldn't read freq file header")?; + let freq_file_version = rbuf + .pread::(0) + .context("couldn't read freq file version")?; + // make sure versions match + if freq_file_version > afconst::PERMIT_FILE_VER { + crit!(log, + "The permit_freq.bin file had version {}, but this version of alevin-fry requires version {}", + freq_file_version, afconst::PERMIT_FILE_VER + ); + return Err(anyhow!("execution terminated unexpectedly")); + } + + // read the barcode length + rdr.read_exact(&mut rbuf) + .context("couldn't read freq file buffer")?; + let _bc_len = rbuf + .pread::(0) + .context("couldn't read freq file barcode length")?; + + // read the barcode -> frequency hashmap + let freq_hm: HashMap = + bincode::deserialize_from(rdr).context("couldn't deserialize barcode to frequency map.")?; + let total_to_collate = freq_hm.values().sum(); + let mut tsv_map = Vec::from_iter(freq_hm); + + // sort this so that we deal with largest cells (by # of reads) first + // sort in _descending_ order by count. + tsv_map.sort_unstable_by_key(|&a: &(u64, u64)| std::cmp::Reverse(a.1)); + + /* + let est_num_rounds = (total_to_collate as f64 / max_records as f64).ceil() as u64; + info!( + log, + "estimated that collation would require {} passes over input.", est_num_rounds + ); + // if est_num_rounds > 2 { + info!(log, "executing temporary file scatter-gather strategy."); + */ + + collate_with_temp( + input_dir, + rad_dir, + num_threads, + max_records, + tsv_map, + total_to_collate, + compress_out, + cmdline, + version_str, + log, + ) + + /*} else { + info!(log, "executing multi-pass strategy."); + collate_in_memory_multipass( + input_dir, + rad_dir, + num_threads, + max_records, + tsv_map, + total_to_collate, + log, + ) + }*/ +} + +#[derive(Debug)] +pub enum FilterType { + Filtered, + Unfiltered, +} + +pub fn get_filter_type(mdata: &serde_json::Value, log: &slog::Logger) -> FilterType { + if let Some(fts) = mdata.get("permit-list-type") { + let ft = match fts.as_str() { + Some("unfiltered") => FilterType::Unfiltered, + Some("filtered") => FilterType::Filtered, + _ => FilterType::Filtered, + }; + ft + } else { + info!( + log, + "permit-list-type key not present in JSON file; assuming list is filtered." + ); + FilterType::Filtered + } +} + +pub fn get_most_ambiguous_record(mdata: &serde_json::Value, log: &slog::Logger) -> usize { + if let Some(mar) = mdata.get("max-ambig-record") { + match mar.as_u64() { + Some(mv) => mv as usize, + _ => 2500_usize, + } + } else { + info!( + log, + "max-ambig-record key not present in JSON file; using default of 2,500. Please consider upgrading alevin-fry." + ); + 2500_usize + } +} + +pub fn get_num_chunks(mdata: &serde_json::Value, log: &slog::Logger) -> anyhow::Result { + if let Some(mar) = mdata.get("num-chunks") { + match mar.as_u64() { + Some(mv) => Ok(mv), + _ => Err(anyhow!("Error parsing num-chunks")), + } + } else { + info!(log, "num-chunks key not present in JSON file;"); + Err(anyhow!("num-chunks key not present")) + } +} + +pub fn correct_unmapped_counts( + correct_map: &Arc>, + unmapped_file: &std::path::Path, + parent: &std::path::Path, +) { + let i_file = File::open(unmapped_file).unwrap(); + let mut br = BufReader::new(i_file); + + // enough to hold a key value pair (a u64 key and u32 value) + let mut rbuf = [0u8; std::mem::size_of::() + std::mem::size_of::()]; + + let mut unmapped_count: HashMap = HashMap::new(); + + // pre-populate the output map with all valid keys + // keys (corrected barcodes) with no unmapped reads + // will simply have a value of 0. + //for (&_ubc, &cbc) in correct_map.iter() { + // unmapped_count.entry(cbc).or_insert(0); + //} + + // collect all of the information from the existing + // serialized map (that may contain repeats) + while br.read_exact(&mut rbuf[..]).is_ok() { + let k = rbuf.pread::(0).unwrap(); + let v = rbuf.pread::(std::mem::size_of::()).unwrap(); + // get the corrected key for the raw key + if let Some((&_rk, &ck)) = correct_map.get_key_value(&k) { + *unmapped_count.entry(ck).or_insert(0) += v; + } + } + + let s_path = parent.join("unmapped_bc_count_collated.bin"); + let s_file = std::fs::File::create(s_path).expect("could not create serialization file."); + let mut s_writer = BufWriter::new(&s_file); + bincode::serialize_into(&mut s_writer, &unmapped_count) + .expect("couldn't serialize corrected unmapped bc count."); +} + +#[allow(clippy::too_many_arguments, clippy::manual_clamp)] +pub fn collate_with_temp( + input_dir: P1, + rad_dir: P2, + num_threads: u32, + max_records: u32, + tsv_map: Vec<(u64, u64)>, + total_to_collate: u64, + compress_out: bool, + cmdline: &str, + version: &str, + log: &slog::Logger, +) -> anyhow::Result<()> +where + P1: Into, + P2: AsRef, +{ + // the number of corrected cells we'll write + let expected_output_chunks = tsv_map.len() as u64; + // the parent input directory + let input_dir = input_dir.into(); + let parent = std::path::Path::new(input_dir.as_path()); + + let n_workers = if num_threads > 1 { + (num_threads - 1) as usize + } else { + 1 + }; + + // open the metadata file and read the json + let meta_data_file = File::open(parent.join("generate_permit_list.json")) + .context("could not open the generate_permit_list.json file.")?; + let mdata: serde_json::Value = serde_json::from_reader(&meta_data_file)?; + + let filter_type = get_filter_type(&mdata, log); + let most_ambig_record = get_most_ambiguous_record(&mdata, log); + let num_chunks = get_num_chunks(&mdata, log)?; + + // log the filter type + info!(log, "filter_type = {:?}", filter_type); + info!( + log, + "collated rad file {} be compressed", + if compress_out { "will" } else { "will not" } + ); + // because : + // https://superuser.com/questions/865710/write-to-newfile-vs-overwriting-performance-issue + let cfname = if compress_out { + "map.collated.rad.sz" + } else { + "map.collated.rad" + }; + + // writing the collate metadata + { + let collate_meta = json!({ + "cmd" : cmdline, + "version_str" : version, + "compressed_output" : compress_out, + }); + let cm_path = parent.join("collate.json"); + + let mut cm_file = + std::fs::File::create(cm_path).context("could not create metadata file.")?; + + let cm_info_string = + serde_json::to_string_pretty(&collate_meta).context("could not format json.")?; + cm_file + .write_all(cm_info_string.as_bytes()) + .context("cannot write to collate.json file")?; + } + + let oname = parent.join(cfname); + if oname.exists() { + std::fs::remove_file(&oname) + .with_context(|| format!("could not remove {}", oname.display()))?; + } + + let ofile = File::create(parent.join(cfname)) + .with_context(|| format!("couldn't create directory {}", cfname))?; + let owriter = Arc::new(Mutex::new(BufWriter::with_capacity(1048576, ofile))); + + let i_dir = std::path::Path::new(rad_dir.as_ref()); + + if !i_dir.exists() { + crit!( + log, + "the input RAD path {:?} does not exist", + rad_dir.as_ref() + ); + return Err(anyhow!("invalid input")); + } + + let input_rad_path = i_dir.join("map.rad"); + let i_file = File::open(&input_rad_path).context("couldn't open input RAD file")?; + let mut br = BufReader::new(i_file); + + let hdr = RadHeader::from_bytes(&mut br)?; + + // the exact position at the end of the header, + // precisely sizeof(u64) bytes beyond the num_chunks field. + let end_header_pos = br.get_ref().stream_position().unwrap() - (br.buffer().len() as u64); + + info!( + log, + "paired : {:?}, ref_count : {}, num_chunks : {}", + hdr.is_paired != 0, + hdr.ref_count.to_formatted_string(&Locale::en), + num_chunks.to_formatted_string(&Locale::en) + ); + + // file-level + let fl_tags = rad_types::TagSection::from_bytes(&mut br)?; + info!(log, "read {:?} file-level tags", fl_tags.tags.len()); + // read-level + let rl_tags = rad_types::TagSection::from_bytes(&mut br)?; + info!(log, "read {:?} read-level tags", rl_tags.tags.len()); + // alignment-level + let al_tags = rad_types::TagSection::from_bytes(&mut br)?; + info!(log, "read {:?} alignment-level tags", al_tags.tags.len()); + + // create the prelude and rebind the variables we need + let prelude = RadPrelude::from_header_and_tag_sections(hdr, fl_tags, rl_tags, al_tags); + let rl_tags = &prelude.read_tags; + + let file_tag_map = prelude.file_tags.parse_tags_from_bytes(&mut br); + info!(log, "File-level tag values {:?}", file_tag_map); + + let bct = rl_tags.tags[0].typeid; + + // the exact position at the end of the header + file tags + let pos = br.get_ref().stream_position().unwrap() - (br.buffer().len() as u64); + + // copy the header + { + // we want to copy up to the end of the header + // minus the num chunks (sizeof u64), and then + // write the actual number of chunks we expect. + let chunk_bytes = std::mem::size_of::() as u64; + let take_pos = end_header_pos - chunk_bytes; + + // This temporary file pointer and buffer will be dropped + // at the end of this block (scope). + let mut rfile = File::open(&input_rad_path).context("Couldn't open input RAD file")?; + let mut hdr_buf = Cursor::new(vec![0u8; pos as usize]); + + rfile + .read_exact(hdr_buf.get_mut()) + .context("couldn't read input file header")?; + hdr_buf.set_position(take_pos); + hdr_buf + .write_all(&expected_output_chunks.to_le_bytes()) + .context("couldn't write num_chunks")?; + hdr_buf.set_position(0); + + // compress the header buffer to a compressed buffer + if compress_out { + let mut compressed_buf = + snap::write::FrameEncoder::new(Cursor::new(Vec::::with_capacity(pos as usize))); + compressed_buf + .write_all(hdr_buf.get_ref()) + .context("could not compress the output header.")?; + hdr_buf = compressed_buf + .into_inner() + .context("couldn't unwrap the FrameEncoder.")?; + hdr_buf.set_position(0); + } + + if let Ok(mut oput) = owriter.lock() { + oput.write_all(hdr_buf.get_ref()) + .context("could not write the output header.")?; + } + } + + // get the correction map + let cmfile = std::fs::File::open(parent.join("permit_map.bin")) + .context("couldn't open output permit_map.bin file")?; + let correct_map: Arc> = Arc::new(bincode::deserialize_from(&cmfile).unwrap()); + + // NOTE: the assumption of where the unmapped file will be + // should be robustified + let unmapped_file = i_dir.join("unmapped_bc_count.bin"); + correct_unmapped_counts(&correct_map, &unmapped_file, parent); + + info!( + log, + "deserialized correction map of length : {}", + correct_map.len().to_formatted_string(&Locale::en) + ); + + let cc = chunk::ChunkConfigAtac { + num_chunks, + bc_type: libradicl::rad_types::encode_type_tag(bct).expect("valid barcode tag type"), + }; + + // TODO: see if we can do this without the Arc + let mut output_cache = Arc::new(HashMap::>::new()); + + // max_records is the max size of each intermediate file + let mut total_allocated_records = 0; + let mut allocated_records = 0; + let mut temp_buckets = vec![( + 0, + 0, + Arc::new(libradicl::TempBucket::from_id_and_parent(0, parent)), + )]; + + let max_records_per_thread = (max_records / n_workers as u32) + 1; + // The tsv_map tells us, for each "true" barcode + // how many records belong to it. We can scan this information + // to determine what true barcodes we will keep in memory. + let mut num_bucket_chunks = 0u32; + { + let moutput_cache = Arc::make_mut(&mut output_cache); + for rec in tsv_map.iter() { + // corrected barcode points to the bucket + // file. + moutput_cache.insert(rec.0, temp_buckets.last().unwrap().2.clone()); + allocated_records += rec.1; + num_bucket_chunks += 1; + if allocated_records >= (max_records_per_thread as u64) { + temp_buckets.last_mut().unwrap().0 = num_bucket_chunks; + temp_buckets.last_mut().unwrap().1 = allocated_records as u32; + let tn = temp_buckets.len() as u32; + temp_buckets.push(( + 0, + 0, + Arc::new(libradicl::TempBucket::from_id_and_parent(tn, parent)), + )); + total_allocated_records += allocated_records; + allocated_records = 0; + num_bucket_chunks = 0; + } + } + } + if num_bucket_chunks > 0 { + temp_buckets.last_mut().unwrap().0 = num_bucket_chunks; + temp_buckets.last_mut().unwrap().1 = allocated_records as u32; + } + total_allocated_records += allocated_records; + info!(log, "Generated {} temporary buckets.", temp_buckets.len()); + + let sty = ProgressStyle::default_bar() + .template( + "{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {pos:>7}/{len:7} {msg}", + ) + .expect("ProgressStyle template was invalid") + .progress_chars("╢▌▌░╟"); + + let pbar_inner = ProgressBar::with_draw_target( + Some(cc.num_chunks), + ProgressDrawTarget::stderr_with_hz(5u8), // update at most 5 times/sec. + ); + + pbar_inner.set_style(sty.clone()); + pbar_inner.tick(); + + // create a thread-safe queue based on the number of worker threads + let q = Arc::new(ArrayQueue::<(usize, Vec)>::new(4 * n_workers)); + + // the number of cells left to process + let chunks_to_process = Arc::new(AtomicUsize::new(cc.num_chunks as usize)); + + let mut thread_handles: Vec> = Vec::with_capacity(n_workers); + + let min_rec_len = 24usize; // smallest size an individual record can be loaded in memory + let max_rec = max_records as usize; + let num_buckets = temp_buckets.len(); + let num_threads = n_workers; + let loc_buffer_size = (min_rec_len + (most_ambig_record * 4_usize) - 4_usize).max( + (1000_usize.max((min_rec_len * max_rec) / (num_buckets * num_threads))).min(262_144_usize), + ); //131072_usize); + + // for each worker, spawn off a thread + for _worker in 0..n_workers { + // each thread will need to access the work queue + let in_q = q.clone(); + // the output cache and correction map + let oc = output_cache.clone(); + let correct_map = correct_map.clone(); + // the number of chunks remaining to be processed + let chunks_remaining = chunks_to_process.clone(); + // and knowledge of the UMI and BC types + let bc_type = rad_types::decode_int_type_tag(cc.bc_type).expect("unknown barcode type id."); + + let nbuckets = temp_buckets.len(); + let loc_temp_buckets = temp_buckets.clone(); + //let owrite = owriter.clone(); + // now, make the worker thread + let handle = std::thread::spawn(move || { + // old code + //let mut local_buffers = vec![Cursor::new(vec![0u8; loc_buffer_size]); nbuckets]; + + // new approach (how much does this extra complexity matter?) + // to avoid having a vector of cursors, where each cursor points to + // a completely different vector (thus scattering memory of threads + // and incurring the extra overhead for the capacity of the inner + // vectors), we will have one backing chunk of memory. + // NOTE: once stabilized, maybe using as_chunks_mut here + // will be simpler (https://doc.rust-lang.org/std/primitive.slice.html#method.as_chunks_mut) + + // the memory that will back our temporary buffers + let mut local_buffer_backing = vec![0u8; loc_buffer_size * nbuckets]; + // the vector of cursors we will use to write into our temporary buffers + let mut local_buffers: Vec> = Vec::with_capacity(nbuckets); + // The below is a bit tricky in rust but we basically break off each mutable slice + // piece by piece. Since `as_mut_slice(n)` returns the slices [0,n), [n,end) we + // expect to chop off a first part of size `loc_buffer_size` a total of `nbuckets` + // times. + let mut tslice = local_buffer_backing.as_mut_slice(); + for _ in 0..nbuckets { + let (first, rest) = tslice.split_at_mut(loc_buffer_size); + //let brange = (bn*loc_buffer_size..(bn+1)*loc_buffer_size); + local_buffers.push(Cursor::new(first)); + tslice = rest; + } + + // pop from the work queue until everything is + // processed + while chunks_remaining.load(Ordering::SeqCst) > 0 { + if let Some((_chunk_num, buf)) = in_q.pop() { + chunks_remaining.fetch_sub(1, Ordering::SeqCst); + let mut nbr = BufReader::new(&buf[..]); + libradicl::dump_corrected_cb_chunk_to_temp_file_atac( + &mut nbr, + &bc_type, + &correct_map, + &oc, + &mut local_buffers, + loc_buffer_size, + CollateKey::Barcode, + ); + } + } + + // empty any remaining local buffers + for (bucket_id, lb) in local_buffers.iter().enumerate() { + let len = lb.position() as usize; + if len > 0 { + let mut filebuf = loc_temp_buckets[bucket_id].2.bucket_writer.lock().unwrap(); + filebuf.write_all(&lb.get_ref()[0..len]).unwrap(); + } + } + // return something more meaningful + 0 + }); + + thread_handles.push(handle); + } // for each worker + + // read each chunk + pbar_inner.reset(); + let pb_msg = format!( + "processing {} / {} total records", + total_allocated_records, total_to_collate + ); + pbar_inner.set_message(pb_msg); + + // read chunks from the input file and pass them to the + // worker threads. + let mut buf = vec![0u8; 65536]; + for cell_num in 0..(cc.num_chunks as usize) { + let (nbytes_chunk, nrec_chunk) = chunk::Chunk::::read_header(&mut br); + buf.resize(nbytes_chunk as usize, 0); + buf.pwrite::(nbytes_chunk, 0)?; + buf.pwrite::(nrec_chunk, 4)?; + br.read_exact(&mut buf[8..]).unwrap(); + + let mut bclone = (cell_num, buf.clone()); + // keep trying until we can push this payload + while let Err(t) = q.push(bclone) { + bclone = t; + // no point trying to push if the queue is full + while q.is_full() {} + } + pbar_inner.inc(1); + } + pbar_inner.finish(); + + // wait for the worker threads to finish + for h in thread_handles.drain(0..) { + match h.join() { + Ok(_) => {} + Err(_e) => { + info!(log, "thread panicked"); + } + } + } + pbar_inner.finish_with_message("partitioned records into temporary files."); + drop(q); + + // At this point, we are done with the "scatter" + // phase of writing the records to the corresponding + // intermediate files. Now, we'll begin the gather + // phase of collating the temporary files and merging + // them into the final output file. + + for (i, temp_bucket) in temp_buckets.iter().enumerate() { + // make sure we flush each temp bucket + temp_bucket + .2 + .bucket_writer + .lock() + .unwrap() + .flush() + .context("could not flush temporary output file!")?; + // a sanity check that we have the correct number of records + // and the expected number of bytes in each file + let expected = temp_bucket.1; + let observed = temp_bucket.2.num_records_written.load(Ordering::SeqCst); + assert_eq!(expected, observed); + + let md = std::fs::metadata(parent.join(format!("bucket_{}.tmp", i)))?; + let expected_bytes = temp_bucket.2.num_bytes_written.load(Ordering::SeqCst); + let observed_bytes = md.len(); + assert_eq!(expected_bytes, observed_bytes); + } + + //std::process::exit(1); + + // to hold the temp buckets threads will process + let slack = (n_workers / 2).max(1_usize); + let temp_bucket_queue_size = slack + n_workers; + let fq = Arc::new(ArrayQueue::<( + u32, + u32, + std::sync::Arc, + )>::new(temp_bucket_queue_size)); + // the number of cells left to process + let buckets_to_process = Arc::new(AtomicUsize::new(temp_buckets.len())); + + let pbar_gather = ProgressBar::new(temp_buckets.len() as u64); + pbar_gather.set_style(sty); + pbar_gather.tick(); + + // for each worker, spawn off a thread + for _worker in 0..n_workers { + // each thread will need to access the work queue + let in_q = fq.clone(); + // the output cache and correction map + let s = ahash::RandomState::with_seeds(2u64, 7u64, 1u64, 8u64); + let mut cmap = HashMap::::with_hasher(s); + // alternative strategy + // let mut cmap = HashMap::::with_hasher(s); + + // the number of chunks remaining to be processed + let buckets_remaining = buckets_to_process.clone(); + // and knowledge of the BC types + let bc_type = + rad_types::decode_int_type_tag(cc.bc_type).context("unknown barcode type id.")?; + + // have access to the input directory + let input_dir: PathBuf = input_dir.clone(); + // the output file + let owriter = owriter.clone(); + // and the progress bar + let pbar_gather = pbar_gather.clone(); + + // now, make the worker threads + let handle = std::thread::spawn(move || { + let mut local_chunks = 0u64; + let parent = std::path::Path::new(&input_dir); + // pop from the work queue until everything is + // processed + while buckets_remaining.load(Ordering::SeqCst) > 0 { + if let Some(temp_bucket) = in_q.pop() { + buckets_remaining.fetch_sub(1, Ordering::SeqCst); + cmap.clear(); + + let fname = parent.join(format!("bucket_{}.tmp", temp_bucket.2.bucket_id)); + // create a new handle for reading + let tfile = std::fs::File::open(&fname).expect("couldn't open temporary file."); + let mut treader = BufReader::new(tfile); + + local_chunks += libradicl::collate_temporary_bucket_twopass_atac( + &mut treader, + &bc_type, + temp_bucket.1, + &owriter, + compress_out, + &mut cmap, + ) as u64; + + // we don't need the file or reader anymore + drop(treader); + std::fs::remove_file(fname).expect("could not delete temporary file."); + + pbar_gather.inc(1); + } + } + local_chunks + }); + thread_handles.push(handle); + } // for each worker + + // push the temporary buckets onto the work queue to be dispatched + // by the worker threads. + for temp_bucket in temp_buckets { + let mut bclone = temp_bucket.clone(); + // keep trying until we can push this payload + while let Err(t) = fq.push(bclone) { + bclone = t; + // no point trying to push if the queue is full + while fq.is_full() {} + } + let expected = temp_bucket.1; + let observed = temp_bucket.2.num_records_written.load(Ordering::SeqCst); + assert_eq!(expected, observed); + } + + // wait for all of the workers to finish + let mut num_output_chunks = 0u64; + for h in thread_handles.drain(0..) { + match h.join() { + Ok(c) => { + num_output_chunks += c; + } + Err(_e) => { + info!(log, "thread panicked"); + } + } + } + pbar_gather.finish_with_message("gathered all temp files."); + + // make sure we wrote the same number of records that our + // file suggested we should. + assert_eq!(total_allocated_records, total_to_collate); + + info!( + log, + "writing num output chunks ({}) to header", + num_output_chunks.to_formatted_string(&Locale::en) + ); + + info!( + log, + "expected number of output chunks {}", + expected_output_chunks.to_formatted_string(&Locale::en) + ); + + assert_eq!( + expected_output_chunks, + num_output_chunks, + "expected to write {} chunks but wrote {}", + expected_output_chunks.to_formatted_string(&Locale::en), + num_output_chunks.to_formatted_string(&Locale::en), + ); + + owriter.lock().unwrap().flush()?; + info!( + log, + "finished collating input rad file {:?}.", + i_dir.join("map.rad") + ); + Ok(()) +} diff --git a/src/atac/deduplicate.rs b/src/atac/deduplicate.rs new file mode 100644 index 0000000..ff4963d --- /dev/null +++ b/src/atac/deduplicate.rs @@ -0,0 +1,271 @@ +use crate::atac::prog_opts::DeduplicateOpts; +use anyhow::Context; +use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle}; +use libradicl::header::RadPrelude; +use libradicl::record::AtacSeqReadRecord; +use num_format::ToFormattedString; +use slog::info; +use std::fs::File; +use std::io::{BufRead, BufReader, Write}; +use std::sync::{atomic::AtomicU32, atomic::Ordering, Arc, Mutex}; +use std::thread; +pub type MetaChunk = (usize, usize, u32, u32, Vec); +use crate::atac::sort::HitInfo; +use crate::atac::utils as atac_utils; +use itertools::Itertools; +use num_format::Locale; + +pub fn write_bed( + bd_writer_lock: &Arc>, + hit_info_vec: &[HitInfo], + ref_names: &[String], + rev: bool, + bc_len: u8, + count_frag: &Arc, +) { + let mut s = "".to_string(); + for i in 0..hit_info_vec.len() { + if hit_info_vec[i].frag_len < 2000 { + let s2 = [ + ref_names[hit_info_vec[i].chr as usize].clone(), + hit_info_vec[i].start.to_string(), + (hit_info_vec[i].start + hit_info_vec[i].frag_len as u32).to_string(), + atac_utils::get_bc_string(&hit_info_vec[i].barcode, rev, bc_len), + hit_info_vec[i].count.to_string(), + ] + .join("\t"); + s.push_str(&s2); + s.push('\n'); + } else { + count_frag.fetch_add(1, Ordering::SeqCst); + } + } + + let bd_lock = bd_writer_lock.lock(); + let writer = &mut *bd_lock.unwrap(); + writer.write_all(s.as_bytes()).unwrap(); +} + +pub fn deduplicate(dedup_opts: DeduplicateOpts) -> anyhow::Result<()> { + let parent = std::path::Path::new(dedup_opts.input_dir); + let log = dedup_opts.log; + let collate_md_file = + File::open(parent.join("collate.json")).context("could not open the collate.json file.")?; + let collate_md: serde_json::Value = serde_json::from_reader(&collate_md_file)?; + + // is the collated RAD file compressed? + let compressed_input = collate_md["compressed_output"] + .as_bool() + .context("could not read compressed_output field from collate metadata.")?; + + if compressed_input { + let i_file = + File::open(parent.join("map.collated.rad.sz")).context("run collate before quant")?; + // let metadata = i_file.metadata()?; + let br = BufReader::new(snap::read::FrameDecoder::new(&i_file)); + // let file_len = metadata.len(); + info!( + log, + "quantifying from compressed, collated RAD file {:?}", i_file + ); + // Ok(()) + do_deduplicate(br, dedup_opts) + } else { + let i_file = + File::open(parent.join("map.collated.rad")).context("run collate before quant")?; + // let metadata = i_file.metadata()?; + // let file_len = metadata.len(); + let br = BufReader::new(i_file); + + info!( + log, + "quantifying from uncompressed, collated RAD file {:?}", + parent.join("map.collated.rad") + ); + + do_deduplicate(br, dedup_opts) + } +} + +pub fn do_deduplicate(mut br: T, dedup_opts: DeduplicateOpts) -> anyhow::Result<()> { + let num_threads = dedup_opts.num_threads; + + let n_workers = if num_threads > 1 { + (num_threads - 1) as usize + } else { + 1 + }; + + let prelude = RadPrelude::from_bytes(&mut br).unwrap(); + let file_tag_map = prelude.file_tags.parse_tags_from_bytes(&mut br).unwrap(); + let log = &dedup_opts.log; + + let hdr = &prelude.hdr; + let refs = &hdr.ref_names; + let num_multimappings = Arc::new(AtomicU32::new(0_u32)); + let num_dedup = Arc::new(AtomicU32::new(0_u32)); + let num_frag_counts = Arc::new(AtomicU32::new(0_u32)); // fragments larger than 2000 + let num_non_mapped_pair = Arc::new(AtomicU32::new(0_u32)); // fragments larger than 2000 + + if let Ok(summary) = prelude.summary(None) { + println!("{}", summary); + } + info!( + log, + "paired : {:?}, ref_count : {}, num_chunks : {}", + hdr.is_paired != 0, + hdr.ref_count.to_formatted_string(&Locale::en), + hdr.num_chunks.to_formatted_string(&Locale::en) + ); + + let parent = std::path::Path::new(dedup_opts.input_dir); + let bed_path = parent.join("map.bed"); + let num_chunks = hdr.num_chunks; + + // let bc_unmapped_file = File::open(parent.join("unmapped_bc_count_collated.bin")).unwrap(); + // let bc_unmapped_map: Arc> = + // Arc::new(bincode::deserialize_from(&bc_unmapped_file).unwrap()); + + let fl_tags = &prelude.file_tags; + info!(log, "read {:?} file-level tags", fl_tags.tags.len()); + // read-level + let rl_tags = &prelude.read_tags; + info!(log, "read {:?} read-level tags", rl_tags.tags.len()); + // alignment-level + let al_tags = &prelude.aln_tags; + info!(log, "read {:?} alignment-level tags", al_tags.tags.len()); + + info!(log, "File-level tag values {:?}", file_tag_map); + + let barcode_tag = file_tag_map + .get("cblen") + .expect("tag map must contain cblen"); + let barcode_len: u16 = barcode_tag.try_into()?; + + let pbar = ProgressBar::with_draw_target( + Some(num_chunks), + ProgressDrawTarget::stderr_with_hz(5u8), // update at most 5 times/sec. + ); + pbar.set_style( + ProgressStyle::default_bar() + .template( + "{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {pos:>7}/{len:7} {msg}", + ) + .expect("ProgressStyle template was invalid.") + .progress_chars("╢▌▌░╟"), + ); + + let bed_writer = Arc::new(Mutex::new(File::create(bed_path).unwrap())); + let mut thread_handles: Vec> = Vec::with_capacity(n_workers); + + let chunk_reader = libradicl::readers::ParallelChunkReader::::new( + &prelude, + std::num::NonZeroUsize::new(n_workers).unwrap(), + ); + + for _worker in 0..n_workers { + let rd = chunk_reader.is_done(); + let q = chunk_reader.get_queue(); + let bd = bed_writer.clone(); + let refs = refs.clone(); + let num_multimappings = num_multimappings.clone(); + let num_dedup = num_dedup.clone(); + let num_frag_counts = num_frag_counts.clone(); + let num_non_mapped_pair = num_non_mapped_pair.clone(); + + let handle = std::thread::spawn(move || { + let mut nrec_processed = 0_usize; + while !rd.load(Ordering::SeqCst) { + while let Some(meta_chunk) = q.pop() { + for c in meta_chunk.iter() { + nrec_processed += c.nrec as usize; + let mut hit_info_vec: Vec = Vec::with_capacity(c.nrec as usize); + // println!("Chunk :: nbytes: {}, nrecs: {}", c.nbytes, c.nrec); + assert_eq!(c.nrec as usize, c.reads.len()); + for r in c.reads.iter() { + let na = r.refs.len(); + // add a field tracking such counts + if na == 1 && r.map_type[0] == 4 { + hit_info_vec.push(HitInfo { + chr: r.refs[0], + start: r.start_pos[0], + frag_len: r.frag_lengths[0], + barcode: r.bc, + count: 0, + }) + } else if na > 1 { + num_multimappings.fetch_add(1, Ordering::SeqCst); + continue; + } else { + num_non_mapped_pair.fetch_add(1, Ordering::SeqCst); + } + } + hit_info_vec.sort_unstable(); + let mut h_updated: Vec = Vec::with_capacity(hit_info_vec.len()); + for (count, hv) in hit_info_vec.iter_mut().dedup_with_count() { + hv.count = count as u16; + h_updated.push(*hv); + if count > 1 { + num_dedup.fetch_add(1, Ordering::SeqCst); + } + } + drop(hit_info_vec); + write_bed( + &bd, + &h_updated, + &refs, + dedup_opts.rev, + barcode_len as u8, + &num_frag_counts, + ); + } + } + } + nrec_processed + }); + thread_handles.push(handle); + } + // let header_offset = rad_reader.get_byte_offset(); + // let pbar = ProgressBar::new(file_len - header_offset); + // pbar.set_style( + // ProgressStyle::with_template( + // "[{elapsed_precise}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}", + // ) + // .unwrap() + // .progress_chars("##-"), + // ); + pbar.set_draw_target(ProgressDrawTarget::stderr_with_hz(5)); + // let cb = |new_bytes: u64, _new_rec: u64| { + // pbar.inc(new_bytes); + // }; + // let _ = rad_reader.start_chunk_parsing(Some(cb)); //libradicl::readers::EMPTY_METACHUNK_CALLBACK); + let mut total_processed = 0; + for handle in thread_handles { + total_processed += handle.join().expect("The parsing thread panicked"); + } + pbar.finish_with_message(format!( + "finished parsing RAD file; processed {} total records\n", + total_processed + )); + info!( + log, + "Number of records with greater than 1 mapping {}", + num_multimappings.load(Ordering::SeqCst) + ); + info!( + log, + "Number of records that are deduplicated {}", + num_dedup.load(Ordering::SeqCst) + ); + info!( + log, + "Number of records that are not mapped pairs {}", + num_non_mapped_pair.load(Ordering::SeqCst) + ); + info!( + log, + "Number of records that have frag length > 2000 {}", + num_frag_counts.load(Ordering::SeqCst) + ); + Ok(()) +} diff --git a/src/atac/mod.rs b/src/atac/mod.rs new file mode 100644 index 0000000..76cea06 --- /dev/null +++ b/src/atac/mod.rs @@ -0,0 +1,7 @@ +mod cellfilter; +mod collate; +mod deduplicate; +mod prog_opts; +pub mod run; +mod sort; +mod utils; diff --git a/src/atac/prog_opts.rs b/src/atac/prog_opts.rs new file mode 100644 index 0000000..3614045 --- /dev/null +++ b/src/atac/prog_opts.rs @@ -0,0 +1,28 @@ +use crate::atac::cellfilter::CellFilterMethod; +use serde::Serialize; +use slog; +use std::path::PathBuf; +use typed_builder::TypedBuilder; + +#[derive(TypedBuilder, Debug, Serialize)] +pub struct GenPermitListOpts<'a, 'b, 'c, 'd, 'e> { + pub input_dir: &'a PathBuf, + pub output_dir: &'b PathBuf, + pub fmeth: CellFilterMethod, + pub rc: bool, + pub cmdline: &'c str, + pub version: &'d str, + #[serde(skip_serializing)] + pub log: &'e slog::Logger, +} + +#[derive(TypedBuilder, Debug, Serialize)] +pub struct DeduplicateOpts<'a, 'b, 'c, 'd> { + pub input_dir: &'a PathBuf, + pub num_threads: u32, + pub rev: bool, + pub cmdline: &'b str, + pub version: &'c str, + #[serde(skip_serializing)] + pub log: &'d slog::Logger, +} diff --git a/src/atac/run.rs b/src/atac/run.rs new file mode 100644 index 0000000..0645c07 --- /dev/null +++ b/src/atac/run.rs @@ -0,0 +1,127 @@ +use crate::atac::cellfilter::generate_permit_list; +use crate::atac::cellfilter::CellFilterMethod; +use crate::atac::collate::collate; +use crate::atac::deduplicate::deduplicate; +use crate::atac::prog_opts::{DeduplicateOpts, GenPermitListOpts}; +use crate::atac::sort::sort; +use anyhow::bail; +use clap::ArgMatches; +use slog::{crit, info, o, warn, Logger}; +use std::path::PathBuf; + +pub fn run(opts: &ArgMatches, version: &str, cmdline: &str, log: &Logger) -> anyhow::Result<()> { + if let Some(t) = opts.subcommand_matches("generate-permit-list") { + let input_dir: &PathBuf = t.get_one("input").expect("no input directory specified"); + let output_dir: &PathBuf = t + .get_one("output-dir") + .expect("no output directory specified"); + let mut fmeth = CellFilterMethod::KneeFinding; + + if let Some(v) = t.get_one::("unfiltered-pl") { + let min_reads: usize = *t + .get_one("min-reads") + .expect("min-reads must be a valid integer"); + if min_reads < 1 { + crit!( + log, + "min-reads < 1 is not supported, the value {} was provided", + min_reads + ); + std::process::exit(1); + } + fmeth = CellFilterMethod::UnfilteredExternalList(v.clone(), min_reads); + }; + let rc: bool = *t.get_one("rev-comp").expect("reverse comp must be boolean"); + + let gpl_opts = GenPermitListOpts::builder() + .input_dir(input_dir) + .output_dir(output_dir) + .fmeth(fmeth) + .rc(rc) + .version(version) + .cmdline(cmdline) + .log(log) + .build(); + + match generate_permit_list(gpl_opts) { + Ok(0) => { + warn!(log, "found 0 corrected barcodes; please check the input."); + } + Err(e) => return Err(e), + _ => (), + }; + } + + if let Some(t) = opts.subcommand_matches("collate") { + let input_dir: &PathBuf = t.get_one("input-dir").unwrap(); + let rad_dir: &PathBuf = t.get_one("rad-dir").unwrap(); + let num_threads = *t.get_one("threads").unwrap(); + let compress_out = t.get_flag("compress"); + let max_records: u32 = *t.get_one("max-records").unwrap(); + collate( + input_dir, + rad_dir, + num_threads, + max_records, + compress_out, + cmdline, + version, + log, + ) + .expect("could not collate."); + } + + if let Some(t) = opts.subcommand_matches("sort") { + let input_dir: &PathBuf = t.get_one("input-dir").unwrap(); + let rad_dir: &PathBuf = t.get_one("rad-dir").unwrap(); + let num_threads: u32 = *t.get_one("threads").unwrap(); + let compress_out = t.get_flag("compress"); + let max_records: u32 = *t.get_one("max-records").unwrap(); + + sort( + input_dir, + rad_dir, + num_threads, + max_records, + compress_out, + cmdline, + version, + log, + ) + .expect("could not sort."); + } + + if let Some(t) = opts.subcommand_matches("deduplicate") { + let input_dir: &PathBuf = t.get_one("input-dir").unwrap(); + let num_threads = *t.get_one("threads").unwrap(); + let rc: bool = *t.get_one("rev-comp").expect("reverse comp must be boolean"); + + let dedup_opts = DeduplicateOpts::builder() + .input_dir(input_dir) + .num_threads(num_threads) + .rev(rc) + .cmdline(&cmdline) + .version(version) + .log(&log) + .build(); + + let parent = std::path::Path::new(&input_dir); + let json_path = parent.join("generate_permit_list.json"); + let col_json_path = parent.join("collate.json"); + + if json_path.exists() && col_json_path.exists() { + match deduplicate(dedup_opts) { + Ok(_) => {} + Err(_e) => { + panic!("Could not dedupicate rad file"); + } + }; + } else { + crit!(log, + "The provided input directory lacks a generate_permit_list.json or collate.json file; this should not happen." + ); + bail!("The provided input directory lacks a generate_permit_list.json or collate.json file; this should not happen."); + } + } + Ok(()) +} diff --git a/src/atac/sort.rs b/src/atac/sort.rs new file mode 100644 index 0000000..ea2e44c --- /dev/null +++ b/src/atac/sort.rs @@ -0,0 +1,837 @@ +use crate::constants as afconst; +use crate::utils::InternalVersionInfo; +use anyhow::{anyhow, Context}; +use crossbeam_queue::ArrayQueue; +use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle}; +use slog::{crit, info}; + +use libradicl::chunk; +use libradicl::header::{RadHeader, RadPrelude}; +use libradicl::rad_types; +use libradicl::record::AtacSeqReadRecord; +use libradicl::schema::CollateKey; + +use crate::atac::collate::{ + correct_unmapped_counts, get_filter_type, get_most_ambiguous_record, get_num_chunks, +}; +use crate::atac::utils as atac_utils; +use flate2::write::GzEncoder; +use flate2::Compression; +use num_format::{Locale, ToFormattedString}; +use scroll::{Pread, Pwrite}; +use serde_json::json; +use std::cmp; +use std::collections::HashMap; +use std::fs::File; +use std::io; +use std::io::{BufReader, Cursor, Read, Seek, Write}; +use std::iter::FromIterator; +use std::path::{Path, PathBuf}; +use std::str::FromStr; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; +use std::thread; + +#[derive(Debug, PartialEq, Eq, Copy, Clone)] +pub struct HitInfo { + pub chr: u32, + pub start: u32, + pub frag_len: u16, + pub barcode: u64, + pub count: u16, + // rec_id: u64, +} + +impl Ord for HitInfo { + fn cmp(&self, other: &Self) -> cmp::Ordering { + if self.chr != other.chr { + self.chr.cmp(&other.chr) + } else if self.start != other.start { + self.start.cmp(&other.start) + } else if self.frag_len != other.frag_len { + self.frag_len.cmp(&other.frag_len) + } else { + self.barcode.cmp(&other.barcode) + } + } +} +impl PartialOrd for HitInfo { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +pub fn get_bed_string( + hit_info_vec: &[HitInfo], + ref_names: &[String], + bc_len: u16, + rev: bool, +) -> String { + let bc_len: u8 = bc_len.try_into().unwrap(); + let mut s = "".to_string(); + let mut count = 1; + let mut i = 0; + + while i < (hit_info_vec.len() - 1) { + if hit_info_vec[i].frag_len < 2000 { + if hit_info_vec[i] == hit_info_vec[i + 1] { + count += 1; + } else { + let s2 = [ + ref_names[hit_info_vec[i].chr as usize].clone(), + hit_info_vec[i].start.to_string(), + (hit_info_vec[i].start + hit_info_vec[i].frag_len as u32).to_string(), + atac_utils::get_bc_string(&hit_info_vec[i].barcode, rev, bc_len), + count.to_string(), + ] + .join("\t"); + s.push_str(&s2); + s.push('\n'); + count = 1; + } + } else { + count = 1; + } + i += 1; + } + if hit_info_vec[i].frag_len < 2000 { + let s2 = [ + ref_names[hit_info_vec[i].chr as usize].clone(), + hit_info_vec[i].start.to_string(), + (hit_info_vec[i].start + hit_info_vec[i].frag_len as u32).to_string(), + atac_utils::get_bc_string(&hit_info_vec[i].barcode, rev, bc_len), + count.to_string(), + ] + .join("\t"); + s.push_str(&s2); + s.push('\n'); + } + s +} + +#[allow(clippy::too_many_arguments, clippy::manual_clamp)] +pub fn sort_temp_bucket( + reader: &mut BufReader, + bct: &rad_types::RadIntId, + barcode_len: u16, + rc: bool, + ref_names: &[String], + nrec: u32, + parent: &Path, + buck_id: u32, + compress: bool, +) -> anyhow::Result<()> { + let mut hit_info_vec: Vec = Vec::with_capacity(nrec as usize); + for _ in 0..(nrec as usize) { + // read the header of the record + // we don't bother reading the whole thing here + // because we will just copy later as need be + let tup = AtacSeqReadRecord::from_bytes_record_header(reader, bct); + // read the alignment records from the input file + if tup.1 > 1 { + continue; + } + let rr = AtacSeqReadRecord::from_bytes_with_header(reader, tup.0, tup.1); + hit_info_vec.push(HitInfo { + chr: rr.refs[0], + start: rr.start_pos[0], + frag_len: rr.frag_lengths[0], + barcode: rr.bc, + count: 0, + }) + } + hit_info_vec.sort_unstable(); + + let bed_string: String = get_bed_string(&hit_info_vec, ref_names, barcode_len, rc); + if compress { + let bd = File::create(parent.join(format!("{}.bed.gz", buck_id))).unwrap(); + let mut encoder = GzEncoder::new(bd, Compression::default()); + encoder.write_all(bed_string.as_bytes()).unwrap(); + encoder.finish()?; + } else { + let mut bd = File::create(parent.join(format!("{}.bed", buck_id))).unwrap(); + bd.write_all(bed_string.as_bytes())?; + } + Ok(()) + // write_bed(&mut bd, &h_updated, &ref_names, barcode_len, rc); +} + +#[allow(clippy::too_many_arguments)] +pub fn sort( + input_dir: P1, + rad_dir: P2, + num_threads: u32, + max_records: u32, + compress_out: bool, + cmdline: &str, + version_str: &str, + //expected_ori: Strand, + log: &slog::Logger, +) -> anyhow::Result<()> +where + P1: Into, + P2: AsRef, +{ + let input_dir = input_dir.into(); + let parent = std::path::Path::new(input_dir.as_path()); + + let gpl_path = parent.join("generate_permit_list.json"); + let meta_data_file = File::open(&gpl_path) + .with_context(|| format!("Could not open the file {:?}.", gpl_path.display()))?; + let mdata: serde_json::Value = serde_json::from_reader(meta_data_file)?; + + let calling_version = InternalVersionInfo::from_str(version_str)?; + let vd: InternalVersionInfo; + match mdata.get("version_str") { + Some(vs) => match vs.as_str() { + Some(s) => { + vd = InternalVersionInfo::from_str(s)?; + } + None => { + return Err(anyhow!("The version_str field must be a string")); + } + }, + None => { + return Err(anyhow!("The generate_permit_list.json file does not contain a version_str field. Please re-run the generate-permit-list step with a newer version of alevin-fry")); + } + }; + + let rc: bool = mdata["gpl_options"]["rc"].as_bool().unwrap(); + + if let Err(es) = calling_version.is_compatible_with(&vd) { + return Err(anyhow!(es)); + } + + if !parent.join("bin_recs.bin").exists() || !parent.join("bin_lens.bin").exists() { + crit!(log, "bin file containing records does not exist"); + // std::process::exit(1); + return Err(anyhow!("execution terminated unexpectedly")); + } + + let bin_count_file = + std::fs::File::open(parent.join("bin_recs.bin")).context("couldn't open file")?; + let bin_rec_counts: Vec = + bincode::deserialize_from(bin_count_file).context("couldn't open bin counts file.")?; + + let bin_len_file = + std::fs::File::open(parent.join("bin_lens.bin")).context("couldn't open file")?; + let bin_lens: Vec = + bincode::deserialize_from(bin_len_file).context("couldn't open bin length file.")?; + + let freq_file = + std::fs::File::open(parent.join("permit_freq.bin")).context("couldn't open file")?; + + // header buffer + let mut rbuf = [0u8; 8]; + + let mut rdr = BufReader::new(&freq_file); + rdr.read_exact(&mut rbuf) + .context("couldn't read freq file header")?; + let freq_file_version = rbuf + .pread::(0) + .context("couldn't read freq file version")?; + // make sure versions match + if freq_file_version > afconst::PERMIT_FILE_VER { + crit!(log, + "The permit_freq.bin file had version {}, but this version of alevin-fry requires version {}", + freq_file_version, afconst::PERMIT_FILE_VER + ); + return Err(anyhow!("execution terminated unexpectedly")); + } + + // read the barcode length + rdr.read_exact(&mut rbuf) + .context("couldn't read freq file buffer")?; + let _bc_len = rbuf + .pread::(0) + .context("couldn't read freq file barcode length")?; + + // read the barcode -> frequency hashmap + let freq_hm: HashMap = + bincode::deserialize_from(rdr).context("couldn't deserialize barcode to frequency map.")?; + let total_to_collate: u64 = freq_hm.values().sum(); + let tsv_map = Vec::from_iter(freq_hm); + sort_with_temp( + input_dir, + rad_dir, + num_threads, + max_records, + bin_rec_counts, + bin_lens, + tsv_map, + total_to_collate, + compress_out, + cmdline, + version_str, + rc, + log, + )?; + Ok(()) +} + +#[allow(clippy::too_many_arguments, clippy::manual_clamp)] +pub fn sort_with_temp( + input_dir: P1, + rad_dir: P2, + num_threads: u32, + max_records: u32, + bin_recs: Vec, + bin_lens: Vec, + tsv_map: Vec<(u64, u64)>, + _total_to_collate: u64, + compress_out: bool, + cmdline: &str, + version: &str, + rc: bool, + log: &slog::Logger, +) -> anyhow::Result<()> +where + P1: Into, + P2: AsRef, +{ + // the number of corrected cells we'll write + let expected_output_chunks = tsv_map.len() as u64; + // the parent input directory + let input_dir = input_dir.into(); + let parent = std::path::Path::new(input_dir.as_path()); + + let n_workers = if num_threads > 1 { + (num_threads - 1) as usize + } else { + 1 + }; + + // open the metadata file and read the json + let meta_data_file = File::open(parent.join("generate_permit_list.json")) + .context("could not open the generate_permit_list.json file.")?; + let mdata: serde_json::Value = serde_json::from_reader(&meta_data_file)?; + + let filter_type = get_filter_type(&mdata, log); + let most_ambig_record = get_most_ambiguous_record(&mdata, log); + let num_chunks = get_num_chunks(&mdata, log)?; + + // log the filter type + info!(log, "filter_type = {:?}", filter_type); + info!( + log, + "sorted bed file {} be compressed", + if compress_out { "will" } else { "will not" } + ); + // because : + // https://superuser.com/questions/865710/write-to-newfile-vs-overwriting-performance-issue + + // writing the collate metadata + { + let collate_meta = json!({ + "cmd" : cmdline, + "version_str" : version, + "compressed_output" : compress_out, + }); + let cm_path = parent.join("sort.json"); + + let mut cm_file = + std::fs::File::create(cm_path).context("could not create metadata file.")?; + + let cm_info_string = + serde_json::to_string_pretty(&collate_meta).context("could not format json.")?; + cm_file + .write_all(cm_info_string.as_bytes()) + .context("cannot write to sort.json file")?; + } + + let i_dir = std::path::Path::new(rad_dir.as_ref()); + + if !i_dir.exists() { + crit!( + log, + "the input RAD path {:?} does not exist", + rad_dir.as_ref() + ); + return Err(anyhow!("invalid input")); + } + + let input_rad_path = i_dir.join("map.rad"); + let i_file = File::open(&input_rad_path).context("couldn't open input RAD file")?; + let mut br = BufReader::new(i_file); + + let hdr = RadHeader::from_bytes(&mut br)?; + + // the exact position at the end of the header, + // precisely sizeof(u64) bytes beyond the num_chunks field. + let end_header_pos = br.get_ref().stream_position().unwrap() - (br.buffer().len() as u64); + let ref_names = hdr.ref_names.clone(); + info!( + log, + "paired : {:?}, ref_count : {}, num_chunks : {}", + hdr.is_paired != 0, + hdr.ref_count.to_formatted_string(&Locale::en), + num_chunks.to_formatted_string(&Locale::en) + ); + + // file-level + let fl_tags = rad_types::TagSection::from_bytes(&mut br)?; + info!(log, "read {:?} file-level tags", fl_tags.tags.len()); + // read-level + let rl_tags = rad_types::TagSection::from_bytes(&mut br)?; + info!(log, "read {:?} read-level tags", rl_tags.tags.len()); + // alignment-level + let al_tags = rad_types::TagSection::from_bytes(&mut br)?; + info!(log, "read {:?} alignment-level tags", al_tags.tags.len()); + + // create the prelude and rebind the variables we need + let prelude = RadPrelude::from_header_and_tag_sections(hdr, fl_tags, rl_tags, al_tags); + let rl_tags = &prelude.read_tags; + + let file_tag_map = prelude.file_tags.parse_tags_from_bytes(&mut br); + info!(log, "File-level tag values {:?}", file_tag_map); + + let binding = file_tag_map?; + let barcode_tag = binding.get("cblen").expect("tag map must contain cblen"); + let barcode_len: u16 = barcode_tag.try_into()?; + + let bct = rl_tags.tags[0].typeid; + + // the exact position at the end of the header + file tags + let pos = br.get_ref().stream_position().unwrap() - (br.buffer().len() as u64); + + // copy the header + { + // we want to copy up to the end of the header + // minus the num chunks (sizeof u64), and then + // write the actual number of chunks we expect. + let chunk_bytes = std::mem::size_of::() as u64; + let take_pos = end_header_pos - chunk_bytes; + + // This temporary file pointer and buffer will be dropped + // at the end of this block (scope). + let mut rfile = File::open(&input_rad_path).context("Couldn't open input RAD file")?; + let mut hdr_buf = Cursor::new(vec![0u8; pos as usize]); + + rfile + .read_exact(hdr_buf.get_mut()) + .context("couldn't read input file header")?; + hdr_buf.set_position(take_pos); + hdr_buf + .write_all(&expected_output_chunks.to_le_bytes()) + .context("couldn't write num_chunks")?; + hdr_buf.set_position(0); + + // compress the header buffer to a compressed buffer + if compress_out { + let mut compressed_buf = + snap::write::FrameEncoder::new(Cursor::new(Vec::::with_capacity(pos as usize))); + compressed_buf + .write_all(hdr_buf.get_ref()) + .context("could not compress the output header.")?; + hdr_buf = compressed_buf + .into_inner() + .context("couldn't unwrap the FrameEncoder.")?; + hdr_buf.set_position(0); + } + } + + // get the correction map + let cmfile = std::fs::File::open(parent.join("permit_map.bin")) + .context("couldn't open output permit_map.bin file")?; + let correct_map: Arc> = Arc::new(bincode::deserialize_from(&cmfile).unwrap()); + + // NOTE: the assumption of where the unmapped file will be + // should be robustified + let unmapped_file = i_dir.join("unmapped_bc_count.bin"); + correct_unmapped_counts(&correct_map, &unmapped_file, parent); + + info!( + log, + "deserialized correction map of length : {}", + correct_map.len().to_formatted_string(&Locale::en) + ); + + let cc = chunk::ChunkConfigAtac { + num_chunks, + bc_type: libradicl::rad_types::encode_type_tag(bct).expect("valid barcode tag type"), + }; + + // TODO: see if we can do this without the Arc + let mut output_cache = Arc::new(HashMap::>::new()); + + // max_records is the max size of each intermediate file + // let mut total_allocated_records = 0; + let mut allocated_records = 0; + let mut temp_buckets = vec![( + 0, + 0, + Arc::new(libradicl::TempBucket::from_id_and_parent(0, parent)), + )]; + + let max_records_per_thread = (max_records / n_workers as u32) + 1; + // The tsv_map tells us, for each "true" barcode + // how many records belong to it. We can scan this information + // to determine what true barcodes we will keep in memory. + let mut num_bucket_chunks = 0u32; + { + let moutput_cache = Arc::make_mut(&mut output_cache); + for (i, nrec) in bin_recs.iter().enumerate() { + // corrected barcode points to the bucket + // file. + moutput_cache.insert(i as u64, temp_buckets.last().unwrap().2.clone()); + allocated_records += nrec; + num_bucket_chunks += 1; + if allocated_records >= (max_records_per_thread as u64) { + temp_buckets.last_mut().unwrap().0 = num_bucket_chunks; + temp_buckets.last_mut().unwrap().1 = allocated_records as u32; + let tn = temp_buckets.len() as u32; + temp_buckets.push(( + 0, + 0, + Arc::new(libradicl::TempBucket::from_id_and_parent(tn, parent)), + )); + // total_allocated_records += allocated_records; + allocated_records = 0; + num_bucket_chunks = 0; + } + } + } + if num_bucket_chunks > 0 { + temp_buckets.last_mut().unwrap().0 = num_bucket_chunks; + temp_buckets.last_mut().unwrap().1 = allocated_records as u32; + } + // total_allocated_records += allocated_records; + info!(log, "Generated {} temporary buckets.", temp_buckets.len()); + + let sty = ProgressStyle::default_bar() + .template( + "{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {pos:>7}/{len:7} {msg}", + ) + .expect("ProgressStyle template was invalid") + .progress_chars("╢▌▌░╟"); + + let pbar_inner = ProgressBar::with_draw_target( + Some(cc.num_chunks), + ProgressDrawTarget::stderr_with_hz(5u8), // update at most 5 times/sec. + ); + + pbar_inner.set_style(sty.clone()); + pbar_inner.tick(); + + // create a thread-safe queue based on the number of worker threads + let q = Arc::new(ArrayQueue::<(usize, Vec)>::new(4 * n_workers)); + + // // the number of cells left to process + let chunks_to_process = Arc::new(AtomicUsize::new(cc.num_chunks as usize)); + + let mut thread_handles: Vec> = Vec::with_capacity(n_workers); + + let min_rec_len = 24usize; // smallest size an individual record can be loaded in memory + let max_rec = max_records as usize; + let num_buckets = temp_buckets.len(); + let num_threads = n_workers; + let loc_buffer_size = (min_rec_len + (most_ambig_record * 4_usize) - 4_usize).max( + (1000_usize.max((min_rec_len * max_rec) / (num_buckets * num_threads))).min(262_144_usize), + ); //131072_usize); + + // // for each worker, spawn off a thread + for _worker in 0..n_workers { + // each thread will need to access the work queue + let in_q = q.clone(); + // the output cache and correction map + let oc = output_cache.clone(); + let correct_map = correct_map.clone(); + // the number of chunks remaining to be processed + let chunks_remaining = chunks_to_process.clone(); + // and knowledge of the UMI and BC types + let bc_type = rad_types::decode_int_type_tag(cc.bc_type).expect("unknown barcode type id."); + + let nbuckets = temp_buckets.len(); + let loc_temp_buckets = temp_buckets.clone(); + let loc_bin_lens = bin_lens.clone(); + + //let owrite = owriter.clone(); + // now, make the worker thread + let handle = std::thread::spawn(move || { + // old code + // let mut local_buffers = vec![Cursor::new(vec![0u8; loc_buffer_size]); nbuckets]; + + // new approach (how much does this extra complexity matter?) + // to avoid having a vector of cursors, where each cursor points to + // a completely different vector (thus scattering memory of threads + // and incurring the extra overhead for the capacity of the inner + // vectors), we will have one backing chunk of memory. + // NOTE: once stabilized, maybe using as_chunks_mut here + // will be simpler (https://doc.rust-lang.org/std/primitive.slice.html#method.as_chunks_mut) + + // the memory that will back our temporary buffers + let mut local_buffer_backing = vec![0u8; loc_buffer_size * nbuckets]; + // the vector of cursors we will use to write into our temporary buffers + let mut local_buffers: Vec> = Vec::with_capacity(nbuckets); + // The below is a bit tricky in rust but we basically break off each mutable slice + // piece by piece. Since `as_mut_slice(n)` returns the slices [0,n), [n,end) we + // expect to chop off a first part of size `loc_buffer_size` a total of `nbuckets` + // times. + let mut tslice = local_buffer_backing.as_mut_slice(); + for _ in 0..nbuckets { + let (first, rest) = tslice.split_at_mut(loc_buffer_size); + //let brange = (bn*loc_buffer_size..(bn+1)*loc_buffer_size); + local_buffers.push(Cursor::new(first)); + tslice = rest; + } + + let size_range = 100000; + let closure_get_bin_id = |pos: u32, ref_id: usize| { + atac_utils::get_bin_id(pos, ref_id, size_range, &loc_bin_lens.clone()) + }; + + // pop from the work queue until everything is + // processed + while chunks_remaining.load(Ordering::SeqCst) > 0 { + if let Some((_chunk_num, buf)) = in_q.pop() { + chunks_remaining.fetch_sub(1, Ordering::SeqCst); + let mut nbr = BufReader::new(&buf[..]); + libradicl::dump_corrected_cb_chunk_to_temp_file_atac( + &mut nbr, + &bc_type, + &correct_map, + &oc, + &mut local_buffers, + loc_buffer_size, + CollateKey::Pos(Box::new(closure_get_bin_id)), + ); + } + } + + // empty any remaining local buffers + for (bucket_id, lb) in local_buffers.iter().enumerate() { + let len = lb.position() as usize; + if len > 0 { + let mut filebuf = loc_temp_buckets[bucket_id].2.bucket_writer.lock().unwrap(); + filebuf.write_all(&lb.get_ref()[0..len]).unwrap(); + } + } + // return something more meaningful + 0 + }); + + thread_handles.push(handle); + } // for each worker + + // read each chunk + pbar_inner.reset(); + // let pb_msg = format!( + // "processing {} / {} total records", + // total_allocated_records, total_to_collate + // ); + // pbar_inner.set_message(pb_msg); + + // read chunks from the input file and pass them to the + // worker threads. + let mut buf = vec![0u8; 65536]; + for cell_num in 0..(cc.num_chunks as usize) { + let (nbytes_chunk, nrec_chunk) = chunk::Chunk::::read_header(&mut br); + buf.resize(nbytes_chunk as usize, 0); + buf.pwrite::(nbytes_chunk, 0)?; + buf.pwrite::(nrec_chunk, 4)?; + br.read_exact(&mut buf[8..]).unwrap(); + + let mut bclone = (cell_num, buf.clone()); + // keep trying until we can push this payload + while let Err(t) = q.push(bclone) { + bclone = t; + // no point trying to push if the queue is full + while q.is_full() {} + } + pbar_inner.inc(1); + } + pbar_inner.finish(); + + // wait for the worker threads to finish + // let mut num_output_chunks = 0u64; + for h in thread_handles.drain(0..) { + match h.join() { + Ok(_) => {} + Err(_e) => { + info!(log, "thread panicked"); + } + } + } + pbar_inner.finish_with_message("partitioned records into temporary files."); + drop(q); + + // At this point, we are done with the "scatter" + // phase of writing the records to the corresponding + // intermediate files. Now, we'll begin the gather + // phase of collating the temporary files and merging + // them into the final output file. + + for temp_bucket in temp_buckets.iter() { + // make sure we flush each temp bucket + temp_bucket + .2 + .bucket_writer + .lock() + .unwrap() + .flush() + .context("could not flush temporary output file!")?; + // a sanity check that we have the correct number of records + // and the expected number of bytes in each file + // let expected = temp_bucket.1; + // let observed = temp_bucket.2.num_records_written.load(Ordering::SeqCst); + // assert_eq!(expected, observed); + + // let md = std::fs::metadata(parent.join(format!("bucket_{}.tmp", i)))?; + // let expected_bytes = temp_bucket.2.num_bytes_written.load(Ordering::SeqCst); + // let observed_bytes = md.len(); + // assert_eq!(expected_bytes, observed_bytes); + } + + //std::process::exit(1); + + // to hold the temp buckets threads will process + let slack = (n_workers / 2).max(1_usize); + let temp_bucket_queue_size = slack + n_workers; + let fq = Arc::new(ArrayQueue::<( + u32, + u32, + std::sync::Arc, + )>::new(temp_bucket_queue_size)); + // the number of cells left to process + let buckets_to_process = Arc::new(AtomicUsize::new(temp_buckets.len())); + + let pbar_gather = ProgressBar::new(temp_buckets.len() as u64); + pbar_gather.set_style(sty); + pbar_gather.tick(); + + // // for each worker, spawn off a thread + for _worker in 0..n_workers { + // each thread will need to access the work queue + let in_q = fq.clone(); + // let barcode_len = barcode_len.clone(); + // the output cache and correction map + // let s = ahash::RandomState::with_seeds(2u64, 7u64, 1u64, 8u64); + // let mut cmap = HashMap::::with_hasher(s); + // alternative strategy + // let mut cmap = HashMap::::with_hasher(s); + + // the number of chunks remaining to be processed + let buckets_remaining = buckets_to_process.clone(); + // and knowledge of the BC types + let bc_type = + rad_types::decode_int_type_tag(cc.bc_type).context("unknown barcode type id.")?; + + // have access to the input directory + let input_dir: PathBuf = input_dir.clone(); + // the output file + + let r_names = ref_names.clone(); + // and the progress bar + let pbar_gather = pbar_gather.clone(); + + // now, make the worker threads + let handle = std::thread::spawn(move || { + let local_chunks = 0u64; + let parent = std::path::Path::new(&input_dir); + // pop from the work queue until everything is + // processed + while buckets_remaining.load(Ordering::SeqCst) > 0 { + if let Some(temp_bucket) = in_q.pop() { + buckets_remaining.fetch_sub(1, Ordering::SeqCst); + // cmap.clear(); + + let fname = parent.join(format!("bucket_{}.tmp", temp_bucket.2.bucket_id)); + // create a new handle for reading + let tfile = std::fs::File::open(&fname).expect("couldn't open temporary file."); + let mut treader = BufReader::new(tfile); + + let _ = sort_temp_bucket( + &mut treader, + &bc_type, + barcode_len, + rc, + &r_names, + temp_bucket.2.num_records_written.load(Ordering::SeqCst), + parent, + temp_bucket.2.bucket_id, + compress_out, + ); + + // we don't need the file or reader anymore + drop(treader); + std::fs::remove_file(fname).expect("could not delete temporary file."); + + pbar_gather.inc(1); + } + } + local_chunks + }); + thread_handles.push(handle); + } // for each worker + + // push the temporary buckets onto the work queue to be dispatched + // by the worker threads. + for temp_bucket in &temp_buckets { + let mut bclone = temp_bucket.clone(); + // keep trying until we can push this payload + while let Err(t) = fq.push(bclone) { + bclone = t; + // no point trying to push if the queue is full + while fq.is_full() {} + } + } + + // wait for all of the workers to finish + // let mut num_output_chunks = 0u64; + for h in thread_handles.drain(0..) { + match h.join() { + Ok(_c) => { + // num_output_chunks += c; + } + Err(_e) => { + info!(log, "thread panicked"); + } + } + } + pbar_gather.finish_with_message("gathered all temp files."); + + // make sure we wrote the same number of records that our + // file suggested we should. + // assert_eq!(total_allocated_records, total_to_collate); + + // info!( + // log, + // "writing num output chunks ({}) to header", + // num_output_chunks.to_formatted_string(&Locale::en) + // ); + + info!( + log, + "expected number of output chunks {}", + expected_output_chunks.to_formatted_string(&Locale::en) + ); + if compress_out { + let out_bed_file = File::create(parent.join("map.bed.gz")).unwrap(); + let mut encoder = GzEncoder::new(out_bed_file, Compression::default()); + for i in 0..temp_buckets.len() { + let temp_bed_name = parent.join(format!("{}.bed.gz", i)); + let mut input = File::open(&temp_bed_name)?; + io::copy(&mut input, &mut encoder)?; + std::fs::remove_file(&temp_bed_name)?; + } + encoder.finish()?; + } else { + let mut out_bed_file = File::create(parent.join("map.bed")).unwrap(); + for i in 0..temp_buckets.len() { + let temp_bed_name = parent.join(format!("{}.bed", i)); + let mut input = File::open(&temp_bed_name)?; + io::copy(&mut input, &mut out_bed_file)?; + std::fs::remove_file(&temp_bed_name)?; + } + } + // assert_eq!( + // expected_output_chunks, + // num_output_chunks, + // "expected to write {} chunks but wrote {}", + // expected_output_chunks.to_formatted_string(&Locale::en), + // num_output_chunks.to_formatted_string(&Locale::en), + // ); + + info!(log, "merging temp files"); + Ok(()) +} diff --git a/src/atac/utils.rs b/src/atac/utils.rs new file mode 100644 index 0000000..3a78466 --- /dev/null +++ b/src/atac/utils.rs @@ -0,0 +1,18 @@ +use needletail::bitkmer::*; + +pub fn get_bin_id(pos: u32, ref_id: usize, size_range: u32, blens: &[u64]) -> usize { + let bid = pos / size_range; + let ind: usize = (blens[ref_id] + bid as u64) as usize; + ind +} + +pub fn get_bc_string(kmerseq: &BitKmerSeq, reverse_barcode: bool, bc_len: u8) -> String { + let kmseq = *kmerseq; + let mut km: BitKmer = (kmseq, bc_len); + if reverse_barcode { + km = needletail::bitkmer::reverse_complement(km); + } + let bytes = needletail::bitkmer::bitmer_to_bytes(km); + let seq: String = String::from_utf8(bytes).expect("Invalid barcode"); + seq +} diff --git a/src/cellfilter.rs b/src/cellfilter.rs index c85db15..c103bf5 100644 --- a/src/cellfilter.rs +++ b/src/cellfilter.rs @@ -12,7 +12,7 @@ use dashmap::DashMap; use slog::crit; use slog::{info, warn}; -use crate::diagnostics; +//use crate::diagnostics; use crate::prog_opts::GenPermitListOpts; use crate::utils as afutils; #[allow(unused_imports)] @@ -816,7 +816,7 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result max_ambiguity_read.to_formatted_string(&Locale::en) ); let valid_thresh = 0.3f64; - match diagnostics::likely_valid_permit_list( + /*match diagnostics::likely_valid_permit_list( unmatched_bc.len(), num_reads, valid_thresh, @@ -830,6 +830,7 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result warn!(log, "{:?}", e); } } + */ process_unfiltered( hmu.unwrap(), diff --git a/src/convert.rs b/src/convert.rs index 5d4c83e..25866c3 100644 --- a/src/convert.rs +++ b/src/convert.rs @@ -402,9 +402,7 @@ where let flags = rec.flags()?; let is_reverse = flags.is_reverse_complemented(); - let qname_str = str::from_utf8(rec.name().expect("valid name").as_bytes()) - .unwrap() - .to_owned(); + let qname_str = rec.name().expect("valid name").to_string().to_owned(); let qname = qname_str; let mut tid = rec.reference_sequence_id(&hdrv).unwrap().unwrap() as u32; if qname == old_qname { diff --git a/src/em.rs b/src/em.rs index 526dc48..aa810ef 100644 --- a/src/em.rs +++ b/src/em.rs @@ -11,6 +11,7 @@ use crate::eq_class::IndexedEqList; #[allow(unused_imports)] use ahash::{AHasher, RandomState}; +use nalgebra::base::OVector; use rand::{thread_rng, Rng}; #[allow(unused_imports)] use slog::info; @@ -452,7 +453,7 @@ pub(crate) fn run_bootstrap_subset( .iter() .map(|x| (x.1 as f64) / (total_fragments as f64)) .collect(); - let dist = Multinomial::new(&eq_counts[..], total_fragments as u64).unwrap(); + let dist = Multinomial::new(eq_counts, total_fragments as u64).unwrap(); // store bootstraps let mut bootstrap_counts = Vec::with_capacity(cell_data.len()); @@ -469,7 +470,7 @@ pub(crate) fn run_bootstrap_subset( // let mut old_resampled_counts = Vec::new(); for _bs_num in 0..num_bootstraps { // resample from multinomial - let resampled_counts = thread_rng().sample(dist.clone()); + let resampled_counts: OVector = thread_rng().sample(dist.clone()); for (idx, (eq_id, _orig_count)) in cell_data.iter().enumerate() { bootstrap_counts.push((*eq_id, resampled_counts[idx].round() as u32)); } @@ -611,7 +612,7 @@ pub fn run_bootstrap_old( let s = ahash::RandomState::with_seeds(2u64, 7u64, 1u64, 8u64); let mut eqclass_bootstrap: HashMap, u32, ahash::RandomState> = HashMap::with_hasher(s); // define a multinomial - let dist = Multinomial::new(&eq_counts, total_fragments).unwrap(); + let dist = Multinomial::new(eq_counts, total_fragments).unwrap(); // store bootstraps let mut bootstraps = Vec::new(); @@ -620,7 +621,7 @@ pub fn run_bootstrap_old( // let mut old_resampled_counts = Vec::new(); for _bs_num in 0..num_bootstraps { // resample from multinomial - let resampled_counts = thread_rng().sample(dist.clone()); + let resampled_counts: OVector = thread_rng().sample(dist.clone()); for (eq_id, labels) in &eqclasses_serialize { eqclass_bootstrap.insert(labels.to_vec(), resampled_counts[*eq_id].round() as u32); diff --git a/src/lib.rs b/src/lib.rs index 0e75560..6864b2e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,7 +12,8 @@ pub mod cmd_parse_utils; pub mod collate; pub mod constants; pub mod convert; -pub mod diagnostics; +//pub mod diagnostics; +pub mod atac; pub mod em; pub mod eq_class; pub mod infer; diff --git a/src/main.rs b/src/main.rs index 15ddfb4..5dd761f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -15,7 +15,7 @@ use csv::ErrorKind; use itertools::Itertools; use mimalloc::MiMalloc; use rand::Rng; -use slog::{crit, o, warn, Drain}; +use slog::{crit, info, o, warn, Drain}; use std::path::PathBuf; use alevin_fry::cellfilter::{generate_permit_list, CellFilterMethod}; @@ -44,6 +44,101 @@ fn gen_random_kmer(k: usize) -> String { s } +fn atac_sub_commands() -> Command { + let num_hardware_threads = num_cpus::get() as u32; + let max_num_threads: String = (num_cpus::get() as u32).to_string(); + let max_num_collate_threads: String = (16_u32.min(num_hardware_threads).max(2_u32)).to_string(); + let max_num_gpl_threads: String = (8_u32.min(num_hardware_threads).max(2_u32)).to_string(); + let max_num_sort_threads: String = (16_u32.min(num_hardware_threads).max(2_u32)).to_string(); + + let crate_authors = crate_authors!("\n"); + let version = crate_version!(); + + let gen_app = Command::new("generate-permit-list") + .about("Generate a permit list of barcodes from a whitelist file") + .version(version) + .author(crate_authors) + .arg(arg!(-i --input "input directory containing the map.rad file") + .required(true) + .value_parser(pathbuf_directory_exists_validator)) + .arg(arg!(-o --"output-dir" "output directory") + .required(true) + .value_parser(value_parser!(PathBuf)) + ) + .arg( + arg!(-u --"unfiltered-pl" "uses an unfiltered external permit list") + .value_parser(pathbuf_file_exists_validator) + ) + .group(ArgGroup::new("filter-method") + .args(["unfiltered-pl"]) + .required(true) + ) + .arg( + arg!(-m --"min-reads" "minimum read count threshold; only used with --unfiltered-pl") + .value_parser(value_parser!(usize)) + .default_value("10")) + .arg( + arg!(-r --"rev-comp" "reverse complement the barcode") + .value_parser(clap::builder::BoolishValueParser::new()) + .default_value("true") + ); + + let collate_app = Command::new("collate") + .about("Collate a RAD file with corrected cell barcode") + .version(version) + .author(crate_authors) + .arg(arg!(-i --"input-dir" "output directory made by generate-permit-list") + .required(true) + .value_parser(pathbuf_directory_exists_validator)) + .arg(arg!(-r --"rad-dir" "the directory containing the map.rad file which will be collated (typically produced as an output of the mapping)") + .required(true) + .value_parser(pathbuf_directory_exists_validator)) + .arg(arg!(-t --threads "number of threads to use for processing").value_parser(value_parser!(u32)).default_value(max_num_collate_threads.clone())) + .arg(arg!(-c --compress "compress the output collated RAD file")) + .arg(arg!(-m --"max-records" "the maximum number of read records to keep in memory at once") + .value_parser(value_parser!(u32)) + .default_value("30000000")); + + let sort_app = Command::new("sort") + .about("Produce coordinate sorted bed file") + .version(version) + .author(crate_authors) + .arg(arg!(-i --"input-dir" "output directory made by generate-permit-list") + .required(true) + .value_parser(pathbuf_directory_exists_validator)) + .arg(arg!(-r --"rad-dir" "the directory containing the map.rad file which will be sorted (typically produced as an output of the mapping)") + .required(true) + .value_parser(pathbuf_directory_exists_validator)) + .arg(arg!(-t --threads "number of threads to use for processing").value_parser(value_parser!(u32)).default_value(max_num_sort_threads)) + .arg(arg!(-c --compress "compress the output of the sorted RAD file")) + .arg(arg!(-m --"max-records" "the maximum number of read records to keep in memory at once") + .value_parser(value_parser!(u32)) + .default_value("30000000")); + + let deduplicate_app = Command::new("deduplicate") + .about("Deduplicate the RAD file and output a BED file") + .version(version) + .author(crate_authors) + .arg(arg!(-i --"input-dir" "input directory made by generate-permit-list that also contains the output of collate") + .required(true) + .value_parser(pathbuf_directory_exists_validator)) + .arg(arg!(-t --threads "number of threads to use for processing").value_parser(value_parser!(u32)).default_value(max_num_threads)) + .arg( + arg!(-r --"rev-comp" "reverse complement") + .value_parser(clap::builder::BoolishValueParser::new()) + .default_value("true") + ); + + Command::new("atac") + .about("Deduplicate the RAD file and output a BED file") + .version(version) + .author(crate_authors) + .subcommand(gen_app) + .subcommand(sort_app) + .subcommand(collate_app) + .subcommand(deduplicate_app) +} + #[allow(clippy::manual_clamp)] fn main() -> anyhow::Result<()> { let num_hardware_threads = num_cpus::get() as u32; @@ -213,6 +308,8 @@ fn main() -> anyhow::Result<()> { .arg(arg!(--"use-mtx" "flag for writing output matrix in matrix market format (default)")) .arg(arg!(--"use-eds" "flag for writing output matrix in EDS format").conflicts_with("use-mtx")); + let atac_app = atac_sub_commands(); + let opts = Command::new("alevin-fry") .subcommand_required(true) .arg_required_else_help(true) @@ -225,6 +322,7 @@ fn main() -> anyhow::Result<()> { .subcommand(infer_app) .subcommand(convert_app) .subcommand(view_app) + .subcommand(atac_app) .get_matches(); let decorator = slog_term::TermDecorator::new().build(); @@ -236,9 +334,14 @@ fn main() -> anyhow::Result<()> { .build() .fuse(); let drain = slog_async::Async::new(drain).build().fuse(); - let log = slog::Logger::root(drain, o!()); + use alevin_fry::atac; + if let Some(t) = opts.subcommand_matches("atac") { + info!(&log, "scATAC-seq mode"); + atac::run::run(t, VERSION, &cmdline, &log)?; + } + // You can handle information about subcommands by requesting their matches by name // (as below), requesting just the name used, or both at the same time if let Some(t) = opts.subcommand_matches("generate-permit-list") { From 7ef1bf2110c17ba30c8551ef720450ad7eb8c7fc Mon Sep 17 00:00:00 2001 From: rob-p Date: Tue, 3 Dec 2024 16:16:34 -0500 Subject: [PATCH 04/16] more atac cleanup --- src/atac/deduplicate.rs | 9 +- src/atac/run.rs | 2 +- src/cellfilter.rs | 2 +- src/main.rs | 190 ++++++++++++++++++++-------------------- 4 files changed, 101 insertions(+), 102 deletions(-) diff --git a/src/atac/deduplicate.rs b/src/atac/deduplicate.rs index ff4963d..0e1e26b 100644 --- a/src/atac/deduplicate.rs +++ b/src/atac/deduplicate.rs @@ -1,19 +1,18 @@ use crate::atac::prog_opts::DeduplicateOpts; +use crate::atac::sort::HitInfo; +use crate::atac::utils as atac_utils; use anyhow::Context; use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle}; +use itertools::Itertools; use libradicl::header::RadPrelude; use libradicl::record::AtacSeqReadRecord; +use num_format::Locale; use num_format::ToFormattedString; use slog::info; use std::fs::File; use std::io::{BufRead, BufReader, Write}; use std::sync::{atomic::AtomicU32, atomic::Ordering, Arc, Mutex}; use std::thread; -pub type MetaChunk = (usize, usize, u32, u32, Vec); -use crate::atac::sort::HitInfo; -use crate::atac::utils as atac_utils; -use itertools::Itertools; -use num_format::Locale; pub fn write_bed( bd_writer_lock: &Arc>, diff --git a/src/atac/run.rs b/src/atac/run.rs index 0645c07..af62a2a 100644 --- a/src/atac/run.rs +++ b/src/atac/run.rs @@ -6,7 +6,7 @@ use crate::atac::prog_opts::{DeduplicateOpts, GenPermitListOpts}; use crate::atac::sort::sort; use anyhow::bail; use clap::ArgMatches; -use slog::{crit, info, o, warn, Logger}; +use slog::{crit, warn, Logger}; use std::path::PathBuf; pub fn run(opts: &ArgMatches, version: &str, cmdline: &str, log: &Logger) -> anyhow::Result<()> { diff --git a/src/cellfilter.rs b/src/cellfilter.rs index c103bf5..f25cd8f 100644 --- a/src/cellfilter.rs +++ b/src/cellfilter.rs @@ -815,7 +815,7 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result num_chunks.expect("nonzero").to_formatted_string(&Locale::en), max_ambiguity_read.to_formatted_string(&Locale::en) ); - let valid_thresh = 0.3f64; + let _valid_thresh = 0.3f64; /*match diagnostics::likely_valid_permit_list( unmatched_bc.len(), num_reads, diff --git a/src/main.rs b/src/main.rs index 5dd761f..38688f0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -44,101 +44,6 @@ fn gen_random_kmer(k: usize) -> String { s } -fn atac_sub_commands() -> Command { - let num_hardware_threads = num_cpus::get() as u32; - let max_num_threads: String = (num_cpus::get() as u32).to_string(); - let max_num_collate_threads: String = (16_u32.min(num_hardware_threads).max(2_u32)).to_string(); - let max_num_gpl_threads: String = (8_u32.min(num_hardware_threads).max(2_u32)).to_string(); - let max_num_sort_threads: String = (16_u32.min(num_hardware_threads).max(2_u32)).to_string(); - - let crate_authors = crate_authors!("\n"); - let version = crate_version!(); - - let gen_app = Command::new("generate-permit-list") - .about("Generate a permit list of barcodes from a whitelist file") - .version(version) - .author(crate_authors) - .arg(arg!(-i --input "input directory containing the map.rad file") - .required(true) - .value_parser(pathbuf_directory_exists_validator)) - .arg(arg!(-o --"output-dir" "output directory") - .required(true) - .value_parser(value_parser!(PathBuf)) - ) - .arg( - arg!(-u --"unfiltered-pl" "uses an unfiltered external permit list") - .value_parser(pathbuf_file_exists_validator) - ) - .group(ArgGroup::new("filter-method") - .args(["unfiltered-pl"]) - .required(true) - ) - .arg( - arg!(-m --"min-reads" "minimum read count threshold; only used with --unfiltered-pl") - .value_parser(value_parser!(usize)) - .default_value("10")) - .arg( - arg!(-r --"rev-comp" "reverse complement the barcode") - .value_parser(clap::builder::BoolishValueParser::new()) - .default_value("true") - ); - - let collate_app = Command::new("collate") - .about("Collate a RAD file with corrected cell barcode") - .version(version) - .author(crate_authors) - .arg(arg!(-i --"input-dir" "output directory made by generate-permit-list") - .required(true) - .value_parser(pathbuf_directory_exists_validator)) - .arg(arg!(-r --"rad-dir" "the directory containing the map.rad file which will be collated (typically produced as an output of the mapping)") - .required(true) - .value_parser(pathbuf_directory_exists_validator)) - .arg(arg!(-t --threads "number of threads to use for processing").value_parser(value_parser!(u32)).default_value(max_num_collate_threads.clone())) - .arg(arg!(-c --compress "compress the output collated RAD file")) - .arg(arg!(-m --"max-records" "the maximum number of read records to keep in memory at once") - .value_parser(value_parser!(u32)) - .default_value("30000000")); - - let sort_app = Command::new("sort") - .about("Produce coordinate sorted bed file") - .version(version) - .author(crate_authors) - .arg(arg!(-i --"input-dir" "output directory made by generate-permit-list") - .required(true) - .value_parser(pathbuf_directory_exists_validator)) - .arg(arg!(-r --"rad-dir" "the directory containing the map.rad file which will be sorted (typically produced as an output of the mapping)") - .required(true) - .value_parser(pathbuf_directory_exists_validator)) - .arg(arg!(-t --threads "number of threads to use for processing").value_parser(value_parser!(u32)).default_value(max_num_sort_threads)) - .arg(arg!(-c --compress "compress the output of the sorted RAD file")) - .arg(arg!(-m --"max-records" "the maximum number of read records to keep in memory at once") - .value_parser(value_parser!(u32)) - .default_value("30000000")); - - let deduplicate_app = Command::new("deduplicate") - .about("Deduplicate the RAD file and output a BED file") - .version(version) - .author(crate_authors) - .arg(arg!(-i --"input-dir" "input directory made by generate-permit-list that also contains the output of collate") - .required(true) - .value_parser(pathbuf_directory_exists_validator)) - .arg(arg!(-t --threads "number of threads to use for processing").value_parser(value_parser!(u32)).default_value(max_num_threads)) - .arg( - arg!(-r --"rev-comp" "reverse complement") - .value_parser(clap::builder::BoolishValueParser::new()) - .default_value("true") - ); - - Command::new("atac") - .about("Deduplicate the RAD file and output a BED file") - .version(version) - .author(crate_authors) - .subcommand(gen_app) - .subcommand(sort_app) - .subcommand(collate_app) - .subcommand(deduplicate_app) -} - #[allow(clippy::manual_clamp)] fn main() -> anyhow::Result<()> { let num_hardware_threads = num_cpus::get() as u32; @@ -708,3 +613,98 @@ fn main() -> anyhow::Result<()> { } Ok(()) } + +fn atac_sub_commands() -> Command { + let num_hardware_threads = num_cpus::get() as u32; + let max_num_threads: String = (num_cpus::get() as u32).to_string(); + let max_num_collate_threads: String = (16_u32.min(num_hardware_threads).max(2_u32)).to_string(); + let _max_num_gpl_threads: String = (8_u32.min(num_hardware_threads).max(2_u32)).to_string(); + let max_num_sort_threads: String = (16_u32.min(num_hardware_threads).max(2_u32)).to_string(); + + let crate_authors = crate_authors!("\n"); + let version = crate_version!(); + + let gen_app = Command::new("generate-permit-list") + .about("Generate a permit list of barcodes from a whitelist file") + .version(version) + .author(crate_authors) + .arg(arg!(-i --input "input directory containing the map.rad file") + .required(true) + .value_parser(pathbuf_directory_exists_validator)) + .arg(arg!(-o --"output-dir" "output directory") + .required(true) + .value_parser(value_parser!(PathBuf)) + ) + .arg( + arg!(-u --"unfiltered-pl" "uses an unfiltered external permit list") + .value_parser(pathbuf_file_exists_validator) + ) + .group(ArgGroup::new("filter-method") + .args(["unfiltered-pl"]) + .required(true) + ) + .arg( + arg!(-m --"min-reads" "minimum read count threshold; only used with --unfiltered-pl") + .value_parser(value_parser!(usize)) + .default_value("10")) + .arg( + arg!(-r --"rev-comp" "reverse complement the barcode") + .value_parser(clap::builder::BoolishValueParser::new()) + .default_value("true") + ); + + let collate_app = Command::new("collate") + .about("Collate a RAD file with corrected cell barcode") + .version(version) + .author(crate_authors) + .arg(arg!(-i --"input-dir" "output directory made by generate-permit-list") + .required(true) + .value_parser(pathbuf_directory_exists_validator)) + .arg(arg!(-r --"rad-dir" "the directory containing the map.rad file which will be collated (typically produced as an output of the mapping)") + .required(true) + .value_parser(pathbuf_directory_exists_validator)) + .arg(arg!(-t --threads "number of threads to use for processing").value_parser(value_parser!(u32)).default_value(max_num_collate_threads.clone())) + .arg(arg!(-c --compress "compress the output collated RAD file")) + .arg(arg!(-m --"max-records" "the maximum number of read records to keep in memory at once") + .value_parser(value_parser!(u32)) + .default_value("30000000")); + + let sort_app = Command::new("sort") + .about("Produce coordinate sorted bed file") + .version(version) + .author(crate_authors) + .arg(arg!(-i --"input-dir" "output directory made by generate-permit-list") + .required(true) + .value_parser(pathbuf_directory_exists_validator)) + .arg(arg!(-r --"rad-dir" "the directory containing the map.rad file which will be sorted (typically produced as an output of the mapping)") + .required(true) + .value_parser(pathbuf_directory_exists_validator)) + .arg(arg!(-t --threads "number of threads to use for processing").value_parser(value_parser!(u32)).default_value(max_num_sort_threads)) + .arg(arg!(-c --compress "compress the output of the sorted RAD file")) + .arg(arg!(-m --"max-records" "the maximum number of read records to keep in memory at once") + .value_parser(value_parser!(u32)) + .default_value("30000000")); + + let deduplicate_app = Command::new("deduplicate") + .about("Deduplicate the RAD file and output a BED file") + .version(version) + .author(crate_authors) + .arg(arg!(-i --"input-dir" "input directory made by generate-permit-list that also contains the output of collate") + .required(true) + .value_parser(pathbuf_directory_exists_validator)) + .arg(arg!(-t --threads "number of threads to use for processing").value_parser(value_parser!(u32)).default_value(max_num_threads)) + .arg( + arg!(-r --"rev-comp" "reverse complement") + .value_parser(clap::builder::BoolishValueParser::new()) + .default_value("true") + ); + + Command::new("atac") + .about("Deduplicate the RAD file and output a BED file") + .version(version) + .author(crate_authors) + .subcommand(gen_app) + .subcommand(sort_app) + .subcommand(collate_app) + .subcommand(deduplicate_app) +} From ad704b72491345614bff734c2d6b3e4d03ee5333 Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Tue, 3 Dec 2024 21:40:59 -0500 Subject: [PATCH 05/16] message --- src/cellfilter.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cellfilter.rs b/src/cellfilter.rs index c85db15..215dae7 100644 --- a/src/cellfilter.rs +++ b/src/cellfilter.rs @@ -789,7 +789,7 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result unmatched_bc.extend_from_slice(&ubc); max_ambiguity_read = max_ambiguity_read.max(mar); } - pbar.finish_with_message(format!("finished parsing RAD file\n",)); + pbar.finish_with_message("finished parsing RAD file\n"); // return the hash map we no longer need std::sync::Arc::>::into_inner(hmu) }); @@ -884,11 +884,11 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result num_orientation_compat_reads += nocr; max_ambiguity_read = max_ambiguity_read.max(mar); } + pbar.finish_with_message("finished parsing RAD file\n"); // return the hash map we no longer need Arc::>::into_inner(hm) .expect("unique reference to DashMap") }); - pbar.finish_with_message(format!("finished parsing RAD file\n",)); info!( log, "observed {} reads in {} chunks --- max ambiguity read occurs in {} refs", From 8a2b189c7d75c2c661c9da0f7438773c668d73a5 Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Tue, 3 Dec 2024 21:41:43 -0500 Subject: [PATCH 06/16] add back diagnostics --- src/cellfilter.rs | 7 +++---- src/diagnostics.rs | 22 ++++++++++++++++++++++ 2 files changed, 25 insertions(+), 4 deletions(-) create mode 100644 src/diagnostics.rs diff --git a/src/cellfilter.rs b/src/cellfilter.rs index 5922285..215dae7 100644 --- a/src/cellfilter.rs +++ b/src/cellfilter.rs @@ -12,7 +12,7 @@ use dashmap::DashMap; use slog::crit; use slog::{info, warn}; -//use crate::diagnostics; +use crate::diagnostics; use crate::prog_opts::GenPermitListOpts; use crate::utils as afutils; #[allow(unused_imports)] @@ -815,8 +815,8 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result num_chunks.expect("nonzero").to_formatted_string(&Locale::en), max_ambiguity_read.to_formatted_string(&Locale::en) ); - let _valid_thresh = 0.3f64; - /*match diagnostics::likely_valid_permit_list( + let valid_thresh = 0.3f64; + match diagnostics::likely_valid_permit_list( unmatched_bc.len(), num_reads, valid_thresh, @@ -830,7 +830,6 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result warn!(log, "{:?}", e); } } - */ process_unfiltered( hmu.unwrap(), diff --git a/src/diagnostics.rs b/src/diagnostics.rs new file mode 100644 index 0000000..c43e251 --- /dev/null +++ b/src/diagnostics.rs @@ -0,0 +1,22 @@ +use anyhow; + +pub(crate) fn likely_valid_permit_list( + num_unmatched: usize, + total_mapped: usize, + thresh: f64, +) -> anyhow::Result { + if total_mapped > 0 { + let unmatched_frac = (num_unmatched as f64) / (total_mapped as f64); + if unmatched_frac < thresh { + anyhow::Ok(unmatched_frac) + } else { + anyhow::bail!( + "Percentage of mapped reads not matching a known barcode exaclty ({}%) is > the suggested fraction ({}%)", + unmatched_frac * 100.0f64, + thresh * 100.0f64 + ) + } + } else { + anyhow::bail!("Cannot determine (likely) valid permit list if not reads are mapped") + } +} From 4e43f1e912408d7008c2524908e829610b2639e5 Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Tue, 3 Dec 2024 21:42:33 -0500 Subject: [PATCH 07/16] declare diagnostics --- src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 6864b2e..a5e0cca 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,13 +7,13 @@ * License: 3-clause BSD, see https://opensource.org/licenses/BSD-3-Clause */ +pub mod atac; pub mod cellfilter; pub mod cmd_parse_utils; pub mod collate; pub mod constants; pub mod convert; -//pub mod diagnostics; -pub mod atac; +pub mod diagnostics; pub mod em; pub mod eq_class; pub mod infer; From a6e53c13d5e366a997beb2ea48ee29eca56d452d Mon Sep 17 00:00:00 2001 From: rob-p Date: Thu, 5 Dec 2024 22:27:24 -0500 Subject: [PATCH 08/16] multithreaded maybe --- src/atac/cellfilter.rs | 219 +++++++++++++++++++++++++++++++---------- src/atac/prog_opts.rs | 1 + src/atac/run.rs | 4 + src/main.rs | 3 +- 4 files changed, 173 insertions(+), 54 deletions(-) diff --git a/src/atac/cellfilter.rs b/src/atac/cellfilter.rs index 8c47633..4fcfe5c 100644 --- a/src/atac/cellfilter.rs +++ b/src/atac/cellfilter.rs @@ -2,6 +2,13 @@ use crate::atac::prog_opts::GenPermitListOpts; use crate::utils as afutils; use anyhow::{anyhow, Context}; use bstr::io::BufReadExt; +use dashmap::DashMap; +use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle}; +use std::num::NonZeroUsize; +use std::sync::{ + atomic::{AtomicU64, Ordering as AtomicOrdering}, + Arc, +}; // use indexmap::map::IndexMap; use itertools::Itertools; use libradicl::exit_codes; @@ -60,11 +67,11 @@ pub enum CellFilterMethod { } pub fn update_barcode_hist_unfiltered( - hist: &mut HashMap, + hist: &DashMap, unmatched_bc: &mut Vec, max_ambiguity_read: &mut usize, chunk: &chunk::Chunk, - bins: &mut [u64], + bins: &[AtomicU64], blens: &[u64], size_range: u64, ) -> usize { @@ -76,7 +83,7 @@ pub fn update_barcode_hist_unfiltered( // barcodes match hist.get_mut(&r.bc) { // if we find a match, increment the count - Some(c) => *c += 1, + Some(mut c) => *c += 1, // otherwise, push this into the unmatched list None => { unmatched_bc.push(r.bc); @@ -90,7 +97,7 @@ pub fn update_barcode_hist_unfiltered( let sp = r.start_pos[i]; let bid = sp as u64 / size_range; let ind: usize = (blens[ref_id as usize] + bid) as usize; - bins[ind] += 1; + bins[ind].fetch_add(1, AtomicOrdering::AcqRel); } } num_strand_compat_reads @@ -100,9 +107,9 @@ fn populate_unfiltered_barcode_map( br: BufReader, first_bclen: &mut usize, rev_bc: bool, -) -> HashMap { +) -> DashMap { let s = ahash::RandomState::with_seeds(2u64, 7u64, 1u64, 8u64); - let mut hm = HashMap::with_hasher(s); + let mut hm = DashMap::with_hasher(s); // read through the external unfiltered barcode list // and generate a vector of encoded barcodes @@ -135,7 +142,7 @@ fn populate_unfiltered_barcode_map( #[allow(clippy::unnecessary_unwrap, clippy::too_many_arguments)] fn process_unfiltered( - mut hm: HashMap, + hm: DashMap, mut unmatched_bc: Vec, file_tag_map: &rad_types::TagMap, filter_meth: &CellFilterMethod, @@ -163,19 +170,19 @@ fn process_unfiltered( let mut kept_bc = Vec::::new(); // iterate over the count map - for (k, v) in hm.iter_mut() { + for mut kvp in hm.iter_mut() { // if this satisfies our requirement for the minimum count // then keep this barcode - if *v >= min_freq { - kept_bc.push(*k); + if *kvp.value() >= min_freq { + kept_bc.push(*kvp.key()); } else { // otherwise, we have to add this barcode's // counts to our unmatched list - for _ in 0..*v { - unmatched_bc.push(*k); + for _ in 0..*kvp.value() { + unmatched_bc.push(*kvp.key()); } // and then reset the counter for this barcode to 0 - *v = 0u64; + *kvp.value_mut() = 0u64; } } @@ -237,7 +244,7 @@ fn process_unfiltered( if cbc != *ubc && n == 1 { // then increment the count of this // barcode by 1 (because we'll correct to it) - if let Some(c) = hm.get_mut(&cbc) { + if let Some(mut c) = hm.get_mut(&cbc) { *c += count as u64; corrected_list.push((*ubc, cbc)); } @@ -294,6 +301,8 @@ fn process_unfiltered( let parent = std::path::Path::new(output_dir); let o_path = parent.join("permit_freq.bin"); + // convert the DashMap to a HashMap + let mut hm: HashMap = hm.into_iter().collect(); match afutils::write_permit_list_freq(&o_path, barcode_len, &hm) { Ok(_) => {} Err(error) => { @@ -410,11 +419,18 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result ); } + let nworkers: usize = gpl_opts.threads; let i_file = File::open(i_dir.join("map.rad")).context("could not open input rad file")?; - let mut br = BufReader::new(i_file); - let prelude = RadPrelude::from_bytes(&mut br)?; - let hdr = &prelude.hdr; + let metadata = i_file.metadata()?; + let file_len = metadata.len(); + let ifile = BufReader::new(i_file); + let mut rad_reader = + libradicl::readers::ParallelRadReader::>::new( + ifile, + NonZeroUsize::new(nworkers).unwrap(), + ); + let hdr = &rad_reader.prelude.hdr; info!( log, "paired : {:?}, ref_count : {}, num_chunks : {}", @@ -424,10 +440,10 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result ); // file-level - let fl_tags = &prelude.file_tags; + let fl_tags = &rad_reader.prelude.file_tags; info!(log, "read {:?} file-level tags", fl_tags.tags.len()); // read-level - let rl_tags = &prelude.read_tags; + let rl_tags = &rad_reader.prelude.read_tags; info!(log, "read {:?} read-level tags", rl_tags.tags.len()); const BNAME: &str = "barcode"; @@ -448,25 +464,23 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result } // alignment-level - let al_tags = &prelude.aln_tags; - info!(log, "read {:?} alignment-level tags", al_tags.tags.len()); - - let file_tag_map = prelude.file_tags.parse_tags_from_bytes(&mut br)?; - info!(log, "File-level tag values {:?}", file_tag_map); - - let ref_tag = file_tag_map - .get("ref_lengths") - .expect("tag must contain ref lengths"); - let TagValue::ArrayU64(ref_lens) = ref_tag else { - todo!() - }; + let al_tags = &rad_reader.prelude.aln_tags; + info!(log, "read {:?} alignemnt-level tags", al_tags.tags.len()); + + let mut ref_lens; + { + let file_tag_map = &rad_reader.file_tag_map; + info!(log, "File-level tag values {:?}", file_tag_map); + ref_lens = match file_tag_map.get("ref_lengths") { + Some(TagValue::ArrayU64(v)) => v.clone(), + _ => panic!("expected \"ref_lengths\" to be an ArrayU64, but didn't find that"), + }; + } - let record_context = prelude.get_record_context::()?; let mut num_reads: usize = 0; - let mut blens: Vec = vec![0; ref_lens.len() + 1]; - let tot_bins = initialize_rec_list(&mut blens, ref_lens, size_range); - let mut bins: Vec = vec![0; tot_bins.unwrap() as usize]; + //let record_context = prelude.get_record_context::()?; + let mut num_reads: usize = 0; // if dealing with the unfiltered type // the set of barcodes that are not an exact match for any known barcodes @@ -476,26 +490,121 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result let mut num_orientation_compat_reads = 0usize; // Tracking if a unique or a multihit + // for progress bar + let header_offset = rad_reader.get_byte_offset(); + let pbar = ProgressBar::new(file_len - header_offset); + pbar.set_style( + ProgressStyle::with_template( + "[{elapsed_precise}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}", + ) + .unwrap() + .progress_chars("##-"), + ); + pbar.set_draw_target(ProgressDrawTarget::stderr_with_hz(5)); + let cb = |new_bytes: u64, _new_chunks: u64| { + pbar.inc(new_bytes); + }; + match filter_meth { CellFilterMethod::UnfilteredExternalList(_, _min_reads) => { unmatched_bc = Vec::with_capacity(10000000); // the unfiltered_bc_count map must be valid in this branch - if let Some(mut hmu) = unfiltered_bc_counts { - while has_data_left(&mut br).expect("encountered error reading input file") { - let c = chunk::Chunk::::from_bytes(&mut br, &record_context); - num_orientation_compat_reads += update_barcode_hist_unfiltered( - &mut hmu, - &mut unmatched_bc, - &mut max_ambiguity_read, - &c, - &mut bins, - &blens, - size_range, + // the unfiltered_bc_count map must be valid in this branch + if unfiltered_bc_counts.is_some() { + let (hmu, bins, blens, num_chunks) = std::thread::scope(|s| { + let mut blens: Vec = vec![0; ref_lens.len() + 1]; + let tot_bins = initialize_rec_list(&mut blens, &ref_lens, size_range); + let blens = blens; + let mut bins: Arc> = Arc::new( + vec![0; tot_bins.unwrap() as usize] + .iter() + .map(|x| AtomicU64::new(*x)) + .collect(), ); - num_chunks += 1; - num_reads += c.reads.len(); - } + + let hmu = std::sync::Arc::new(unfiltered_bc_counts.unwrap()); + let mut num_chunks = 0usize; + let mut handles = Vec::< + std::thread::ScopedJoinHandle<(usize, usize, Vec, usize, usize)>, + >::new(); + for _ in 0..nworkers { + let rd = rad_reader.is_done(); + let q = rad_reader.get_queue(); + let hmu = hmu.clone(); + let mut blens = blens.clone(); + let mut bins = bins.clone(); + let handle = s.spawn(move || { + let mut unmatched_bc = Vec::::new(); + let mut max_ambiguity_read = 0usize; + let mut num_reads = 0; + let mut num_chunks = 0; + let mut num_orientation_compat_reads = 0; + while !rd.load(AtomicOrdering::SeqCst) { + while let Some(meta_chunk) = q.pop() { + for c in meta_chunk.iter() { + num_orientation_compat_reads += + update_barcode_hist_unfiltered( + &hmu, + &mut unmatched_bc, + &mut max_ambiguity_read, + &c, + &bins, + &blens, + size_range, + ); + num_reads += c.reads.len(); + num_chunks += 1; + } + } + } + ( + num_reads, + num_orientation_compat_reads, + unmatched_bc, + max_ambiguity_read, + num_chunks, + ) + }); + handles.push(handle); + } + let _ = rad_reader.start_chunk_parsing(Some(cb)); //libradicl::readers::EMPTY_METACHUNK_CALLBACK); + for handle in handles { + let (nr, nocr, ubc, mar, nc) = + handle.join().expect("The parsing thread panicked"); + num_reads += nr; + num_orientation_compat_reads += nocr; + unmatched_bc.extend_from_slice(&ubc); + max_ambiguity_read = max_ambiguity_read.max(mar); + num_chunks += nc; + } + pbar.finish_with_message("finished parsing RAD file\n"); + // return the hash map we no longer need + ( + Arc::>::into_inner(hmu), + Arc::>::into_inner(bins), + blens, + num_chunks, + ) + }); + + /* + if let Some(mut hmu) = unfiltered_bc_counts { + while has_data_left(&mut br).expect("encountered error reading input file") { + let c = chunk::Chunk::::from_bytes(&mut br, &record_context); + num_orientation_compat_reads += update_barcode_hist_unfiltered( + &mut hmu, + &mut unmatched_bc, + &mut max_ambiguity_read, + &c, + &mut bins, + &blens, + size_range, + ); + num_chunks += 1; + num_reads += c.reads.len(); + } + */ let bin_recs_path = output_dir.join("bin_recs.bin"); let br_file = std::fs::File::create(bin_recs_path) @@ -520,17 +629,21 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result max_ambiguity_read.to_formatted_string(&Locale::en) ); process_unfiltered( - hmu, + hmu.expect("barcode map should be defined"), unmatched_bc, - &file_tag_map, + &rad_reader.file_tag_map, &filter_meth, output_dir, version, max_ambiguity_read, - num_chunks, + num_chunks as u32, cmdline, log, - *bins.iter().max().unwrap(), + bins.expect("invalid bins") + .into_iter() + .map(|v| v.load(AtomicOrdering::SeqCst)) + .max() + .unwrap(), &gpl_opts, ) } else { diff --git a/src/atac/prog_opts.rs b/src/atac/prog_opts.rs index 3614045..930f809 100644 --- a/src/atac/prog_opts.rs +++ b/src/atac/prog_opts.rs @@ -9,6 +9,7 @@ pub struct GenPermitListOpts<'a, 'b, 'c, 'd, 'e> { pub input_dir: &'a PathBuf, pub output_dir: &'b PathBuf, pub fmeth: CellFilterMethod, + pub threads: usize, pub rc: bool, pub cmdline: &'c str, pub version: &'d str, diff --git a/src/atac/run.rs b/src/atac/run.rs index af62a2a..7520454 100644 --- a/src/atac/run.rs +++ b/src/atac/run.rs @@ -32,12 +32,16 @@ pub fn run(opts: &ArgMatches, version: &str, cmdline: &str, log: &Logger) -> any fmeth = CellFilterMethod::UnfilteredExternalList(v.clone(), min_reads); }; let rc: bool = *t.get_one("rev-comp").expect("reverse comp must be boolean"); + let threads = *t + .get_one::("threads") + .expect("number of threads should be given") as usize; let gpl_opts = GenPermitListOpts::builder() .input_dir(input_dir) .output_dir(output_dir) .fmeth(fmeth) .rc(rc) + .threads(threads) .version(version) .cmdline(cmdline) .log(log) diff --git a/src/main.rs b/src/main.rs index 38688f0..346e1c5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -618,7 +618,7 @@ fn atac_sub_commands() -> Command { let num_hardware_threads = num_cpus::get() as u32; let max_num_threads: String = (num_cpus::get() as u32).to_string(); let max_num_collate_threads: String = (16_u32.min(num_hardware_threads).max(2_u32)).to_string(); - let _max_num_gpl_threads: String = (8_u32.min(num_hardware_threads).max(2_u32)).to_string(); + let max_num_gpl_threads: String = (8_u32.min(num_hardware_threads).max(2_u32)).to_string(); let max_num_sort_threads: String = (16_u32.min(num_hardware_threads).max(2_u32)).to_string(); let crate_authors = crate_authors!("\n"); @@ -635,6 +635,7 @@ fn atac_sub_commands() -> Command { .required(true) .value_parser(value_parser!(PathBuf)) ) + .arg(arg!(-t --threads "number of threads to use for the first phase of permit-list generation").value_parser(value_parser!(u32)).default_value(max_num_gpl_threads)) .arg( arg!(-u --"unfiltered-pl" "uses an unfiltered external permit list") .value_parser(pathbuf_file_exists_validator) From 8f27f3164f9094175079d12658a372b3a4ed0284 Mon Sep 17 00:00:00 2001 From: rob-p Date: Fri, 6 Dec 2024 00:44:39 -0500 Subject: [PATCH 09/16] same output --- src/atac/cellfilter.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/atac/cellfilter.rs b/src/atac/cellfilter.rs index 4fcfe5c..61523dd 100644 --- a/src/atac/cellfilter.rs +++ b/src/atac/cellfilter.rs @@ -610,6 +610,11 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result let br_file = std::fs::File::create(bin_recs_path) .expect("could not create serialization file."); let mut br_writer = BufWriter::new(&br_file); + let bins: Vec = bins + .expect("bins Option should be Some") + .into_iter() + .map(|x| x.load(AtomicOrdering::SeqCst)) + .collect(); bincode::serialize_into(&mut br_writer, &bins) .expect("couldn't serialize bins recs."); @@ -639,11 +644,7 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result num_chunks as u32, cmdline, log, - bins.expect("invalid bins") - .into_iter() - .map(|v| v.load(AtomicOrdering::SeqCst)) - .max() - .unwrap(), + *bins.iter().max().expect("bins should not be empty"), &gpl_opts, ) } else { From c253fc56a4b9af6a4c449b41bc46691fd5dec012 Mon Sep 17 00:00:00 2001 From: rob-p Date: Fri, 6 Dec 2024 00:50:41 -0500 Subject: [PATCH 10/16] clippy is happy --- src/atac/cellfilter.rs | 20 ++++++-------------- src/atac/run.rs | 4 ++-- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/src/atac/cellfilter.rs b/src/atac/cellfilter.rs index 61523dd..77df765 100644 --- a/src/atac/cellfilter.rs +++ b/src/atac/cellfilter.rs @@ -14,13 +14,8 @@ use itertools::Itertools; use libradicl::exit_codes; use libradicl::rad_types; use libradicl::rad_types::TagValue; -use libradicl::utils::has_data_left; use libradicl::BarcodeLookupMap; -use libradicl::{ - chunk, - header::RadPrelude, - record::{AtacSeqReadRecord, AtacSeqRecordContext}, -}; +use libradicl::{chunk, record::AtacSeqReadRecord}; use num_format::{Locale, ToFormattedString}; use serde::Serialize; use serde_json::json; @@ -109,7 +104,7 @@ fn populate_unfiltered_barcode_map( rev_bc: bool, ) -> DashMap { let s = ahash::RandomState::with_seeds(2u64, 7u64, 1u64, 8u64); - let mut hm = DashMap::with_hasher(s); + let hm = DashMap::with_hasher(s); // read through the external unfiltered barcode list // and generate a vector of encoded barcodes @@ -371,7 +366,6 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result let cmdline = gpl_opts.cmdline; let log = gpl_opts.log; let rc = gpl_opts.rc; - let mut num_chunks = 0; let size_range: u64 = 100000; let i_dir = std::path::Path::new(&rad_dir); let o_dir_path = std::path::Path::new(&output_dir); @@ -467,7 +461,7 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result let al_tags = &rad_reader.prelude.aln_tags; info!(log, "read {:?} alignemnt-level tags", al_tags.tags.len()); - let mut ref_lens; + let ref_lens; { let file_tag_map = &rad_reader.file_tag_map; info!(log, "File-level tag values {:?}", file_tag_map); @@ -477,8 +471,6 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result }; } - let mut num_reads: usize = 0; - //let record_context = prelude.get_record_context::()?; let mut num_reads: usize = 0; @@ -516,7 +508,7 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result let mut blens: Vec = vec![0; ref_lens.len() + 1]; let tot_bins = initialize_rec_list(&mut blens, &ref_lens, size_range); let blens = blens; - let mut bins: Arc> = Arc::new( + let bins: Arc> = Arc::new( vec![0; tot_bins.unwrap() as usize] .iter() .map(|x| AtomicU64::new(*x)) @@ -532,8 +524,8 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result let rd = rad_reader.is_done(); let q = rad_reader.get_queue(); let hmu = hmu.clone(); - let mut blens = blens.clone(); - let mut bins = bins.clone(); + let blens = blens.clone(); + let bins = bins.clone(); let handle = s.spawn(move || { let mut unmatched_bc = Vec::::new(); let mut max_ambiguity_read = 0usize; diff --git a/src/atac/run.rs b/src/atac/run.rs index 7520454..cee0071 100644 --- a/src/atac/run.rs +++ b/src/atac/run.rs @@ -104,9 +104,9 @@ pub fn run(opts: &ArgMatches, version: &str, cmdline: &str, log: &Logger) -> any .input_dir(input_dir) .num_threads(num_threads) .rev(rc) - .cmdline(&cmdline) + .cmdline(cmdline) .version(version) - .log(&log) + .log(log) .build(); let parent = std::path::Path::new(&input_dir); From 915ec2be7aa554f9b8324db0e1f692d243235253 Mon Sep 17 00:00:00 2001 From: rob-p Date: Fri, 6 Dec 2024 16:23:49 -0500 Subject: [PATCH 11/16] cleanup --- src/atac/cellfilter.rs | 120 ++++++++++++++++------------------------- src/atac/sort.rs | 31 +++++++++-- src/main.rs | 2 +- src/utils.rs | 9 ++++ 4 files changed, 81 insertions(+), 81 deletions(-) diff --git a/src/atac/cellfilter.rs b/src/atac/cellfilter.rs index 77df765..33f791e 100644 --- a/src/atac/cellfilter.rs +++ b/src/atac/cellfilter.rs @@ -1,12 +1,12 @@ use crate::atac::prog_opts::GenPermitListOpts; use crate::utils as afutils; -use anyhow::{anyhow, Context}; +use anyhow::{bail, Context}; use bstr::io::BufReadExt; use dashmap::DashMap; -use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle}; +use indicatif::{ProgressBar, ProgressDrawTarget}; use std::num::NonZeroUsize; use std::sync::{ - atomic::{AtomicU64, Ordering as AtomicOrdering}, + atomic::{AtomicU64, AtomicUsize, Ordering as AtomicOrdering}, Arc, }; // use indexmap::map::IndexMap; @@ -28,6 +28,9 @@ use std::io::{BufWriter, Write}; use std::path::PathBuf; use std::time::Instant; +type ParBCMap = DashMap; +type BCMap = HashMap; + /// Initialize the index map with key being references and position /// Take the largest reference length (from chromosome) /// For each chromosome divide into ranges of size_range uptil max_size @@ -62,7 +65,7 @@ pub enum CellFilterMethod { } pub fn update_barcode_hist_unfiltered( - hist: &DashMap, + hist: &ParBCMap, unmatched_bc: &mut Vec, max_ambiguity_read: &mut usize, chunk: &chunk::Chunk, @@ -102,9 +105,9 @@ fn populate_unfiltered_barcode_map( br: BufReader, first_bclen: &mut usize, rev_bc: bool, -) -> DashMap { +) -> ParBCMap { let s = ahash::RandomState::with_seeds(2u64, 7u64, 1u64, 8u64); - let hm = DashMap::with_hasher(s); + let hm = ParBCMap::with_hasher(s); // read through the external unfiltered barcode list // and generate a vector of encoded barcodes @@ -137,7 +140,7 @@ fn populate_unfiltered_barcode_map( #[allow(clippy::unnecessary_unwrap, clippy::too_many_arguments)] fn process_unfiltered( - hm: DashMap, + hm: ParBCMap, mut unmatched_bc: Vec, file_tag_map: &rad_types::TagMap, filter_meth: &CellFilterMethod, @@ -196,11 +199,6 @@ fn process_unfiltered( .expect("tag map must contain cblen"); let barcode_len: u16 = barcode_tag.try_into()?; - // let ref_lens = file_tag_map - // .get("ref_lengths") - // .expect("tag map must contain ref_lengths"); - // now, we create a second barcode map with just the barcodes - // for cells we will keep / rescue. let bcmap2 = BarcodeLookupMap::new(kept_bc, barcode_len as u32); info!( log, @@ -297,7 +295,7 @@ fn process_unfiltered( let o_path = parent.join("permit_freq.bin"); // convert the DashMap to a HashMap - let mut hm: HashMap = hm.into_iter().collect(); + let mut hm: BCMap = hm.into_iter().collect(); match afutils::write_permit_list_freq(&o_path, barcode_len, &hm) { Ok(_) => {} Err(error) => { @@ -380,7 +378,7 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result rad_dir.display() ); // std::process::exit(1); - return Err(anyhow!("execution terminated unexpectedly")); + bail!("execution terminated unexpectedly"); } let mut first_bclen = 0usize; @@ -413,10 +411,13 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result ); } - let nworkers: usize = gpl_opts.threads; + let nworkers: usize = gpl_opts.threads.saturating_sub(1).max(1); + // open the input rad file and get the total file length from metadata to support + // a progress bar let i_file = File::open(i_dir.join("map.rad")).context("could not open input rad file")?; let metadata = i_file.metadata()?; let file_len = metadata.len(); + let ifile = BufReader::new(i_file); let mut rad_reader = libradicl::readers::ParallelRadReader::>::new( @@ -430,7 +431,10 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result "paired : {:?}, ref_count : {}, num_chunks : {}", hdr.is_paired != 0, hdr.ref_count.to_formatted_string(&Locale::en), - hdr.num_chunks.to_formatted_string(&Locale::en), + match hdr.num_chunks() { + None => String::from("unknown"), + Some(v) => v.to_formatted_string(&Locale::en), + } ); // file-level @@ -441,19 +445,14 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result info!(log, "read {:?} read-level tags", rl_tags.tags.len()); const BNAME: &str = "barcode"; - // let mut bct: Option = None; - for rt in &rl_tags.tags { // if this is one of our tags if rt.name == BNAME && !rt.typeid.is_int_type() { crit!( log, - "currently only RAD types 1--4 are supported for 'b' tags." + "currently only RAD types 1--4 are supported for 'b' tags. exiting." ); std::process::exit(exit_codes::EXIT_UNSUPPORTED_TAG_TYPE); - // if rt.name == BNAME { - // bct = Some(rt.typeid); - // } } } @@ -463,34 +462,33 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result let ref_lens; { + info!(log, "reading reference lengths from file-level tag map"); let file_tag_map = &rad_reader.file_tag_map; - info!(log, "File-level tag values {:?}", file_tag_map); ref_lens = match file_tag_map.get("ref_lengths") { Some(TagValue::ArrayU64(v)) => v.clone(), - _ => panic!("expected \"ref_lengths\" to be an ArrayU64, but didn't find that"), + _ => bail!( + "expected the \"ref_lengths\" tag value to exist and to be an ArrayU64, but didn't find that; cannot proceed." + ), }; } - //let record_context = prelude.get_record_context::()?; - let mut num_reads: usize = 0; - + let num_reads = Arc::new(AtomicUsize::new(0)); // if dealing with the unfiltered type // the set of barcodes that are not an exact match for any known barcodes let mut unmatched_bc: Vec; // let mut num_orientation_compat_reads = 0usize; let mut max_ambiguity_read = 0usize; let mut num_orientation_compat_reads = 0usize; - // Tracking if a unique or a multihit // for progress bar let header_offset = rad_reader.get_byte_offset(); let pbar = ProgressBar::new(file_len - header_offset); pbar.set_style( - ProgressStyle::with_template( - "[{elapsed_precise}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}", + indicatif::ProgressStyle::with_template( + "[{elapsed_precise}] {bar:40.green/blue} {human_pos:>12}/{human_len:12} {msg}", ) .unwrap() - .progress_chars("##-"), + .progress_chars("⠁⠁⠉⠙⠚⠒⠂⠂⠒⠲⠴⠤⠄⠄⠤⠠⠠⠤⠦⠖⠒⠐⠐⠒⠓⠋⠉⠈⠈"), ); pbar.set_draw_target(ProgressDrawTarget::stderr_with_hz(5)); let cb = |new_bytes: u64, _new_chunks: u64| { @@ -499,8 +497,7 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result match filter_meth { CellFilterMethod::UnfilteredExternalList(_, _min_reads) => { - unmatched_bc = Vec::with_capacity(10000000); - // the unfiltered_bc_count map must be valid in this branch + unmatched_bc = Vec::with_capacity(10_000_000); // the unfiltered_bc_count map must be valid in this branch if unfiltered_bc_counts.is_some() { @@ -508,29 +505,25 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result let mut blens: Vec = vec![0; ref_lens.len() + 1]; let tot_bins = initialize_rec_list(&mut blens, &ref_lens, size_range); let blens = blens; - let bins: Arc> = Arc::new( - vec![0; tot_bins.unwrap() as usize] - .iter() - .map(|x| AtomicU64::new(*x)) - .collect(), - ); + let mut bin_vec = vec![]; + bin_vec.resize_with(tot_bins.unwrap() as usize, || AtomicU64::new(0)); + let bins: Arc> = Arc::new(bin_vec); let hmu = std::sync::Arc::new(unfiltered_bc_counts.unwrap()); - let mut num_chunks = 0usize; - let mut handles = Vec::< - std::thread::ScopedJoinHandle<(usize, usize, Vec, usize, usize)>, - >::new(); + let num_chunks = Arc::new(AtomicUsize::new(0)); + let mut handles = + Vec::, usize)>>::new(); for _ in 0..nworkers { let rd = rad_reader.is_done(); let q = rad_reader.get_queue(); let hmu = hmu.clone(); let blens = blens.clone(); let bins = bins.clone(); + let num_reads = num_reads.clone(); + let num_chunks = num_chunks.clone(); let handle = s.spawn(move || { let mut unmatched_bc = Vec::::new(); let mut max_ambiguity_read = 0usize; - let mut num_reads = 0; - let mut num_chunks = 0; let mut num_orientation_compat_reads = 0; while !rd.load(AtomicOrdering::SeqCst) { while let Some(meta_chunk) = q.pop() { @@ -545,59 +538,36 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result &blens, size_range, ); - num_reads += c.reads.len(); - num_chunks += 1; + num_reads.fetch_add(c.reads.len(), AtomicOrdering::AcqRel); + num_chunks.fetch_add(1, AtomicOrdering::AcqRel); } } } ( - num_reads, num_orientation_compat_reads, unmatched_bc, max_ambiguity_read, - num_chunks, ) }); handles.push(handle); } - let _ = rad_reader.start_chunk_parsing(Some(cb)); //libradicl::readers::EMPTY_METACHUNK_CALLBACK); + let _ = rad_reader.start_chunk_parsing(Some(cb)); for handle in handles { - let (nr, nocr, ubc, mar, nc) = - handle.join().expect("The parsing thread panicked"); - num_reads += nr; + let (nocr, ubc, mar) = handle.join().expect("The parsing thread panicked"); num_orientation_compat_reads += nocr; unmatched_bc.extend_from_slice(&ubc); max_ambiguity_read = max_ambiguity_read.max(mar); - num_chunks += nc; } pbar.finish_with_message("finished parsing RAD file\n"); // return the hash map we no longer need ( - Arc::>::into_inner(hmu), + Arc::::into_inner(hmu), Arc::>::into_inner(bins), blens, - num_chunks, + num_chunks.load(AtomicOrdering::Acquire), ) }); - /* - if let Some(mut hmu) = unfiltered_bc_counts { - while has_data_left(&mut br).expect("encountered error reading input file") { - let c = chunk::Chunk::::from_bytes(&mut br, &record_context); - num_orientation_compat_reads += update_barcode_hist_unfiltered( - &mut hmu, - &mut unmatched_bc, - &mut max_ambiguity_read, - &c, - &mut bins, - &blens, - size_range, - ); - num_chunks += 1; - num_reads += c.reads.len(); - } - */ - let bin_recs_path = output_dir.join("bin_recs.bin"); let br_file = std::fs::File::create(bin_recs_path) .expect("could not create serialization file."); @@ -620,7 +590,7 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result info!( log, "observed {} reads ({} orientation consistent) in {} chunks --- max ambiguity read occurs in {} refs", - num_reads.to_formatted_string(&Locale::en), + num_reads.load(AtomicOrdering::Acquire).to_formatted_string(&Locale::en), num_orientation_compat_reads.to_formatted_string(&Locale::en), num_chunks.to_formatted_string(&Locale::en), max_ambiguity_read.to_formatted_string(&Locale::en) diff --git a/src/atac/sort.rs b/src/atac/sort.rs index ea2e44c..6a4546c 100644 --- a/src/atac/sort.rs +++ b/src/atac/sort.rs @@ -1,5 +1,6 @@ use crate::constants as afconst; -use crate::utils::InternalVersionInfo; +use crate::utils as afutils; +use afutils::InternalVersionInfo; use anyhow::{anyhow, Context}; use crossbeam_queue::ArrayQueue; use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle}; @@ -144,12 +145,18 @@ pub fn sort_temp_bucket( let bed_string: String = get_bed_string(&hit_info_vec, ref_names, barcode_len, rc); if compress { - let bd = File::create(parent.join(format!("{}.bed.gz", buck_id))).unwrap(); + let bname = parent.join(format!("{}.bed.gz", buck_id)); + afutils::remove_file_if_exists(&bname)?; + let bd = File::create(&bname) + .with_context(|| format!("could not create temporary bed file {}", bname.display()))?; let mut encoder = GzEncoder::new(bd, Compression::default()); encoder.write_all(bed_string.as_bytes()).unwrap(); encoder.finish()?; } else { - let mut bd = File::create(parent.join(format!("{}.bed", buck_id))).unwrap(); + let bname = parent.join(format!("{}.bed", buck_id)); + afutils::remove_file_if_exists(&bname)?; + let mut bd = File::create(&bname) + .with_context(|| format!("could not create temporary bed file {}", bname.display()))?; bd.write_all(bed_string.as_bytes())?; } Ok(()) @@ -806,7 +813,14 @@ where expected_output_chunks.to_formatted_string(&Locale::en) ); if compress_out { - let out_bed_file = File::create(parent.join("map.bed.gz")).unwrap(); + let bedname = parent.join("map.bed.gz"); + afutils::remove_file_if_exists(&bedname)?; + let out_bed_file = File::create(&bedname).with_context(|| { + format!( + "could not create target output bed file {}", + bedname.display() + ) + })?; let mut encoder = GzEncoder::new(out_bed_file, Compression::default()); for i in 0..temp_buckets.len() { let temp_bed_name = parent.join(format!("{}.bed.gz", i)); @@ -816,7 +830,14 @@ where } encoder.finish()?; } else { - let mut out_bed_file = File::create(parent.join("map.bed")).unwrap(); + let bedname = parent.join("map.bed"); + afutils::remove_file_if_exists(&bedname)?; + let mut out_bed_file = File::create(&bedname).with_context(|| { + format!( + "could not create target output bed file {}", + bedname.display() + ) + })?; for i in 0..temp_buckets.len() { let temp_bed_name = parent.join(format!("{}.bed", i)); let mut input = File::open(&temp_bed_name)?; diff --git a/src/main.rs b/src/main.rs index 346e1c5..1e53437 100644 --- a/src/main.rs +++ b/src/main.rs @@ -701,7 +701,7 @@ fn atac_sub_commands() -> Command { ); Command::new("atac") - .about("Deduplicate the RAD file and output a BED file") + .about("subcommand for processing scATAC-seq RAD files") .version(version) .author(crate_authors) .subcommand(gen_app) diff --git a/src/utils.rs b/src/utils.rs index 6777be6..857eb72 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -18,6 +18,7 @@ use std::collections::{HashMap, HashSet}; use std::error::Error; use std::fs::File; use std::io::{BufReader, BufWriter, Write}; +use std::path::Path; use std::path::PathBuf; use std::str::FromStr; use thiserror::Error; @@ -40,6 +41,14 @@ struct QuantArguments { } */ +pub(crate) fn remove_file_if_exists(fname: &Path) -> anyhow::Result<()> { + if fname.exists() { + std::fs::remove_file(fname) + .with_context(|| format!("could not remove {}", fname.display()))?; + } + Ok(()) +} + /// FROM https://github.com/10XGenomics/rust-debruijn/blob/master/src/dna_string.rs /// count Hamming distance between 2 2-bit DNA packed u64s pub(super) fn count_diff_2_bit_packed(a: u64, b: u64) -> usize { From 2dfae94928018f8d7306a6a6130428d40bdfad40 Mon Sep 17 00:00:00 2001 From: rob-p Date: Fri, 6 Dec 2024 16:39:33 -0500 Subject: [PATCH 12/16] add diagnostic about barcode matching --- src/atac/cellfilter.rs | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/atac/cellfilter.rs b/src/atac/cellfilter.rs index 33f791e..1fc1efc 100644 --- a/src/atac/cellfilter.rs +++ b/src/atac/cellfilter.rs @@ -1,4 +1,5 @@ use crate::atac::prog_opts::GenPermitListOpts; +use crate::diagnostics; use crate::utils as afutils; use anyhow::{bail, Context}; use bstr::io::BufReadExt; @@ -19,8 +20,7 @@ use libradicl::{chunk, record::AtacSeqReadRecord}; use num_format::{Locale, ToFormattedString}; use serde::Serialize; use serde_json::json; -use slog::crit; -use slog::info; +use slog::{crit, info, warn}; use std::collections::HashMap; use std::fs::File; use std::io::{BufReader, Read}; @@ -587,10 +587,27 @@ pub fn generate_permit_list(gpl_opts: GenPermitListOpts) -> anyhow::Result bincode::serialize_into(&mut bl_writer, &blens) .expect("couldn't serialize bins lengths."); + let num_reads = num_reads.load(AtomicOrdering::Acquire); + let valid_thresh = 0.3f64; + match diagnostics::likely_valid_permit_list( + unmatched_bc.len(), + num_reads, + valid_thresh, + ) { + Ok(f) => { + info!(log, + "The percentage of mapped reads not matching a known barcode exactly is {:.3}%, which is < the warning threshold {:.3}%", + f * 100f64, valid_thresh * 100f64); + } + Err(e) => { + warn!(log, "{:?}", e); + } + } + info!( log, "observed {} reads ({} orientation consistent) in {} chunks --- max ambiguity read occurs in {} refs", - num_reads.load(AtomicOrdering::Acquire).to_formatted_string(&Locale::en), + num_reads.to_formatted_string(&Locale::en), num_orientation_compat_reads.to_formatted_string(&Locale::en), num_chunks.to_formatted_string(&Locale::en), max_ambiguity_read.to_formatted_string(&Locale::en) From fed7a800267c006efe7971150d366c42e988a1e3 Mon Sep 17 00:00:00 2001 From: rob-p Date: Fri, 6 Dec 2024 17:35:38 -0500 Subject: [PATCH 13/16] update cargo dist --- .github/workflows/release.yml | 123 +++++++++++++++++++--------------- Cargo.lock | 14 ++-- Cargo.toml | 37 ++++++---- 3 files changed, 102 insertions(+), 72 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index d271dfd..0e9bca8 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,20 +1,21 @@ -# Copyright 2022-2023, axodotdev +# This file was autogenerated by dist: https://opensource.axo.dev/cargo-dist/ +# +# Copyright 2022-2024, axodotdev # SPDX-License-Identifier: MIT or Apache-2.0 # # CI that: # # * checks for a Git Tag that looks like a release -# * builds artifacts with cargo-dist (archives, installers, hashes) +# * builds artifacts with dist (archives, installers, hashes) # * uploads those artifacts to temporary workflow zip -# * on success, uploads the artifacts to a Github Release +# * on success, uploads the artifacts to a GitHub Release # -# Note that the Github Release will be created with a generated +# Note that the GitHub Release will be created with a generated # title/body based on your changelogs. name: Release - permissions: - contents: write + "contents": "write" # This task will run whenever you push a git tag that looks like a version # like "1.0.0", "v0.1.0-prerelease.1", "my-app/0.1.0", "releases/v1.0.0", etc. @@ -23,30 +24,30 @@ permissions: # must be a Cargo-style SemVer Version (must have at least major.minor.patch). # # If PACKAGE_NAME is specified, then the announcement will be for that -# package (erroring out if it doesn't have the given version or isn't cargo-dist-able). +# package (erroring out if it doesn't have the given version or isn't dist-able). # # If PACKAGE_NAME isn't specified, then the announcement will be for all -# (cargo-dist-able) packages in the workspace with that version (this mode is +# (dist-able) packages in the workspace with that version (this mode is # intended for workspaces with only one dist-able package, or with all dist-able # packages versioned/released in lockstep). # # If you push multiple tags at once, separate instances of this workflow will -# spin up, creating an independent announcement for each one. However Github +# spin up, creating an independent announcement for each one. However, GitHub # will hard limit this to 3 tags per commit, as it will assume more tags is a # mistake. # # If there's a prerelease-style suffix to the version, then the release(s) # will be marked as a prerelease. on: + pull_request: push: tags: - '**[0-9]+.[0-9]+.[0-9]+*' - pull_request: jobs: - # Run 'cargo dist plan' (or host) to determine what tasks we need to do + # Run 'dist plan' (or host) to determine what tasks we need to do plan: - runs-on: ubuntu-latest + runs-on: "ubuntu-20.04" outputs: val: ${{ steps.plan.outputs.manifest }} tag: ${{ !github.event.pull_request && github.ref_name || '' }} @@ -58,11 +59,16 @@ jobs: - uses: actions/checkout@v4 with: submodules: recursive - - name: Install cargo-dist + - name: Install dist # we specify bash to get pipefail; it guards against the `curl` command # failing. otherwise `sh` won't catch that `curl` returned non-0 shell: bash - run: "curl --proto '=https' --tlsv1.2 -LsSf https://github.com/axodotdev/cargo-dist/releases/download/v0.11.1/cargo-dist-installer.sh | sh" + run: "curl --proto '=https' --tlsv1.2 -LsSf https://github.com/axodotdev/cargo-dist/releases/download/v0.25.1/cargo-dist-installer.sh | sh" + - name: Cache dist + uses: actions/upload-artifact@v4 + with: + name: cargo-dist-cache + path: ~/.cargo/bin/dist # sure would be cool if github gave us proper conditionals... # so here's a doubly-nested ternary-via-truthiness to try to provide the best possible # functionality based on whether this is a pull_request, and whether it's from a fork. @@ -70,8 +76,8 @@ jobs: # but also really annoying to build CI around when it needs secrets to work right.) - id: plan run: | - cargo dist ${{ (!github.event.pull_request && format('host --steps=create --tag={0}', github.ref_name)) || 'plan' }} --output-format=json > plan-dist-manifest.json - echo "cargo dist ran successfully" + dist ${{ (!github.event.pull_request && format('host --steps=create --tag={0}', github.ref_name)) || 'plan' }} --output-format=json > plan-dist-manifest.json + echo "dist ran successfully" cat plan-dist-manifest.json echo "manifest=$(jq -c "." plan-dist-manifest.json)" >> "$GITHUB_OUTPUT" - name: "Upload dist-manifest.json" @@ -89,12 +95,12 @@ jobs: if: ${{ fromJson(needs.plan.outputs.val).ci.github.artifacts_matrix.include != null && (needs.plan.outputs.publishing == 'true' || fromJson(needs.plan.outputs.val).ci.github.pr_run_mode == 'upload') }} strategy: fail-fast: false - # Target platforms/runners are computed by cargo-dist in create-release. + # Target platforms/runners are computed by dist in create-release. # Each member of the matrix has the following arguments: # # - runner: the github runner - # - dist-args: cli flags to pass to cargo dist - # - install-dist: expression to run to install cargo-dist on the runner + # - dist-args: cli flags to pass to dist + # - install-dist: expression to run to install dist on the runner # # Typically there will be: # - 1 "global" task that builds universal installers @@ -105,11 +111,13 @@ jobs: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} BUILD_MANIFEST_NAME: target/distrib/${{ join(matrix.targets, '-') }}-dist-manifest.json steps: + - name: enable windows longpaths + run: | + git config --global core.longpaths true - uses: actions/checkout@v4 with: submodules: recursive - - uses: swatinem/rust-cache@v2 - - name: Install cargo-dist + - name: Install dist run: ${{ matrix.install_dist }} # Get the dist-manifest - name: Fetch local artifacts @@ -124,8 +132,8 @@ jobs: - name: Build artifacts run: | # Actually do builds and make zips and whatnot - cargo dist build ${{ needs.plan.outputs.tag-flag }} --print=linkage --output-format=json ${{ matrix.dist_args }} > dist-manifest.json - echo "cargo dist ran successfully" + dist build ${{ needs.plan.outputs.tag-flag }} --print=linkage --output-format=json ${{ matrix.dist_args }} > dist-manifest.json + echo "dist ran successfully" - id: cargo-dist name: Post-build # We force bash here just because github makes it really hard to get values up @@ -135,7 +143,7 @@ jobs: run: | # Parse out what we just built and upload it to scratch storage echo "paths<> "$GITHUB_OUTPUT" - jq --raw-output ".artifacts[]?.path | select( . != null )" dist-manifest.json >> "$GITHUB_OUTPUT" + jq --raw-output ".upload_files[]" dist-manifest.json >> "$GITHUB_OUTPUT" echo "EOF" >> "$GITHUB_OUTPUT" cp dist-manifest.json "$BUILD_MANIFEST_NAME" @@ -160,9 +168,12 @@ jobs: - uses: actions/checkout@v4 with: submodules: recursive - - name: Install cargo-dist - shell: bash - run: "curl --proto '=https' --tlsv1.2 -LsSf https://github.com/axodotdev/cargo-dist/releases/download/v0.11.1/cargo-dist-installer.sh | sh" + - name: Install cached dist + uses: actions/download-artifact@v4 + with: + name: cargo-dist-cache + path: ~/.cargo/bin/ + - run: chmod +x ~/.cargo/bin/dist # Get all the local artifacts for the global tasks to use (for e.g. checksums) - name: Fetch local artifacts uses: actions/download-artifact@v4 @@ -173,12 +184,12 @@ jobs: - id: cargo-dist shell: bash run: | - cargo dist build ${{ needs.plan.outputs.tag-flag }} --output-format=json "--artifacts=global" > dist-manifest.json - echo "cargo dist ran successfully" + dist build ${{ needs.plan.outputs.tag-flag }} --output-format=json "--artifacts=global" > dist-manifest.json + echo "dist ran successfully" # Parse out what we just built and upload it to scratch storage echo "paths<> "$GITHUB_OUTPUT" - jq --raw-output ".artifacts[]?.path | select( . != null )" dist-manifest.json >> "$GITHUB_OUTPUT" + jq --raw-output ".upload_files[]" dist-manifest.json >> "$GITHUB_OUTPUT" echo "EOF" >> "$GITHUB_OUTPUT" cp dist-manifest.json "$BUILD_MANIFEST_NAME" @@ -206,8 +217,12 @@ jobs: - uses: actions/checkout@v4 with: submodules: recursive - - name: Install cargo-dist - run: "curl --proto '=https' --tlsv1.2 -LsSf https://github.com/axodotdev/cargo-dist/releases/download/v0.11.1/cargo-dist-installer.sh | sh" + - name: Install cached dist + uses: actions/download-artifact@v4 + with: + name: cargo-dist-cache + path: ~/.cargo/bin/ + - run: chmod +x ~/.cargo/bin/dist # Fetch artifacts from scratch-storage - name: Fetch artifacts uses: actions/download-artifact@v4 @@ -215,11 +230,10 @@ jobs: pattern: artifacts-* path: target/distrib/ merge-multiple: true - # This is a harmless no-op for Github Releases, hosting for that happens in "announce" - id: host shell: bash run: | - cargo dist host ${{ needs.plan.outputs.tag-flag }} --steps=upload --steps=release --output-format=json > dist-manifest.json + dist host ${{ needs.plan.outputs.tag-flag }} --steps=upload --steps=release --output-format=json > dist-manifest.json echo "artifacts uploaded and released successfully" cat dist-manifest.json echo "manifest=$(jq -c "." dist-manifest.json)" >> "$GITHUB_OUTPUT" @@ -229,8 +243,29 @@ jobs: # Overwrite the previous copy name: artifacts-dist-manifest path: dist-manifest.json + # Create a GitHub Release while uploading all files to it + - name: "Download GitHub Artifacts" + uses: actions/download-artifact@v4 + with: + pattern: artifacts-* + path: artifacts + merge-multiple: true + - name: Cleanup + run: | + # Remove the granular manifests + rm -f artifacts/*-dist-manifest.json + - name: Create GitHub Release + env: + PRERELEASE_FLAG: "${{ fromJson(steps.host.outputs.manifest).announcement_is_prerelease && '--prerelease' || '' }}" + ANNOUNCEMENT_TITLE: "${{ fromJson(steps.host.outputs.manifest).announcement_title }}" + ANNOUNCEMENT_BODY: "${{ fromJson(steps.host.outputs.manifest).announcement_github_body }}" + RELEASE_COMMIT: "${{ github.sha }}" + run: | + # Write and read notes from a file to avoid quoting breaking things + echo "$ANNOUNCEMENT_BODY" > $RUNNER_TEMP/notes.txt + + gh release create "${{ needs.plan.outputs.tag }}" --target "$RELEASE_COMMIT" $PRERELEASE_FLAG --title "$ANNOUNCEMENT_TITLE" --notes-file "$RUNNER_TEMP/notes.txt" artifacts/* - # Create a Github Release while uploading all files to it announce: needs: - plan @@ -246,21 +281,3 @@ jobs: - uses: actions/checkout@v4 with: submodules: recursive - - name: "Download Github Artifacts" - uses: actions/download-artifact@v4 - with: - pattern: artifacts-* - path: artifacts - merge-multiple: true - - name: Cleanup - run: | - # Remove the granular manifests - rm -f artifacts/*-dist-manifest.json - - name: Create Github Release - uses: ncipollo/release-action@v1 - with: - tag: ${{ needs.plan.outputs.tag }} - name: ${{ fromJson(needs.host.outputs.val).announcement_title }} - body: ${{ fromJson(needs.host.outputs.val).announcement_github_body }} - prerelease: ${{ fromJson(needs.host.outputs.val).announcement_is_prerelease }} - artifacts: "artifacts/*" diff --git a/Cargo.lock b/Cargo.lock index 84cf4c0..72edcfa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -32,7 +32,7 @@ dependencies = [ [[package]] name = "alevin-fry" -version = "0.10.0" +version = "0.11.0" dependencies = [ "ahash", "anyhow", @@ -323,9 +323,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.21" +version = "4.5.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb3b4b9e5a7c7514dfa52869339ee98b3156b0bfb4e8a77c4ff4babb64b1604f" +checksum = "3135e7ec2ef7b10c6ed8950f0f792ed96ee093fa088608f1c76e569722700c84" dependencies = [ "clap_builder", "clap_derive", @@ -333,9 +333,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.21" +version = "4.5.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b17a95aa67cc7b5ebd32aa5370189aa0d79069ef1c64ce893bd30fb24bff20ec" +checksum = "30582fc632330df2bd26877bde0c1f4470d57c582bbc070376afcd04d8cb4838" dependencies = [ "anstream", "anstyle", @@ -358,9 +358,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.7.3" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7" +checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" [[package]] name = "colorchoice" diff --git a/Cargo.toml b/Cargo.toml index 80b2d0e..0319f12 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,13 +1,13 @@ [package] -name = "alevin-fry" -version = "0.10.0" authors = [ - "Avi Srivastava ", - "Hirak Sarkar ", - "Dongze He ", - "Mohsen Zakeri ", + "Avi Srivastava ", + "Hirak Sarkar ", + "Dongze He .zakeri@gmail.com>", + "Noor Pratap Singh ", "Rob Patro ", ] +name = "alevin-fry" +version = "0.11.0" edition = "2021" description = "A suite of tools for the rapid, accurate and memory-frugal processing single-cell and single-nucleus sequencing data." license-file = "LICENSE" @@ -27,6 +27,7 @@ keywords = [ "single-cell", "preprocessing", "RNA-seq", + "ATAC-seq", "single-nucleus", "RNA-velocity", ] @@ -71,7 +72,15 @@ sce = { git = "https://github.com/parazodiac/SingleCellExperiment", branch = "de # no shenanigans; clap makes breaking "fixes" too often to allow variability # in the version different from what we tested with -clap = { version = "4.5.9", features = ["derive", "wrap_help", "cargo", "help", "usage", "string", "error-context"] } +clap = { version = "4.5.23", features = [ + "derive", + "wrap_help", + "cargo", + "help", + "usage", + "string", + "error-context", +] } noodles = { version = "0.85.0", features = ["bam", "bgzf", "sam"] } dashmap = { version = "6.1.0", features = ["serde", "inline"] } @@ -88,18 +97,22 @@ opt-level = 3 inherits = "release" lto = "thin" -# Config for 'cargo dist' +# Config for 'dist' [workspace.metadata.dist] -# The preferred cargo-dist version to use in CI (Cargo.toml SemVer syntax) -cargo-dist-version = "0.11.1" +# The preferred dist version to use in CI (Cargo.toml SemVer syntax) +cargo-dist-version = "0.25.1" # CI backends to support -ci = ["github"] +ci = "github" # The installers to generate for each app installers = ["shell"] # Target platforms to build apps for (Rust target-triple syntax) targets = ["aarch64-apple-darwin", "x86_64-apple-darwin", "x86_64-unknown-linux-gnu"] -# Publish jobs to run in CI +# Which actions to run on pull requests pr-run-mode = "plan" +# Path that installers should place binaries in +install-path = "CARGO_HOME" +# Whether to install an updater program +install-updater = true [workspace.metadata.dist.github-custom-runners] aarch64-apple-darwin = "macos-14" From 0f53de058be09a87e7fc62b8a67db3128932f55e Mon Sep 17 00:00:00 2001 From: rob-p Date: Fri, 6 Dec 2024 17:35:57 -0500 Subject: [PATCH 14/16] update deps --- Cargo.lock | 57 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 72edcfa..18cf487 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -717,9 +717,9 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "lexical-core" -version = "1.0.2" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0431c65b318a590c1de6b8fd6e72798c92291d27762d94c9e6c37ed7a73d8458" +checksum = "06d7a061b7feb8a4b233a4d90280d13e0965c4e0181566e9ad61af98e210ca9d" dependencies = [ "lexical-parse-float", "lexical-parse-integer", @@ -730,9 +730,9 @@ dependencies = [ [[package]] name = "lexical-parse-float" -version = "1.0.2" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb17a4bdb9b418051aa59d41d65b1c9be5affab314a872e5ad7f06231fb3b4e0" +checksum = "0029bdee2a94a6c4393a86f7e6921c90f234218fa4f2154bc001c92bc51e8bf5" dependencies = [ "lexical-parse-integer", "lexical-util", @@ -741,9 +741,9 @@ dependencies = [ [[package]] name = "lexical-parse-integer" -version = "1.0.2" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5df98f4a4ab53bf8b175b363a34c7af608fe31f93cc1fb1bf07130622ca4ef61" +checksum = "440a2398a08def518ff962b69e7146246c53bad8090e2b75d95fd5a469338958" dependencies = [ "lexical-util", "static_assertions", @@ -751,18 +751,18 @@ dependencies = [ [[package]] name = "lexical-util" -version = "1.0.3" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85314db53332e5c192b6bca611fb10c114a80d1b831ddac0af1e9be1b9232ca0" +checksum = "3100209587e35b13881068ce5a41241b112e0500b4d847ba16be172829c112ff" dependencies = [ "static_assertions", ] [[package]] name = "lexical-write-float" -version = "1.0.2" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e7c3ad4e37db81c1cbe7cf34610340adc09c322871972f74877a712abc6c809" +checksum = "27adf08e2f91ff44ab54bbac0c4579303f0865730870f91b58c044df821f114c" dependencies = [ "lexical-util", "lexical-write-integer", @@ -771,9 +771,9 @@ dependencies = [ [[package]] name = "lexical-write-integer" -version = "1.0.2" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb89e9f6958b83258afa3deed90b5de9ef68eef090ad5086c791cd2345610162" +checksum = "8b4e5d27d742da13f013765f849efc0c4b6173e0e64404546475eb5ee0931e2c" dependencies = [ "lexical-util", "static_assertions", @@ -954,9 +954,9 @@ dependencies = [ [[package]] name = "needletail" -version = "0.6.0" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f29a3c5015d6985f33318d154fa0c41315eb2e7df29432c844c74a83434bfe21" +checksum = "de3de09e373770238e3d30eb1a9f09f4754134d0ef354d0570bc1203d2517257" dependencies = [ "buffer-redux", "bytecount", @@ -964,6 +964,7 @@ dependencies = [ "flate2", "liblzma", "memchr", + "zstd", ] [[package]] @@ -1982,3 +1983,31 @@ dependencies = [ "quote", "syn 2.0.90", ] + +[[package]] +name = "zstd" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54a3ab4db68cea366acc5c897c7b4d4d1b8994a9cd6e6f841f8964566a419059" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.13+zstd.1.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38ff0f21cfee8f97d94cef41359e0c89aa6113028ab0291aa8ca0038995a95aa" +dependencies = [ + "cc", + "pkg-config", +] From c73059695aae0d715253bc3bfa44f3ee9676f31e Mon Sep 17 00:00:00 2001 From: rob-p Date: Fri, 6 Dec 2024 17:44:09 -0500 Subject: [PATCH 15/16] add sanitize --- .github/workflows/sanitize-cargo.yml | 38 ++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 .github/workflows/sanitize-cargo.yml diff --git a/.github/workflows/sanitize-cargo.yml b/.github/workflows/sanitize-cargo.yml new file mode 100644 index 0000000..b9722db --- /dev/null +++ b/.github/workflows/sanitize-cargo.yml @@ -0,0 +1,38 @@ +name: Sanitize Cargo +on: + push: + branches: + - main + +permissions: + "contents": "write" + +jobs: + sanitize_cargo_file: + if: "contains(github.event.head_commit.message, '[do_tag]')" + runs-on: ubuntu-latest + # This is optional; it exposes the plan to your job as an environment variable + #env: + # PLAN: ${{ inputs.plan }} + steps: + - uses: actions/checkout@v4 + with: + token: ${{ secrets.PAT_TOK }} + - uses: dtolnay/rust-toolchain@stable + - name: Install cargo-sanitize + run: | + cargo install cargo-sanitize + - name: Sanitize Cargo.toml file + run: | + cp Cargo.toml Cargo.toml.cs_orig + cargo-sanitize -i Cargo.toml.cs_orig -o Cargo.toml + - name: Tag and push new commit + run: | + export VERSION_TAG=`cargo read-manifest | jq ".version" | tr -d '"'` + git config --local user.email "41898282+github-actions[bot]@users.noreply.github.com" + git config --local user.name "github-actions[bot]" + git add Cargo.toml.cs_orig + git add Cargo.toml + git commit -m "create sanitized release" + git tag --force -a v${VERSION_TAG} -m "version ${VERSION_TAG}" + git push --force origin v${VERSION_TAG} From 091c2bff0a0795f68ae15ac25bec9029be70ccdb Mon Sep 17 00:00:00 2001 From: rob-p Date: Fri, 6 Dec 2024 18:36:29 -0500 Subject: [PATCH 16/16] bump versions --- Cargo.lock | 4 ++-- Cargo.toml | 9 ++++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 18cf487..01d1bf8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -823,8 +823,8 @@ dependencies = [ [[package]] name = "libradicl" -version = "0.9.1" -source = "git+https://github.com/COMBINE-lab/libradicl?branch=develop#5dea2f8f2fc15bf79d3f873066c9bf29595c7820" +version = "0.10.0" +source = "git+https://github.com/COMBINE-lab/libradicl?branch=develop#07d42bedf341f43d9388e27dab3eb370589389ab" dependencies = [ "ahash", "anyhow", diff --git a/Cargo.toml b/Cargo.toml index 0319f12..122a1be 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,14 +29,13 @@ keywords = [ "RNA-seq", "ATAC-seq", "single-nucleus", - "RNA-velocity", ] categories = ["command-line-utilities", "science"] [dependencies] # for local development, look in the libradicl git repository # but when published, pull the specified version -libradicl = { git = "https://github.com/COMBINE-lab/libradicl", branch = "develop", version = "0.9.1" } +libradicl = { git = "https://github.com/COMBINE-lab/libradicl", branch = "develop", version = "0.10.0" } anyhow = "1.0.86" arrayvec = "0.7.4" ahash = "0.8.11" @@ -106,7 +105,11 @@ ci = "github" # The installers to generate for each app installers = ["shell"] # Target platforms to build apps for (Rust target-triple syntax) -targets = ["aarch64-apple-darwin", "x86_64-apple-darwin", "x86_64-unknown-linux-gnu"] +targets = [ + "aarch64-apple-darwin", + "x86_64-apple-darwin", + "x86_64-unknown-linux-gnu", +] # Which actions to run on pull requests pr-run-mode = "plan" # Path that installers should place binaries in