From 4fad802c1c6d8a70ce423ea564f4bfc1ceb32d04 Mon Sep 17 00:00:00 2001 From: DongzeHe Date: Sat, 6 Jul 2024 14:00:15 -0400 Subject: [PATCH 1/5] updated grangers --- Cargo.toml | 8 +- roers/src/lib/lib.rs | 782 ------------------------------------------- src/lib/lib.rs | 24 +- 3 files changed, 18 insertions(+), 796 deletions(-) delete mode 100644 roers/src/lib/lib.rs diff --git a/Cargo.toml b/Cargo.toml index b5c2751..f609797 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -40,14 +40,14 @@ path = "src/lib/lib.rs" [dependencies] anyhow = "1.0.75" clap = { version = "4.4.7", features = ["derive", "wrap_help","cargo"] } -grangers = { git = "https://github.com/COMBINE-lab/grangers.git", branch="dev", version = "0.4.0" } -polars = { version = "0.38.2", features = ["lazy","dataframe_arithmetic", "checked_arithmetic","rows","dtype-struct", "dtype-categorical", "list_eval","concat_str", "strings"]} +grangers = { git = "https://github.com/COMBINE-lab/grangers.git", branch="add_rustdoc", version = "0.4.0" } +polars = { version = "0.41.3", features = ["lazy","dataframe_arithmetic", "checked_arithmetic","rows","dtype-struct", "dtype-categorical", "list_eval","concat_str", "strings"]} peak_alloc = "0.2.0" tracing = "0.1.40" tracing-subscriber = { version = "0.3.17", features = ["env-filter"] } -noodles = { version = "0.65.0", features = ["gtf","gff","fasta", "core"] } +noodles = { version = "0.77.0", features = ["gtf","gff","fasta", "core"] } serde = {version = "1.0.190", features = ["derive"]} serde_json = "1.0.107" -itertools = "0.12.1" +itertools = "0.13.0" oomfi = "0.1.2" xxhash-rust = { version = "0.8.7", features = ["xxh3"] } diff --git a/roers/src/lib/lib.rs b/roers/src/lib/lib.rs deleted file mode 100644 index 84300d9..0000000 --- a/roers/src/lib/lib.rs +++ /dev/null @@ -1,782 +0,0 @@ -use anyhow::Context; -use clap::builder::{PossibleValuesParser, TypedValueParser}; -use grangers::{options, Grangers}; -use itertools::Itertools; -use polars::lazy::dsl::concat_str; -use polars::prelude::*; -use serde::Serialize; -use serde_json::json; -use std::collections::{hash_map::Entry, HashMap}; -use std::io::{BufWriter, Write}; -use std::ops::Add; -use std::ops::Not; -use std::path::{Path, PathBuf}; -use std::time::{Duration, Instant}; -use tracing::{debug, info, warn}; -use xxhash_rust::xxh3::xxh3_128_with_seed; - -//#[global_allocator] -//static PEAK_ALLOC: PeakAlloc = PeakAlloc; - -use clap::Args; - -/// The type of sequences we might include in the output reference FASTA file -/// to map against for quantification with -/// alevin-fry. -#[derive(Clone, Debug, Serialize)] -pub enum AugType { - /// The sequence of Spliced transcripts - Transcript, - /// The sequence of introns of transcripts, merged by gene - Intronic, - /// The sequence of gene bodies (from the first to the last base of each gene) - GeneBody, - /// The sequence of transcript bodies (from the first to the last base of each transcript) - TranscriptBody, -} - -impl From<&str> for AugType { - fn from(s: &str) -> Self { - match s { - // "transcript" | "t" => Self::Transcript, - "intronic" | "i" => Self::Intronic, - "gene-body" | "g" => Self::GeneBody, - "transcript-body" | "t" => Self::TranscriptBody, - _ => panic!("Invalid sequence type"), - } - } -} - -impl AsRef for AugType { - fn as_ref(&self) -> &str { - match self { - Self::Transcript => "transcript", - Self::Intronic => "intronic", - Self::GeneBody => "gene-body", - Self::TranscriptBody => "transcript-body", - } - } -} - -#[derive(Args, Clone, Debug, Serialize)] -pub struct AugRefOpts { - /// The path to a genome fasta file. - pub genome: PathBuf, - /// The path to a gene annotation gtf/gff3 file. - pub genes: PathBuf, - /// The path to the output directory (will be created if it doesn't exist). - pub out_dir: PathBuf, - - /// Comma separated types of augmented sequences to include in the output FASTA file on top of spliced transcripts. - /// Available options are `intronic` (or `i` for short), `gene-body` (or `g`), and `transcript-body` (or `t`). - #[arg( - short, - long, - display_order = 1, - value_delimiter = ',', - requires = "genome", - value_parser = PossibleValuesParser::new(&["i", "g", "t", "intronic", "gene-body", "transcript-body"]).map(|s| AugType::from(s.as_str())), - hide_possible_values = true, - )] - pub aug_type: Option>, - - /// A flag of not including spliced transcripts in the output FASTA file. (usually there should be a good reason to do so) - #[arg(long, display_order = 3)] - pub no_transcript: bool, - - /// The read length of the single-cell experiment being processed (determines flank size). - #[arg( - short, - long, - help_heading = "Intronic Sequence Options", - display_order = 1, - default_value_t = 91, - requires_if("intronic", "aug_type") - )] - pub read_length: i64, - - /// Determines the length of sequence subtracted from the read length to obtain the flank length. - #[arg( - long, - help_heading = "Intronic Sequence Options", - display_order = 2, - default_value_t = 5, - requires_if("intronic", "aug_type") - )] - pub flank_trim_length: i64, - - /// Indicates whether flank lengths will be considered when merging introns. - #[arg( - long, - help_heading = "Intronic Sequence Options", - display_order = 3, - requires_if("intronic", "aug_type") - )] - pub no_flanking_merge: bool, - - /// The file name prefix of the generated output files. - #[arg(short = 'p', long, default_value = "roers_ref", display_order = 2)] - pub filename_prefix: String, - - /// Indicates whether identical sequences will be deduplicated. - #[arg(long = "dedup", display_order = 1)] - pub dedup_seqs: bool, - - /// The path to an extra spliced sequence fasta file. - #[arg(long, help_heading = "Extra Spliced Sequence File", display_order = 3)] - pub extra_spliced: Option, - - /// The path to an extra unspliced sequence fasta file. - #[arg( - // short, - long, - help_heading = "Extra Unspliced Sequence File", - display_order = 3, - )] - pub extra_unspliced: Option, - - /// Denotes that the input annotation is a GFF3 (instead of GTF) file - #[arg(long = "gff3", display_order = 4)] - pub gff3: bool, -} - -type HashType = u128; - -struct SeqDedup { - seq_hs: HashMap, - collisions: Vec<(String, String)>, - num_seen: usize, - num_dup: usize, -} - -impl SeqDedup { - fn new() -> Self { - Self { - seq_hs: HashMap::::new(), - collisions: vec![], - num_seen: 0, - num_dup: 0, - } - } - - fn callback(&mut self, rec: &noodles::fasta::Record) -> bool { - let sequence_rec = rec - .sequence() - .get(..) - .unwrap_or_else(|| panic!("Failed getting sequence for record {}", rec.name())); - let sequence_hash = xxh3_128_with_seed(sequence_rec, 271828); - self.num_seen += 1; - - match self.seq_hs.entry(sequence_hash) { - // if we have already seen this key then add this to the list of collisions - Entry::Occupied(e) => { - self.collisions - .push((e.get().to_owned(), rec.name().to_owned())); - self.num_dup += 1; - false - } - // otherwise, associate this sequence with the given name, and write the - // sequence to file - Entry::Vacant(ve) => { - ve.insert(rec.name().to_owned()); - true - } - } - } - - fn get_duplicate_ids(&self) -> Vec<&str> { - self.collisions.iter().map(|x| x.1.as_ref()).collect() - } - - fn write_duplicate_info>(&mut self, out_dir: P) -> anyhow::Result<()> { - let dup_path = out_dir.as_ref().join("duplicate_entries.tsv"); - - info!( - "Observed {} total sequences during reference generation, of which {} \ - were exact sequence duplicates of some other sequence.", - self.num_seen, self.num_dup - ); - info!( - "Duplicate sequences will not be written to the reference fasta file, \ - or the t2g / t2g_3col file, but they are listed in {:?}.", - dup_path.as_os_str() - ); - - let dupfile = std::fs::OpenOptions::new() - .write(true) - .truncate(true) - .create(true) - .open(dup_path.clone()) - .with_context(|| { - format!("Could not open the output file {:?}", dup_path.as_os_str()) - })?; - - let mut dup_writer = BufWriter::new(dupfile); - - // sort so all of the values with the same - // retained key are adjacent - self.collisions.sort(); - - writeln!(dup_writer, "RetainedRef\tDuplicateRef")?; - - for (key, group) in &self.collisions.iter().group_by(|&x| &x.0) { - for d in group { - writeln!(dup_writer, "{}\t{}", key, d.1)?; - } - } - Ok(()) - } -} - -#[allow(clippy::uninlined_format_args)] -pub fn make_ref(aug_ref_opts: AugRefOpts) -> anyhow::Result<()> { - // clean this up - let genome_path: PathBuf = aug_ref_opts.genome; - let gtf_path: PathBuf = aug_ref_opts.genes; - let out_dir: PathBuf = aug_ref_opts.out_dir; - let aug_type: Option> = aug_ref_opts.aug_type; - let no_transcript: bool = aug_ref_opts.no_transcript; - let read_length: i64 = aug_ref_opts.read_length; - let flank_trim_length: i64 = aug_ref_opts.flank_trim_length; - let no_flanking_merge: bool = aug_ref_opts.no_flanking_merge; - let filename_prefix: String = aug_ref_opts.filename_prefix; - let dedup_seqs: bool = aug_ref_opts.dedup_seqs; - let extra_spliced: Option = aug_ref_opts.extra_spliced; - let extra_unspliced: Option = aug_ref_opts.extra_unspliced; - let gff3: bool = aug_ref_opts.gff3; - - // if nothing to build, then exit - if no_transcript & aug_type.is_none() { - anyhow::bail!("Nothing to build: --no-transcript is set and --aug-type is not provided. Cannot proceed"); - } - - // check if extra_spliced and extra_unspliced is valid - if let Some(extra_spliced) = &extra_spliced { - if !extra_spliced.exists() { - anyhow::bail!("The extra spliced sequence file does not exist. Cannot proceed"); - } - } - - if let Some(extra_unspliced) = &extra_unspliced { - if !extra_unspliced.exists() { - anyhow::bail!("The extra unspliced sequence file does not exist. Cannot proceed"); - } - } - - // create the folder if it doesn't exist - std::fs::create_dir_all(&out_dir)?; - // specify output file names - let out_gid2name = out_dir.join("gene_id_to_name.tsv"); - let out_t2g_name = out_dir.join(if aug_type.is_some() { - "t2g_3col.tsv" - } else { - "t2g.tsv" - }); - - if flank_trim_length > read_length { - anyhow::bail!( - "The read length: {} must be >= the flank trim length: {}", - read_length, - flank_trim_length - ); - } - let flank_length = read_length - flank_trim_length; - // let filename_prefix = format!("{}_fl{}.fa", filename_prefix, flank_length); - let filename_prefix = format!("{}.fa", filename_prefix); - let out_fa_path = out_dir.join(&filename_prefix); - - // we create the writer - // we will not append because this should be the start of the file - let fa_out_file = std::fs::OpenOptions::new() - .write(true) - .truncate(true) - .create(true) - .open(&out_fa_path) - .with_context(|| { - format!( - "Could not open the output file {:?}", - out_fa_path.as_os_str() - ) - })?; - - let mut sd = SeqDedup::new(); - let mut sd_callback = if dedup_seqs { - Some(|r: &noodles::fasta::Record| -> bool { sd.callback(r) }) - } else { - None - }; - - // 1. we read the gtf/gff3 file as grangers. This will make sure that the eight fields are there. - let start = Instant::now(); - let (gr, file_type) = if gff3 { - (Grangers::from_gff(gtf_path.as_path(), true)?, "GFF3") - } else { - (Grangers::from_gtf(gtf_path.as_path(), true)?, "GTF") - }; - - let duration: Duration = start.elapsed(); - debug!("Built the Grangers object in {:?}", duration); - info!( - "Built the Grangers object for {:?} records", - gr.df().height() - ); - - // we get the exon df and validate it - // this will make sure that each exon has a valid transcript ID, and the exon numbers are valid - let mut exon_gr = gr.exons(None, true)?; - - let fc = exon_gr.field_columns(); - let df = exon_gr.df(); - - // we then make sure that the gene_id and gene_name fields are not both missing - if fc.gene_id().is_none() && fc.gene_name().is_none() { - anyhow::bail!( - "The input {} file does not have a valid gene_id or gene_name field. Cannot proceed", - file_type - ); - } else if fc.gene_id().is_none() { - warn!( - "The input {} file does not have a valid gene_id field. Roers will use gene_name as gene_id", - file_type - ); - // we get gene name and rename it to gene_id - let mut gene_id = df.column(fc.gene_name().unwrap())?.clone(); - gene_id.rename("gene_id"); - // we update the field_columns - // fc.update("gene_id", "gene_id")?; - // push to the df - exon_gr.update_column(gene_id, Some("gene_id"))?; - // df.with_column(gene_id)?; - } else if fc.gene_name().is_none() { - warn!( - "The input {} file does not have a valid gene_name field. Roers will use gene_id as gene_name.", - file_type - ); - // we get gene id and rename it to gene_name - let mut gene_name = df.column(fc.gene_id().unwrap())?.clone(); - gene_name.rename("gene_name"); - // fc.update("gene_name", "gene_name")?; - // push to the df - exon_gr.update_column(gene_name, Some("gene_name"))?; - // df.with_column(gene_name)?; - } - - // TODO: Getting the string and then making the &str are annoying, maybe we should have a better way to do this - // exon_gr.field_columns = fc; - let gene_id_s = exon_gr.get_column_name("gene_id", false)?; - let gene_id = gene_id_s.as_str(); - let gene_name_s = exon_gr.get_column_name("gene_name", false)?; - let gene_name = gene_name_s.as_str(); - let transcript_id_s = exon_gr.get_column_name("transcript_id", false)?; - let transcript_id = transcript_id_s.as_str(); - - // Next, we fill the missing gene_id and gene_name fields - if exon_gr.any_nulls(&[gene_id, gene_name], false, false)? { - warn!("Found missing gene_id and/or gene_name; Imputing. If both missing, will impute using transcript_id; Otherwise, will impute using the existing one."); - let three_col_df = exon_gr - .df() - .select([transcript_id, gene_id, gene_name])? - .lazy() - .with_columns([ - when(col(gene_id).is_null()) - .then( - when(col(gene_name).is_null()) - .then(col(transcript_id)) - .otherwise(col(gene_name)), - ) - .otherwise(col(gene_id)) - .alias(gene_id), - when(col(gene_name).is_null()) - .then( - when(col(gene_id).is_null()) - .then(col(transcript_id)) - .otherwise(col(gene_id)), - ) - .otherwise(col(gene_name)) - .alias(gene_name), - ]) - .collect()?; - - // we update the corresponding columns - exon_gr.update_column(three_col_df.column(gene_id)?.to_owned(), None)?; - exon_gr.update_column(three_col_df.column(gene_name)?.to_owned(), None)?; - } - - // to this point, we have a valid exon df to work with. - info!( - "Proceed {} exon records from {} transcripts", - exon_gr.df().height(), - exon_gr.df().column("transcript_id")?.n_unique()? - ); - - // Next, we get the gene id to name mapping - let mut gene_id_to_name = - exon_gr - .df() - .select([gene_id, gene_name])? - .unique(None, UniqueKeepStrategy::Any, None)?; - - // also, the t2g mapping for spliced transcripts - let mut t2g_map = exon_gr.df().select([transcript_id, gene_id])?.unique( - None, - UniqueKeepStrategy::Any, - None, - )?; - t2g_map.rename(transcript_id, "t2g_tx_id")?; - - // if we have augmented sequences, we need three columns - if aug_type.is_some() { - t2g_map.with_column(Series::new("splice_status", vec!["S"; t2g_map.height()]))?; - } - - // Next, we write the transcript sequences to file if asked - if !no_transcript { - // Next, we write the transcript seuqences - exon_gr.write_transcript_sequences_with_filter( - &genome_path, - &fa_out_file, - None, - true, - &mut sd_callback, - )?; - } - - // Next, we write the augmented sequences - if let Some(aug_type) = &aug_type { - for seq_typ in aug_type { - match seq_typ { - AugType::Intronic => { - // Then, we get the introns - let mut intron_gr = exon_gr.introns(None, None, None, true)?; - - info!("Processing {} intronic records", intron_gr.df().height(),); - - // no_flanking_merge decides when the order of merge and extend - // if no_flanking_merge is true, we merge first, then extend - if no_flanking_merge { - // Then, we merge the overlapping introns - intron_gr = intron_gr.merge( - &[intron_gr.get_column_name(gene_id, false)?], - false, - None, - None, - )?; - } - - // add flanking end to both side of each intron - intron_gr.extend(flank_length, &options::ExtendOption::Both, false)?; - - // if no_flanking_merge is false, we merge after extend - if !no_flanking_merge { - // Then, we merge the overlapping introns - intron_gr = intron_gr.merge( - &[intron_gr.get_column_name(gene_id, false)?], - false, - None, - None, - )?; - } - - // Then, we give a unique id to each intron - intron_gr.add_order(Some(&[gene_id]), "intron_number", Some(1), true)?; - intron_gr.df = intron_gr - .df - .lazy() - .with_column( - concat_str([col(gene_id), col("intron_number")], "-I") - .alias("t2g_tx_id"), - ) - .sort(gene_id, Default::default()) - .collect()?; - - intron_gr.write_sequences_with_filter( - &genome_path, - &fa_out_file, - false, - Some("t2g_tx_id"), - options::OOBOption::Truncate, - &mut sd_callback, - )?; - - // we need to update the t2g mapping for introns - let mut intron_t2g = intron_gr.df().select(["t2g_tx_id", gene_id])?.unique( - None, - UniqueKeepStrategy::Any, - None, - )?; - - // if we are here, we need to add a column for splice_status cuz it is an augmented reference - intron_t2g.with_column(Series::new( - "splice_status", - vec!["U"; intron_t2g.height()], - ))?; - - // we extend the t2g mapping for introns - t2g_map.extend(&intron_t2g)?; - } - AugType::GeneBody => { - // first we get the range (gene body) of each gene - let mut gene_gr = exon_gr.genes(None, true)?; - - // we append a -G to each sequence here to mark that they are gene-body seuqences - gene_gr.df = gene_gr - .df - .lazy() - .sort(gene_id, Default::default()) - .with_column(col(gene_id).add(lit("-G")).alias("t2g_tx_id")) - .collect()?; - - // say something - info!("Proceed {} gene body records", gene_gr.df().height(),); - - gene_gr.write_sequences_with_filter( - &genome_path, - &fa_out_file, - false, - Some("t2g_tx_id"), - options::OOBOption::Truncate, - &mut sd_callback, - )?; - - // we need to update the t2g mapping for genes - let mut gene_t2g = gene_gr.df().select(["t2g_tx_id", gene_id])?.unique( - None, - UniqueKeepStrategy::Any, - None, - )?; - - // add the splice status column - gene_t2g - .with_column(Series::new("splice_status", vec!["U"; gene_t2g.height()]))?; - - // extend t2g_map for genes - t2g_map.extend(&gene_t2g)?; - } - AugType::TranscriptBody => { - // we get the range (transcript body) of each transcript - let mut tx_gr = exon_gr.transcripts(None, true)?; - - // Then we append a -T to mark the sequence as transcript body - tx_gr.df = tx_gr - .df - .lazy() - .with_column(col(gene_id).add(lit("-T")).alias("t2g_tx_id")) - .collect()?; - - // say something - info!("Proceed {} transcript body records", tx_gr.df().height(),); - - tx_gr.write_sequences_with_filter( - &genome_path, - &fa_out_file, - false, - Some(gene_id), - options::OOBOption::Truncate, - &mut sd_callback, - )?; - - // we need to update the t2g mapping for genes - let mut tx_t2g = tx_gr.df().select(["t2g_tx_id", gene_id])?.unique( - None, - UniqueKeepStrategy::Any, - None, - )?; - tx_t2g.with_column(Series::new("splice_status", vec!["U"; tx_t2g.height()]))?; - t2g_map.extend(&tx_t2g)?; - } - _ => { - // todo, this is to avoid initalization error - // we probably want to set this as an empty grangers - // object here. - anyhow::bail!("Invalid sequence type"); - } - } - } - } - - // if there are extra spliced sequences, we include them in if dedup is set, otherwise we write them to the file - if let Some(path) = &extra_spliced { - // create extra file reader - let mut reader = std::fs::File::open(path) - .map(std::io::BufReader::new) - .map(noodles::fasta::Reader::new)?; - - // we crate the writer, and write if not dedup - let mut writer = noodles::fasta::Writer::new(&fa_out_file); - - // if dedup, we push the records into the seq vector - // otherwise, we write the records to the output file - let mut names = Vec::new(); - for result in reader.records() { - let record = result?; - - let write_record = if let Some(ref mut dup_filt) = sd_callback { - dup_filt(&record) - } else { - true - }; - - // TODO : this will be removed if we are deduplicating - // sequences, but we push it here unconditionally. The - // current behavior is not wrong, but may be unnecessary. - names.push(record.name().to_owned().clone()); - - if write_record { - writer.write_record(&record).with_context(|| { - format!( - "Could not write the sequence of extra spliced sequence {} to the output file", - record.name() - ) - })?; - } - } - - // extend t2g_map for the custom spliced targets - let mut extra_t2g = df!( - "t2g_tx_id" => &names, - "gene_id" => &names, - )?; - - if aug_type.is_some() { - // if we are here, we need to add a column for splice_status cuz it is an augmented reference - extra_t2g.with_column(Series::new("splice_status", vec!["S"; extra_t2g.height()]))?; - } - - t2g_map.extend(&extra_t2g)?; - - // extend gene_id_to_name for the custom spliced targets - gene_id_to_name.extend(&df!(gene_id => &names, gene_name => &names)?)?; - }; - - if let Some(path) = &extra_unspliced { - // create extra file reader - let mut reader = std::fs::File::open(path) - .map(std::io::BufReader::new) - .map(noodles::fasta::Reader::new)?; - - let mut writer = noodles::fasta::Writer::new(&fa_out_file); - - let mut names = Vec::new(); - for result in reader.records() { - let record = result?; - - let write_record = if let Some(ref mut dup_filt) = sd_callback { - dup_filt(&record) - } else { - true - }; - - // TODO : this will be removed if we are deduplicating - // sequences, but we push it here unconditionally. The - // current behavior is not wrong, but may be unnecessary. - names.push(record.name().to_owned().clone()); - - if write_record { - writer.write_record(&record).with_context(|| { - format!( - "Could not write the sequence of extra spliced sequence {} to the output file", - record.name() - ) - })?; - } - } - - // extend t2g_map for the custom spliced targets - let mut extra_t2g = df!( - "t2g_tx_id" => &names, - "gene_id" => &names, - )?; - - if aug_type.is_some() { - // if we are here, we need to add a column for splice_status cuz it is an augmented reference - extra_t2g.with_column(Series::new("splice_status", vec!["U"; extra_t2g.height()]))?; - } - - t2g_map.extend(&extra_t2g)?; - - // extend gene_id_to_name for the custom spliced targets - gene_id_to_name.extend(&df!(gene_id => &names, gene_name => &names)?)?; - }; - - // at this point, the fasta file should be ready - // if we are doing deduplication, we need to write out - // the duplicate entry information - if dedup_seqs { - sd.write_duplicate_info(&out_dir)?; - } - - // Till this point, we are done with the fasta file - // we need to write the t2g and gene_id_to_name files - - // if we are removing duplicates, remove them from t2g here - if dedup_seqs { - // the column with the keys corresponding to the - // sequences we've written to the output reference - // fasta file. - let column = t2g_map.column("t2g_tx_id")?; - // get the list of duplicated sequences we have - // collected so far - let dups = sd.get_duplicate_ids(); - // mask out the t2g rows that are keyed by the duplicates - // and then *negate* this mask (since we wish to keep everything - // that is *not* a duplicate). - let mask = is_in(&column, &Series::new("values", dups))?; - let mask = column.is_in(&Series::new("values", dups))?; - // replace the t2g_map dataframe with the one that has the - // duplicate mappings removed. - t2g_map = t2g_map.filter(&mask.not())?; - } - - let mut file = std::fs::File::create(&out_t2g_name)?; - CsvWriter::new(&mut file) - .has_header(false) - .with_delimiter(b'\t') - .finish(&mut t2g_map)?; - - let mut file = std::fs::File::create(&out_gid2name)?; - CsvWriter::new(&mut file) - .has_header(false) - .with_delimiter(b'\t') - .finish(&mut gene_id_to_name)?; - - let info_file = out_dir.join("roers_make-ref.json"); - - let v = clap::crate_version!(); - - let index_info = json!({ - "command" : "roers makeref", - "roers_version" : v, - "output_fasta": out_fa_path, - "output_t2g": out_t2g_name, - "output_gid2name": out_gid2name, - "args" : { - "genome" : genome_path, - "genes" : gtf_path, - "out_dir" : out_dir, - "aug_type" : if let Some(at) = &aug_type { - at.iter().map(|x| x.as_ref()).collect::>() - } else { - vec![] - }, - "no_transcript" : no_transcript, - "read_length" : read_length, - "flank_trim_length" : flank_trim_length, - "no_flanking_merge" : no_flanking_merge, - "filename_prefix" : filename_prefix, - "dedup_seqs" : dedup_seqs, - "extra_spliced" : extra_spliced, - "extra_unspliced" : extra_unspliced, - "gff3" : gff3, - } - }); - - std::fs::write( - &info_file, - serde_json::to_string_pretty(&index_info).unwrap(), - ) - .with_context(|| format!("could not write {}", info_file.display()))?; - - info!("Done!"); - - Ok(()) -} diff --git a/src/lib/lib.rs b/src/lib/lib.rs index 0d852f4..63f5ce6 100644 --- a/src/lib/lib.rs +++ b/src/lib/lib.rs @@ -160,7 +160,8 @@ impl SeqDedup { } fn callback(&mut self, rec: &noodles::fasta::Record) -> bool { - let record_name = std::str::from_utf8(rec.name()).expect(format!("Failed getting name for record {:?}", rec).as_str()); + let record_name = std::str::from_utf8(rec.name()) + .unwrap_or_else(|_| panic!("Failed getting name for record {:?}", rec)); let sequence_rec = rec .sequence() .get(..) @@ -220,7 +221,7 @@ impl SeqDedup { writeln!(dup_writer, "RetainedRef\tDuplicateRef")?; - for (key, group) in &self.collisions.iter().group_by(|&x| &x.0) { + for (key, group) in &self.collisions.iter().chunk_by(|&x| &x.0) { for d in group { writeln!(dup_writer, "{}\t{}", key, d.1)?; } @@ -324,7 +325,7 @@ pub fn make_ref(aug_ref_opts: AugRefOpts) -> anyhow::Result<()> { // we get the exon df and validate it // this will make sure that each exon has a valid transcript ID, and the exon numbers are valid - let mut exon_gr = gr.exons(None, true)?; + let mut exon_gr = gr.exons(None, false)?; let fc = exon_gr.field_columns(); let df = exon_gr.df(); @@ -437,7 +438,7 @@ pub fn make_ref(aug_ref_opts: AugRefOpts) -> anyhow::Result<()> { &genome_path, &fa_out_file, None, - true, + false, &mut sd_callback, )?; } @@ -448,7 +449,7 @@ pub fn make_ref(aug_ref_opts: AugRefOpts) -> anyhow::Result<()> { match seq_typ { AugType::Intronic => { // Then, we get the introns - let mut intron_gr = exon_gr.introns(None, None, None, true)?; + let mut intron_gr = exon_gr.introns(None, None, None, false)?; info!("Processing {} intronic records", intron_gr.df().height(),); @@ -456,11 +457,13 @@ pub fn make_ref(aug_ref_opts: AugRefOpts) -> anyhow::Result<()> { // if no_flanking_merge is true, we merge first, then extend if no_flanking_merge { // Then, we merge the overlapping introns + // We use multithreads as built-in. Not sure if we want to expose this to the user intron_gr = intron_gr.merge( &[intron_gr.get_column_name(gene_id, false)?], false, None, None, + true, )?; } @@ -475,11 +478,12 @@ pub fn make_ref(aug_ref_opts: AugRefOpts) -> anyhow::Result<()> { false, None, None, + true, )?; } // Then, we give a unique id to each intron - intron_gr.add_order(Some(&[gene_id]), "intron_number", Some(1), true)?; + intron_gr.add_order(Some(&[gene_id]), "intron_number", Some(1), false)?; intron_gr.df = intron_gr .df .lazy() @@ -487,7 +491,7 @@ pub fn make_ref(aug_ref_opts: AugRefOpts) -> anyhow::Result<()> { concat_str([col(gene_id), col("intron_number")], "-I", false) .alias("t2g_tx_id"), ) - .sort(gene_id, Default::default()) + .sort([gene_id], Default::default()) .collect()?; intron_gr.write_sequences_with_filter( @@ -517,13 +521,13 @@ pub fn make_ref(aug_ref_opts: AugRefOpts) -> anyhow::Result<()> { } AugType::GeneBody => { // first we get the range (gene body) of each gene - let mut gene_gr = exon_gr.genes(None, true)?; + let mut gene_gr = exon_gr.genes(None, false)?; // we append a -G to each sequence here to mark that they are gene-body seuqences gene_gr.df = gene_gr .df .lazy() - .sort(gene_id, Default::default()) + .sort([gene_id], Default::default()) .with_column(col(gene_id).add(lit("-G")).alias("t2g_tx_id")) .collect()?; @@ -555,7 +559,7 @@ pub fn make_ref(aug_ref_opts: AugRefOpts) -> anyhow::Result<()> { } AugType::TranscriptBody => { // we get the range (transcript body) of each transcript - let mut tx_gr = exon_gr.transcripts(None, true)?; + let mut tx_gr = exon_gr.transcripts(None, false)?; // Then we append a -T to mark the sequence as transcript body tx_gr.df = tx_gr From 695e63286ab5d5c65295f9ffafecb4f2bed11c6f Mon Sep 17 00:00:00 2001 From: DongzeHe Date: Sat, 6 Jul 2024 18:12:16 -0400 Subject: [PATCH 2/5] turn off multithreading --- src/lib/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib/lib.rs b/src/lib/lib.rs index 63f5ce6..d957a6d 100644 --- a/src/lib/lib.rs +++ b/src/lib/lib.rs @@ -463,7 +463,7 @@ pub fn make_ref(aug_ref_opts: AugRefOpts) -> anyhow::Result<()> { false, None, None, - true, + false, )?; } @@ -478,7 +478,7 @@ pub fn make_ref(aug_ref_opts: AugRefOpts) -> anyhow::Result<()> { false, None, None, - true, + false, )?; } From fa15b1cfe4dbbd2845bc1ea30420cbc320e24159 Mon Sep 17 00:00:00 2001 From: an-altosian Date: Mon, 13 Jan 2025 01:38:01 +0000 Subject: [PATCH 3/5] update dependencies --- Cargo.toml | 10 ++++++---- src/lib/lib.rs | 1 - 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index f609797..a4b952d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,15 +39,17 @@ path = "src/lib/lib.rs" [dependencies] anyhow = "1.0.75" +# https://github.com/pola-rs/polars/issues/19157 +hashbrown = { version = "0.14.5", features = ["raw"] } clap = { version = "4.4.7", features = ["derive", "wrap_help","cargo"] } -grangers = { git = "https://github.com/COMBINE-lab/grangers.git", branch="add_rustdoc", version = "0.4.0" } -polars = { version = "0.41.3", features = ["lazy","dataframe_arithmetic", "checked_arithmetic","rows","dtype-struct", "dtype-categorical", "list_eval","concat_str", "strings"]} +grangers = { git = "https://github.com/an-altosian/grangers.git", branch="add_rustdoc", version = "0.4.0" } +polars = { version = "0.45.1", features = ["lazy","dataframe_arithmetic", "checked_arithmetic","rows","dtype-struct", "dtype-categorical", "list_eval","concat_str", "strings"]} peak_alloc = "0.2.0" tracing = "0.1.40" tracing-subscriber = { version = "0.3.17", features = ["env-filter"] } -noodles = { version = "0.77.0", features = ["gtf","gff","fasta", "core"] } +noodles = { version = "0.87.0", features = ["gtf","gff","fasta", "core"] } serde = {version = "1.0.190", features = ["derive"]} serde_json = "1.0.107" -itertools = "0.13.0" +itertools = "0.14.0" oomfi = "0.1.2" xxhash-rust = { version = "0.8.7", features = ["xxh3"] } diff --git a/src/lib/lib.rs b/src/lib/lib.rs index d957a6d..60ce42e 100644 --- a/src/lib/lib.rs +++ b/src/lib/lib.rs @@ -9,7 +9,6 @@ use serde_json::json; use std::collections::{hash_map::Entry, HashMap}; use std::io::{BufWriter, Write}; use std::ops::Add; -use std::ops::Not; use std::path::{Path, PathBuf}; use std::time::{Duration, Instant}; use tracing::{debug, info, warn}; From cb917f0cc0840831315f7b375ea5e8d8d3ea6055 Mon Sep 17 00:00:00 2001 From: an-altosian Date: Mon, 13 Jan 2025 04:18:55 +0000 Subject: [PATCH 4/5] update to polars 0.45.1 --- Cargo.toml | 3 +- README.md | 49 +++++++++++++++++++++++ src/lib/lib.rs | 103 ++++++++++++++++++++++++++++++++++--------------- 3 files changed, 122 insertions(+), 33 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index a4b952d..f14add3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -42,7 +42,8 @@ anyhow = "1.0.75" # https://github.com/pola-rs/polars/issues/19157 hashbrown = { version = "0.14.5", features = ["raw"] } clap = { version = "4.4.7", features = ["derive", "wrap_help","cargo"] } -grangers = { git = "https://github.com/an-altosian/grangers.git", branch="add_rustdoc", version = "0.4.0" } +# grangers = { git = "https://github.com/an-altosian/grangers.git", branch="add_rustdoc", version = "0.4.1" } +grangers = { path = "../grangers" } polars = { version = "0.45.1", features = ["lazy","dataframe_arithmetic", "checked_arithmetic","rows","dtype-struct", "dtype-categorical", "list_eval","concat_str", "strings"]} peak_alloc = "0.2.0" tracing = "0.1.40" diff --git a/README.md b/README.md index b77ec16..6c082f7 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,52 @@ # roers A rust library for preparing expanded transcriptome references for quantification with [`alevin-fry`](https://alevin-fry.readthedocs.io/en/latest/). + +To use outside of simpleaf, follow the following help message: + +```bash +build the (expanded) reference index + +Usage: roers make-ref [OPTIONS] + +Arguments: + The path to a genome fasta file + The path to a gene annotation gtf/gff3 file + The path to the output directory (will be created if it doesn't exist) + +Options: + -a, --aug-type + Comma separated types of augmented sequences to include in the output FASTA file on + top of spliced transcripts. Available options are `intronic` (or `i` for short), + `gene-body` (or `g`), and `transcript-body` (or `t`) + --dedup + Indicates whether identical sequences will be deduplicated + -p, --filename-prefix + The file name prefix of the generated output files [default: roers_ref] + --no-transcript + A flag of not including spliced transcripts in the output FASTA file. (usually there + should be a good reason to do so) + --gff3 + Denotes that the input annotation is a GFF3 (instead of GTF) file + -h, --help + Print help + -V, --version + Print version + +Intronic Sequence Options: + -r, --read-length + The read length of the single-cell experiment being processed (determines flank size) + [default: 91] + --flank-trim-length + Determines the length of sequence subtracted from the read length to obtain the flank + length [default: 5] + --no-flanking-merge + Indicates whether flank lengths will be considered when merging introns + +Extra Spliced Sequence File: + --extra-spliced The path to an extra spliced sequence fasta file + +Extra Unspliced Sequence File: + --extra-unspliced The path to an extra unspliced sequence fasta file + +``` \ No newline at end of file diff --git a/src/lib/lib.rs b/src/lib/lib.rs index 60ce42e..dd6e883 100644 --- a/src/lib/lib.rs +++ b/src/lib/lib.rs @@ -9,6 +9,7 @@ use serde_json::json; use std::collections::{hash_map::Entry, HashMap}; use std::io::{BufWriter, Write}; use std::ops::Add; +use std::ops::Not; use std::path::{Path, PathBuf}; use std::time::{Duration, Instant}; use tracing::{debug, info, warn}; @@ -342,7 +343,7 @@ pub fn make_ref(aug_ref_opts: AugRefOpts) -> anyhow::Result<()> { ); // we get gene name and rename it to gene_id let mut gene_id = df.column(fc.gene_name().unwrap())?.clone(); - gene_id.rename("gene_id"); + gene_id.rename("gene_id".into()); // we update the field_columns // fc.update("gene_id", "gene_id")?; // push to the df @@ -355,7 +356,7 @@ pub fn make_ref(aug_ref_opts: AugRefOpts) -> anyhow::Result<()> { ); // we get gene id and rename it to gene_name let mut gene_name = df.column(fc.gene_id().unwrap())?.clone(); - gene_name.rename("gene_name"); + gene_name.rename("gene_name".into()); // fc.update("gene_name", "gene_name")?; // push to the df exon_gr.update_column(gene_name, Some("gene_name"))?; @@ -398,36 +399,49 @@ pub fn make_ref(aug_ref_opts: AugRefOpts) -> anyhow::Result<()> { ]) .collect()?; - // we update the corresponding columns - exon_gr.update_column(three_col_df.column(gene_id)?.to_owned(), None)?; - exon_gr.update_column(three_col_df.column(gene_name)?.to_owned(), None)?; + // [transcript_id, gene_id, gene_name] + let mut three_col_vec = three_col_df.take_columns(); + exon_gr.update_column( + three_col_vec + .pop() + .with_context(|| "Could not find the gene_name column")?, + None, + )?; + exon_gr.update_column( + three_col_vec + .pop() + .with_context(|| "Could not find the gene_id column")?, + None, + )?; } // to this point, we have a valid exon df to work with. info!( - "Proceed {} exon records from {} transcripts", + "Found {} exon records from {} transcripts.", exon_gr.df().height(), exon_gr.df().column("transcript_id")?.n_unique()? ); // Next, we get the gene id to name mapping - let mut gene_id_to_name = - exon_gr - .df() - .select([gene_id, gene_name])? - .unique(None, UniqueKeepStrategy::Any, None)?; - - // also, the t2g mapping for spliced transcripts - let mut t2g_map = exon_gr.df().select([transcript_id, gene_id])?.unique( + let mut gene_id_to_name = exon_gr.df().select([gene_id, gene_name])?.unique_stable( None, UniqueKeepStrategy::Any, None, )?; - t2g_map.rename(transcript_id, "t2g_tx_id")?; + + // also, the t2g mapping for spliced transcripts + let mut t2g_map = exon_gr + .df() + .select([transcript_id, gene_id])? + .unique_stable(None, UniqueKeepStrategy::Any, None)?; + t2g_map.rename(transcript_id, "t2g_tx_id".into())?; // if we have augmented sequences, we need three columns if aug_type.is_some() { - t2g_map.with_column(Series::new("splice_status", vec!["S"; t2g_map.height()]))?; + t2g_map.with_column(Column::new( + "splice_status".into(), + vec!["S"; t2g_map.height()], + ))?; } // Next, we write the transcript sequences to file if asked @@ -440,6 +454,9 @@ pub fn make_ref(aug_ref_opts: AugRefOpts) -> anyhow::Result<()> { false, &mut sd_callback, )?; + + // to this point, we have a valid exon df to work with. + info!("Wrote transcript sequences to output file."); } // Next, we write the augmented sequences @@ -447,10 +464,11 @@ pub fn make_ref(aug_ref_opts: AugRefOpts) -> anyhow::Result<()> { for seq_typ in aug_type { match seq_typ { AugType::Intronic => { + info!("Processing intronic records."); // Then, we get the introns let mut intron_gr = exon_gr.introns(None, None, None, false)?; - info!("Processing {} intronic records", intron_gr.df().height(),); + info!("Found {} intronic records.", intron_gr.df().height(),); // no_flanking_merge decides when the order of merge and extend // if no_flanking_merge is true, we merge first, then extend @@ -464,11 +482,15 @@ pub fn make_ref(aug_ref_opts: AugRefOpts) -> anyhow::Result<()> { None, false, )?; + + info!("Merged overlapping intronic records."); } // add flanking end to both side of each intron intron_gr.extend(flank_length, &options::ExtendOption::Both, false)?; + info!("Added flanking length to intronic records."); + // if no_flanking_merge is false, we merge after extend if !no_flanking_merge { // Then, we merge the overlapping introns @@ -479,6 +501,8 @@ pub fn make_ref(aug_ref_opts: AugRefOpts) -> anyhow::Result<()> { None, false, )?; + + info!("Merged overlapping intronic records."); } // Then, we give a unique id to each intron @@ -502,16 +526,17 @@ pub fn make_ref(aug_ref_opts: AugRefOpts) -> anyhow::Result<()> { &mut sd_callback, )?; + info!("Wrote intronic sequences to output file."); + // we need to update the t2g mapping for introns - let mut intron_t2g = intron_gr.df().select(["t2g_tx_id", gene_id])?.unique( - None, - UniqueKeepStrategy::Any, - None, - )?; + let mut intron_t2g = intron_gr + .df() + .select(["t2g_tx_id", gene_id])? + .unique_stable(None, UniqueKeepStrategy::Any, None)?; // if we are here, we need to add a column for splice_status cuz it is an augmented reference - intron_t2g.with_column(Series::new( - "splice_status", + intron_t2g.with_column(Column::new( + "splice_status".into(), vec!["U"; intron_t2g.height()], ))?; @@ -543,15 +568,17 @@ pub fn make_ref(aug_ref_opts: AugRefOpts) -> anyhow::Result<()> { )?; // we need to update the t2g mapping for genes - let mut gene_t2g = gene_gr.df().select(["t2g_tx_id", gene_id])?.unique( + let mut gene_t2g = gene_gr.df().select(["t2g_tx_id", gene_id])?.unique_stable( None, UniqueKeepStrategy::Any, None, )?; // add the splice status column - gene_t2g - .with_column(Series::new("splice_status", vec!["U"; gene_t2g.height()]))?; + gene_t2g.with_column(Column::new( + "splice_status".into(), + vec!["U"; gene_t2g.height()], + ))?; // extend t2g_map for genes t2g_map.extend(&gene_t2g)?; @@ -580,12 +607,15 @@ pub fn make_ref(aug_ref_opts: AugRefOpts) -> anyhow::Result<()> { )?; // we need to update the t2g mapping for genes - let mut tx_t2g = tx_gr.df().select(["t2g_tx_id", gene_id])?.unique( + let mut tx_t2g = tx_gr.df().select(["t2g_tx_id", gene_id])?.unique_stable( None, UniqueKeepStrategy::Any, None, )?; - tx_t2g.with_column(Series::new("splice_status", vec!["U"; tx_t2g.height()]))?; + tx_t2g.with_column(Column::new( + "splice_status".into(), + vec!["U"; tx_t2g.height()], + ))?; t2g_map.extend(&tx_t2g)?; } _ => { @@ -644,7 +674,10 @@ pub fn make_ref(aug_ref_opts: AugRefOpts) -> anyhow::Result<()> { if aug_type.is_some() { // if we are here, we need to add a column for splice_status cuz it is an augmented reference - extra_t2g.with_column(Series::new("splice_status", vec!["S"; extra_t2g.height()]))?; + extra_t2g.with_column(Column::new( + "splice_status".into(), + vec!["S"; extra_t2g.height()], + ))?; } t2g_map.extend(&extra_t2g)?; @@ -695,7 +728,10 @@ pub fn make_ref(aug_ref_opts: AugRefOpts) -> anyhow::Result<()> { if aug_type.is_some() { // if we are here, we need to add a column for splice_status cuz it is an augmented reference - extra_t2g.with_column(Series::new("splice_status", vec!["U"; extra_t2g.height()]))?; + extra_t2g.with_column(Column::new( + "splice_status".into(), + vec!["U"; extra_t2g.height()], + ))?; } t2g_map.extend(&extra_t2g)?; @@ -726,7 +762,10 @@ pub fn make_ref(aug_ref_opts: AugRefOpts) -> anyhow::Result<()> { // mask out the t2g rows that are keyed by the duplicates // and then *negate* this mask (since we wish to keep everything // that is *not* a duplicate). - let mask = is_in(column, &Series::new("values", dups))?; + let mask = is_in( + column.as_materialized_series(), + &Series::new("values".into(), dups), + )?; // replace the t2g_map dataframe with the one that has the // duplicate mappings removed. From 0489f18e3b9fab12503ec7c74dcd1a46eccc76e5 Mon Sep 17 00:00:00 2001 From: an-altosian Date: Mon, 13 Jan 2025 04:21:05 +0000 Subject: [PATCH 5/5] update to polars 0.45.1 --- Cargo.toml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index f14add3..a4b952d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -42,8 +42,7 @@ anyhow = "1.0.75" # https://github.com/pola-rs/polars/issues/19157 hashbrown = { version = "0.14.5", features = ["raw"] } clap = { version = "4.4.7", features = ["derive", "wrap_help","cargo"] } -# grangers = { git = "https://github.com/an-altosian/grangers.git", branch="add_rustdoc", version = "0.4.1" } -grangers = { path = "../grangers" } +grangers = { git = "https://github.com/an-altosian/grangers.git", branch="add_rustdoc", version = "0.4.0" } polars = { version = "0.45.1", features = ["lazy","dataframe_arithmetic", "checked_arithmetic","rows","dtype-struct", "dtype-categorical", "list_eval","concat_str", "strings"]} peak_alloc = "0.2.0" tracing = "0.1.40"