Merge pull request #27 from COMBINE-lab/dev

Dev
COMBINE-lab · Aug 26, 2024 · 4e0ddf3 · 4e0ddf3
2 parents 34e6d79 + 878571d
commit 4e0ddf3
Show file tree

Hide file tree

Showing 11 changed files with 1,864 additions and 419 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "oarfish"
-version = "0.5.1"
+version = "0.6.0"
 edition = "2021"
 authors = [
   "Zahra Zare Jousheghani <[email protected]>",
@@ -28,23 +28,20 @@ categories = ["command-line-utilities", "science"]
 seqcol_rs = { git = "https://github.com/COMBINE-lab/seqcol_rs", branch = "dev", version = "0.1.0" }
 anyhow = "1.0.86"
 bio-types = { version = "1.0.4", features = ["serde"] }
-clap = { version = "4.5.13", features = ["derive"] }
-coitrees = "0.4.0"
+clap = { version = "4.5.16", features = ["derive"] }
 noodles-bam = "0.66.0"
-noodles-gtf = "0.30.0"
 noodles-sam = "0.63.0"
 num-format = "0.4.4"
 tabled = "0.16.0"
 tracing = "0.1.40"
 tracing-subscriber = { version = "0.3.18", features = ["env-filter"] }
-typed-builder = "0.19.1"
+typed-builder = "0.20.0"
 rayon = "1.10"
 statrs = "0.17"
 csv = "1.3"
-ndarray = "0.16"
 serde = { version = "1", features = ["derive"] }
 itertools = "0.13.0"
-serde_json = "1.0.122"
+serde_json = "1.0.127"
 path-tools = "0.1.0"
 atomic_float = "1.0.0"
 rand = "0.8.5"
@@ -56,6 +53,18 @@ arrow2 = { version = "0.18.0", features = [
 ] }
 kders = { git = "https://github.com/COMBINE-lab/kde-rs.git", branch = "dev", version = "0.1.0" }
 noodles-bgzf = { version = "0.32.0" }
+crossbeam = { version = "0.8.4", features = [
+  "crossbeam-queue",
+  "crossbeam-channel",
+] }
+sprs = "0.11.1"
+minimap2-sys = { version = "0.1.19" }
+# rely on minimap2-temp until upstream version is pushed
+minimap2-temp = { version = "0.1.30" }
+#git = "https://github.com/rob-p/minimap2-rs.git", branch = "alignment-score" }
+#git = "https://github.com/jguhlin/minimap2-rs.git", branch = "alignment-score" }
+needletail = "0.5.1"
+indicatif = "0.17.8"
 
 [[bin]]
 name = "oarfish"

diff --git a/docs/index.md b/docs/index.md
@@ -52,26 +52,43 @@ filters:
           only alignments to this strand will be allowed; options are (fw /+, rc/-, or both/.) [default: .]
 ```
 
-The input should be a `bam` format file, with reads aligned using [`minimap2`](https://github.com/lh3/minimap2) against the _transcriptome_. That is, `oarfish` does not currently handle spliced alignment to the genome.  Further, the output alignments should be name sorted (the default order produced by `minimap2` should be fine). _Specifically_, `oarfish` relies on the existence of the `AS` tag in the `bam` records that encodes the alignment score in order to obtain the score for each alignment (which is used in probabilistic read assignment), and the score of the best alignment, overall, for each read.  The parameters above should be explained by their relevant help option, but the 
-`-d`/`--strand-filter` is worth noting explicitly. By default, alignments to both strands of a transcript will be considered valid.  You can use this option to allow only alignments in the specified orientation; for example 
-`-d fw` will allow only alignments in the forward orientation and `-d rc` will allow only alignments in the reverse-complement orientation and `-d both` (the default) will allow both.  The `-d` filter, if explicitly provided, overrides 
-the orientation filter in any provided "filter group" so e.g. passing `--filter-group no-filters -d fw` will disable other filters, but will still only admit alignments in the forward orientation.
 
-### Choosing `minimap2` alignment options
+## Input to `oarfish`
 
-Since the purpose of `oarfish` is to estimate transcript abundance from a collection of alignments to the target transcriptome, it is important that the alignments are generated 
-in a fashion that is compatible with this goal.  Primarily, this means that the aligner should be configured to report as many optimal (and near-optimal) alignments as exist, so that 
-`oarfish` can observe all of this information and determine how to allocate reads to transcripts.  We recommend using the following options with `minimap2` when aligning data for 
-later processing by `oarfish`
+`Oarfish` can accept as input either a `bam` file containing reads aligned to the transcriptome as specified [below](index.md#alignment-based-input), or
+raw sequencing reads themselves (along with a reference transcriptome), which are then mapped to the reference using [minimap2-rs](https://github.com/jguhlin/minimap2-rs)
+and subsequently processed with `oarfish`.  With equivalent alignment options, the results of these input modes should be equivalent, so which to use is therefore
+based on the preference of the user.
 
-  * For ONT data (either dRNA or cDNA): please use the flags `--eqx -N 100 -ax map-ont`
-  * For PacBio data: please use the flags `--eqx -N 100 -ax pacbio`
 
-**Note (1)**: It may be worthwile using an even larger `N` value (e.g. the [TranSigner manuscript](https://www.biorxiv.org/content/10.1101/2024.04.13.589356v1.full) recommends `-N 181`). A larger value should not diminish the 
-accuracy of `oarfish`, but it may make alignment take longer and produce a larger `bam` file.
+### Read-based input
 
-**Note (2)**: For very high quality PacBio data, it may be most appropriate to use the `-ax map-hifi` flag in place of `-ax pacbio`.  We are currently evaluating the effect of this option, and also welcome feedback if you 
-have experiences to share on the use of data aligned with these different flags with `oarfish`.
+The read-based input mode takes as input a reference (specified with the `--reference` argument), which can be either a `FASTA` file containing a transcriptome reference
+or an pre-build `minimap2` index, as well as a set of reads (specified with the `--reads` argument), and a `--seq-tech` argument specifying the sequencing technology 
+type of the reads to be mapped.
+
+The mapping between the potential values that can be passed to `oarfish`'s `--seq-tech` argument and the `minimap2` presets is as follows:
+
+  - `oarfish` seq-tech `ont-cdna` corresponds to `minimap2` preset `map-ont`
+  - `oarfish` seq-tech `ont-drna` corresponds to `minimap2` preset `map-ont`
+  - `oarfish` seq-tech `pac-bio` corresponds to `minimap2` preset `map-pb`
+  - `oarfish` seq-tech `pac-bio-hifi` corresponds to `minimap2` preset `map-hifi`
+
+Given these inputs, `oarfish` will either load the pre-built `minimap2` index, or build one according to the parameter specified by `--seq-tech`, and will then align
+the reads to this index using [`minimap2-rs`](https://github.com/jguhlin/minimap2-rs).  Optionally, the maximum multimapping rate (i.e. the number of secondary alignments 
+corresponding to the `minimap2` parameter `-N`) can be specified with the command line parameter `--best-n`. The default value of this parameter is 100.
+
+### Alignmment-based input
+
+In alignment-based mode, `oarfish` processes pre-computed alignments of hte read to the transcriptome. The input should be a `bam` format file, with reads aligned using [`minimap2`](https://github.com/lh3/minimap2) against the _transcriptome_. That is, `oarfish` does not currently handle spliced alignment to the genome. Further, the output alignments should be name sorted (the default order produced by `minimap2` should be fine). _Specifically_, `oarfish` relies on the existence of the `AS` tag in the `bam` records that encodes the alignment score in order to obtain the score for each alignment (which is used in probabilistic read assignment), and the score of the best alignment, overall, for each read. ### Choosing `minimap2` alignment options Since the purpose of `oarfish` is to estimate transcript abundance from a collection of alignments to the target transcriptome, it is important that the alignments are generated in a fashion that is compatible with this goal.  Primarily, this means that the aligner should be configured to report as many optimal (and near-optimal) alignments as exist, so that `oarfish` can observe all of this information and determine how to allocate reads to transcripts.  We recommend using the following options with `minimap2` when aligning data for later processing by `oarfish` * For ONT data (either dRNA or cDNA): please use the flags `--eqx -N 100 -ax map-ont` For PacBio data: please use the flags `--eqx -N 100 -ax pacbio` **Note (1)**: It may be worthwile using an even larger `N` value (e.g. the [TranSigner manuscript](https://www.biorxiv.org/content/10.1101/2024.04.13.589356v1.full) recommends `-N 181`). A larger value should not diminish the accuracy of `oarfish`, but it may make alignment take longer and produce a larger `bam` file.
+
+**Note (2)**: For very high quality PacBio data, it may be most appropriate to use the `-ax map-hifi` flag in place of `-ax pacbio`.  We are currently evaluating the effect of this option, and also welcome feedback if you have experiences to share on the use of data aligned with these different flags with `oarfish`.
+
+### Other notes on `oarfish` parameters
+
+The parameters above should be explained by their relevant help option, but the `-d`/`--strand-filter` is worth noting explicitly. By default, alignments to both strands of a transcript will be considered valid.  You can use this option to allow only alignments in the specified orientation; for example `-d fw` will allow only alignments in the forward orientation and `-d rc` will allow only alignments in the reverse-complement orientation and `-d both` (the default) will allow both.  The `-d` filter, if explicitly provided, overrides the orientation filter in any provided "filter group" so e.g. passing `--filter-group no-filters -d fw` will disable other filters, but will still only admit alignments in the forward orientation.
+
+**In general**, if you apply a `filter-group`, the group options will be applied first and then any explicitly provided options given will override the corresponding option in the `filter-group`.
 
 ### Inferential Replicates
 

diff --git a/src/alignment_parser.rs b/src/alignment_parser.rs
@@ -1,10 +1,11 @@
 use crate::util::oarfish_types::{InMemoryAlignmentStore, TranscriptInfo};
 use noodles_bam as bam;
-use noodles_sam::Header;
+use noodles_sam::header::record::value::map::tag;
+use noodles_sam::{alignment::RecordBuf, Header};
 use num_format::{Locale, ToFormattedString};
 use std::io;
 use std::path::Path;
-use tracing::{info, trace};
+use tracing::{error, info};
 
 pub fn read_and_verify_header<R: io::BufRead>(
     reader: &mut bam::io::Reader<R>,
@@ -21,6 +22,23 @@ pub fn read_and_verify_header<R: io::BufRead>(
             .to_formatted_string(&Locale::en)
     );
 
+    // sort order tag
+    let so = tag::Other::try_from([b'S', b'O'])?;
+    let so_type = header
+        .header()
+        .expect("has inner header")
+        .other_fields()
+        .get(&so)
+        .expect("BAM file should have @SO field");
+
+    if so_type == "coordinate" {
+        error!("oarfish is not designed to process coordinate sorted BAM files.");
+        anyhow::bail!(
+        "You provided a coordinate-sorted BAM, but oarfish does not support processing these. 
+         You should provide a BAM file collated by record name (which is the \"natural\" minimap2 order)."
+        );
+    }
+
     let mut saw_minimap2 = false;
     let mut progs = vec![];
     // explicitly check that alignment was done with a supported
@@ -42,18 +60,196 @@ pub fn read_and_verify_header<R: io::BufRead>(
     Ok(header)
 }
 
+pub enum NextAction {
+    SkipUnmapped,
+    ProcessSameBarcode,
+    NewBarcode,
+    EndOfFile,
+}
+
+#[inline(always)]
+fn is_same_barcode(rec: &RecordBuf, current_barcode: &[u8]) -> anyhow::Result<bool> {
+    const CB_TAG: [u8; 2] = [b'C', b'B'];
+    let same_barcode = match rec.data().get(&CB_TAG) {
+        None => anyhow::bail!("could not get CB tag value"),
+        Some(v) => match v {
+            noodles_sam::alignment::record_buf::data::field::Value::String(x) => {
+                x.as_slice() == current_barcode
+            }
+            _ => anyhow::bail!("CB tag value had unexpected type!"),
+        },
+    };
+    Ok(same_barcode)
+}
+
+/// Takes a collection of [RecordBuf]s, a mutable reference to an [InMemoryAlignmentStore]
+/// as well as the relevant [TranscriptInfo].
+///
+/// This function first sorts the `records` by the read name (ensuring that any primary alignment
+/// of the read comes first), so that reads are collated by name.  Then, it processes in turn the
+/// alignments for each input read, filtering them according to the filters attached to the
+/// `store`.  Subsequently, the alignments are summarized in the `store`. If all `records` are
+/// processed successfully, [Ok]`()` is returned, otherwise the relevant [anyhow::Error] is
+/// returned.
+pub fn sort_and_parse_barcode_records(
+    records: &mut Vec<RecordBuf>,
+    store: &mut InMemoryAlignmentStore,
+    txps: &mut [TranscriptInfo],
+    records_for_read: &mut Vec<RecordBuf>,
+) -> anyhow::Result<()> {
+    records_for_read.clear();
+    let mut prev_read = String::new();
+
+    // first sort records by read name
+    records.sort_unstable_by(|x, y| match x.name().cmp(&y.name()) {
+        std::cmp::Ordering::Equal => {
+            match (x.flags().is_secondary(), y.flags().is_secondary()) {
+                (false, true) => std::cmp::Ordering::Less,
+                (true, false) => std::cmp::Ordering::Greater,
+                (true, true) => std::cmp::Ordering::Equal,
+                // this one shouldn't happen
+                (false, false) => std::cmp::Ordering::Equal,
+            }
+        }
+        x => x,
+    });
+
+    // Parse the input alignemnt file, gathering the alignments aggregated
+    // by their source read. **Note**: this requires that we have a
+    // name-sorted input bam file (currently, aligned against the transcriptome).
+    //
+    // *NOTE*: this had to be changed from `records` to `record_bufs` or
+    // critical information was missing from the records. This happened when
+    // moving to the new version of noodles. Track `https://github.com/zaeleus/noodles/issues/230`
+    // to see if it's clear why this is the case
+    for record in records {
+        if let Some(rname) = record.name() {
+            let record_copy = record.clone();
+            let rstring: String = String::from_utf8_lossy(rname.as_ref()).into_owned();
+            // if this is an alignment for the same read, then
+            // push it onto our temporary vector.
+            if prev_read == rstring {
+                if let Some(_ref_id) = record.reference_sequence_id() {
+                    records_for_read.push(record_copy);
+                }
+            } else {
+                // otherwise, record the alignment range for the
+                // previous read record.
+                if !prev_read.is_empty() {
+                    store.add_group(txps, records_for_read);
+                    if records_for_read.len() == 1 {
+                        store.inc_unique_alignments();
+                    }
+                    records_for_read.clear();
+                }
+                // the new "prev_read" name is the current read name
+                // so it becomes the first on the new alignment range
+                // vector.
+                prev_read = rstring;
+                if let Some(_ref_id) = record.reference_sequence_id() {
+                    records_for_read.push(record_copy);
+                }
+            }
+        }
+    }
+    // if we end with a non-empty alignment range vector, then
+    // add that group.
+    if !records_for_read.is_empty() {
+        store.add_group(txps, records_for_read);
+        if records_for_read.len() == 1 {
+            store.inc_unique_alignments();
+        }
+        records_for_read.clear();
+    }
+    Ok(())
+}
+
+#[inline(always)]
+pub fn parse_alignments_for_barcode<R: io::BufRead>(
+    iter: &mut core::iter::Peekable<noodles_bam::io::reader::RecordBufs<R>>,
+    current_cb: &[u8],
+) -> anyhow::Result<Vec<noodles_sam::alignment::record_buf::RecordBuf>> {
+    //records_for_read.clear();
+    let mut records_for_barcode =
+        Vec::<noodles_sam::alignment::record_buf::RecordBuf>::with_capacity(2048);
+    let mut _records_processed = 0_usize;
+
+    // Parse the input alignemnt file, gathering the alignments aggregated
+    // by their source read. **Note**: this requires that we have a
+    // name-sorted input bam file (currently, aligned against the transcriptome).
+    //
+    // *NOTE*: this had to be changed from `records` to `record_bufs` or
+    // critical information was missing from the records. This happened when
+    // moving to the new version of noodles. Track `https://github.com/zaeleus/noodles/issues/230`
+    // to see if it's clear why this is the case
+
+    loop {
+        let action = if let Some(result) = iter.peek() {
+            if let Ok(record) = result {
+                // unmapped reads don't contribute to quantification
+                // but we track them.
+                if record.flags().is_unmapped() {
+                    _records_processed += 1;
+                    NextAction::SkipUnmapped
+                } else {
+                    let same_barcode = is_same_barcode(record, current_cb)?;
+                    if !same_barcode {
+                        NextAction::NewBarcode
+                    } else {
+                        NextAction::ProcessSameBarcode
+                    }
+                }
+            } else {
+                anyhow::bail!("error parsing record");
+            }
+        } else {
+            NextAction::EndOfFile
+        };
+
+        match action {
+            NextAction::SkipUnmapped => {
+                iter.next().unwrap()?;
+            }
+            NextAction::ProcessSameBarcode => {
+                let record = iter.next().unwrap()?;
+                records_for_barcode.push(record.clone());
+            }
+            NextAction::NewBarcode | NextAction::EndOfFile => {
+                break;
+            }
+        };
+    }
+    Ok(records_for_barcode)
+}
+
 pub fn parse_alignments<R: io::BufRead>(
     store: &mut InMemoryAlignmentStore,
     header: &Header,
     reader: &mut bam::io::Reader<R>,
     txps: &mut [TranscriptInfo],
+    quiet: bool,
 ) -> anyhow::Result<()> {
     // we'll need these to keep track of which alignments belong
     // to which reads.
     let mut prev_read = String::new();
     let mut num_unmapped = 0_u64;
     let mut records_for_read = vec![];
 
+    let pb = if quiet {
+        indicatif::ProgressBar::hidden()
+    } else {
+        indicatif::ProgressBar::new_spinner().with_message("Number of reads mapped")
+    };
+
+    pb.set_style(
+        indicatif::ProgressStyle::with_template(
+            "[{elapsed_precise}] {spinner:4.green/blue} {msg} {human_pos:>12}",
+        )
+        .unwrap()
+        .tick_chars("⠁⠁⠉⠙⠚⠒⠂⠂⠒⠲⠴⠤⠄⠄⠤⠠⠠⠤⠦⠖⠒⠐⠐⠒⠓⠋⠉⠈⠈"),
+    );
+    pb.set_draw_target(indicatif::ProgressDrawTarget::stderr_with_hz(4));
+
     // Parse the input alignemnt file, gathering the alignments aggregated
     // by their source read. **Note**: this requires that we have a
     // name-sorted input bam file (currently, aligned against the transcriptome).
@@ -62,12 +258,10 @@ pub fn parse_alignments<R: io::BufRead>(
     // critical information was missing from the records. This happened when
     // moving to the new version of noodles. Track `https://github.com/zaeleus/noodles/issues/230`
     // to see if it's clear why this is the case
-    for (i, result) in reader.record_bufs(header).enumerate() {
+    for result in reader.record_bufs(header) {
         let record = result?;
+        pb.inc(1);
 
-        if i % 100_000 == 1 {
-            trace!("processed {i} alignment records");
-        }
         // unmapped reads don't contribute to quantification
         // but we track them.
         if record.flags().is_unmapped() {
@@ -113,6 +307,8 @@ pub fn parse_alignments<R: io::BufRead>(
         records_for_read.clear();
     }
 
+    pb.finish_with_message("Finished processing alignments.");
+
     info!(
         "the alignment file contained {} unmapped read records.",
         num_unmapped.to_formatted_string(&Locale::en)