From 6c146f4513f8732554ffb4db219d60a0f4d90ce8 Mon Sep 17 00:00:00 2001
From: Rob Patro <rob.patro@gmail.com>
Date: Sun, 3 Mar 2024 11:44:21 -0500
Subject: [PATCH 1/5] update docs

---
 docs/index.md | 34 ++++++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 10 deletions(-)
diff --git a/docs/index.md b/docs/index.md
index da3f0b3..25e81fd 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -6,22 +6,31 @@
 
 It optionally employs many filters to help discard alignments that may reduce quantification accuracy.  Currently, the set of filters applied in `oarfish` are directly derived from the [`NanoCount`](https://github.com/a-slide/NanoCount)[^Gleeson] tool; both the filters that exist, and the way their values are set (with the exception of the `--three-prime-clip` filter, which is not set by default in `oarfish` but is in `NanoCount`).
 
-Additionally, `oarfish` provides options to make use of coverage profiles derived from the aligned reads to improve quantification accuracy.  The use of this coverage model is enabled with the `--model-coverage` flag.
+Additionally, `oarfish` provides options to make use of coverage profiles derived from the aligned reads to improve quantification accuracy.  The use of this coverage model is enabled with the `--model-coverage` flag. You can read more about `oarfish`[^preprint] in the [preprint](https://www.biorxiv.org/content/10.1101/2024.02.28.582591v1). Please cite the preprint if you use `oarfish` in your work or analysis.
 
 The usage can be provided by passing `-h` at the command line.
 ```
-accurate transcript quantification from long-read RNA-seq data
+A fast, accurate and versatile tool for long-read transcript quantification.
 
 Usage: oarfish [OPTIONS] --alignments <ALIGNMENTS> --output <OUTPUT>
 
 Options:
-      --quiet                    be quiet (i.e. don't output log messages that aren't at least warnings)
-      --verbose                  be verbose (i.e. output all non-developer logging messages)
-  -a, --alignments <ALIGNMENTS>  path to the file containing the input alignments
-  -o, --output <OUTPUT>          location where output quantification file should be written
-  -t, --threads <THREADS>        maximum number of cores that the oarfish can use to obtain binomial probability [default: 1]
-  -h, --help                     Print help
-  -V, --version                  Print version
+      --quiet
+          be quiet (i.e. don't output log messages that aren't at least warnings)
+      --verbose
+          be verbose (i.e. output all non-developer logging messages)
+  -a, --alignments <ALIGNMENTS>
+          path to the file containing the input alignments
+  -o, --output <OUTPUT>
+          location where output quantification file should be written
+  -j, --threads <THREADS>
+          maximum number of cores that the oarfish can use to obtain binomial probability [default: 1]
+      --num-bootstraps <NUM_BOOTSTRAPS>
+          number of bootstrap replicates to produce to assess quantification uncertainty [default: 0]
+  -h, --help
+          Print help
+  -V, --version
+          Print version
 
 filters:
       --filter-group <FILTER_GROUP>
@@ -54,6 +63,9 @@ EM:
 
 The input should be a `bam` format file, with reads aligned using [`minimap2`](https://github.com/lh3/minimap2) against the _transcriptome_. That is, `oarfish` does not currently handle spliced alignment to the genome.  Further, the output alignments should be name sorted (the default order produced by `minimap2` should be fine). _Specifically_, `oarfish` relies on the existence of the `AS` tag in the `bam` records that encodes the alignment score in order to obtain the score for each alignment (which is used in probabilistic read assignment), and the score of the best alignment, overall, for each read.
 
+### Inferential Replicates
+
+`oarfish` has the ability to compute [_inferential replicates_](https://academic.oup.com/nar/article/47/18/e105/5542870) of its quantification estimates. This is performed by bootstrap sampling of the original read mappings, and subsequently performing inference under each resampling.  These inferential replicates allow assessing the variance of the point estimate of transcript abundance, and can lead to improved differential analysis at the transcript level, if using a differential testing tool that takes advantage of this information. The generation of inferential replicates is controlled by the `--num-bootstraps` argument to `oarfish`.  The default value is `0`, meaning that no inferential replicates are generated.  If you set this to some value greater than `0`, the the requested number of inferential replicates will be generated. It is recommended, if generating inferential replicates, to run `oarfish` with multiple threads, since replicate generation is highly-parallelized. Finally, if replicates are generated, they are written to a [`Parquet`](https://parquet.apache.org/), starting with the specified output stem and ending with `infreps.pq`.
 
 ### Output
 
@@ -61,8 +73,10 @@ The `--output` option passed to `oarfish` corresponds to a path prefix (this pre
 
   * `P.meta_info.json` - a JSON format file containing information about relevant parameters with which `oarfish` was run, and other relevant inforamtion from the processed sample apart from the actual transcript quantifications.
   * `P.quant` - a tab separated file listing the quantified targets, as well as information about their length and other metadata. The `num_reads` column provides the estimate of the number of reads originating from each target.
-
+  * `P.infreps.pq` - a [`Parquet`](https://parquet.apache.org/) table where each row is a transcript and each column is an inferential replicate, containing the estimated counts for each transcript under each computed inferential replicate.
 
 ### References
 
 [^Gleeson]: Josie Gleeson, Adrien Leger, Yair D J Prawer, Tracy A Lane, Paul J Harrison, Wilfried Haerty, Michael B Clark, Accurate expression quantification from nanopore direct RNA sequencing with NanoCount, Nucleic Acids Research, Volume 50, Issue 4, 28 February 2022, Page e19, [https://doi.org/10.1093/nar/gkab1129](https://doi.org/10.1093/nar/gkab1129)
+
+[^preprint]: Zahra Zare Jousheghani, Rob Patro. Oarfish: Enhanced probabilistic modeling leads to improved accuracy in long read transcriptome quantification, bioRxiv 2024.02.28.582591; doi: [https://doi.org/10.1101/2024.02.28.582591](https://doi.org/10.1101/2024.02.28.582591)

From f776972bc46012e0d7ad1df6fd2f93adfc196b5a Mon Sep 17 00:00:00 2001
From: Rob Patro <rob@newton>
Date: Sun, 3 Mar 2024 13:07:16 -0500
Subject: [PATCH 2/5] docs

Add more documentation
---
 src/main.rs               | 3 +++
 src/util/parquet_utils.rs | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/src/main.rs b/src/main.rs
index 03b0dae..5e8df2a 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -170,6 +170,9 @@ fn get_filter_opts(args: &Args) -> AlignmentFilters {
     }
 }
 
+/// Produce a [serde_json::Value] that encodes the relevant arguments and 
+/// parameters of the run that we wish to record to file. Ultimately, this 
+/// will be written to the corresponding `meta_info.json` file for this run.
 fn get_json_info(args: &Args, emi: &EMInfo) -> serde_json::Value {
     let prob = if args.model_coverage {
         "binomial"
diff --git a/src/util/parquet_utils.rs b/src/util/parquet_utils.rs
index 17e6564..e181220 100644
--- a/src/util/parquet_utils.rs
+++ b/src/util/parquet_utils.rs
@@ -9,6 +9,9 @@ use arrow2::{
 };
 use std::fs::File;
 
+/// Write a chunk of values `chunk` to a file specified at the 
+/// provided `path` using `scheme`. This raises an [anyhow::Error] if
+/// there is an error in creating the file or writing the contents.
 pub(crate) fn write_chunk_to_file(
     path: &str,
     schema: Schema,

From 0f19359d1ef623ee7185f5af9e538b4fd8b3f21d Mon Sep 17 00:00:00 2001
From: Rob Patro <rob@newton>
Date: Sun, 3 Mar 2024 14:42:45 -0500
Subject: [PATCH 3/5] format

---
 src/main.rs               | 4 ++--
 src/util/parquet_utils.rs | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index 5e8df2a..8b157fb 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -170,8 +170,8 @@ fn get_filter_opts(args: &Args) -> AlignmentFilters {
     }
 }
 
-/// Produce a [serde_json::Value] that encodes the relevant arguments and 
-/// parameters of the run that we wish to record to file. Ultimately, this 
+/// Produce a [serde_json::Value] that encodes the relevant arguments and
+/// parameters of the run that we wish to record to file. Ultimately, this
 /// will be written to the corresponding `meta_info.json` file for this run.
 fn get_json_info(args: &Args, emi: &EMInfo) -> serde_json::Value {
     let prob = if args.model_coverage {
diff --git a/src/util/parquet_utils.rs b/src/util/parquet_utils.rs
index e181220..24bdf71 100644
--- a/src/util/parquet_utils.rs
+++ b/src/util/parquet_utils.rs
@@ -9,7 +9,7 @@ use arrow2::{
 };
 use std::fs::File;
 
-/// Write a chunk of values `chunk` to a file specified at the 
+/// Write a chunk of values `chunk` to a file specified at the
 /// provided `path` using `scheme`. This raises an [anyhow::Error] if
 /// there is an error in creating the file or writing the contents.
 pub(crate) fn write_chunk_to_file(

From 741986ce1f67a44d1c7c68a9b27b25eb23d0092d Mon Sep 17 00:00:00 2001
From: Rob Patro <rob.patro@gmail.com>
Date: Fri, 29 Mar 2024 00:30:27 -0400
Subject: [PATCH 4/5] add seqcol digest

---
 Cargo.toml  |  1 +
 src/main.rs | 16 +++++++++++++---
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 0e2cfaa..577dbe5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -30,6 +30,7 @@ categories = ["command-line-utilities", "science"]
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
+seqcol_rs = { git = "https://github.com/COMBINE-lab/seqcol_rs", branch = "dev", version = "0.1.0" }
 anyhow = "1.0.80"
 bio-types = "1.0.1"
 clap = { version = "4.5.1", features = ["derive"] }
diff --git a/src/main.rs b/src/main.rs
index 8b157fb..8f4a287 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -173,7 +173,7 @@ fn get_filter_opts(args: &Args) -> AlignmentFilters {
 /// Produce a [serde_json::Value] that encodes the relevant arguments and
 /// parameters of the run that we wish to record to file. Ultimately, this
 /// will be written to the corresponding `meta_info.json` file for this run.
-fn get_json_info(args: &Args, emi: &EMInfo) -> serde_json::Value {
+fn get_json_info(args: &Args, emi: &EMInfo, seqcol_digest: &str) -> serde_json::Value {
     let prob = if args.model_coverage {
         "binomial"
     } else {
@@ -194,7 +194,8 @@ fn get_json_info(args: &Args, emi: &EMInfo) -> serde_json::Value {
         "threads": &args.threads,
         "filter_group": &args.filter_group,
         "short_quant": &args.short_quant,
-        "num_bootstraps": &args.num_bootstraps
+        "num_bootstraps": &args.num_bootstraps,
+        "seqcol_digest": seqcol_digest
     })
 }
 
@@ -233,6 +234,15 @@ fn main() -> anyhow::Result<()> {
     // parse the header, and ensure that the reads were mapped with minimap2 (as far as we
     // can tell).
     let header = alignment_parser::read_and_verify_header(&mut reader, &args.alignments)?;
+    let seqcol_digest = {
+        let sc = seqcol_rs::SeqCol::from_sam_header(
+            header
+                .reference_sequences()
+                .iter()
+                .map(|(k, v)| (k.as_slice(), v.length().into())),
+        );
+        sc.digest(seqcol_rs::DigestConfig::default())?
+    };
     let num_ref_seqs = header.reference_sequences().len();
 
     // where we'll write down the per-transcript information we need
@@ -306,7 +316,7 @@ fn main() -> anyhow::Result<()> {
 
     // prepare the JSON object we'll write
     // to meta_info.json
-    let json_info = get_json_info(&args, &emi);
+    let json_info = get_json_info(&args, &emi, &seqcol_digest);
 
     // write the output
     write_output(&args.output, json_info, &header, &counts)?;

From 86b238ebebb70147c5fe73ebeeaff5a1d166433a Mon Sep 17 00:00:00 2001
From: Rob Patro <rob@newton>
Date: Fri, 29 Mar 2024 16:48:25 -0400
Subject: [PATCH 5/5] bump version and deps

---
 Cargo.toml  | 10 +++++-----
 src/main.rs |  8 +++++++-
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 577dbe5..8d90e45 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "oarfish"
-version = "0.3.1"
+version = "0.4.0"
 edition = "2021"
 authors = [
   "Zahra Zare Jousheghani <zzare@umd.edu>",
@@ -31,9 +31,9 @@ categories = ["command-line-utilities", "science"]
 
 [dependencies]
 seqcol_rs = { git = "https://github.com/COMBINE-lab/seqcol_rs", branch = "dev", version = "0.1.0" }
-anyhow = "1.0.80"
+anyhow = "1.0.81"
 bio-types = "1.0.1"
-clap = { version = "4.5.1", features = ["derive"] }
+clap = { version = "4.5.4", features = ["derive"] }
 coitrees = "0.4.0"
 noodles-bam = "0.56.0"
 noodles-gtf = "0.23.0"
@@ -43,13 +43,13 @@ tabled = "0.15.0"
 tracing = "0.1.40"
 tracing-subscriber = {version="0.3.18", features=["env-filter"]}
 typed-builder = "0.18.1"
-rayon = "1.9"
+rayon = "1.10"
 statrs = "0.16"
 csv = "1.3"
 ndarray = "0.15"
 serde = { version = "1", features = ["derive"] }
 itertools = "0.12.1"
-serde_json = "1.0.114"
+serde_json = "1.0.115"
 path-tools = "0.1.0"
 atomic_float = "0.1.0"
 rand = "0.8.5"
diff --git a/src/main.rs b/src/main.rs
index 8f4a287..276ac24 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,5 +1,6 @@
 use clap::Parser;
 
+use anyhow::Context;
 use arrow2::{array::Float64Array, chunk::Chunk, datatypes::Field};
 
 use std::{
@@ -235,13 +236,18 @@ fn main() -> anyhow::Result<()> {
     // can tell).
     let header = alignment_parser::read_and_verify_header(&mut reader, &args.alignments)?;
     let seqcol_digest = {
+        info!("calculating seqcol digest");
         let sc = seqcol_rs::SeqCol::from_sam_header(
             header
                 .reference_sequences()
                 .iter()
                 .map(|(k, v)| (k.as_slice(), v.length().into())),
         );
-        sc.digest(seqcol_rs::DigestConfig::default())?
+        let d = sc.digest(seqcol_rs::DigestConfig::default()).context(
+            "failed to compute the seqcol digest for the information from the alignment header",
+        )?;
+        info!("done calculating seqcol digest");
+        d
     };
     let num_ref_seqs = header.reference_sequences().len();