From abb51398c1e09c5241897cf2552a8379b14b7b7c Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Wed, 2 Jun 2021 00:30:25 -0400 Subject: [PATCH 1/9] tweaks for sketch --- scripts/fetchPufferfish.sh | 4 +- src/SalmonAlevin.cpp | 87 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 85 insertions(+), 6 deletions(-) diff --git a/scripts/fetchPufferfish.sh b/scripts/fetchPufferfish.sh index b2fde1a17..21a12492b 100755 --- a/scripts/fetchPufferfish.sh +++ b/scripts/fetchPufferfish.sh @@ -22,8 +22,8 @@ if [ -d ${INSTALL_DIR}/src/pufferfish ] ; then rm -fr ${INSTALL_DIR}/src/pufferfish fi -SVER=salmon-v1.5.0 -#SVER=develop +#SVER=salmon-v1.5.0 +SVER=develop #SVER=sketch-mode EXPECTED_SHA256=e72470c58a7f9b1f66dece73ebf27df07b24b1585d4b155466f165dc9dfcf586 diff --git a/src/SalmonAlevin.cpp b/src/SalmonAlevin.cpp index 2395d6b9e..88b1d5c2e 100644 --- a/src/SalmonAlevin.cpp +++ b/src/SalmonAlevin.cpp @@ -499,7 +499,7 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re fw_score += score_inc; ++fw_hits; added = true; - } + } return added; } @@ -555,6 +555,13 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re SimpleHit{false, approx_pos_rc, rc_score, rc_hits, std::numeric_limits::max()}; } + inline std::string to_string() { + std::stringstream ss; + ss << "fw_hits: " << fw_hits << ", fw_score : " << fw_score << ", fw_pos : " << approx_pos_fw + << " || rc_hits: " << rc_hits << ", rc_score: " << rc_score << ", rc_pos: " << approx_pos_rc; + return ss.str(); + } + int32_t last_read_pos_fw{-1}; int32_t last_read_pos_rc{-1}; @@ -648,6 +655,12 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re extraBAMtags.clear(); bool seqOk; + // keep track of the *least* freqeuntly + // occurring hit in this fragment to consider + // alternative processing of fragments where + // all observed hits have high frequency. + uint64_t alt_max_occ = 0; + if (alevinOpts.protocol.end == bcEnd::FIVE || alevinOpts.protocol.end == bcEnd::THREE){ bool extracted_bc = aut::extractBarcode(rp.first.seq, rp.second.seq, alevinOpts.protocol, barcode); @@ -692,14 +705,24 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re decltype(raw_hits[0].first) prev_read_pos = -1; // the maximum span the supporting k-mers of a // mapping position are allowed to have. - int32_t max_stretch = static_cast(readSubSeq->length() * 1.5); + int32_t max_stretch = static_cast(readSubSeq->length() * 1.25); // a raw hit is a pair of read_pos and a projected hit + + // the least frequent hit for this fragment. + uint64_t min_occ = std::numeric_limits::max(); + + // this is false by default and will be set to true + // if *every* collected hit for this fragment occurs + // salmonOpts.maxReadOccs times or more. + bool had_alt_max_occ = false; + for (auto& raw_hit : raw_hits) { auto& read_pos = raw_hit.first; auto& proj_hits = raw_hit.second; auto& refs = proj_hits.refRange; uint64_t num_occ = static_cast(refs.size()); + min_occ = std::min(min_occ, num_occ); // SANITY if (read_pos <= prev_read_pos) { @@ -729,6 +752,61 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re } // DONE : if (static_cast(refs.size()) < salmonOpts.maxReadOccs) } // DONE : for (auto& raw_hit : raw_hits) + // If our default threshold was too stringent, then set a more liberal + // threshold and look up the k-mers that occur the least frequently. + // Specifically, if the min occuring hits have frequency < min_thresh_prime (2500 by default) + // times, then collect the min occuring hits to get the mapping. + // TODO: deal with code duplication below. + size_t max_allowed_occ = 2500; + if ((min_occ >= salmonOpts.maxReadOccs) and (min_occ < max_allowed_occ)) { + prev_read_pos = -1; + max_allowed_occ = min_occ; + for (auto& raw_hit : raw_hits) { + auto& read_pos = raw_hit.first; + auto& proj_hits = raw_hit.second; + auto& refs = proj_hits.refRange; + uint64_t num_occ = static_cast(refs.size()); + min_occ = std::min(min_occ, num_occ); + had_alt_max_occ = true; + + // SANITY + if (read_pos <= prev_read_pos) { + salmonOpts.jointLog->warn( + "read_pos : {}, prev_read_pos : {}", read_pos, + prev_read_pos); + } + + prev_read_pos = read_pos; + if (num_occ <= max_allowed_occ) { + + ++num_valid_hits; + total_occs += num_occ; + largest_occ = + (num_occ > largest_occ) ? num_occ : largest_occ; + float score_inc = 1.0 / num_occ; + perfect_score += score_inc; + + for (auto& pos_it : refs) { + const auto& ref_pos_ori = proj_hits.decodeHit(pos_it); + uint32_t tid = static_cast( + qidx->getRefId(pos_it.transcript_id())); + int32_t pos = static_cast(ref_pos_ori.pos); + bool ori = ref_pos_ori.isFW; + if (ori) { + hit_map[tid].add_fw(pos, + static_cast(read_pos), + max_stretch, score_inc); + } else { + hit_map[tid].add_rc(pos, + static_cast(read_pos), + max_stretch, score_inc); + } + } // DONE: for (auto &pos_it : refs) + } // DONE : if (static_cast(refs.size()) < + // salmonOpts.maxReadOccs) + } // DONE : for (auto& raw_hit : raw_hits) + } + //float perfect_score = static_cast(num_valid_hits) / total_occs; float acceptable_score = (num_valid_hits == 1) ? perfect_score : perfect_score - (1.0f / largest_occ); @@ -760,6 +838,8 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re } } + alt_max_occ = had_alt_max_occ ? accepted_hits.size() : salmonOpts.maxReadOccs; + /* if (accepted_hits.empty() and (num_valid_hits > 1) and (best_alt_hits >= num_valid_hits - 1)) { for (auto& kv : hit_map) { @@ -796,9 +876,8 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re std::max(readLenLeft, readLenRight)); } - // If the read mapped to > maxReadOccs places, discard it - if (accepted_hits.size() > salmonOpts.maxReadOccs) { + if (accepted_hits.size() > alt_max_occ ) { accepted_hits.clear(); } else if (!accepted_hits.empty()) { mapType = salmon::utils::MappingType::SINGLE_MAPPED; From 2a5920c377a561ec1a7c01e761fea16c01563fc7 Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Thu, 3 Jun 2021 22:13:25 -0400 Subject: [PATCH 2/9] be more stringent! --- src/SalmonAlevin.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/SalmonAlevin.cpp b/src/SalmonAlevin.cpp index 88b1d5c2e..8858f2ca4 100644 --- a/src/SalmonAlevin.cpp +++ b/src/SalmonAlevin.cpp @@ -705,7 +705,7 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re decltype(raw_hits[0].first) prev_read_pos = -1; // the maximum span the supporting k-mers of a // mapping position are allowed to have. - int32_t max_stretch = static_cast(readSubSeq->length() * 1.25); + int32_t max_stretch = static_cast(readSubSeq->length() * 1.1); // a raw hit is a pair of read_pos and a projected hit From abde84a2f345f6d389a326ecc0682c9c77de1135 Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Thu, 3 Jun 2021 23:51:08 -0400 Subject: [PATCH 3/9] rc is tricky ... fixed! --- src/SalmonAlevin.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/SalmonAlevin.cpp b/src/SalmonAlevin.cpp index 8858f2ca4..15b7db7cc 100644 --- a/src/SalmonAlevin.cpp +++ b/src/SalmonAlevin.cpp @@ -515,9 +515,9 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re // This ensures that we don't double-count a k-mer that // might occur twice on this target. if (ref_pos < last_ref_pos_rc and read_pos > last_read_pos_rc) { - approx_pos_rc = ref_pos - read_pos; + approx_pos_rc = ref_pos; if (last_read_pos_rc == -1) { - approx_end_pos_rc = ref_pos - read_pos; + approx_end_pos_rc = ref_pos + read_pos; } else { if (approx_end_pos_rc - approx_pos_rc > max_stretch) { return false;} } @@ -705,7 +705,9 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re decltype(raw_hits[0].first) prev_read_pos = -1; // the maximum span the supporting k-mers of a // mapping position are allowed to have. - int32_t max_stretch = static_cast(readSubSeq->length() * 1.1); + // NOTE this is still > read_length b/c the stretch is measured wrt the + // START of the terminal k-mer. + int32_t max_stretch = static_cast(readSubSeq->length() * 1.0); // a raw hit is a pair of read_pos and a projected hit From 9580fd5ecab0226d3ee653c6e6b83c58f4ed6b7c Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Fri, 11 Jun 2021 14:22:24 -0400 Subject: [PATCH 4/9] fix bug where integer width for UMI was tied to barcode --- src/SalmonAlevin.cpp | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/src/SalmonAlevin.cpp b/src/SalmonAlevin.cpp index 15b7db7cc..bd15d8c90 100644 --- a/src/SalmonAlevin.cpp +++ b/src/SalmonAlevin.cpp @@ -149,6 +149,14 @@ namespace alevin{ constexpr uint32_t miniBatchSize{5000}; + // NOTE: Consider the implications of using uint32_t below. + // Currently, this should not limit the barcode length to <= 16 in + // "fry" mode (either sla or sketch) since we never actually put the + // barcode in the corresponding variable of the AlignmentGroup class. + // In traditional alevin, this should be referring to a barcode *index* + // rather than the sequence itself, so it should be OK so long as we have + // < std::numeric_limits::max() distinct barcodes / reads. + // However, that assumption should be tested more thoroughly. using CellBarcodeT = uint32_t; using UMIBarcodeT = uint64_t; @@ -895,9 +903,9 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re // bc // if we can fit the barcode into an integer - if ( alevinOpts.protocol.barcodeLength <= 32 ) { + if ( barcodeLength <= 32 ) { if (barcode_ok) { - if (alevinOpts.protocol.barcodeLength <= 16) { // can use 32-bit int + if ( barcodeLength <= 16 ) { // can use 32-bit int uint32_t shortbck = static_cast(0x00000000FFFFFFFF & bck.word(0)); bw << shortbck; } else { // must use 64-bit int @@ -909,11 +917,11 @@ void process_reads_sc_sketch(paired_parser* parser, ReadExperimentT& readExp, Re } // umi - if ( alevinOpts.protocol.barcodeLength <= 16 ) { // if we can use 32-bit int + if ( umiLength <= 16 ) { // if we can use 32-bit int uint64_t umiint = jointHitGroup.umi(); uint32_t shortumi = static_cast(0x00000000FFFFFFFF & umiint); bw << shortumi; - } else if ( alevinOpts.protocol.barcodeLength <= 32 ) { // if we can use 64-bit int + } else if ( umiLength <= 32 ) { // if we can use 64-bit int uint64_t umiint = jointHitGroup.umi(); bw << umiint; } else { // must use string @@ -1336,11 +1344,11 @@ void process_reads_sc_align(paired_parser* parser, ReadExperimentT& readExp, Rea // bc // if we can if the barcode into an integer - if ( alevinOpts.protocol.barcodeLength <= 32 ) { + if ( barcodeLength <= 32 ) { if (barcode_ok) { //alevinOpts.jointLog->info("BARCODE : {} \t ENC : {} ", barcode, bck.word(0)); - if (alevinOpts.protocol.barcodeLength <= 16) { // can use 32-bit int + if ( barcodeLength <= 16 ) { // can use 32-bit int uint32_t shortbck = static_cast(0x00000000FFFFFFFF & bck.word(0)); //alevinOpts.jointLog->info("shortbck : {} ", shortbck); bw << shortbck; @@ -1353,11 +1361,11 @@ void process_reads_sc_align(paired_parser* parser, ReadExperimentT& readExp, Rea } // umi - if ( alevinOpts.protocol.barcodeLength <= 16 ) { // if we can use 32-bit int + if ( umiLength <= 16 ) { // if we can use 32-bit int uint64_t umiint = jointHitGroup.umi(); uint32_t shortumi = static_cast(0x00000000FFFFFFFF & umiint); bw << shortumi; - } else if ( alevinOpts.protocol.barcodeLength <= 32 ) { // if we can use 64-bit int + } else if ( umiLength <= 32 ) { // if we can use 64-bit int uint64_t umiint = jointHitGroup.umi(); bw << umiint; } else { // must use string From a7a6f1319a3fbc19611fd55b76ea8c6be82c5a0e Mon Sep 17 00:00:00 2001 From: Avi Srivastava Date: Sat, 12 Jun 2021 23:38:53 -0400 Subject: [PATCH 5/9] fixing alevin issue --- include/SingleCellProtocols.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/SingleCellProtocols.hpp b/include/SingleCellProtocols.hpp index acc242201..9dfc85413 100644 --- a/include/SingleCellProtocols.hpp +++ b/include/SingleCellProtocols.hpp @@ -103,8 +103,8 @@ namespace alevin{ } // NOTE: these do nothing but satisfy // template requirements right now - void set_umi_geo(TagGeometry& g) { (void)g; }; - void set_bc_geo(TagGeometry& g) { (void)g; }; + void set_umi_geo(TagGeometry& g) { umiLength = g.length1 + g.length2; }; + void set_bc_geo(TagGeometry& g) { barcodeLength = g.length1 + g.length2; }; void set_read_geo(TagGeometry& g) { (void)g; }; uint32_t barcode_length() const { return barcodeLength; } uint32_t umi_length() const { return umiLength; } From c37855c3cf882114d20f7b42d752f50e40b2fc8d Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Sun, 13 Jun 2021 01:04:17 -0400 Subject: [PATCH 6/9] minor tweak --- include/SingleCellProtocols.hpp | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/include/SingleCellProtocols.hpp b/include/SingleCellProtocols.hpp index 9dfc85413..a4fc4483d 100644 --- a/include/SingleCellProtocols.hpp +++ b/include/SingleCellProtocols.hpp @@ -101,10 +101,13 @@ namespace alevin{ maxValue(maxValue_){ alevin::types::AlevinUMIKmer::k(umiLength); } - // NOTE: these do nothing but satisfy - // template requirements right now - void set_umi_geo(TagGeometry& g) { umiLength = g.length1 + g.length2; }; - void set_bc_geo(TagGeometry& g) { barcodeLength = g.length1 + g.length2; }; + // NOTE: these functions are duplicated + // with those in `CustomGeometry` below, and + // due to semantics have slightly different + // implementations. See if the design can be + // unified later. + void set_umi_geo(TagGeometry& g) { umiLength = g.length(); }; + void set_bc_geo(TagGeometry& g) { barcodeLength = g.length(); }; void set_read_geo(TagGeometry& g) { (void)g; }; uint32_t barcode_length() const { return barcodeLength; } uint32_t umi_length() const { return umiLength; } @@ -185,14 +188,15 @@ namespace alevin{ TagGeometry bc_geo; TagGeometry read_geo; - void set_umi_geo(TagGeometry& g) { umi_geo = g; umiLength = umi_geo.length1 + umi_geo.length2; } - void set_bc_geo(TagGeometry& g) { bc_geo = g; barcodeLength = bc_geo.length1 + bc_geo.length2; } + void set_umi_geo(TagGeometry& g) { umi_geo = g; umiLength = umi_geo.length(); } + void set_bc_geo(TagGeometry& g) { bc_geo = g; barcodeLength = bc_geo.length(); } void set_read_geo(TagGeometry& g) { read_geo = g; } uint32_t barcode_length() const { return barcodeLength; } uint32_t umi_length() const { return umiLength; } - // These values do nothing in this class except - // maintain template compat ... fix this design later. + // These values are set only when `set_umi_geo` and + // `set_bc_geo` are called. See if this design can + // be better integrated with `Rule` later. uint32_t barcodeLength, umiLength, maxValue; BarcodeEnd end; }; From e8c47c57817d4f01eb815927b8908d649e2c68b5 Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Sun, 13 Jun 2021 01:38:21 -0400 Subject: [PATCH 7/9] version bump to prepare for next release --- current_version.txt | 2 +- doc/source/conf.py | 2 +- docker/Dockerfile | 2 +- docker/build_test.sh | 2 +- include/SalmonConfig.hpp | 4 ++-- include/SingleCellProtocols.hpp | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/current_version.txt b/current_version.txt index bf7d551fa..233e1d268 100644 --- a/current_version.txt +++ b/current_version.txt @@ -1,3 +1,3 @@ VERSION_MAJOR 1 VERSION_MINOR 5 -VERSION_PATCH 0 +VERSION_PATCH 1 diff --git a/doc/source/conf.py b/doc/source/conf.py index 16372a206..270df9eeb 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -57,7 +57,7 @@ # The short X.Y version. version = '1.5' # The full version, including alpha/beta/rc tags. -release = '1.5.0' +release = '1.5.1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docker/Dockerfile b/docker/Dockerfile index 464d6c3da..a4f2d2de0 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -6,7 +6,7 @@ MAINTAINER salmon.maintainer@gmail.com ENV PACKAGES git gcc make g++ libboost-all-dev liblzma-dev libbz2-dev \ ca-certificates zlib1g-dev libcurl4-openssl-dev curl unzip autoconf apt-transport-https ca-certificates gnupg software-properties-common wget -ENV SALMON_VERSION 1.5.0 +ENV SALMON_VERSION 1.5.1 # salmon binary will be installed in /home/salmon/bin/salmon diff --git a/docker/build_test.sh b/docker/build_test.sh index d24c156c5..1eac40c90 100644 --- a/docker/build_test.sh +++ b/docker/build_test.sh @@ -1,3 +1,3 @@ #! /bin/bash -SALMON_VERSION=1.5.0 +SALMON_VERSION=1.5.1 docker build --no-cache -t combinelab/salmon:${SALMON_VERSION} -t combinelab/salmon:latest . diff --git a/include/SalmonConfig.hpp b/include/SalmonConfig.hpp index 84a35d922..85160af21 100644 --- a/include/SalmonConfig.hpp +++ b/include/SalmonConfig.hpp @@ -27,8 +27,8 @@ namespace salmon { constexpr char majorVersion[] = "1"; constexpr char minorVersion[] = "5"; -constexpr char patchVersion[] = "0"; -constexpr char version[] = "1.5.0"; +constexpr char patchVersion[] = "1"; +constexpr char version[] = "1.5.1"; constexpr uint32_t indexVersion = 5; constexpr char requiredQuasiIndexVersion[] = "p7"; } // namespace salmon diff --git a/include/SingleCellProtocols.hpp b/include/SingleCellProtocols.hpp index a4fc4483d..252f8d4a6 100644 --- a/include/SingleCellProtocols.hpp +++ b/include/SingleCellProtocols.hpp @@ -104,7 +104,7 @@ namespace alevin{ // NOTE: these functions are duplicated // with those in `CustomGeometry` below, and // due to semantics have slightly different - // implementations. See if the design can be + // implementations. See if the design can be // unified later. void set_umi_geo(TagGeometry& g) { umiLength = g.length(); }; void set_bc_geo(TagGeometry& g) { barcodeLength = g.length(); }; From 80a5e19b4fcc56526f71d98f6d4fc6c932ebc2fc Mon Sep 17 00:00:00 2001 From: Avi Srivastava Date: Sun, 13 Jun 2021 19:36:52 -0400 Subject: [PATCH 8/9] blocking citeseq with custom --- src/Alevin.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Alevin.cpp b/src/Alevin.cpp index 1f8ccc218..d7215d61d 100644 --- a/src/Alevin.cpp +++ b/src/Alevin.cpp @@ -1038,7 +1038,7 @@ salmon-based processing of single-cell RNA-seq data. if (celseq) validate_num_protocols += 1; if (celseq2) validate_num_protocols += 1; if (quartzseq2) validate_num_protocols += 1; - if (custom and !noTgMap) validate_num_protocols += 1; + if (custom) validate_num_protocols += 1; if ( validate_num_protocols != 1 ) { fmt::print(stderr, "ERROR: Please specify one and only one scRNA protocol;"); From 2a3765fae8816246da3aa247936bfe65345ed358 Mon Sep 17 00:00:00 2001 From: Rob Patro Date: Sun, 13 Jun 2021 23:12:12 -0400 Subject: [PATCH 9/9] fetch from 1.5.1 tag --- scripts/fetchPufferfish.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/fetchPufferfish.sh b/scripts/fetchPufferfish.sh index 21a12492b..3f17ebeea 100755 --- a/scripts/fetchPufferfish.sh +++ b/scripts/fetchPufferfish.sh @@ -22,11 +22,11 @@ if [ -d ${INSTALL_DIR}/src/pufferfish ] ; then rm -fr ${INSTALL_DIR}/src/pufferfish fi -#SVER=salmon-v1.5.0 -SVER=develop +SVER=salmon-v1.5.1 +#SVER=develop #SVER=sketch-mode -EXPECTED_SHA256=e72470c58a7f9b1f66dece73ebf27df07b24b1585d4b155466f165dc9dfcf586 +EXPECTED_SHA256=468e0c23a32d81524f7acadc8326efb155628970c15fd6cb843d26a61478bfde mkdir -p ${EXTERNAL_DIR} curl -k -L https://github.com/COMBINE-lab/pufferfish/archive/${SVER}.zip -o ${EXTERNAL_DIR}/pufferfish.zip