diff --git a/LICENSE.txt b/LICENSE.txt
index 150bc54c..fa815670 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -159,5 +159,5 @@ ABOVE STATED REMEDY FAILS OF ITS ESSENTIAL PURPOSE.
END OF TERMS AND CONDITIONS
-Genozip license version: 15.0.66
+Genozip license version: 15.0.67
diff --git a/RELEASE_NOTES.txt b/RELEASE_NOTES.txt
index 5916074f..6ff742a5 100644
--- a/RELEASE_NOTES.txt
+++ b/RELEASE_NOTES.txt
@@ -3,6 +3,9 @@ Note on versioning:
- Minor version changes with bug fixes and minor feature updates
- Some minor versions are skipped due to failed deployment pipelines
+15.0.67 23/9/2024
+- Improvements in Deep.
+
15.0.66 15/9/2024
- BAM: better compression of PacBio and Nanopore files generated with minimap2, pbmm2, winnowmap
- BAM: further small reduction in RAM consumption when compressing and uncompressing SAM/BAM/CRAM files with many Supplementary and Secondary alignments
diff --git a/genozip.code-workspace b/genozip.code-workspace
index e5392d0d..b150ee7b 100644
--- a/genozip.code-workspace
+++ b/genozip.code-workspace
@@ -66,7 +66,8 @@
"string": "c",
"string_view": "c",
"bzlib.h": "c",
- "memory": "c"
+ "memory": "c",
+ "vector": "c"
}
}
}
\ No newline at end of file
diff --git a/installers/LICENSE.html b/installers/LICENSE.html
index 990cd96c..23b23f51 100644
--- a/installers/LICENSE.html
+++ b/installers/LICENSE.html
@@ -34,4 +34,4 @@
10. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides Genozip on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Genozip and assume any risks associated with Your exercise of permissions under this License.
11. LIMITATION OF LIABILITY. TO THE FULLEST EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, STRICT LIABILITY OR OTHER LEGAL OR EQUITABLE THEORY, SHALL LICENSOR OR DEVELOPER BE LIABLE FOR DAMAGES, INCLUDING ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY CHARACTER ARISING AS A RESULT OF THIS LICENSE OR OUT OF THE USE OR INABILITY TO USE GENOZIP (INCLUDING BUT NOT LIMITED TO DAMAGES FOR LOSS OF GOODWILL, WORK STOPPAGE, COMPUTER FAILURE OR MALFUNCTION, FILE CORRUPTION, DATA LOSS, OR ANY AND ALL OTHER COMMERCIAL DAMAGES OR LOSSES), EVEN IF LICENSOR OR DEVELOPER HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL LICENSOR'S OR DEVELOPER'S TOTAL LIABILITY TO LICENSEE FOR ALL DAMAGES (OTHER THAN AS MAY BE REQUIRED BY APPLICABLE LAW IN CASES INVOLVING PERSONAL INJURY) EXCEED THE AMOUNT OF $500 USD. THE FOREGOING LIMITATIONS WILL APPLY EVEN IF THE ABOVE STATED REMEDY FAILS OF ITS ESSENTIAL PURPOSE.
END OF TERMS AND CONDITIONS
-Genozip license version: 15.0.66
+Genozip license version: 15.0.67
diff --git a/installers/genozip-installer.exe b/installers/genozip-installer.exe
index f5223acb..74c9d65b 100644
Binary files a/installers/genozip-installer.exe and b/installers/genozip-installer.exe differ
diff --git a/installers/genozip-linux-x86_64.tar b/installers/genozip-linux-x86_64.tar
index 54d0921d..b7cb328c 100644
Binary files a/installers/genozip-linux-x86_64.tar and b/installers/genozip-linux-x86_64.tar differ
diff --git a/installers/genozip-osx-arm.tar b/installers/genozip-osx-arm.tar
index dca82544..1ab2ef98 100644
Binary files a/installers/genozip-osx-arm.tar and b/installers/genozip-osx-arm.tar differ
diff --git a/installers/genozip-osx-x86.tar b/installers/genozip-osx-x86.tar
index 5f78924e..4cf7c9a5 100644
Binary files a/installers/genozip-osx-x86.tar and b/installers/genozip-osx-x86.tar differ
diff --git a/src/Makefile b/src/Makefile
index 9541568e..5ddadc37 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -141,7 +141,7 @@ LATEST_SRC = ../../genozip-latest/src
SRC_DIRS = secure zlib bzlib lzma bsc libdeflate_1.7 libdeflate_1.19 libdeflate_1.19/x86 libdeflate_1.19/arm \
htscodecs igzip igzip/aarch64 igzip/x86_64 igzip/noarch
-MY_SRCS = genozip.c genols.c context.c container.c strings.c stats.c arch.c tip.c seg_id.c zip_dyn_int.c \
+MY_SRCS = genozip.c genols.c context.c container.c strings.c crc64.c stats.c arch.c tip.c seg_id.c zip_dyn_int.c\
data_types.c bits.c progress.c writer.c zriter.c tar.c chrom.c qname.c tokenizer.c mutex.c threads.c \
zip.c piz.c reconstruct.c recon_history.c recon_peek.c seg.c zfile.c aligner.c flags.c specials.c \
reference.c contigs.c ref_lock.c refhash.c ref_make.c ref_contigs.c ref_iupacs.c ref_cache.c digest.c \
diff --git a/src/bam_seg.c b/src/bam_seg.c
index 56161867..b15475d5 100644
--- a/src/bam_seg.c
+++ b/src/bam_seg.c
@@ -200,7 +200,7 @@ uint32_t bam_split_aux (VBlockSAMP vb, rom alignment, rom aux, rom after_aux, ro
void bam_seg_BIN (VBlockSAMP vb, ZipDataLineSAMP dl, uint16_t bin /* used only in bam */, bool is_bam)
{
PosType32 this_pos = dl->POS;
- PosType32 last_pos = dl->FLAG.unmapped ? this_pos : (this_pos + vb->ref_consumed - 1);
+ PosType32 last_pos = dl->FLAG.unmapped ? this_pos : (this_pos + vb->ref_consumed - 1); // note: it is possible to have RNAME/POS set and FLAG.unmapped. The SAM spec calls this "placed unmapped";
uint16_t reg2bin = bam_reg2bin (this_pos, last_pos); // zero-based, half-closed half-open [start,end)
if (!is_bam || (last_pos <= MAX_POS_SAM && reg2bin == bin))
@@ -549,7 +549,7 @@ rom bam_seg_txt_line (VBlockP vb_, rom alignment /* BAM terminology for one line
// finally we can segment the textual CIGAR now (including if n_cigar_op=0)
sam_seg_CIGAR (vb, dl, vb->textual_cigar.len32, STRb(vb->textual_seq), qual, l_seq,
- ((uint32_t)n_cigar_op * sizeof (uint32_t) /* cigar */ + sizeof (uint16_t) /* n_cigar_op */));
+ ((uint32_t)n_cigar_op * sizeof (uint32_t) /* cigar */ + sizeof (uint16_t) /* n_cigar_op */));
// QUAL. note: can only be called after sam_seg_CIGAR updates SEQ.len
if (!vb->qual_missing) // case we have both SEQ and QUAL
diff --git a/src/codec_domq.c b/src/codec_domq.c
index e1c81596..bef659a4 100644
--- a/src/codec_domq.c
+++ b/src/codec_domq.c
@@ -711,7 +711,7 @@ CODEC_RECONSTRUCT (codec_domq_reconstruct)
START_TIMER;
if (!ctx->is_loaded && !(ctx+1)->is_loaded && !(ctx+2)->is_loaded && !(ctx+3)->is_loaded) return;
-
+
ContextP declare_domq_contexts (ctx);
// case: up to v13, all reads were compressed with the same dom (no multiplexing)
diff --git a/src/conda/meta.template.yaml b/src/conda/meta.template.yaml
index 08dc5e27..2858a0bc 100644
--- a/src/conda/meta.template.yaml
+++ b/src/conda/meta.template.yaml
@@ -27,17 +27,21 @@ build:
requirements:
build:
- {{ posix }}make
+ - {{ posix }}filesystem # [win]
+ - {{ posix }}sed # [win]
+ - {{ posix }}coreutils # [win]
+ - {{ posix }}zip # [win]
- nasm # [not arm64]
- - {{ compiler('c') }} # [not win]
- - {{ compiler('cxx') }} # [not win]
+ - {{ compiler('c') }} # [unix]
+ - {{ compiler('cxx') }} # [unix]
+ - {{ stdlib("c") }} # [unix]
- {{ compiler('m2w64_c') }} # [win]
- - {{ posix }}sed # [win]
- - {{ posix }}coreutils # [win]
-
- c_stdlib: # added per: https://github.com/conda-forge/conda-forge.github.io/issues/2102
- - sysroot # [linux]
- - macosx_deployment_target # [osx]
- # - vs # [win]
+ - {{ compiler('m2w64_cxx') }} # [win]
+ - {{ stdlib("m2w64_c") }} # [win]
+
+ # c_stdlib: # added per: https://github.com/conda-forge/conda-forge.github.io/issues/2102
+ # - sysroot # [linux]
+ # - macosx_deployment_target # [osx]
c_stdlib_version: # [unix]
- 2.12 # [linux and x86_64]
@@ -46,9 +50,12 @@ requirements:
- 11.0 # [osx and arm64]
host:
- - {{ native }}gcc-libs # [win]
+# - {{ native }}gcc-libs # [win]
+ - pthreads-win32 # [win]
+
run:
- - {{ native }}gcc-libs # [win]
+# - {{ native }}gcc-libs # [win]
+ - pthreads-win32 # [win]
- curl
test:
@@ -64,7 +71,7 @@ about:
license_family: OTHER
license_file:
- LICENSE.txt
- summary: Compressor for genomic files (FASTQ, BAM, VCF, FASTA and more), up to 5x better than gzip and faster too
+ summary: Lossless compression of FASTQ, BAM, VCF, FASTA - 2x-10x better than .gz / .cram
description: |
__README_MD__
diff --git a/src/crc64.c b/src/crc64.c
new file mode 100644
index 00000000..4b5ef660
--- /dev/null
+++ b/src/crc64.c
@@ -0,0 +1,121 @@
+
+/* Redis uses the CRC64 variant with "Jones" coefficients and init value of 0.
+ *
+ * Specification of this CRC64 variant follows:
+ * Name: crc-64-jones
+ * Width: 64 bites
+ * Poly: 0xad93d23594c935a9
+ * Reflected In: True
+ * Xor_In: 0xffffffffffffffff
+ * Reflected_Out: True
+ * Xor_Out: 0x0
+ * Check("123456789"): 0xe9c6d914c4b8d9ca
+ *
+ * Copyright (c) 2012, Salvatore Sanfilippo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Redis nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE. */
+
+// Genozip comment:
+// This file was obtained from https://raw.githubusercontent.com/srned/baselib/refs/heads/master/crc64.c and modified for Genozip purposes. Modifications are:
+// Copyright (C) 2024-2024 Genozip Limited. Patent Pending. Please see terms and conditions in the file LICENSE.txt
+
+#include "genozip.h"
+
+uint64_t crc64 (uint64_t crc, bytes data, uint64_t data_len)
+{
+ static const uint64_t crc64_tab[256] = {
+ 0x0000000000000000, 0x7ad870c830358979, 0xf5b0e190606b12f2, 0x8f689158505e9b8b,
+ 0xc038e5739841b68f, 0xbae095bba8743ff6, 0x358804e3f82aa47d, 0x4f50742bc81f2d04,
+ 0xab28ecb46814fe75, 0xd1f09c7c5821770c, 0x5e980d24087fec87, 0x24407dec384a65fe,
+ 0x6b1009c7f05548fa, 0x11c8790fc060c183, 0x9ea0e857903e5a08, 0xe478989fa00bd371,
+ 0x7d08ff3b88be6f81, 0x07d08ff3b88be6f8, 0x88b81eabe8d57d73, 0xf2606e63d8e0f40a,
+ 0xbd301a4810ffd90e, 0xc7e86a8020ca5077, 0x4880fbd87094cbfc, 0x32588b1040a14285,
+ 0xd620138fe0aa91f4, 0xacf86347d09f188d, 0x2390f21f80c18306, 0x594882d7b0f40a7f,
+ 0x1618f6fc78eb277b, 0x6cc0863448deae02, 0xe3a8176c18803589, 0x997067a428b5bcf0,
+ 0xfa11fe77117cdf02, 0x80c98ebf2149567b, 0x0fa11fe77117cdf0, 0x75796f2f41224489,
+ 0x3a291b04893d698d, 0x40f16bccb908e0f4, 0xcf99fa94e9567b7f, 0xb5418a5cd963f206,
+ 0x513912c379682177, 0x2be1620b495da80e, 0xa489f35319033385, 0xde51839b2936bafc,
+ 0x9101f7b0e12997f8, 0xebd98778d11c1e81, 0x64b116208142850a, 0x1e6966e8b1770c73,
+ 0x8719014c99c2b083, 0xfdc17184a9f739fa, 0x72a9e0dcf9a9a271, 0x08719014c99c2b08,
+ 0x4721e43f0183060c, 0x3df994f731b68f75, 0xb29105af61e814fe, 0xc849756751dd9d87,
+ 0x2c31edf8f1d64ef6, 0x56e99d30c1e3c78f, 0xd9810c6891bd5c04, 0xa3597ca0a188d57d,
+ 0xec09088b6997f879, 0x96d1784359a27100, 0x19b9e91b09fcea8b, 0x636199d339c963f2,
+ 0xdf7adabd7a6e2d6f, 0xa5a2aa754a5ba416, 0x2aca3b2d1a053f9d, 0x50124be52a30b6e4,
+ 0x1f423fcee22f9be0, 0x659a4f06d21a1299, 0xeaf2de5e82448912, 0x902aae96b271006b,
+ 0x74523609127ad31a, 0x0e8a46c1224f5a63, 0x81e2d7997211c1e8, 0xfb3aa75142244891,
+ 0xb46ad37a8a3b6595, 0xceb2a3b2ba0eecec, 0x41da32eaea507767, 0x3b024222da65fe1e,
+ 0xa2722586f2d042ee, 0xd8aa554ec2e5cb97, 0x57c2c41692bb501c, 0x2d1ab4dea28ed965,
+ 0x624ac0f56a91f461, 0x1892b03d5aa47d18, 0x97fa21650afae693, 0xed2251ad3acf6fea,
+ 0x095ac9329ac4bc9b, 0x7382b9faaaf135e2, 0xfcea28a2faafae69, 0x8632586aca9a2710,
+ 0xc9622c4102850a14, 0xb3ba5c8932b0836d, 0x3cd2cdd162ee18e6, 0x460abd1952db919f,
+ 0x256b24ca6b12f26d, 0x5fb354025b277b14, 0xd0dbc55a0b79e09f, 0xaa03b5923b4c69e6,
+ 0xe553c1b9f35344e2, 0x9f8bb171c366cd9b, 0x10e3202993385610, 0x6a3b50e1a30ddf69,
+ 0x8e43c87e03060c18, 0xf49bb8b633338561, 0x7bf329ee636d1eea, 0x012b592653589793,
+ 0x4e7b2d0d9b47ba97, 0x34a35dc5ab7233ee, 0xbbcbcc9dfb2ca865, 0xc113bc55cb19211c,
+ 0x5863dbf1e3ac9dec, 0x22bbab39d3991495, 0xadd33a6183c78f1e, 0xd70b4aa9b3f20667,
+ 0x985b3e827bed2b63, 0xe2834e4a4bd8a21a, 0x6debdf121b863991, 0x1733afda2bb3b0e8,
+ 0xf34b37458bb86399, 0x8993478dbb8deae0, 0x06fbd6d5ebd3716b, 0x7c23a61ddbe6f812,
+ 0x3373d23613f9d516, 0x49aba2fe23cc5c6f, 0xc6c333a67392c7e4, 0xbc1b436e43a74e9d,
+ 0x95ac9329ac4bc9b5, 0xef74e3e19c7e40cc, 0x601c72b9cc20db47, 0x1ac40271fc15523e,
+ 0x5594765a340a7f3a, 0x2f4c0692043ff643, 0xa02497ca54616dc8, 0xdafce7026454e4b1,
+ 0x3e847f9dc45f37c0, 0x445c0f55f46abeb9, 0xcb349e0da4342532, 0xb1eceec59401ac4b,
+ 0xfebc9aee5c1e814f, 0x8464ea266c2b0836, 0x0b0c7b7e3c7593bd, 0x71d40bb60c401ac4,
+ 0xe8a46c1224f5a634, 0x927c1cda14c02f4d, 0x1d148d82449eb4c6, 0x67ccfd4a74ab3dbf,
+ 0x289c8961bcb410bb, 0x5244f9a98c8199c2, 0xdd2c68f1dcdf0249, 0xa7f41839ecea8b30,
+ 0x438c80a64ce15841, 0x3954f06e7cd4d138, 0xb63c61362c8a4ab3, 0xcce411fe1cbfc3ca,
+ 0x83b465d5d4a0eece, 0xf96c151de49567b7, 0x76048445b4cbfc3c, 0x0cdcf48d84fe7545,
+ 0x6fbd6d5ebd3716b7, 0x15651d968d029fce, 0x9a0d8ccedd5c0445, 0xe0d5fc06ed698d3c,
+ 0xaf85882d2576a038, 0xd55df8e515432941, 0x5a3569bd451db2ca, 0x20ed197575283bb3,
+ 0xc49581ead523e8c2, 0xbe4df122e51661bb, 0x3125607ab548fa30, 0x4bfd10b2857d7349,
+ 0x04ad64994d625e4d, 0x7e7514517d57d734, 0xf11d85092d094cbf, 0x8bc5f5c11d3cc5c6,
+ 0x12b5926535897936, 0x686de2ad05bcf04f, 0xe70573f555e26bc4, 0x9ddd033d65d7e2bd,
+ 0xd28d7716adc8cfb9, 0xa85507de9dfd46c0, 0x273d9686cda3dd4b, 0x5de5e64efd965432,
+ 0xb99d7ed15d9d8743, 0xc3450e196da80e3a, 0x4c2d9f413df695b1, 0x36f5ef890dc31cc8,
+ 0x79a59ba2c5dc31cc, 0x037deb6af5e9b8b5, 0x8c157a32a5b7233e, 0xf6cd0afa9582aa47,
+ 0x4ad64994d625e4da, 0x300e395ce6106da3, 0xbf66a804b64ef628, 0xc5bed8cc867b7f51,
+ 0x8aeeace74e645255, 0xf036dc2f7e51db2c, 0x7f5e4d772e0f40a7, 0x05863dbf1e3ac9de,
+ 0xe1fea520be311aaf, 0x9b26d5e88e0493d6, 0x144e44b0de5a085d, 0x6e963478ee6f8124,
+ 0x21c640532670ac20, 0x5b1e309b16452559, 0xd476a1c3461bbed2, 0xaeaed10b762e37ab,
+ 0x37deb6af5e9b8b5b, 0x4d06c6676eae0222, 0xc26e573f3ef099a9, 0xb8b627f70ec510d0,
+ 0xf7e653dcc6da3dd4, 0x8d3e2314f6efb4ad, 0x0256b24ca6b12f26, 0x788ec2849684a65f,
+ 0x9cf65a1b368f752e, 0xe62e2ad306bafc57, 0x6946bb8b56e467dc, 0x139ecb4366d1eea5,
+ 0x5ccebf68aecec3a1, 0x2616cfa09efb4ad8, 0xa97e5ef8cea5d153, 0xd3a62e30fe90582a,
+ 0xb0c7b7e3c7593bd8, 0xca1fc72bf76cb2a1, 0x45775673a732292a, 0x3faf26bb9707a053,
+ 0x70ff52905f188d57, 0x0a2722586f2d042e, 0x854fb3003f739fa5, 0xff97c3c80f4616dc,
+ 0x1bef5b57af4dc5ad, 0x61372b9f9f784cd4, 0xee5fbac7cf26d75f, 0x9487ca0fff135e26,
+ 0xdbd7be24370c7322, 0xa10fceec0739fa5b, 0x2e675fb4576761d0, 0x54bf2f7c6752e8a9,
+ 0xcdcf48d84fe75459, 0xb71738107fd2dd20, 0x387fa9482f8c46ab, 0x42a7d9801fb9cfd2,
+ 0x0df7adabd7a6e2d6, 0x772fdd63e7936baf, 0xf8474c3bb7cdf024, 0x829f3cf387f8795d,
+ 0x66e7a46c27f3aa2c, 0x1c3fd4a417c62355, 0x935745fc4798b8de, 0xe98f353477ad31a7,
+ 0xa6df411fbfb21ca3, 0xdc0731d78f8795da, 0x536fa08fdfd90e51, 0x29b7d047efec8728,
+ };
+
+ for (uint64_t i=0; i < data_len; i++)
+ crc = crc64_tab[(uint8_t)crc ^ data[i]] ^ (crc >> 8);
+
+ return crc;
+}
+
diff --git a/src/deep.c b/src/deep.c
index b490e442..d1f8e051 100644
--- a/src/deep.c
+++ b/src/deep.c
@@ -13,8 +13,6 @@
#define MAX_AUTO_READ_LEN 20000 // not too long, so that the "longer reads" code path also gets some mileage
-rom by_names[2] = { "BY_SEQ", "BY_QNAME" };
-
// hash of a SEQ field in the forward direction
// note: I tested crc32 after converting seq to 2-bit. No advantage - Almost identical linked-list-length histogram.
uint32_t deep_seq_hash (VBlockP vb, STRp(seq), bool is_revcomp)
@@ -90,31 +88,5 @@ uint32_t deep_qual_hash (VBlockP vb, STRp(qual), bool is_revcomp)
if (is_revcomp && qual_len > MAX_AUTO_READ_LEN)
buf_free (vb->scratch);
- // xxx possible better but slower hash for qual
- // char *my_qual;
-
- // // short enough reads - use automatic allocation
- // if (qual_len <= MAX_AUTO_READ_LEN)
- // my_qual = short_read_data;
- // else {
- // ASSERTNOTINUSE (vb->scratch);
- // buf_alloc (vb, &vb->scratch, 0, qual_len, char, 0, "scratch");
- // my_qual = B1STc (vb->scratch);
- // }
-
- // if (is_revcomp)
- // str_reverse (my_qual, STRa(qual));
-
- // for (uint32_t i=0; i < qual_len; i++)
- // my_qual[i] ^= (i & 0xff);
-
- // uint32_t hash = crc32 (0, my_qual, qual_len);
-
- // // note: qual_hash=0 means "QUAL not hashed", so both crc32=0 and crc32=1 get mapped to hash=1
- // if (hash == 0) hash = 1;
-
- // if (qual_len > MAX_AUTO_READ_LEN)
- // buf_free (vb->scratch);
-
return hash;
}
diff --git a/src/deep.h b/src/deep.h
index d71a3893..ae8f50a2 100644
--- a/src/deep.h
+++ b/src/deep.h
@@ -9,20 +9,19 @@
#pragma once
#include "qname.h"
-
-#define num_hash_bits z_file->deep_index_by[BY_SEQ].prm8[0] // ZIP: number of bits of seq_hash used for deep_hash_by_* (i.e. hash table is of size 2^num_hash_bits)
+#define num_hash_bits z_file->deep_index.prm8[0] // ZIP: number of bits of seq_hash used for deep_hash_by_* (i.e. hash table is of size 2^num_hash_bits)
#define deephash_issame(a,b) (!memcmp (&(a), &(b), sizeof (DeepHash)))
#define DEEPHASHf(a) (a).qname, (a).seq, (a).qual // for printf-like arguments
// note: in PIZ, z_file->vb_start_deep_line is indexed by vb_idx - 0-based counter of non-DEPN VBs (unlike in ZIP which is vb_i)
-#define num_deepable_sam_vbs z_file->deep_index.param
+#define num_deepable_sam_vbs z_file->deep_index.prm32[0] // PIZ
// 32 bytes
typedef struct {
- DeepHash hash; // hashes of qname, seq, qual
+ DeepHash hash; // hashes of qname (64b), seq, qual (32b)
#define NO_NEXT 0xffffffff
- uint32_t next[2]; // two linked lists (BY_SEQ, BY_QNAME) of entries with the with the same hash(SEQ)&mask
+ uint32_t next; // linked list of entries with the with the same (hash.qname & mask)
uint32_t seq_len;
union ZipZDeepPlace {
@@ -43,9 +42,9 @@ extern uint32_t deep_seq_hash (VBlockP vb, STRp(seq), bool is_revcomp);
extern uint32_t deep_qual_hash (VBlockP vb, STRp(qual), bool is_revcomp);
// ZIP: hash of canonical qname (note: in FASTQ qname is the part of DESC up to the first whitespace)
-static inline uint32_t deep_qname_hash (QType q, STRp(qname), thool is_last, uint32_t *uncanonical_suffix_len)
-{
- return qname_calc_hash (q, COMP_NONE, STRa(qname), is_last, true, uncanonical_suffix_len);
+static inline uint64_t deep_qname_hash (QType q, STRp(qname), thool is_last, uint32_t *uncanonical_suffix_len)
+{
+ return qname_calc_hash (q, COMP_NONE, STRa(qname), is_last, true, CRC64, uncanonical_suffix_len);
}
//-----------------------------------------------------------------------
@@ -56,7 +55,7 @@ static inline uint32_t deep_qname_hash (QType q, STRp(qname), thool is_last, uin
// 2: is_long_seq_comp
// 3: is_long_qual_comp
//
-// Part B: exists unless segconf.deep_qtype==QNONE or --seq-only or --qual-only
+// Part B: exists unless segconf.deep_qtype==QNONE (supported up to 15.0.66) or --seq-only or --qual-only
// 1 Byte : qname_len (up to 254 by BAM spec)
// qname_len bytes : QNAME (not compressed, not nul-terminated)
//
@@ -97,5 +96,3 @@ typedef struct { // 1 byte
uint8_t is_long_qual_comp : 1; // is qual_comp_len greater than 255
uint8_t unused : 3;
} PizZDeepFlags;
-
-extern rom by_names[2];
diff --git a/src/dict_id_gen.h b/src/dict_id_gen.h
index 3c1606e1..01759ec7 100644
--- a/src/dict_id_gen.h
+++ b/src/dict_id_gen.h
@@ -2356,11 +2356,11 @@ typedef enum { BED_CHROM, BED_START, BED_END, BED_NAME, BED_SCORE, BED_STRAND, B
#define NUM_VCF_SPECIAL 92
-#define SAM_SPECIAL_NAMES { "CIGAR", "TLEN_old", "BDBI", "delta_seq_len", "MD_old", "FLOAT", "BIN", "NM", "MD", "REF_CONSUMED", "PNEXT_IS_PREV_POS_old", "COPY_MATE_FLAG", "COPY_MATE_TLEN_old", "COPY_BUDDY_CIGAR", "FASTQ_CONSUME_AUX", "TLEN", "QUAL", "SAG", "SEQ", "PRIM_QNAME", "SQUANK", "BSSEEKER2_XO", "BSSEEKER2_XG", "BSSEEKER2_XM", "SA_main", "COPY_PRIM", "BWA_XC", "BWA_XT", "BWA_X1", "BWA_XS", "SM", "AM", "PNEXT", "DEMUX_BY_MATE", "DEMUX_BY_MATE_PRIM", "DEMUX_BY_BUDDY", "GEM3_XB", "BSBOLT_YS", "0A_RNAME", "BISMARK_XG", "HI", "DEMUX_BY_BUDDY_MAP", "SEQ_LEN", "FI", "cm", "COPY_BUDDY", "SET_BUDDY", "TX_AN_POS", "COPY_TEXTUAL_CIGAR", "BISMARK_XM", "BSBOLT_XB", "UQ", "iqsqdq", "ULTIMA_tp_old", "ULTIMA_C", "bi", "sd", "AGENT_RX", "AGENT_QX", "qname_rng2seq_len", "DEMUX_BY_XX_0", "DEMUX_BY_AS", "PLUS", "ULTIMA_tp", "ULTIMA_mi", "PACBIO_qe", "DEMUX_sn", "jI", "jM_length", "RG_by_QNAME", "PACBIO_we", "DEMUX_BY_REVCOMP_MATE", "crdna_GP", "DEMUX_MAPQ", "CPU_XL", }
+#define SAM_SPECIAL_NAMES { "CIGAR", "TLEN_old", "BDBI", "delta_seq_len", "MD_old", "FLOAT", "BIN", "NM", "MD", "REF_CONSUMED", "PNEXT_IS_PREV_POS_old", "COPY_MATE_FLAG", "COPY_MATE_TLEN_old", "COPY_BUDDY_CIGAR", "FASTQ_CONSUME_AUX", "TLEN", "QUAL", "SAG", "SEQ", "PRIM_QNAME", "SQUANK", "BSSEEKER2_XO", "BSSEEKER2_XG", "BSSEEKER2_XM", "SA_main", "COPY_PRIM", "BWA_XC", "BWA_XT", "BWA_X1", "BWA_XS", "SM", "AM", "PNEXT", "DEMUX_BY_MATE", "DEMUX_BY_MATE_PRIM", "DEMUX_BY_BUDDY", "GEM3_XB", "BSBOLT_YS", "0A_RNAME", "BISMARK_XG", "HI", "DEMUX_BY_BUDDY_MAP", "SEQ_LEN", "FI", "cm", "COPY_BUDDY", "SET_BUDDY", "TX_AN_POS", "COPY_TEXTUAL_CIGAR", "BISMARK_XM", "BSBOLT_XB", "UQ", "iqsqdq", "ULTIMA_tp_old", "ULTIMA_C", "bi", "sd", "AGENT_RX", "AGENT_QX", "qname_rng2seq_len", "DEMUX_BY_XX_0", "DEMUX_BY_AS", "PLUS", "ULTIMA_tp", "ULTIMA_mi", "PACBIO_qe", "DEMUX_sn", "jI", "jM_length", "RG_by_QNAME", "PACBIO_we", "DEMUX_BY_REVCOMP_MATE", "crdna_GP", "DEMUX_MAPQ", "CPU_XL", "ML_REPEATS", }
-#define SAM_SPECIAL { sam_cigar_special_CIGAR, sam_piz_special_TLEN_old, sam_piz_special_BD_BI, sam_piz_special_delta_seq_len, sam_piz_special_MD_old, bam_piz_special_FLOAT, bam_piz_special_BIN, sam_piz_special_NM, sam_piz_special_MD, sam_piz_special_REF_CONSUMED, sam_piz_special_PNEXT_IS_PREV_POS_old, sam_piz_special_COPY_MATE_FLAG, sam_piz_special_COPY_MATE_TLEN_old, sam_piz_special_COPY_BUDDY_CIGAR, sam_piz_special_FASTQ_CONSUME_AUX, sam_piz_special_TLEN, sam_piz_special_QUAL, sam_piz_special_pull_from_sag, sam_piz_special_SEQ, sam_piz_special_PRIM_QNAME, sam_piz_special_SQUANK, sam_piz_special_BSSEEKER2_XO, sam_piz_special_BSSEEKER2_XG, sam_piz_special_BSSEEKER2_XM, sam_piz_special_SA_main, sam_piz_special_COPY_PRIM, sam_piz_special_BWA_XC, sam_piz_special_BWA_XT, sam_piz_special_BWA_X1, sam_piz_special_BWA_XS, sam_piz_special_SM, sam_piz_special_AM, sam_piz_special_PNEXT, sam_piz_special_DEMUX_BY_MATE, sam_piz_special_DEMUX_BY_MATE_PRIM, sam_piz_special_DEMUX_BY_BUDDY, sam_piz_special_GEM3_XB, sam_piz_special_BSBOLT_YS, sam_piz_special_0A_RNAME, sam_piz_special_BISMARK_XG, sam_piz_special_HI, sam_piz_special_DEMUX_BY_BUDDY_MAP, sam_piz_special_SEQ_LEN, sam_piz_special_FI, sam_piz_special_cm, sam_piz_special_COPY_BUDDY, sam_piz_special_SET_BUDDY, sam_piz_special_TX_AN_POS, sam_piz_special_COPY_TEXTUAL_CIGAR, sam_piz_special_BISMARK_XM, sam_piz_special_BSBOLT_XB, sam_piz_special_UQ, sam_piz_special_iq_sq_dq, sam_piz_special_ULTIMA_tp_old, ultima_c_piz_special_DEMUX_BY_Q4NAME, sam_piz_special_bi, sam_piz_special_sd, agilent_special_AGENT_RX, agilent_special_AGENT_QX, special_qname_rng2seq_len, sam_piz_special_DEMUX_BY_XX_0, sam_piz_special_DEMUX_BY_AS, piz_special_PLUS, sam_piz_special_ULTIMA_tp, sam_piz_special_ULTIMA_MI, sam_piz_special_PACBIO_qe, sam_piz_special_DEMUX_sn, sam_piz_special_jI, sam_piz_special_jM_length, sam_piz_special_RG_by_QNAME, sam_piz_special_PACBIO_we, sam_piz_special_DEMUX_BY_REVCOMP_MATE, sam_piz_special_crdna_GP, sam_piz_special_DEMUX_MAPQ, sam_piz_special_CPU_XL, }
+#define SAM_SPECIAL { sam_cigar_special_CIGAR, sam_piz_special_TLEN_old, sam_piz_special_BD_BI, sam_piz_special_delta_seq_len, sam_piz_special_MD_old, bam_piz_special_FLOAT, bam_piz_special_BIN, sam_piz_special_NM, sam_piz_special_MD, sam_piz_special_REF_CONSUMED, sam_piz_special_PNEXT_IS_PREV_POS_old, sam_piz_special_COPY_MATE_FLAG, sam_piz_special_COPY_MATE_TLEN_old, sam_piz_special_COPY_BUDDY_CIGAR, sam_piz_special_FASTQ_CONSUME_AUX, sam_piz_special_TLEN, sam_piz_special_QUAL, sam_piz_special_pull_from_sag, sam_piz_special_SEQ, sam_piz_special_PRIM_QNAME, sam_piz_special_SQUANK, sam_piz_special_BSSEEKER2_XO, sam_piz_special_BSSEEKER2_XG, sam_piz_special_BSSEEKER2_XM, sam_piz_special_SA_main, sam_piz_special_COPY_PRIM, sam_piz_special_BWA_XC, sam_piz_special_BWA_XT, sam_piz_special_BWA_X1, sam_piz_special_BWA_XS, sam_piz_special_SM, sam_piz_special_AM, sam_piz_special_PNEXT, sam_piz_special_DEMUX_BY_MATE, sam_piz_special_DEMUX_BY_MATE_PRIM, sam_piz_special_DEMUX_BY_BUDDY, sam_piz_special_GEM3_XB, sam_piz_special_BSBOLT_YS, sam_piz_special_0A_RNAME, sam_piz_special_BISMARK_XG, sam_piz_special_HI, sam_piz_special_DEMUX_BY_BUDDY_MAP, sam_piz_special_SEQ_LEN, sam_piz_special_FI, sam_piz_special_cm, sam_piz_special_COPY_BUDDY, sam_piz_special_SET_BUDDY, sam_piz_special_TX_AN_POS, sam_piz_special_COPY_TEXTUAL_CIGAR, sam_piz_special_BISMARK_XM, sam_piz_special_BSBOLT_XB, sam_piz_special_UQ, sam_piz_special_iq_sq_dq, sam_piz_special_ULTIMA_tp_old, ultima_c_piz_special_DEMUX_BY_Q4NAME, sam_piz_special_bi, sam_piz_special_sd, agilent_special_AGENT_RX, agilent_special_AGENT_QX, special_qname_rng2seq_len, sam_piz_special_DEMUX_BY_XX_0, sam_piz_special_DEMUX_BY_AS, piz_special_PLUS, sam_piz_special_ULTIMA_tp, sam_piz_special_ULTIMA_MI, sam_piz_special_PACBIO_qe, sam_piz_special_DEMUX_sn, sam_piz_special_jI, sam_piz_special_jM_length, sam_piz_special_RG_by_QNAME, sam_piz_special_PACBIO_we, sam_piz_special_DEMUX_BY_REVCOMP_MATE, sam_piz_special_crdna_GP, sam_piz_special_DEMUX_MAPQ, sam_piz_special_CPU_XL, sam_piz_special_ML_REPEATS, }
-#define NUM_SAM_SPECIAL 75
+#define NUM_SAM_SPECIAL 76
#define FASTQ_SPECIAL_NAMES { "unaligned_SEQ", "PAIR2_GPOS", "mate_lookup", "set_deep", "deep_copy_QNAME", "deep_copy_SEQ", "deep_copy_QUAL", "backspace", "copy_line1", "monochar_QUAL", "ULTIMA_C", "AGENT_RX", "AGENT_QX", "qname_rng2seq_len", "DEMUX_BY_R", }
diff --git a/src/digest.c b/src/digest.c
index 52d28538..82f9c36b 100644
--- a/src/digest.c
+++ b/src/digest.c
@@ -16,6 +16,7 @@
#include "progress.h"
#include "writer.h"
#include "txtheader.h"
+#include "piz.h"
#define IS_ADLER (IS_ZIP ? !flag.md5 : z_file->z_flags.adler)
#define IS_MD5 (!IS_ADLER)
@@ -211,21 +212,12 @@ static void digest_piz_verify_one_vb (VBlockP vb,
DIGEST_NAME, digest_display (vb->expected_digest).s,
recon_size_warn);
- // gencomp: don't count PRIM VBs in the midst of MAIN VBs, as --biopsy sets --no-gencomp
- VBIType no_gencomp_vb_i = (z_sam_gencomp && vb->comp_i == SAM_COMP_MAIN) ? sections_get_num_vbs_up_to (SAM_COMP_MAIN, vb->vblock_i) : vb->vblock_i;
-
// case: first bad VB: dump bad VB to disk and maybe exit
if (!__atomic_test_and_set (&txt_file->vb_digest_failed, __ATOMIC_RELAXED)) { // not WARN_ONCE because we might be genounzipping multiple files - we want to show this for every failed file (see also note in digest_piz_verify_one_txt_file)
- NOISYWARN ("Bad reconstructed %s has been dumped to: %s. vb_position_txt_file=%"PRIu64", txt_data.len=%u recon_size(expected)=%u\n"
- "To see the same data in the original file:\n"
- "genozip --biopsy %u%s%s%s --no-header %s%s", // note: segconf.vb_size is only available since v14. For older files, look it up with genocat --stats.
+ NOISYWARN ("Bad reconstructed %s has been dumped to: %s. vb_position_txt_file=%"PRIu64", txt_data.len=%u recon_size(expected)=%u\n%s%s",
VB_NAME, txtfile_dump_vb (vb, z_name, txt_data).s, // note: txt_data dumped INCLUDES integrated gencomp lines (if there are any)
- vb->vb_position_txt_file, vb->txt_data.len32, vb->recon_size, no_gencomp_vb_i,
- cond_int (segconf.vb_size/*0 if IS_VB_SIZE_BY_MGZIP*/ && !(segconf.vb_size % (1 MB)), " -B", (unsigned)(segconf.vb_size >> 20)),
- cond_int (segconf.vb_size && (segconf.vb_size % (1 MB)), " -B", (int)segconf.vb_size),
- segconf.vb_size && (segconf.vb_size % (1 MB)) ? "B" : "",
- (txt_file && txt_file->name) ? filename_guess_original (txt_file) : IS_PIZ ? txtheader_get_txt_filename_from_section(vb->comp_i).s : "(uncalculable)",
- report_support_if_unexpected());
+ vb->vb_position_txt_file, vb->txt_data.len32, vb->recon_size,
+ piz_advise_biopsy (vb).s, report_support_if_unexpected());
if (flag.test) exit_on_error (false); // must be inside the atomic test, otherwise another thread will exit before we completed dumping
}
diff --git a/src/fasta.c b/src/fasta.c
index b0c5d7d2..06c4fdbf 100644
--- a/src/fasta.c
+++ b/src/fasta.c
@@ -361,7 +361,7 @@ void fasta_segconf_finalize (VBlockP vb)
ASSINP0 (num_contigs_this_vb, "Invalid FASTA file: no sequence description line");
uint64_t avg_contig_size_this_vb = Ltxt / num_contigs_this_vb;
- uint64_t est_num_contigs_in_file = txtfile_get_seggable_size() / avg_contig_size_this_vb;
+ uint64_t est_num_contigs_in_file = txt_file->est_seggable_size / avg_contig_size_this_vb;
ASSINP0 (!flag.make_reference || segconf.fasta_has_contigs, "Can't use --make-reference on this file, because Genozip can't find the contig names in the FASTA description lines");
diff --git a/src/fastq.c b/src/fastq.c
index 483d43c8..8fc4bbf4 100644
--- a/src/fastq.c
+++ b/src/fastq.c
@@ -711,7 +711,7 @@ void fastq_segconf_finalize (VBlockP vb)
void fastq_zip_after_segconf (void)
{
if (IS_R1) {
- double est_num_vbs = MAX_(1, (double)txtfile_get_seggable_size() / (double)segconf.vb_size * 1.1);
+ double est_num_vbs = MAX_(1, (double)txt_file->est_seggable_size / (double)segconf.vb_size * 1.1);
// allocate memory to store txt_data.len32 of each R1 VB
buf_alloc (evb, &z_file->R1_txt_data_lens, 0, est_num_vbs, uint32_t, 0, "z_file->R1_txt_data_lens");
@@ -1132,13 +1132,15 @@ rom fastq_seg_txt_line (VBlockP vb_, rom line_start, uint32_t remaining, bool *h
dl->monochar = str_is_monochar (STRa(seq)); // set dl->monochar
// case --deep: compare DESC, SEQ and QUAL to the SAM/BAM data
- bool deep_qname=false, deep_seq=false;
+ bool deeped=false;
uint32_t uncanonical_suffix_len = 0;
- if (flag.deep || flag.show_deep == 2)
- fastq_seg_deep (vb, dl, STRa(qname), STRa(qname2), STRa(seq), STRa(qual), &deep_qname, &deep_seq, &dl->deep_qual, &uncanonical_suffix_len);
+ if (flag.deep || flag.show_deep == SHOW_DEEP_ONE_HASH)
+ deeped = fastq_seg_deep (vb, dl, STRa(qname), STRa(qname2), STRa(seq), STRa(qual), &uncanonical_suffix_len);
- fastq_seg_QNAME (vb, STRa(qname), line1_len, segconf.deep_qtype == QNAME1 && deep_qname, uncanonical_suffix_len);
+ dl->deep_qual = deeped && flag.deep && !segconf.deep_no_qual;
+
+ fastq_seg_QNAME (vb, STRa(qname), line1_len, segconf.deep_qtype == QNAME1 && deeped, uncanonical_suffix_len);
// seg SAUX
if (segconf.has_saux)
@@ -1146,12 +1148,12 @@ rom fastq_seg_txt_line (VBlockP vb_, rom line_start, uint32_t remaining, bool *h
// seg DESC (i.e. QNAME2 + EXTRA + AUX), it might have come either from line1 or from line3
else if (segconf.has_desc)
- fastq_seg_DESC (vb, STRa(desc), segconf.deep_qtype == QNAME2 && deep_qname, uncanonical_suffix_len);
+ fastq_seg_DESC (vb, STRa(desc), segconf.deep_qtype == QNAME2 && deeped, uncanonical_suffix_len);
if (!FAF)
fastq_seg_LINE3 (vb, STRa(line3), STRa(qname), STRa(desc)); // before SEQ, in case it as a segconf_seq_len_dict_id field
- fastq_seg_SEQ (vb, dl, STRa(seq), deep_seq);
+ fastq_seg_SEQ (vb, dl, STRa(seq), deeped);
if (!FAF)
fastq_seg_QUAL (vb, dl, STRa(qual));
diff --git a/src/fastq_deep.c b/src/fastq_deep.c
index 8e6d9f30..f9176b53 100644
--- a/src/fastq_deep.c
+++ b/src/fastq_deep.c
@@ -64,6 +64,7 @@ void fastq_deep_zip_finalize (void)
iprintf ("%-11.11s: %"PRIu64" (%.1f%%)\n", (rom[])NO_DEEP_NAMES[i], z_file->deep_stats[i], 100.0 * (double)z_file->deep_stats[i] / (double)total);
}
+ ASSERTISALLOCED (z_file->deep_ents);
ARRAY (ZipZDeep, deep_ents, z_file->deep_ents);
// Count deepable alignments in SAM that were did not appear in FASTQ. This would be an user
@@ -85,7 +86,7 @@ void fastq_deep_zip_finalize (void)
if (count_not_consumed)
for (uint64_t ent_i=0, count=0; ent_i < deep_ents_len && count < NUM_SHOW; ent_i++)
if (!deep_ents[ent_i].consumed && !deep_ents[ent_i].dup) {
- iprintf ("sam_vb=%u line_i=%u hash=%u,%u,%u\n", deep_ents[ent_i].vb_i, deep_ents[ent_i].line_i, DEEPHASHf(deep_ents[ent_i].hash));
+ iprintf ("sam_vb=%u line_i=%u hash=%016"PRIx64",%08x,%08x\n", deep_ents[ent_i].vb_i, deep_ents[ent_i].line_i, DEEPHASHf(deep_ents[ent_i].hash));
count++;
}
}
@@ -113,11 +114,25 @@ void fastq_deep_zip_finalize (void)
}
}
+static void fastq_deep_show_segconf (uint32_t n_lines, uint32_t proof_of_first, uint32_t proof_of_last, uint32_t proof_of_qname1, uint32_t proof_of_qname2, uint32_t proof_of_qual)
+{
+ iprintf ("\nFASTQ Deep segconf stats: n_lines=%u proof_of_first=%u proof_of_last=%u proof_of_qname1=%u proof_of_qname2=%u proof_of_qual=%u\n"
+ "FIRST:{n_full_mch=(%u,%u) n_seq_qname_mch=(%u,%u)} LAST:{n_full_mch=(%u,%u) n_seq_qname_mch=(%u,%u)} NOT_PAIR={n_full_mch=(%u,%u) n_seq_qname_mch=(%u,%u)} n_no_mch=%u trimming=%s n_full_mch_trimmed=%u\n",
+ n_lines, proof_of_first, proof_of_last, proof_of_qname1, proof_of_qname2, proof_of_qual,
+ segconf.n_full_mch[2], segconf.n_full_mch[3], segconf.n_seq_qname_mch[2], segconf.n_seq_qname_mch[3],
+ segconf.n_full_mch[4], segconf.n_full_mch[5], segconf.n_seq_qname_mch[4], segconf.n_seq_qname_mch[5],
+ segconf.n_full_mch[0], segconf.n_full_mch[1], segconf.n_seq_qname_mch[0], segconf.n_seq_qname_mch[1],
+ segconf.n_no_mch, segconf_deep_trimming_name(), segconf.n_full_mch_trimmed);
+
+ iprintf ("\nFASTQ Deep segconf settings: paired_qname=%s is_last=%s qtype=%s has_trimmed=%s has_trimmed_left=%s no_qual=%s\n",
+ TF(segconf.deep_paired_qname), YN(segconf.deep_is_last), qtype_name (segconf.deep_qtype), TF(segconf.deep_has_trimmed), TF(segconf.deep_has_trimmed_left), TF(segconf.deep_no_qual));
+}
+
void fastq_deep_seg_finalize_segconf (uint32_t n_lines)
{
if (zip_is_biopsy) return;
- uint32_t proof_of_first=0, proof_of_last=0;
+ uint32_t proof_of_first=0, proof_of_last=0, proof_of_qname1=0, proof_of_qname2=0, proof_of_qual=0;
if (segconf.deep_paired_qname) {
proof_of_first = segconf.n_seq_qname_mch[2] + segconf.n_seq_qname_mch[3] + segconf.n_full_mch[2] + segconf.n_full_mch[3];
@@ -137,45 +152,46 @@ void fastq_deep_seg_finalize_segconf (uint32_t n_lines)
unsigned qname2_i = qname_i + 1;
// test: at least half of the reads (excluding reads that had no matching SEQ in the SAM) have a matching QNAME
- uint32_t proof_of_qname1 = segconf.n_seq_qname_mch[qname_i] + segconf.n_full_mch[qname_i];
- uint32_t proof_of_qname2 = segconf.n_seq_qname_mch[qname2_i] + segconf.n_full_mch[qname2_i];
-
- segconf.deep_qtype = (!proof_of_qname1 && !proof_of_qname2) ? QNONE
- : (proof_of_qname1 >= proof_of_qname2) ? QNAME1
- : QNAME2;
-
- if (segconf.deep_qtype == QNONE) segconf.deep_has_trimmed = false;
-
- if (!segconf.deep_has_trimmed) buf_destroy (z_file->deep_index_by[BY_QNAME]);
-
- // test: likewise, matching QUAL
- uint32_t proof_of_qual = (segconf.deep_qtype == QNAME1) ? segconf.n_full_mch[qname_i]
- : (segconf.deep_qtype == QNAME2) ? segconf.n_full_mch[qname2_i]
- : /* QNONE */ segconf.n_seq_qual_mch;
-
- if (!proof_of_qual) {
- segconf.deep_no_qual = true;
- segconf.deep_N_fq_score = segconf.deep_N_sam_score = 0;
- }
-
- // we need at least 2 of the 3 (QNANE,SEQ,QUAL) to be comparable between FASTQ and SAM to have a total of 64 bit hash (32 from each)
- bool can_deep = segconf.deep_qtype != QNONE || !segconf.deep_no_qual;
-
- if (flag.show_deep || !can_deep) {
- iprintf ("\nFASTQ Deep segconf stats: n_lines=%u proof_of_first=%u proof_of_last=%u proof_of_qname1=%u proof_of_qname2=%u proof_of_qual=%u\n"
- "FIRST:{n_full_mch=(%u,%u) n_seq_qname_mch=(%u,%u)} LAST:{n_full_mch=(%u,%u) n_seq_qname_mch=(%u,%u)} NOT_PAIR={n_full_mch=(%u,%u) n_seq_qname_mch=(%u,%u)} n_seq_qual_mch=%u n_seq_only=%u n_no_match=%u trimming=%s\n",
- n_lines, proof_of_first, proof_of_last, proof_of_qname1, proof_of_qname2, proof_of_qual,
- segconf.n_full_mch[2], segconf.n_full_mch[3], segconf.n_seq_qname_mch[2], segconf.n_seq_qname_mch[3],
- segconf.n_full_mch[4], segconf.n_full_mch[5], segconf.n_seq_qname_mch[4], segconf.n_seq_qname_mch[5],
- segconf.n_full_mch[0], segconf.n_full_mch[1], segconf.n_seq_qname_mch[0], segconf.n_seq_qname_mch[1],
- segconf.n_seq_qual_mch, segconf.n_seq_mch, segconf.n_no_mch, segconf_deep_trimming_name());
+ proof_of_qname1 = segconf.n_seq_qname_mch[qname_i] + segconf.n_full_mch[qname_i];
+ proof_of_qname2 = segconf.n_seq_qname_mch[qname2_i] + segconf.n_full_mch[qname2_i];
+
+ if (proof_of_qname1 || proof_of_qname2) {
+ segconf.deep_qtype = (proof_of_qname1 >= proof_of_qname2) ? QNAME1 : QNAME2;
+
+ segconf.deep_has_trimmed = (segconf.n_full_mch_trimmed > 0); // true is some reads might be trimmed (shorter in SAM than FASTQ)
- iprintf ("\nFASTQ Deep segconf settings: paired_qname=%s is_last=%s qtype=%s has_trimmed=%s has_trimmed_left=%s no_qual=%s\n",
- TF(segconf.deep_paired_qname), YN(segconf.deep_is_last), qtype_name (segconf.deep_qtype), TF(segconf.deep_has_trimmed), TF(segconf.deep_has_trimmed_left), TF(segconf.deep_no_qual));
+ // test: likewise, matching QUAL
+ proof_of_qual = (segconf.deep_qtype == QNAME1) ? segconf.n_full_mch[qname_i] : segconf.n_full_mch[qname2_i];
+
+ if (!proof_of_qual) {
+ segconf.deep_no_qual = true;
+ segconf.deep_N_fq_score = segconf.deep_N_sam_score = 0;
+ }
}
-
- ASSINP (can_deep, "Error: cannot use --deep with this file: based on testing the first %u reads of %s.\nYou may compress these files without --deep, or override with --force-deep",
- n_lines, txt_name);
+
+ else {
+ // no luck finding matching lines in segconf - but continue if user insists...
+ if (flag.force_deep) {
+ segconf.deep_qtype = QNAME1; // assume match is of QNAME1
+ segconf.deep_no_qual = false; // assume QUAL matches too
+ segconf.deep_has_trimmed = true; // be liberal, allow trimming (since we require QNAME anyway)
+ }
+
+ else {
+ fastq_deep_show_segconf (n_lines, proof_of_first, proof_of_last, 0, 0, 0);
+
+ STR(master_qname);
+ huffman_get_master (SAM_QNAME, pSTRa(master_qname));
+
+ ABORTINP ("Error: cannot use --deep with this file: based on testing the first %u reads of %s.\n"
+ "You may compress these files without --deep, or override with --force-deep\n"
+ "First SAM: %.*s First FASTQ: %s%s",
+ n_lines, txt_name, STRf(master_qname), segconf.deep_1st_desc, report_support_if_unexpected());
+ }
+ }
+
+ if (flag.show_deep)
+ fastq_deep_show_segconf (n_lines, proof_of_first, proof_of_last, proof_of_qname1, proof_of_qname2, proof_of_qual);
}
// find a subset of FASTQ's seq that matches the hash of the SAM's seq.
@@ -248,7 +264,7 @@ static rom fastq_deep_set_N_qual (VBlockFASTQP vb, STRp(seq), STRp(qual))
if (seq[i] == 'N') {
if (qual[i] == segconf.deep_N_fq_score)
new_qual[i] = deep_N_sam_score;
- else
+ else
return NULL; // unexpected inconsistent quality score of an 'N' base - don't Deep this read since in reconstruction quality of 'N' will be set to deep_N_fq_score
}
else
@@ -263,109 +279,99 @@ static void fastq_deep_seg_segconf (VBlockFASTQP vb, STRp(qname), STRp(qname2),
struct { bool cond; QType q; STR(qname); thool is_last; } inst[NUM_INSTS] =
// QNAME1 QNAME2
- { { !segconf.deep_paired_qname, QNAME1, STRa(qname), unknown }, { !segconf.deep_paired_qname && qname2_len > 0, QNAME2, STRa(qname2), unknown }, // not apired
+ { { !segconf.deep_paired_qname, QNAME1, STRa(qname), unknown }, { !segconf.deep_paired_qname && qname2_len > 0, QNAME2, STRa(qname2), unknown }, // not paired
{ segconf.deep_paired_qname, QNAME1, STRa(qname), false }, { segconf.deep_paired_qname && qname2_len > 0, QNAME2, STRa(qname2), false }, // is_first
{ segconf.deep_paired_qname, QNAME1, STRa(qname), true }, { segconf.deep_paired_qname && qname2_len > 0, QNAME2, STRa(qname2), true } }; // is_last
- uint32_t qname_hash[NUM_INSTS];
+ uint64_t qname_hash[NUM_INSTS];
for (int i=0; i < NUM_INSTS; i++)
if (inst[i].cond)
qname_hash[i] = deep_qname_hash (inst[i].q, STRa(inst[i].qname), inst[i].is_last, NULL);
-
- uint32_t seq_hash = deep_seq_hash (VB, STRa(seq), false);
-
+
if (segconf.deep_N_sam_score) {
qual = fastq_deep_set_N_qual (vb, STRa(seq), STRa(qual));
if (!qual) {
segconf.n_no_mch++; // some qual scores of 'N' bases are of an inconsistent value - we don't be able to reconstruct them from segconf.deep_N_fq_score
- return;
+ goto done;
}
}
- uint32_t qual_hash = deep_qual_hash (VB, STRa(qual), false);
+ bool seq_qname_matches[6]={};
- bool seq_qual_matches=false, seq_qname_matches[6]={}, seq_matches=false;
- uint32_t hash = seq_hash & bitmask32 (num_hash_bits);
-
- for (uint32_t ent_i = *B32(z_file->deep_index_by[BY_SEQ], hash); ent_i != NO_NEXT; ent_i = deep_ents[ent_i].next[BY_SEQ]) {
- ZipZDeep *e = &deep_ents[ent_i];
+ for (int i=0; i < NUM_INSTS; i++) {
+ if (!inst[i].cond) continue;
+
+ uint32_t hash = qname_hash[i] & bitmask64 (num_hash_bits);
+
+ for (uint32_t ent_i = *B32(z_file->deep_index, hash); ent_i != NO_NEXT; ent_i = deep_ents[ent_i].next) {
+ ZipZDeep *e = &deep_ents[ent_i];
+
+ if (e->hash.qname != qname_hash[i] || // possible, because hash uses less bits that e->hash.qname
+ e->seq_len > seq_len) continue; // definitely not a match if FASTQ seq_len is shorter than SAM's
- if (e->hash.seq != seq_hash) continue; // possible, because hash uses less bits that e->hash.seq
+ uint32_t sam_seq_offset = 0;
+ if (!fastq_deep_seg_find_subseq (vb, STRa(seq), e->seq_len, e->hash.seq, true, &sam_seq_offset))
+ continue;
- for (int i=0; i < NUM_INSTS; i++)
- if (inst[i].cond && e->hash.qname == qname_hash[i]) {
- if (e->hash.qual == qual_hash) {
- segconf.n_full_mch[i]++;
- return;
- }
- else {
- seq_qname_matches[i] = true;
- goto next_ent;
- }
+ if (sam_seq_offset > 0)
+ segconf.deep_has_trimmed_left = true; // segconf observed trimming left bases
+
+ uint32_t qual_hash = deep_qual_hash (VB, qual + sam_seq_offset, e->seq_len, false); // possibly trimmed QUAL
+ if (qual_hash == e->hash.qual) {
+ segconf.n_full_mch[i]++;
+ if (seq_len != e->seq_len) segconf.n_full_mch_trimmed++;
+ goto done; // QNAME, SEQ and QUAL match (possibly trimmed)
}
-
- if (e->hash.qual == qual_hash)
- seq_qual_matches = true;
- else
- seq_matches = true;
-
- next_ent: continue;
+ else
+ seq_qname_matches[i] = true; // only QNAME and SEQ match - continue searching, perhaps a full match will be found
+ }
}
-
- // each segconf read increments exactly one category
+
for (int i=0; i < NUM_INSTS; i++)
if (seq_qname_matches[i]) {
- segconf.n_seq_qname_mch[i]++; // if one match is seq+qname and another is seq+qual, then count the seq+qname
+ segconf.n_seq_qname_mch[i]++;
goto done;
}
- if (seq_qual_matches) segconf.n_seq_qual_mch++;
- else if (seq_matches) segconf.n_seq_mch++; // no match for qname and qual, only seq alone
-
- // no match based on SEQ index, perhaps the read appears trimmed (beyond cropped) in the SAM data - search based on QNAME index
- else {
- for (int i=0; i < NUM_INSTS; i++) {
- if (!inst[i].cond) continue;
+ segconf.n_no_mch++; // no match - likely bc this read was filtered out in the SAM file
- hash = qname_hash[i] & bitmask32 (num_hash_bits);
-
- for (uint32_t ent_i = *B32(z_file->deep_index_by[BY_QNAME], hash); ent_i != NO_NEXT; ent_i = deep_ents[ent_i].next[BY_QNAME]) {
- ZipZDeep *e = &deep_ents[ent_i];
-
- if (e->hash.qname != qname_hash[i] || // possible, because hash uses less bits that e->hash.qname
- e->seq_len >= seq_len) continue; // definitely not a match if FASTQ seq_len is shorter than SAM's and also not equal seq_len, as that would have been a no-trimming situation already inspected above
-
- uint32_t sam_seq_offset = 0;
- if (!fastq_deep_seg_find_subseq (vb, STRa(seq), e->seq_len, e->hash.seq, true, &sam_seq_offset))
- continue;
+done:
+ if (segconf.deep_N_sam_score)
+ buf_free (vb->scratch);
+}
- segconf.deep_has_trimmed = true;
+extern rom main_input_filename (int file_i);
- if (sam_seq_offset > 0)
- segconf.deep_has_trimmed_left = true; // segconf observed trimming left bases
-
- qual_hash = deep_qual_hash (VB, qual + sam_seq_offset, e->seq_len, false); // trimmed QUAL
- if (qual_hash == e->hash.qual) {
- segconf.n_full_mch[i]++;
- return;
- }
- else
- seq_qname_matches[i] = true;
- }
- }
-
- for (int i=0; i < NUM_INSTS; i++)
- if (seq_qname_matches[i]) {
- segconf.n_seq_qname_mch[i]++;
- goto done;
- }
+static CompIType fastq_seg_deep_get_comp_by_vb (VBIType vb_i) // caller guarantees vb_i with a Deep FASTQ vb
+{
+ for (CompIType comp_i=flag.zip_comp_i; comp_i >= SAM_COMP_FQ00; comp_i--)
+ if (vb_i >= sections_get_first_vb_i (comp_i))
+ return comp_i;
- segconf.n_no_mch++; // no match - likely bc this read was filtered out in the SAM file
+ return COMP_NONE;
+}
+
+static StrTextLong fastq_seg_deep_calculate_biopsy_line (VBIType vb_i, uint32_t line_i)
+{
+ StrTextLong s;
+ CompIType comp_i = fastq_seg_deep_get_comp_by_vb (vb_i);
+
+ if (comp_i >= SAM_COMP_FQ00) {
+ // we know the vb_i when compressed with --deep, calculate it when compressed separately
+ VBIType standlone_vb_i = vb_i - sections_get_first_vb_i (comp_i) + 1;
+
+ snprintf (s.s, sizeof(s), "genozip --biopsy-line %u/%u%.20s%.20s%s %.768s",
+ standlone_vb_i, line_i,
+ cond_int (segconf.vb_size/*0 if IS_VB_SIZE_BY_MGZIP*/ && !(segconf.vb_size % (1 MB)), " -B", (unsigned)(segconf.vb_size >> 20)),
+ cond_int (segconf.vb_size && (segconf.vb_size % (1 MB)), " -B", (int)segconf.vb_size),
+ segconf.vb_size && (segconf.vb_size % (1 MB)) ? "B" : "",
+ main_input_filename (1 + comp_i - SAM_COMP_FQ00));
}
-done:
- if (segconf.deep_N_sam_score)
- buf_free (vb->scratch);
+ else
+ snprintf (s.s, sizeof(s), "(unable to calculate --biopsy-line command for vb_i=%d line_i=%u)", vb_i, line_i);
+
+ return s;
}
static inline uint64_t fastq_seg_deep_consume_unique_matching_ent (VBlockFASTQP vb, ZipZDeep *ent, DeepHash deep_hash,
@@ -416,20 +422,19 @@ static inline uint64_t fastq_seg_deep_consume_unique_matching_ent (VBlockFASTQP
return txt_deep_line_i;
ent_consumed_by_another_thread:
- // case: this ent was already consumed by a previous FASTQ read. This can happen for two reasons:
- // 1) two different (QNAME,SEQ,QUAL) triplets have the same 96-bit deep_hash AND only one of the two
- // corresponding SAM lines actually exists in the SAM file. Therefore, both reads map to the single SAM
- // line, and we don't know which is the true match. We have no choice but to abort.
- // 2) two identical (-,SEQ,QUAL) (with QNONE), and only one SAM line exists. conceivably, this could happen for example
- // due to duplicates that were filtered out of the SAM.
- // TO DO: recompress normally by executing another instance of genozip
- ABORTINP ("Deep: We hit a rare edge case: two FASTQ reads: current line %s and line FQ\?\?/%u/%u (vb_size=%s)\n"
- "map to the same SAM line (deep_hash=%u,%u,%u qtype=%s no_qual=%s).\n"
+ // case: this ent was already consumed by a previous FASTQ read. This can happen if two different FASTQ lines
+ // have the same (QNAME,SEQ,QUAL) or (QNAME,SEQ,0) hashes AND only one of the two corresponding SAM lines
+ // actually exists in the SAM file (if both existed in the SAM they would have been marked as Dup and not deeped).
+ // Therefore, both reads map to the single SAM line, and we don't know which is the true match. We have no choice but to abort.
+ ABORTINP ("Deep: We hit a rare edge case: two FASTQ reads: current line %s and line %s/%u/%u (vb_size=%s)\n"
+ "map to the same SAM line (deep_hash=%016"PRIx64",%08x,%08x qtype=%s no_qual=%s).\n"
"%sWork around: compress without using --deep.\n"
- "This_line:\nQNAME=\"%.*s\"\nSEQ=\"%.*s\"\nQUAL=\"%.*s\"",
- LN_NAME, contention_place.vb_i, contention_place.line_i, str_size (segconf.vb_size).s,
+ "This_line:\nQNAME=\"%.*s\"\nSEQ =\"%.*s\"\nQUAL =\"%.*s\"\n"
+ "To see other line, run: %s",
+ LN_NAME, comp_name (fastq_seg_deep_get_comp_by_vb (contention_place.vb_i)), contention_place.vb_i, contention_place.line_i, str_size (segconf.vb_size).s,
DEEPHASHf(deep_hash), qtype_name(segconf.deep_qtype), TF(segconf.deep_no_qual),
- report_support(), STRf(qname), STRf(seq), STRf(qual));
+ report_support(), STRf(qname), STRf(seq), STRf(qual),
+ fastq_seg_deep_calculate_biopsy_line (contention_place.vb_i, contention_place.line_i).s);
COPY_TIMER (fastq_seg_deep_consume_unique_matching_ent);
return 0;
@@ -451,94 +456,67 @@ static int64_t fastq_get_pair_deep_value (VBlockFASTQP vb, ContextP ctx)
}
// called to mark entries as dup in case multiple SAM alignments have the same hash
-static DeepStatsFastq fastq_seg_deep_dup_detected (VBlockFASTQP vb, int by, ZipZDeep *dup1, ZipZDeep *dup2, uint32_t index_hash)
+static bool fastq_seg_deep_is_dup (VBlockFASTQP vb,
+ ZipZDeep *e) // first entry on link list that matches criteria
{
ARRAY (ZipZDeep, deep_ents, z_file->deep_ents);
-
- // first encounter with this dup - traverse the rest of the linked list looking for dups
- // note: we don't bother about atomicity of visibility of "dup" is just used for optimizing the loop and for stats
- if (!dup1->dup) {
- dup1->dup = dup2->dup = true;
-
- for (uint32_t ent_i = dup2->next[by]; ent_i != NO_NEXT; ent_i = deep_ents[ent_i].next[by])
- if (!memcmp (&deep_ents[ent_i].hash, &dup1->hash, sizeof (dup1->hash)))
- deep_ents[ent_i].dup = true;
- }
-
- if (flag.show_deep)
- iprintf ("%s: Two %s alignments with same hashes (%s), skipping: this=[hash=%u,%u,%u sam_vb_i=%u sam_line_i=%u] prev=[%u,%u,%u sam_vb_i=%u sam_line_i=%u] [deep_no_qual=%s deep_qtype=%s]\n",
- VB_NAME, z_dt_name(), by_names[by], DEEPHASHf(dup2->hash), dup2->vb_i, dup2->line_i,
- DEEPHASHf(dup1->hash), dup1->vb_i, dup1->line_i,
- TF(segconf.deep_no_qual), qtype_name (segconf.deep_qtype));
-
- return by==BY_SEQ ? NDP_MULTI_MATCH : NDP_MULTI_TRIMMED;
+ ZipZDeep *e2 = NULL;
+
+ if (!e->dup) // not already marked as dup by another FASTQ read that matched the same criteria
+ for (uint32_t ent_i = e->next; ent_i != NO_NEXT; ent_i = deep_ents[ent_i].next) {
+ e2 = &deep_ents[ent_i];
+ if ( e->hash.qname == e2->hash.qname &&
+ e->hash.seq == e2->hash.seq &&
+ (e->hash.qual == e2->hash.qual || segconf.deep_no_qual))
+ // first encounter with this dup - traverse the rest of the linked list looking for dups
+ // note: no worries about atomicity of visibility of "dup" because all matching threads
+ // will discover and set this dup for all dup entries, and thereafter the entries will be immutable
+ // (because they will not be consumed and hence e->consumed will not be set)
+ e->dup = e2->dup = true; // possibly setting e->dup multiple times, that's ok
+ }
+
+ if (e->dup && flag.show_deep && (flag.show_deep != SHOW_DEEP_ONE_HASH || deephash_issame (flag.debug_deep_hash, e->hash))) {
+ iprintf ("%s: Two or more %s alignments with same hashes (%s,no_qual=%s), skipping: this=[MAIN/%u/%u hash=%016"PRIx64",%08x,%08x]",
+ LN_NAME, z_dt_name(), qtype_name (segconf.deep_qtype), TF(segconf.deep_no_qual),
+ e ->vb_i, e ->line_i, DEEPHASHf(e ->hash));
+ if (e2) iprintf (" another=[MAIN/%u/%u %016"PRIx64",%08x,%08x]", e2->vb_i, e2->line_i, DEEPHASHf(e2->hash));
+ iprint0("\n");
+ }
+
+ return e->dup;
}
-static DeepStatsFastq fastq_seg_find_deep_by_QNAME (VBlockFASTQP vb, ZipDataLineFASTQ *dl, DeepHash deep_hash, STRp(seq), STRp(qual),
- bool *deep_qname, bool *deep_seq, bool *deep_qual, ZipZDeep **matching_ent) // out
+static DeepStatsFastq fastq_seg_find_deep (VBlockFASTQP vb, ZipDataLineFASTQ *dl, DeepHash *deep_hash, STRp(seq), STRp(qual),
+ ZipZDeep **matching_ent) // out
{
ARRAY (ZipZDeep, deep_ents, z_file->deep_ents);
- for (uint32_t ent_i = *B32 (z_file->deep_index_by[BY_QNAME], deep_hash.qname & bitmask32 (num_hash_bits));
- ent_i != NO_NEXT; ent_i = deep_ents[ent_i].next[BY_QNAME]) {
+ for (uint32_t ent_i = *B32 (z_file->deep_index, deep_hash->qname & bitmask64 (num_hash_bits));
+ ent_i != NO_NEXT;
+ ent_i = deep_ents[ent_i].next) {
ZipZDeep *e = &deep_ents[ent_i];
- if (e->hash.qname != deep_hash.qname || // possible, because hash uses less bits that e->hash.qname
- e->seq_len >= seq_len) continue; // definitely not a match if FASTQ seq_len is shorter than SAM's and also not equal seq_len, as that would have been a no-trimming situation already inspected above
-
- // case: QNAME matches: see if we can find a subsequence of SEQ that matches
- if (!fastq_deep_seg_find_subseq (vb, STRa(seq), e->seq_len, e->hash.seq, segconf.deep_has_trimmed_left, &dl->sam_seq_offset))
- continue;
- *deep_seq = *deep_qname = true;
+ if (e->hash.qname != deep_hash->qname || // possible, because hash uses less bits that e->hash.qname
+ e->seq_len > seq_len || // definitely not a match if FASTQ seq_len is shorter than SAM's
+ (e->seq_len < seq_len && !segconf.deep_has_trimmed)) // not a match if FASTQ seq_len is too long, and we don't allow trimming
+ continue;
- *deep_qual = !segconf.deep_no_qual && e->hash.qual &&
- (deep_qual_hash (VB, qual + dl->sam_seq_offset, e->seq_len, false) == e->hash.qual); // trimmed QUAL match
-
- if (e->dup)
- return NDP_MULTI_TRIMMED;
+ // case: QNAME matches: see if we can find a subsequence of SEQ that matches, and if needed, also a subsequence of QUAL
+ if (!fastq_deep_seg_find_subseq (vb, STRa(seq), e->seq_len, e->hash.seq, segconf.deep_has_trimmed_left, &dl->sam_seq_offset) ||
+ (!segconf.deep_no_qual && deep_qual_hash (VB, qual + dl->sam_seq_offset, e->seq_len, false) != e->hash.qual))
+ continue;
- if (*matching_ent)
- return fastq_seg_deep_dup_detected (vb, BY_QNAME, *matching_ent, e, deep_hash.qname);
+ // update to subsequence found (for messages)
+ deep_hash->seq = e->hash.seq;
+ deep_hash->qual = e->hash.qual;
+
+ // case: two or more SAM lines entries match this read according to the hashes - we don't know which is the true one, so we seg without deep.
+ if (fastq_seg_deep_is_dup (vb, e))
+ return (e->seq_len == seq_len) ? NDP_SAM_DUP : NDP_SAM_DUP_TRIM;
*matching_ent = e;
- return NDP_DEEPABLE_TRIM;
- }
-
- return NDP_NO_MATCH;
-}
-
-static DeepStatsFastq fastq_seg_find_deep_by_SEQ (VBlockFASTQP vb, ZipDataLineFASTQ *dl, DeepHash deep_hash,
- bool *deep_qname, bool *deep_seq, bool *deep_qual, ZipZDeep **matching_ent) // out
-{
- ARRAY (ZipZDeep, deep_ents, z_file->deep_ents);
-
- for (uint32_t ent_i = *B32 (z_file->deep_index_by[BY_SEQ], deep_hash.seq & bitmask32 (num_hash_bits));
- ent_i != NO_NEXT;
- ent_i = deep_ents[ent_i].next[BY_SEQ]) {
-
- ZipZDeep *e = &deep_ents[ent_i];
-
- // case: SEQ matches, and QUAL and QNAME match if they are required to match
- bool qual_matches=false, qname_matches=false;
-
- if (e->hash.seq == deep_hash.seq && // SEQ matches
- (segconf.deep_no_qual || !e->hash.qual || (qual_matches = (deep_hash.qual == e->hash.qual)) || segconf.deep_qtype != QNONE) && // QUAL matches. It's ok QUAL doesn't match if QNAME matches (which is required if not QNONE)
- (segconf.deep_qtype == QNONE || (qname_matches = (deep_hash.qname == e->hash.qname)))) { // QNAME matches (or not relevant)
-
- if (e->dup)
- return NDP_MULTI_MATCH; // already detected before as a dup entry
-
- // case: two or more SAM lines entries match this read according to the hashes - we don't know which is the true one, so we seg without deep.
- if (*matching_ent)
- return fastq_seg_deep_dup_detected (vb, BY_SEQ, *matching_ent, e, deep_hash.seq);
-
- *matching_ent = e;
- *deep_seq = true;
- *deep_qual = qual_matches;
- *deep_qname = qname_matches;
- return NDP_DEEPABLE;
- }
+ return (e->seq_len == seq_len) ? NDP_DEEPABLE : NDP_DEEPABLE_TRIM;
}
return NDP_NO_MATCH;
@@ -570,12 +548,12 @@ static void fastq_seg_deep_do (VBlockFASTQP vb, uint64_t deep_value)
}
}
-void fastq_seg_deep (VBlockFASTQP vb, ZipDataLineFASTQ *dl, STRp(qname), STRp(qname2), STRp(seq), STRp(qual),
- bool *deep_qname, bool *deep_seq, bool *deep_qual, // out - set to true, or left unchanged
+bool fastq_seg_deep (VBlockFASTQP vb, ZipDataLineFASTQ *dl, STRp(qname), STRp(qname2), STRp(seq), STRp(qual),
uint32_t *uncanonical_suffix_len) // out - suffix of deeped QNAME beyond canonical, i.e. not copied from Deep
{
START_TIMER;
+ uint64_t deep_value = 0;
DeepStatsFastq reason;
#define NO_MATCH(r) ({ reason=(r); goto no_match; })
@@ -589,17 +567,12 @@ void fastq_seg_deep (VBlockFASTQP vb, ZipDataLineFASTQ *dl, STRp(qname), STRp(qn
if (segconf.running) {
fastq_deep_seg_segconf (vb, STRa(qname), STRa(qname2), STRa(seq), STRa(qual));
- goto reset;
+ goto do_seg;
}
// we don't Deep reads monochar SEQ - too much contention
if (dl->monochar)
NO_MATCH (NDP_MONOSEQ);
-
- // If QNONE (i.e. we're hashing SEQ and QUAL only), we don't Deep reads with monoqual, as it has
- // an elevated chance that two reads will have entirely the same hash (in which case Deep will fail)
- if (segconf.deep_qtype == QNONE && str_is_monochar (STRa(qual)))
- NO_MATCH (NDP_MONOQUAL);
}
if (segconf.deep_N_sam_score && !segconf.deep_no_qual) {
@@ -608,15 +581,16 @@ void fastq_seg_deep (VBlockFASTQP vb, ZipDataLineFASTQ *dl, STRp(qname), STRp(qn
NO_MATCH(NDP_BAD_N_QUAL); // some qual scores of 'N' bases are of an inconsistent value - we don't be able to reconstruct them from segconf.deep_N_fq_score
}
+ bool calc_hash_for_show = (flag.show_deep && !segconf.deep_has_trimmed);
+
DeepHash deep_hash = { .qname = (segconf.deep_qtype == QNAME1) ? deep_qname_hash (QNAME1, STRa(qname), segconf.deep_is_last, uncanonical_suffix_len)
- : (segconf.deep_qtype == QNAME2) ? deep_qname_hash (QNAME2, STRa(qname2), segconf.deep_is_last, uncanonical_suffix_len)
- : 0,
- .seq = deep_seq_hash (VB, STRa(seq), false),
- .qual = segconf.deep_no_qual ? 0 : deep_qual_hash (VB, STRa(qual), false) };
-
- if (flag.show_deep == 2) {
- if (deephash_issame (flag.debug_deep_hash, deep_hash))
- iprintf ("%s Found deep_hash=%u,%u,%u\nQNAME=\"%.*s\"\nSEQ=\"%.*s\"\nQUAL=\"%.*s\"\n",
+ : deep_qname_hash (QNAME2, STRa(qname2), segconf.deep_is_last, uncanonical_suffix_len),
+ .seq = calc_hash_for_show ? deep_seq_hash (VB, STRa(seq), false) : 0,
+ .qual = (calc_hash_for_show && !segconf.deep_no_qual) ? deep_qual_hash (VB, STRa(qual), false) : 0 };
+
+ if (flag.show_deep == SHOW_DEEP_ONE_HASH || flag.show_deep == SHOW_DEEP_ALL) {
+ if (deephash_issame (flag.debug_deep_hash, deep_hash) || flag.show_deep == SHOW_DEEP_ALL)
+ iprintf ("%s Searching for deep_hash=%016"PRIx64",%08x,%08x\nQNAME=\"%.*s\"\nSEQ=\"%.*s\"\nQUAL=\"%.*s\"\n\n",
LN_NAME, DEEPHASHf(deep_hash), STRf(qname), STRf(seq), STRf(qual));
if (!flag.deep) goto done; // only debug
@@ -627,12 +601,11 @@ void fastq_seg_deep (VBlockFASTQP vb, ZipDataLineFASTQ *dl, STRp(qname), STRp(qn
// case: All matching entries are already used by previous FASTQ lines - so we have multiple FASTQ lines claiming the same
// SAM entry - since we can't undo the previous lines at this point - we fail the execution and re-compress without deep
ZipZDeep *matching_ent = NULL;
- reason = segconf.deep_has_trimmed
- ? fastq_seg_find_deep_by_QNAME (vb, dl, deep_hash, STRa(seq), STRa(qual), deep_qname, deep_seq, deep_qual, &matching_ent)
- : fastq_seg_find_deep_by_SEQ (vb, dl, deep_hash, deep_qname, deep_seq, deep_qual, &matching_ent);
+
+ // match by QNAME1 or QNAME2 ; SEQ must match ; QUAL must match unless deep_no_qual ; trimming allow if deep_has_trimmed
+ reason = fastq_seg_find_deep (vb, dl, &deep_hash, STRa(seq), STRa(qual), &matching_ent);
// case: single matching ent
- uint64_t deep_value;
if (matching_ent) {
deep_value = 1 + fastq_seg_deep_consume_unique_matching_ent (vb, matching_ent, deep_hash, STRa(qname), STRa(seq), STRa(qual)); // +1 as 0 means "no match"
vb->deep_stats[reason]++;
@@ -645,21 +618,19 @@ void fastq_seg_deep (VBlockFASTQP vb, ZipDataLineFASTQ *dl, STRp(qname), STRp(qn
no_match:
vb->deep_stats[reason]++;
- if (reason == NDP_NO_MATCH && flag.show_deep) {
+ if (reason == NDP_NO_MATCH && flag.show_deep && (flag.show_deep != SHOW_DEEP_ONE_HASH || deephash_issame (flag.debug_deep_hash, deep_hash))) {
static int count = 0;
- if (__atomic_fetch_add (&count, (int)1, __ATOMIC_RELAXED) < 20) {
+
+ if (flag.show_deep != SHOW_DEEP_SUMMARY || __atomic_fetch_add (&count, (int)1, __ATOMIC_RELAXED) < 20) {
rom once = "";
- DO_ONCE once = "\n\n(Showing first ~20 no-matches)\n";
- iprintf ("%s%s: no matching SAM line (has_match=%s deep_hash=%u,%u,%u is_last=%s)\nQNAME=\"%.*s\"\nSEQ= \"%.*s\"\nQUAL= \"%.*s\"\n",
- once, LN_NAME, TF(!!matching_ent), DEEPHASHf(deep_hash), YN(segconf.deep_is_last), STRf(qname), STRf(seq), STRf(qual));
+ DO_ONCE once = (flag.show_deep == SHOW_DEEP_SUMMARY ? "\n\n(Showing first ~20 no-matches)\n" : "");
+ iprintf ("%s%s: no matching SAM line (deep_hash=%016"PRIx64",%08x,%08x is_last=%s trimming=%s)\nQNAME=\"%.*s\"\nSEQ =\"%.*s\"\nQUAL =\"%.*s\"\n",
+ once, LN_NAME, DEEPHASHf(deep_hash), YN(segconf.deep_is_last), TF(segconf.deep_has_trimmed), STRf(qname), STRf(seq), STRf(qual));
}
}
-
- reset:
- deep_value = 0;
- *deep_seq = *deep_qual = *deep_qname = false; // reset
}
+do_seg:
fastq_seg_deep_do (vb, deep_value);
done:
@@ -667,6 +638,8 @@ void fastq_seg_deep (VBlockFASTQP vb, ZipDataLineFASTQ *dl, STRp(qname), STRp(qn
buf_free (vb->scratch);
COPY_TIMER (fastq_seg_deep);
+
+ return deep_value > 0;
}
void fastq_deep_seg_SEQ (VBlockFASTQP vb, ZipDataLineFASTQ *dl, STRp(seq), ContextP bitmap_ctx, ContextP nonref_ctx)
@@ -846,7 +819,7 @@ SPECIAL_RECONSTRUCTOR_DT (fastq_special_deep_copy_SEQ)
PizZDeepFlags f = *(PizZDeepFlags *)deep_ent;
- deep_ent += (segconf.deep_qtype == QNONE || flag.seq_only || flag.qual_only)
+ deep_ent += (segconf.deep_qtype == QNONE/*up to 15.0.66*/ || flag.seq_only || flag.qual_only)
? 1/*skip flags*/ : (2 + deep_ent[1])/*skip flags, qname_len and qname*/;
uint32_t deep_seq_len = f.is_long_seq ? GET_UINT32 (deep_ent) : *deep_ent;
@@ -944,7 +917,7 @@ SPECIAL_RECONSTRUCTOR_DT (fastq_special_deep_copy_QUAL)
PizZDeepFlags f = *(PizZDeepFlags *)deep_ent;
- deep_ent += (segconf.deep_qtype == QNONE || flag.seq_only || flag.qual_only)
+ deep_ent += (segconf.deep_qtype == QNONE/*up to 15.0.66*/ || flag.seq_only || flag.qual_only)
? 1/*skip flags*/ : (2 + deep_ent[1])/*skip flags, qname_len and qname*/;
uint32_t deep_seq_len = f.is_long_seq ? GET_UINT32 (deep_ent) : *deep_ent;
diff --git a/src/fastq_private.h b/src/fastq_private.h
index 53d81b15..7b8e658e 100644
--- a/src/fastq_private.h
+++ b/src/fastq_private.h
@@ -87,7 +87,7 @@ extern void fastq_seg_QUAL (VBlockFASTQP vb, ZipDataLineFASTQ *dl, STRp(qual));
// Deep stuff
extern void fastq_deep_zip_initialize (void);
-extern void fastq_seg_deep (VBlockFASTQP vb, ZipDataLineFASTQ *dl, STRp(qname), STRp(qname2), STRp(seq), STRp(qual), bool *deep_qname, bool *deep_seq, bool *deep_qual, uint32_t *uncanonical_suffix_len);
+extern bool fastq_seg_deep (VBlockFASTQP vb, ZipDataLineFASTQ *dl, STRp(qname), STRp(qname2), STRp(seq), STRp(qual), uint32_t *uncanonical_suffix_len);
extern void fastq_deep_seg_finalize_segconf (uint32_t n_lines);
extern void fastq_deep_seg_initialize (VBlockFASTQP vb);
extern void fastq_deep_seg_QNAME (VBlockFASTQP vb, Did did_i, STRp(qname), uint32_t uncanonical_suffix_len, unsigned add_bytes);
diff --git a/src/file.c b/src/file.c
index b8527743..102dbe61 100644
--- a/src/file.c
+++ b/src/file.c
@@ -671,8 +671,7 @@ static void file_initialize_z_file_data (FileP file)
Z_INIT (sag_cigars); // union with solo_data
Z_INIT (sag_seq);
Z_INIT (sag_qual);
- Z_INIT (deep_index_by[BY_SEQ]);
- Z_INIT (deep_index_by[BY_QNAME]);
+ Z_INIT (deep_index);
Z_INIT (deep_ents);
Z_INIT (vb_num_deep_lines);
Z_INIT (section_list);
diff --git a/src/file.h b/src/file.h
index ccf7b841..626b93ce 100644
--- a/src/file.h
+++ b/src/file.h
@@ -18,8 +18,8 @@ typedef rom FileMode;
extern FileMode READ, WRITE, WRITEREAD;// this are pointers to static strings - so they can be compared eg "if (mode==READ)"
// number of alignments that are deepable or non-deepable for each of these reasons
-typedef enum { NDP_FQ_READS, NDP_DEEPABLE, NDP_DEEPABLE_TRIM, NDP_NO_ENTS, NDP_MONOSEQ, NDP_MONOQUAL, NDP_BAD_N_QUAL, NDP_MULTI_MATCH, NDP_MULTI_TRIMMED, NDP_NO_MATCH , NUM_DEEP_STATS } DeepStatsFastq;
-#define NO_DEEP_NAMES { "FQ_reads", "Deepable", "Dpable_trim", "No_ents", "Monoseq", "Monoqual", "Bad_N_qual", "Multi_match", "MultM_trim", "No_match" }
+typedef enum { NDP_FQ_READS, NDP_DEEPABLE, NDP_DEEPABLE_TRIM, NDP_NO_ENTS, NDP_MONOSEQ, NDP_BAD_N_QUAL, NDP_SAM_DUP, NDP_SAM_DUP_TRIM, NDP_NO_MATCH , NUM_DEEP_STATS } DeepStatsFastq;
+#define NO_DEEP_NAMES { "FQ_reads", "Deepable", "Dpable_trim", "No_ents", "Monoseq", "Bad_N_qual", "SAM_dup", "SAM_dup_trm", "No_match" }
typedef struct File {
void *file;
@@ -45,7 +45,7 @@ typedef struct File {
int64_t disk_so_far; // ZIP: Z/TXT_FILE: data actually read/write to/from "disk" (using fread/fwrite), (TXT_FILE: possibley bgzf/gz/bz2 compressed ; 0 if external compressor is used for reading).
int64_t disk_gz_uncomp_or_trunc; // ZIP: TXT_FILE: gz-compressed data actually either decompressed or discarded due to truncate
int64_t gz_blocks_so_far; // ZIP: TXT_FILE: number of gz blocks read from disk
- int64_t est_seggable_size; // TXT_FILE ZIP, access via txtfile_get_seggable_size(). Estimated size of txt_data in file, i.e. excluding the header. It is exact for plain files, or based on test_vb if the file has source compression
+ int64_t est_seggable_size; // TXT_FILE ZIP, Estimated size of txt_data in file, i.e. excluding the header. It is exact for plain files, or based on test_vb if the file has source compression
int64_t est_num_lines; // TXT_FILE ZIP, an alternative for progress bar - by lines instead of bytes (used for CRAM)
// this relate to the textual data represented. In case of READ - only data that was picked up from the read buffer.
@@ -170,12 +170,9 @@ typedef struct File {
Buffer deep_ents; // Z_FILE: ZIP: entries of type ZipZDeep
// Z_FILE: PIZ: an array of Buffers, one for each SAM VB, containing QNAME,SEQ,QUAL of all reconstructed lines
- Buffer deep_index; // Z_FILE: PIZ: an array of Buffers, one for each SAM VB, each containing an array of uint32_t - one for each primary line - index into deep_ents[vb_i] of PizZDeep of that line
+ Buffer deep_index; // Z_FILE: ZIP: hash table - indices into deep ents, indexed by a subset of the hash.qname bits
+ // Z_FILE: PIZ: an array of Buffers, one for each SAM VB, each containing an array of uint32_t - one for each primary line - index into deep_ents[vb_i] of PizZDeep of that line
- #define BY_SEQ 0
- #define BY_QNAME 1
- Buffer deep_index_by[2]; // Z_FILE: ZIP: hash table (BY_SEQ,BY_QNAME) - indices into deep ents, indexed by a subset of the hash(SEQ) bits
-
// Reconstruction plan, for reconstructing in sorted order if --sort: [0] is primary coords, [1] is luft coords
Buffer vb_info[3]; // Z_FILE ZIP: SAM: array of SamMainVbInfo for MAIN and SamGcVbInfo for PRIM, DEPN
// Z_FILE PIZ: [0]: used by writer [1]: used to load SAM SA Groups - array of PlsgVbInfo - entry per PRIM vb
diff --git a/src/flags.c b/src/flags.c
index d80f2dce..39672970 100644
--- a/src/flags.c
+++ b/src/flags.c
@@ -282,16 +282,19 @@ static void flag_set_biopsy_line (rom optarg)
static void flag_set_show_deep (rom optarg)
{
- if (optarg) {
- flag.show_deep = 2;
+ if (optarg && !strcmp (optarg, "all"))
+ flag.show_deep = SHOW_DEEP_ALL;
- str_split_ints (optarg, strlen (optarg), 3, ',', hash, true);
- ASSINP0 (n_hashs, "--show-deep takes an optional argument of deep_hash - format is 3 comma-seperator integers: qname_hash,seq_hash,qual_hash");
+ else if (optarg) {
+ flag.show_deep = SHOW_DEEP_ONE_HASH;
+
+ str_split_hexs (optarg, strlen (optarg), 3, ',', hash, true);
+ ASSINP0 (n_hashs, "--show-deep takes an optional argument of deep_hash - format is 3 comma-seperator integers: qname_hash,seq_hash,qual_hash OR \"all\"");
flag.debug_deep_hash = (DeepHash){ hashs[0], hashs[1], hashs[2] };
}
else
- flag.show_deep = 1;
+ flag.show_deep = SHOW_DEEP_SUMMARY;
}
static void flag_set_stats (StatsType stats_type, rom optarg)
@@ -692,7 +695,7 @@ void flags_init_from_command_line (int argc, char **argv)
: !strcmp (optarg, "all") ? COV_ALL
: !strcmp (optarg, "one") ? COV_ONE
: COV_ALL; break;
- case 15 : flag.log_filename = optarg; break;
+ case 15 : flag.log_filename = optarg; break;
case 16 : flag.show_one_counts = dict_id_make (optarg, strlen (optarg), DTYPE_PLAIN); break;
case 131 : flag.show_one_counts = dict_id_make (cSTR("o$TATUS"), DTYPE_PLAIN); break;
case 17 : sam_set_FLAG_filter (optarg) ; break; // filter by SAM FLAG
diff --git a/src/flags.h b/src/flags.h
index 996b4440..feef44db 100644
--- a/src/flags.h
+++ b/src/flags.h
@@ -145,7 +145,7 @@ typedef struct {
show_threads, show_uncompress, biopsy, skip_segconf, show_data_type,
debug_progress, show_hash, debug_memory, debug_threads, debug_stats, debug_generate, debug_recon_size, debug_seg,
debug_LONG, show_qual, debug_qname, debug_read_ctxs, debug_sag, debug_gencomp, debug_lines, debug_latest,
- debug_peek, stats_submit, debug_submit, show_deep, show_segconf_has, debug_huffman, debug_split, debug_upgrade,
+ debug_peek, stats_submit, debug_submit, show_segconf_has, debug_huffman, debug_split, debug_upgrade,
debug_debug, // a flag with no functionality - used for ad-hoc debugging
debug_valgrind, debug_tar, // ad-hoc debug printing in prod
show_compress, show_sec_gencomp, show_scan,
@@ -160,6 +160,7 @@ typedef struct {
show_headers; // (1 + SectionType to display) or 0=flag off or -1=all sections
rom help, dump_section, show_is_set, show_time, show_mutex, show_vblocks, show_header_dict_name;
int64_t dump_section_i;
+ enum { SHOW_DEEP_SUMMARY=1, SHOW_DEEP_ONE_HASH=2, SHOW_DEEP_ALL=3 } show_deep;
CompIType show_time_comp_i; // comp_i for which to show time (possibly COMP_NONE or COMP_ALL)
diff --git a/src/genozip.c b/src/genozip.c
index bfe31c0a..c9620e5a 100644
--- a/src/genozip.c
+++ b/src/genozip.c
@@ -192,6 +192,14 @@ static void main_print_help (bool explicit)
}
}
+rom main_input_filename (int file_i/* 0-based */)
+{
+ if (IN_RANGE (file_i, 0, input_files_buf.len) && *B(rom, input_files_buf, file_i))
+ return *B(rom, input_files_buf, file_i);
+ else
+ return "(unknown filename)";
+}
+
static void main_print_version()
{
iprintf ("version=%s distribution=%s\n", code_version().s, get_distribution());
diff --git a/src/genozip.h b/src/genozip.h
index 9a21416e..ffe63374 100644
--- a/src/genozip.h
+++ b/src/genozip.h
@@ -579,7 +579,7 @@ typedef uint8_t TranslatorId;
extern TRANSLATOR_FUNC(func); \
enum { src_dt##2##dst_dt##_##name = num }; // define constant
-typedef struct { uint32_t qname, seq, qual; } DeepHash;
+typedef struct { uint64_t qname; uint32_t seq, qual; } DeepHash;
typedef enum { QNONE = -6,
QSAM2 = -5, // SAM QNAME2, while segging deep (for example, consensus reads flavor)
diff --git a/src/hash.c b/src/hash.c
index 84f6ac15..ed3201f8 100644
--- a/src/hash.c
+++ b/src/hash.c
@@ -97,7 +97,7 @@ static void hash_alloc_local (VBlockP vb, ContextP vctx)
uint32_t hash_get_estimated_entries (VBlockP vb, ContextP zctx, ConstContextP vctx)
{
double effective_num_vbs = 0;
- double estimated_num_vbs = MAX_(1, (double)txtfile_get_seggable_size() / (double)vb->txt_data.len);
+ double estimated_num_vbs = MAX_(1, (double)txt_file->est_seggable_size / (double)vb->txt_data.len);
double estimated_num_lines = estimated_num_vbs * (double)vb->lines.len;
if (flag.show_hash && vctx->did_i==0)
diff --git a/src/huffman.c b/src/huffman.c
index 09e05724..5ea7fdd9 100644
--- a/src/huffman.c
+++ b/src/huffman.c
@@ -380,6 +380,8 @@ void huffman_compress_section (Did did_i)
{
ContextP zctx = ZCTX(did_i);
+ ASSERT (huffman_exists (did_i), "huffman doesn't exist for %s", zctx->tag_name);
+
zctx->huffman.len = sizeof (HuffmanCodes);
Codec codec = codec_assign_best_codec (evb, NULL, &zctx->huffman, SEC_HUFFMAN);
@@ -395,6 +397,8 @@ void huffman_compress_section (Did did_i)
};
comp_compress (evb, zctx, &evb->z_data, &header, zctx->huffman.data, NO_CALLBACK, "SEC_HUFFMAN");
+
+ zctx->huffman.len = 1; // restore
}
// PIZ: called by the main thread from piz_read_global_area
diff --git a/src/objdir.linux/secure/license.o b/src/objdir.linux/secure/license.o
index 34dbba16..22a77a4f 100644
Binary files a/src/objdir.linux/secure/license.o and b/src/objdir.linux/secure/license.o differ
diff --git a/src/objdir.osx-arm/secure/license.o b/src/objdir.osx-arm/secure/license.o
index c1021517..068465fa 100644
Binary files a/src/objdir.osx-arm/secure/license.o and b/src/objdir.osx-arm/secure/license.o differ
diff --git a/src/objdir.osx-x86/secure/license.o b/src/objdir.osx-x86/secure/license.o
index 6b9308cf..36472141 100644
Binary files a/src/objdir.osx-x86/secure/license.o and b/src/objdir.osx-x86/secure/license.o differ
diff --git a/src/objdir.windows/secure/license.o b/src/objdir.windows/secure/license.o
index 093c73d9..ab17dd8b 100644
Binary files a/src/objdir.windows/secure/license.o and b/src/objdir.windows/secure/license.o differ
diff --git a/src/piz.c b/src/piz.c
index 1a1d1513..3e8938b6 100644
--- a/src/piz.c
+++ b/src/piz.c
@@ -28,6 +28,7 @@
#include "dict_io.h"
#include "user_message.h"
#include "huffman.h"
+#include "filename.h"
TRANSLATOR_FUNC (piz_obsolete_translator)
{
@@ -69,6 +70,24 @@ PizDisQname piz_dis_qname (VBlockP vb)
return out;
}
+StrTextLong piz_advise_biopsy (VBlockP vb)
+{
+ // gencomp: don't count PRIM VBs in the midst of MAIN VBs, as --biopsy sets --no-gencomp
+ VBIType no_gencomp_vb_i = (z_sam_gencomp && vb->comp_i == SAM_COMP_MAIN) ? sections_get_num_vbs_up_to (SAM_COMP_MAIN, vb->vblock_i) : vb->vblock_i;
+
+ StrTextLong s;
+ snprintf (s.s, sizeof(s), "To see the same data in the original file:\n"
+ "genozip --biopsy %u%.20s%.20s%s %.768s (optionally add: --no-header)", // note: segconf.vb_size is only available since v14. For older files, look it up with genocat --stats.
+ no_gencomp_vb_i,
+ // note: segconf.vb_size is only available since v14. For older files, look it up with genocat --stats.
+ cond_int (segconf.vb_size/*0 if IS_VB_SIZE_BY_MGZIP*/ && !(segconf.vb_size % (1 MB)), " -B", (unsigned)(segconf.vb_size >> 20)),
+ cond_int (segconf.vb_size && (segconf.vb_size % (1 MB)), " -B", (int)segconf.vb_size),
+ segconf.vb_size && (segconf.vb_size % (1 MB)) ? "B" : "",
+ (txt_file && txt_file->name) ? filename_guess_original (txt_file) : IS_PIZ ? txtheader_get_txt_filename_from_section(vb->comp_i).s : "(filename uncalculable)");
+
+ return s;
+}
+
void asspiz_text (VBlockP vb, FUNCLINE)
{
StrTextSuperLong s;
diff --git a/src/qname.c b/src/qname.c
index fd555623..cd317dc8 100644
--- a/src/qname.c
+++ b/src/qname.c
@@ -410,9 +410,9 @@ void qname_segconf_finalize (VBlockP vb)
segconf.is_collated || segconf.sam_is_unmapped || segconf.qname_flavor[q]->sam_qname_sorted ||
(!segconf.is_sorted && !segconf.is_paired);
- // if only consensus reads exist, change tech to unknown
- if (segconf.tech == TECH_CONS)
- segconf.tech = TECH_UNKNOWN;
+ // if only consensus reads exist, or is unknown
+ if (segconf.tech == TECH_CONS || segconf.tech == TECH_UNKNOWN)
+ segconf.tech = segconf.tech_by_RG; // usually tech by QNAME is more reliable, but if not available, go by the PL field of @RG in the SAM header (which might also be unavailable)
}
// note: we run this function only in discovery, not in segging, because it is quite expensive - checking all numerics.
@@ -775,7 +775,7 @@ uint32_t qname_hash_change_last (uint32_t hash, bool is_last)
// ZIP: used for syncing DEPN/PRIM in gencomp and SAM/FASTQ in deep, and for scanning a SAM/BAM
// PIZ: used for qname filters.
-uint32_t qname_calc_hash (QType q, CompIType comp_i/*only used in PIZ*/, STRp(qname), thool is_last, bool canonical,
+uint64_t qname_calc_hash (QType q, CompIType comp_i/*only used in PIZ*/, STRp(qname), thool is_last, bool canonical, CrcType type,
uint32_t *uncanonical_suffix_len) // optional out
{
uint32_t save_qame_len = qname_len;
@@ -790,8 +790,8 @@ uint32_t qname_calc_hash (QType q, CompIType comp_i/*only used in PIZ*/, STRp(qn
for (int i=0; i < qname_len; i++)
data[i] = ((((uint8_t *)qname)[i]-33) & 0x3f) | ((((uint8_t *)qname)[qname_len-i-1] & 0x3) << 6);
- uint32_t hash = crc32 (0, data, qname_len);
- if (is_last != unknown) hash = (hash & 0xfffffffe) | is_last;
+ uint64_t hash = (type == CRC64) ? crc64 (0, data, qname_len) : crc32 (0, data, qname_len);
+ if (is_last != unknown) hash = (hash & 0xfffffffffffffffe) | is_last;
return hash;
}
diff --git a/src/qname.h b/src/qname.h
index ea9b50ec..10e50362 100644
--- a/src/qname.h
+++ b/src/qname.h
@@ -22,7 +22,7 @@ typedef packed_enum {
// Illumina-style FASTQ QNAME2 flavors (also appears in Ultima, Singular...)
QF_ILLUM_2bc, QF_ILLUM_1bc, QF_ILLUM_0bc,
// MGI flavors
- QF_MGI_NEW6, QF_MGI_NEW7, QF_MGI_NEW8, QF_MGI_varlen, QF_MGI_r6, QF_MGI_r7, QF_MGI_r8, QF_MGI_ll7, QF_MGI_cl, QF_MGI_rgs8, QF_MGI_rgs8FQ,
+ QF_MGI_NEW6, QF_MGI_NEW7, QF_MGI_NEW8, QF_MGI_varlen, QF_MGI_r6, QF_MGI_die6, QF_MGI_r7, QF_MGI_r8, QF_MGI_ll7, QF_MGI_cl, QF_MGI_rgs8, QF_MGI_rgs8FQ, QF_MGI_coloned,
// PacBio flavors
QF_PACBIO_3, QF_PACBIO_rng, QF_PACBIO_lbl, QF_PACBIO_pln, QF_ONSO,
// Nanopore flavors
@@ -33,7 +33,7 @@ typedef packed_enum {
// Other sequencer flavors
QF_ION_TORR_3, QF_ROCHE_454, QF_HELICOS, QF_SINGULAR, QF_ELEMENT, QF_ELEMENT_bc,
// NCBI flavors
- QF_SRA_L, QF_SRA2, QF_SRA,
+ QF_SRA_L, QF_SRA2, QF_SRA, QF_SRA_sra,
// Consensus alignments flavors
QF_CONSENSUS, QF_CONS,
// Other syntheic (i.e. non-sequencer) flavors
@@ -58,7 +58,8 @@ typedef enum { QTR_SUCCESS, QTR_QNAME_LEN_0, QTR_FIXED_LEN_MISMATCH, QTR_WRONG_
#define QTR_NAME { "SUCCESS", "QNAME_LEN=0", "FIXED_LEN_MISMATCH", "WRONG_Q", "CONTAINER_MISMATCH", "BAD_INTEGER", "BAD_CHARS", "BAD_NUMERIC", "BAD_HEX", "TECH_MISMATCH", "NOT_BARCODE", "NOT_BARCODE2", "NO_MATE", "FAILED_VALIDATE_FUNC"}
extern QnameTestResult qname_test_flavor (STRp(qname), QType q, QnameFlavor qf, bool quiet);
-extern uint32_t qname_calc_hash (QType q, CompIType comp_i, STRp(qname), thool is_last, bool canonical, uint32_t *uncanonical_suffix_len);
+typedef enum { CRC32, CRC64 } CrcType;
+extern uint64_t qname_calc_hash (QType q, CompIType comp_i, STRp(qname), thool is_last, bool canonical, CrcType type, uint32_t *uncanonical_suffix_len);
extern uint32_t qname_hash_change_last (uint32_t hash, bool is_last);
extern void qname_canonize (QType q, rom qname, uint32_t *qname_len, CompIType comp_i);
diff --git a/src/qname_filter.c b/src/qname_filter.c
index 8fa2be66..3d922b46 100644
--- a/src/qname_filter.c
+++ b/src/qname_filter.c
@@ -49,7 +49,7 @@ void qname_filter_initialize_from_file (rom filename, CompIType comp_i)
if (VER(15))
qname_canonize (QNAME1, qSTRa(qname[i].qname), comp_i); // possibly reduces qname_len
- qname[i].hash = qname_calc_hash (QNAME1, comp_i, STRa(qname[i].qname), unknown, false, NULL);
+ qname[i].hash = qname_calc_hash (QNAME1, comp_i, STRa(qname[i].qname), unknown, false, CRC32, NULL);
}
buf_destroy (data); // defined in file_split_lines
@@ -87,7 +87,7 @@ void qname_filter_initialize_from_opt (rom opt, CompIType comp_i)
qname[i].qname_len = str_lens[i];
memcpy (qname[i].qname, strs[i], qname[i].qname_len);
- qname[i].hash = qname_calc_hash (QNAME1, comp_i, STRa(qname[i].qname), unknown, false, NULL);
+ qname[i].hash = qname_calc_hash (QNAME1, comp_i, STRa(qname[i].qname), unknown, false, CRC32, NULL);
}
qsort (STRb(qnames_filter), sizeof(QnameFilterItem), qname_filter_sort_by_hash);
@@ -104,7 +104,7 @@ bool qname_filter_does_line_survive (VBlockP vb, STRp(qname))
if (VER(15))
qname_canonize (QNAME1, qSTRa(qname), vb->comp_i); // possibly reduce qname_len
- uint32_t hash = qname_calc_hash (QNAME1, vb->comp_i, STRa(qname), unknown, false, NULL);
+ uint32_t hash = qname_calc_hash (QNAME1, vb->comp_i, STRa(qname), unknown, false, CRC32, NULL);
QnameFilterItem *ent = binary_search (find_qname_in_filter, QnameFilterItem, qnames_filter, hash);
bool found = false;
diff --git a/src/qname_flavors.h b/src/qname_flavors.h
index e4afcfc1..b784853c 100644
--- a/src/qname_flavors.h
+++ b/src/qname_flavors.h
@@ -373,6 +373,22 @@ CON_MGI_R(8);
#define PX_mgi_R { "", "", "C", "R", "", PX_MATE_FIXED_0_PAD }
+// example: die1_A100004684C001R029011637
+#define CON_MGI_die(n) \
+static SmallContainer con_mgi_die##n = { \
+ .repeats = 1, \
+ .nitems_lo = 6, \
+ .items = { { .dict_id = { _SAM_Q0NAME }, .separator = { CI0_FIXED_0_PAD, 1 } }, /* Die */ \
+ { .dict_id = { _SAM_Q1NAME }, .separator = "C" }, /* Flow cell */ \
+ { .dict_id = { _SAM_Q2NAME }, .separator = { CI0_FIXED_0_PAD, 3 } }, /* Column */ \
+ { .dict_id = { _SAM_Q3NAME }, .separator = { CI0_FIXED_0_PAD, 3 } }, /* Row */ \
+ { .dict_id = { _SAM_Q4NAME }, .separator = { CI0_FIXED_0_PAD, n } }, /* Tile */ \
+ { .dict_id = { _SAM_QmNAME }, I_AM_MATE } }/* Mate */ \
+}
+CON_MGI_die(6);
+
+#define PX_mgi_die { "die", "_", "", "R", "", PX_MATE_FIXED_0_PAD }
+
#define CON_MGI_Rgs(n) /* SAM/BAM only*/ \
static SmallContainer con_mgi_Rgs##n = { \
.repeats = 1, \
@@ -408,6 +424,23 @@ CON_MGI_RgsFQ(8);
#define PX_mgi_RgsFQ { "", "", "", "", "C", "R", "", PX_MATE_FIXED_0_PAD }
+// Example: "DV71-240104001:7:some-string:L02:R014C002:0000:2670"
+static SmallContainer con_mgi_coloned = {
+ .repeats = 1,
+ .nitems_lo = 9,
+ .items = { { .dict_id = { _SAM_Q0NAME }, .separator = ":" }, // ?
+ { .dict_id = { _SAM_Q1NAME }, .separator = ":" }, // ?
+ { .dict_id = { _SAM_Q2NAME }, .separator = ":L" }, // ?
+ { .dict_id = { _SAM_Q3NAME }, .separator = { CI0_FIXED_0_PAD, 2 } }, // Lane
+ { .dict_id = { _SAM_Q4NAME }, .separator = { CI0_FIXED_0_PAD, 3 } }, // Row
+ { .dict_id = { _SAM_Q5NAME }, .separator = { CI0_FIXED_0_PAD, 3 } }, // Column
+ { .dict_id = { _SAM_Q6NAME }, .separator = { CI0_FIXED_0_PAD, 4 } }, // Tile 1st half
+ { .dict_id = { _SAM_Q7NAME }, .separator = { CI0_FIXED_0_PAD, 4 } }, // Tile 2nd half
+ { .dict_id = { _SAM_QmNAME }, I_AM_MATE } } // Mate
+};
+
+#define PX_mgi_coloned { "", "", "", "", ":R", "C", ":", ":", PX_MATE_FIXED_0_PAD }
+
// variant of MGI flavor where Q4NAME is a variable-length integer rather than a fixed-length zero-padded numeric
static SmallContainer con_mgi_varlen = { \
.repeats = 1, \
@@ -930,6 +963,9 @@ static SmallContainer con_ncbi_sra = {
{ .dict_id = { _SAM_Q2NAME } } } // usually sequential number in FASTQ
};
+// used SRA-sra, example: "SRR001666.sra.1"
+#define PX_sra_sra { "", "", "sra." }
+
// example: ERR2708427.1.1
static SmallContainer con_ncbi_sra2 = {
.repeats = 1,
@@ -1097,12 +1133,15 @@ static QnameFlavorStruct qf[] = {
TECH_MGI, TECH_NCBI, QANY, &con_mgi_varlen, no_validate, 0, 3, {4,-1}, {1,2,3,-1}, {2,3,4,-1}, {-1}, 0, 4,3, -1,-1, -1, -1, -1, -1, 0, PX_mgi_varlen },
{}, { QF_MGI_r6, "MGI-R6", { "8A_V100004684L3C001R029011637", "V300014293BL2C001R027005967", "V300003413L4C001R016000000" },
TECH_MGI, TECH_NCBI, QANY, &con_mgi_R6, no_validate, 0, 3, {-1}, {1,2,3,4,-1}, {2,3,4,-1}, {-1}, 0, 4,3, -1,-1, -1, -1, -1, -1, 0, PX_mgi_R },
+ {}, { QF_MGI_die6, "MGI-die6", { "die1_A100004684C001R029011637" }, TECH_MGI, TECH_NCBI, QANY, &con_mgi_die6, no_validate, 0, 6, {-1}, {0,2,3,4,-1}, {2,3,4,-1}, {-1}, 0, 4,3, -1,-1, -1, -1, -1, -1, 0, PX_mgi_die }, // 15.0.67
{}, { QF_MGI_r7, "MGI-R7", { "V300017009_8AL2C001R0030001805", "V300022116L2C001R0010002968", "V300014296L2C001R0013000027", "E100001117L1C001R0030000000", "E1000536L1C002R0020000005" },
TECH_MGI, TECH_NCBI, QANY, &con_mgi_R7, no_validate, 0, 3, {-1}, {1,2,3,4,-1}, {2,3,4,-1}, {-1}, 0, 4,3, -1,-1, -1, -1, -1, -1, 0, PX_mgi_R },
{}, { QF_MGI_rgs8FQ, "MGI-Rgs8FQ", { "CGGTCT-AACCT|ab|E200003777L1C001R00100888074" }, // must be before QF_MGI_r8
TECH_MGI, TECH_NCBI, QNAME1, &con_mgi_RgsFQ8, no_validate, 0, 5, {-1}, {3,4,5,6,-1}, {4,5,6,-1}, {-1}, 0, 6,5, -1,-1, -1, -1, -1, -1, 0, PX_mgi_RgsFQ },
{ QF_MGI_rgs8, "MGI-Rgs8", { "CGGTCT-AACCT|ab|E200003777L1C001R00100888074|2" },
TECH_MGI, TECH_NCBI, QSAM, &con_mgi_Rgs8, no_validate, '|', 6, {-1}, {3,4,5,6,-1}, {4,5,6,-1}, {-1}, 0, 6,5, -1,-1, -1, -1, -1, -1, 0, PX_mgi_Rgs, },
+ {}, { QF_MGI_coloned, "MGI-coloned", { "DV71-240104001:7:some-string:L02:R014C002:0000:2670" },
+ TECH_MGI, TECH_NCBI, QANY, &con_mgi_coloned, no_validate, 0, 9, {1,-1}, {3,4,5,6,7,-1}, {1,3,4,5,6,7,-1}, {-1}, 0, 6,-1, -1,-1, -1, -1, -1, -1, 0, PX_mgi_coloned }, // 15.0.67
{}, { QF_MGI_r8, "MGI-R8", { "V300046476L1C001R00100001719" }, TECH_MGI, TECH_NCBI, QANY, &con_mgi_R8, no_validate, 0, 3, {-1}, {1,2,3,4,-1}, {2,3,4,-1}, {-1}, 0, 4,3, -1,-1, -1, -1, -1, -1, 0, PX_mgi_R },
{}, { QF_MGI_ll7, "MGI-LL7", { "DP8400010271TLL1C005R0511863479" }, TECH_MGI, TECH_NCBI, QANY, &con_mgi_LL7, no_validate, 0, 4, {-1}, {1,2,3,4,-1}, {2,3,4,-1}, {-1}, 0, 4,3, -1,-1, -1, -1, -1, -1, 0, PX_mgi_LL },
{}, { QF_MGI_cl, "MGI-CL", { "CL100025298L1C002R050_244547" }, TECH_MGI, TECH_NCBI, QANY, &con_mgi_CL, no_validate, 0, 6, {4,-1}, {1,2,3,-1}, {2,3,4,-1}, {-1}, 0, 4,3, -1,-1, -1, -1, -1, -1, 0, PX_mgi_CL },
@@ -1130,7 +1169,7 @@ static QnameFlavorStruct qf[] = {
{}, { QF_ILLUM_5rng, "Illum-oldR", { "NOVID_3053_FC625AGAAXX:6:1:1069:11483:0,84" },
TECH_ILLUM, TECH_NCBI, QANY, &con_illumina_5rng, no_validate, ':', 6, {1,2,3,4,5,6,-1}, {-1}, {1,2,3,4,5,6,-1}, {-1}, 0, -1,-1, -1,-1, 6, -1, -1, -1, },
{}, { QF_ILLUM_6, "Illum-old6", { "HWI-ST156_288:4:1:10000:110537:0" }, TECH_ILLUM, TECH_NCBI, QANY, &con_illumina_6, no_validate, 0, 5, {1,2,3,4,5,-1}, {-1}, {1,2,3,4,-1}, {-1}, 0, -1,-1, -1,-1, -1, -1, -1, -1, },
- {}, { QF_ROCHE_454, "Roche-454", { "000050_1712_0767" }, TECH_454, TECH_NCBI, QANY, &con_roche_454, no_validate, 0, 2, {-1}, {0,1,2,-1}, {-1}, {-1}, 0, -1,-1, -1,-1, -1, -1, -1, -1, 16, PX_roche_454 },
+ {}, { QF_ROCHE_454, "Roche-454", { "000050_1712_0767" }, TECH_LS454, TECH_NCBI, QANY, &con_roche_454, no_validate, 0, 2, {-1}, {0,1,2,-1}, {-1}, {-1}, 0, -1,-1, -1,-1, -1, -1, -1, -1, 16, PX_roche_454 },
{}, { QF_HELICOS, "Helicos", { "VHE-242383071011-15-1-0-2" }, TECH_HELICOS, TECH_NCBI, QANY, &con_helicos, no_validate, 0, 5, {2,3,4,5,-1}, {-1}, {-1}, {-1}, 0, -1,-1, -1,-1, -1, -1, -1, -1, },
{}, { QF_PACBIO_3, "PacBio-3", { "0ae26d65_70722_4787" }, TECH_PACBIO, TECH_NCBI, QANY, &con_pacbio_3, no_validate, 0, 2, {1,2,-1}, {0,-1}, {1,2,-1}, {0,-1}, 0, -1,-1, -1,-1, -1, -1, -1, -1, 0, PX_pacbio_3 },
{ QF_PACBIO_rng, "PacBio-Range", { "m130802_221257_00127_c100560082550000001823094812221334_s1_p0/128361/872_4288" },
@@ -1149,6 +1188,7 @@ static QnameFlavorStruct qf[] = {
{ QF_SRA_L, "NCBI_SRA_L", { "SRR11215720.1_1_length=120" }, TECH_NCBI, TECH_NONE, Q1or3, &con_ncbi_sra_L, val_sra, 0, 10, {2,3,4,-1}, {-1}, {2,3,4,-1}, {-1}, 0, 3,-1, -1,-1, 4, -1, -1, -1, 0, PX_sra_len },
{ QF_SRA2, "NCBI-SRA2", { "ERR2708427.1.1" }, TECH_NCBI, TECH_NONE, Q1or3, &con_ncbi_sra2, val_sra, 0, 2, {2,3,-1}, {-1}, {2,3,-1}, {-1}, 0, 2,-1, -1,-1, -1, -1, -1, -1, .is_mated=true },
{ QF_SRA, "NCBI-SRA", { "SRR001666.1" }, TECH_NCBI, TECH_NONE, Q1or3, &con_ncbi_sra, val_sra, 0, 1, {2,-1}, {-1}, {2,-1}, {-1}, 0, 2,-1, -1,-1, -1, -1, -1, -1, },
+ { QF_SRA_sra, "NCBI-SRA-sra", { "SRR001666.sra.1" }, TECH_NCBI, TECH_NONE, Q1or3, &con_ncbi_sra, val_sra, 0, 5, {2,-1}, {-1}, {2,-1}, {-1}, 0, 2,-1, -1,-1, -1, -1, -1, -1, 0, PX_sra_sra }, // 15.0.67
// QNAME2 - Illumina, Singular, Element... - no mate, as mate is part of QNAME in this case
{ QF_ILLUM_2bc, "Illum-2bc", { "2:N:0:CTGAAGCT+ATAGAGGC" }, TECH_NONE, TECH_ANY, QNAME2, &con_qname2_2bc, no_validate, 0, 4, {0,2,-1}, {-1}, {0,-1}, {-1}, 0, -1,-1, -1,-1, -1, 3, -1, -1, 0, PX_illumina_2bc },
diff --git a/src/reconstruct.c b/src/reconstruct.c
index f7b2d47d..de5666c3 100644
--- a/src/reconstruct.c
+++ b/src/reconstruct.c
@@ -478,7 +478,7 @@ void reconstruct_one_snip (VBlockP vb, ContextP snip_ctx,
reconstruct_from_local_text (vb, base_ctx, reconstruct); // this will call us back recursively with the snip retrieved
break;
- case LT_CODEC: // snip can optionally be the length of the sequence to be reconstructed
+ case LT_CODEC: // snip can optionally be the length of the sequence to be reconstructed
codec_args[base_ctx->lcodec].reconstruct (vb, base_ctx->lcodec, base_ctx, (snip_len ? atoi(snip) : vb->seq_len), reconstruct);
break;
diff --git a/src/reconstruct.h b/src/reconstruct.h
index 71ecb87f..b5a625ba 100644
--- a/src/reconstruct.h
+++ b/src/reconstruct.h
@@ -19,12 +19,13 @@ typedef struct { char s[100]; } PizDisQname;
extern PizDisQname piz_dis_qname (VBlockP vb); // for ASSPIZ
extern void asspiz_text (VBlockP vb, FUNCLINE);
+extern StrTextLong piz_advise_biopsy (VBlockP vb);
#define ASSPIZ(condition, format, ...) ({ \
if (__builtin_expect (!(condition), 0)) { \
DO_ONCE { /* first thread to fail at this point prints error and starts exit flow, other threads stall */ \
- asspiz_text ((VBlockP)vb, __FUNCLINE);\
- fprintf (stderr, format "\n", __VA_ARGS__); \
+ asspiz_text (VB, __FUNCLINE);\
+ fprintf (stderr, format "\n%s\n", __VA_ARGS__, piz_advise_biopsy(VB).s);\
exit_on_error(true); \
} \
else stall(); \
diff --git a/src/sam.h b/src/sam.h
index b9369cf0..c54cf371 100644
--- a/src/sam.h
+++ b/src/sam.h
@@ -858,6 +858,7 @@ SPECIAL (SAM, 71, DEMUX_BY_REVCOMP_MATE, sam_piz_special_DEMUX_BY_REVCOMP_MATE);
SPECIAL (SAM, 72, crdna_GP, sam_piz_special_crdna_GP); // introduced 15.0.60
SPECIAL (SAM, 73, DEMUX_MAPQ, sam_piz_special_DEMUX_MAPQ); // introduced 15.0.61
SPECIAL (SAM, 74, CPU_XL, sam_piz_special_CPU_XL); // introduced 15.0.65
+SPECIAL (SAM, 75, ML_REPEATS, sam_piz_special_ML_REPEATS); // introduced 15.0.67
#define SAM_LOCAL_GET_LINE_CALLBACKS(dt) \
{ dt, _OPTION_BD_BI, sam_zip_BD_BI }, \
diff --git a/src/sam_cigar.c b/src/sam_cigar.c
index 84334154..72ec4661 100644
--- a/src/sam_cigar.c
+++ b/src/sam_cigar.c
@@ -38,7 +38,7 @@ static const uint8_t cigar_char_to_op[256] = { [0 ... 255]=BC_INVALID,
// CIGAR snip opcodes - part of the file format
#define COPY_MATE_MC_Z ((char)0x80) // copy from mate's MC:Z
-#define COPY_PRIM_SA_CIGAR ((char)0x81) // v14: copy from prim's SA_CIGAR
+#define COPY_SAGGY_PRIM_SA_CIGAR ((char)0x81) // v14: copy from prim's SA_CIGAR
#define COPY_QNAME_LENGTH ((char)0x82) // v14: derive CIGAR from qname's length= component
#define SQUANK ((char)0x83) // v14
#define COPY_QNAME_LENGTH_NO_CIGAR ((char)0x84) // v15.0.26 - get seq_len from QNAME, and CIGAR is *
@@ -805,8 +805,8 @@ void sam_seg_CIGAR (VBlockSAMP vb, ZipDataLineSAMP dl, uint32_t last_cigar_len,
}
// case: copy from same-vb prim (note: saggy_line_i can only be set in the MAIN component)
- else if (has(SA_Z) && segconf.sam_has_SA_Z && sam_has_prim && sam_cigar_seg_is_predicted_by_saggy_SA (vb, vb->last_cigar, last_cigar_len)) {
- cigar_snip[cigar_snip_len++] = COPY_PRIM_SA_CIGAR; // always at cigar_snip[2]
+ else if (has(SA_Z) && segconf.sam_has_SA_Z && sam_has_prim && sam_line_is_depn (dl) && sam_cigar_seg_is_predicted_by_saggy_SA (vb, vb->last_cigar, last_cigar_len)) {
+ cigar_snip[cigar_snip_len++] = COPY_SAGGY_PRIM_SA_CIGAR; // always at cigar_snip[2]
seg_by_did (VB, STRa(cigar_snip), SAM_CIGAR, add_bytes);
}
@@ -913,8 +913,8 @@ SPECIAL_RECONSTRUCTOR_DT (sam_cigar_special_CIGAR)
ASSPIZ0 (snip_len, "Unable to find buddy MC:Z in history");
break;
- case COPY_PRIM_SA_CIGAR: // copy the predicted alignment in same-vb prim line's SA:Z
- sam_piz_SA_get_prim_item (vb, SA_CIGAR, pSTRa(snip));
+ case COPY_SAGGY_PRIM_SA_CIGAR: // copy the predicted alignment in same-vb prim line's SA:Z
+ sam_piz_SA_get_saggy_prim_item (vb, SA_CIGAR, pSTRa(snip));
stoh = segconf.SA_HtoS ? sam_cigar_S_to_H (vb, (char*)STRa(snip), false) : (StoH){};
break;
diff --git a/src/sam_deep.c b/src/sam_deep.c
index 6aa9fefb..3fd31b8f 100644
--- a/src/sam_deep.c
+++ b/src/sam_deep.c
@@ -18,73 +18,67 @@
static void sam_deep_zip_show_index_stats (void)
{
ARRAY (ZipZDeep, deep_ents, z_file->deep_ents);
+ ARRAY (uint32_t, index, z_file->deep_index);
- for (int i=BY_SEQ; i <= BY_QNAME; i++) {
- ARRAY (uint32_t, index, z_file->deep_index_by[i]);
+ iprintf ("\nz_file.deep_index.len=%"PRIu64" (%s) z_file.deep_ents.len=%"PRIu64" (%s)\n",
+ z_file->deep_index.len, str_size (z_file->deep_index.len * sizeof (uint32_t)).s,
+ z_file->deep_ents.len, str_size (z_file->deep_ents.len * sizeof (ZipZDeep)).s);
- iprintf ("\nz_file.deep_index_by[%s].len=%"PRIu64" (%s) z_file.deep_ents.len=%"PRIu64" (%s)\n",
- by_names[i],
- z_file->deep_index_by[i].len, str_size (z_file->deep_index_by[i].len * sizeof (uint32_t)).s,
- z_file->deep_ents.len, str_size (z_file->deep_ents.len * sizeof (ZipZDeep)).s);
+ #define NUM_LENS 64
+ uint64_t used=0;
+ uint64_t lens[NUM_LENS] = {}, longer=0; // each entry i contains number linked lists which are of length i; last entry i - linked lists of i or longer
+ uint64_t longest_len = 0;
+ uint32_t longest_len_hash;
- #define NUM_LENS 64
- uint64_t used=0;
- uint64_t lens[NUM_LENS] = {}, longer=0; // each entry i contains number linked lists which are of length i; last entry i - linked lists of i or longer
- uint64_t longest_len = 0;
- uint32_t longest_len_hash;
+ uint64_t total_ents_traversed = 0; // total ents traversed on linked lists when segging FASTQ reads
- uint64_t total_ents_traversed = 0; // total ents traversed on linked lists when segging FASTQ reads
-
- for (uint64_t hash=0; hash < index_len; hash++) {
-
- uint64_t this_len=0;
- for (uint32_t ent_i = index[hash]; ent_i != NO_NEXT; ent_i = deep_ents[ent_i].next[i])
- this_len++;
-
- if (this_len) used++;
+ for (uint64_t hash=0; hash < index_len; hash++) {
+ uint64_t this_len=0;
+ for (uint32_t ent_i = index[hash]; ent_i != NO_NEXT; ent_i = deep_ents[ent_i].next)
+ this_len++;
+
+ if (this_len) used++;
- if (this_len < NUM_LENS)
- lens[this_len]++;
+ if (this_len < NUM_LENS)
+ lens[this_len]++;
- else {
- longer++;
- total_ents_traversed += this_len // number of entries that will access this linked list
- * this_len; // length of linked list traversed for each entry accessing this list
+ else {
+ longer++;
+ total_ents_traversed += this_len // number of entries that will access this linked list
+ * this_len; // length of linked list traversed for each entry accessing this list
- if (this_len > longest_len) {
- longest_len = this_len;
- longest_len_hash = hash;
- }
+ if (this_len > longest_len) {
+ longest_len = this_len;
+ longest_len_hash = hash;
}
}
+ }
- if (longest_len > 16384)
- iprintf ("\nFYI: %s index entry %u has %"PRIu64 " deep entries on its linked list. This is excessively large and might cause slowness.%s",
- by_names[i], longest_len_hash, longest_len, report_support_if_unexpected()); // note: error only shows with --show-deep... not very useful
+ if (longest_len > 16384)
+ iprintf ("\nFYI: deep_index entry %u has %"PRIu64 " deep entries on its linked list. This is excessively large and might cause slowness.%s",
+ longest_len_hash, longest_len, report_support_if_unexpected()); // note: error only shows with --show-deep... not very useful
- iprintf ("\nDeep index[%s] entries used: %"PRIu64" / %"PRIu64" (%.1f%%)\n",
- by_names[i], used, index_len, 100.0 * (double)used / (double)index_len);
+ iprintf ("\ndeep_index entries used: %"PRIu64" / %"PRIu64" (%.1f%%)\n",
+ used, index_len, 100.0 * (double)used / (double)index_len);
- iprintf ("Deep index[%s] linked list length histogram:\n", by_names[i]);
- for (int this_len=0; this_len < NUM_LENS; this_len++) {
- if (!lens[this_len]) continue;
+ iprint0 ("\ndeep_index linked list length histogram:\n");
+ for (int this_len=0; this_len < NUM_LENS; this_len++) {
+ if (!lens[this_len]) continue;
- iprintf ("%3u: %"PRIu64"\n", this_len, lens[this_len]);
+ iprintf ("%3u: %"PRIu64"\n", this_len, lens[this_len]);
- // our algorithm requires traversal of entire linked list for each entry. calculate the length of the list
- // traversed for the average read (simplifying assumption: there are the same )
- total_ents_traversed += (uint64_t)this_len // number of entries that will access this linked list
- * (uint64_t)this_len // length of linked list traversed for each entry accessing this list
- * (uint64_t)lens[this_len]; // number of linked lists of this length
- }
+ // our algorithm requires traversal of entire linked list for each entry. calculate the length of the list
+ // traversed for the average read (simplifying assumption: there are the same)
+ total_ents_traversed += (uint64_t)this_len // number of entries that will access this linked list
+ * (uint64_t)this_len // length of linked list traversed for each entry accessing this list
+ * (uint64_t)lens[this_len]; // number of linked lists of this length
+ }
- if (longer) iprintf ("longer: %"PRIu64"\n", longer);
+ if (longer) iprintf ("longer: %"PRIu64"\n", longer);
- // this is approximate: assuming # of ents hashed is equal to the number of reads (only for BY_SEQ - we traverse BY_QNAME only for trimmed reads)
- if (i==BY_SEQ)
- iprintf ("\nFASTQ SEG: BY_SEQ index linked list length traversed on average: %.2f (total_ents_traversed=%"PRIu64")\n",
- (double)total_ents_traversed / (double)deep_ents_len, total_ents_traversed);
- }
+ // this is approximate: assuming # of ents hashed is equal to the number of reads
+ iprintf ("\nFASTQ SEG: deep_index linked list length traversed on average: %.2f (total_ents_traversed=%"PRIu64")\n",
+ (double)total_ents_traversed / (double)deep_ents_len, total_ents_traversed);
}
static void sam_deep_zip_display_reasons (void)
@@ -181,12 +175,13 @@ void sam_deep_set_QUAL_hash (VBlockSAMP vb, ZipDataLineSAMP dl, STRp(qual))
dl->deep_hash.qual = deep_qual_hash (VB, STRa(qual), dl->FLAG.rev_comp);
- if (flag.show_deep == 2 && deephash_issame (dl->deep_hash, flag.debug_deep_hash))
- iprintf ("%s Found deep_hash=%u,%u,%u\nQNAME=\"%.*s\"\nSEQ=\"%.*s\"\nQUAL=\"%.*s\"\n",
+ if (flag.show_deep == SHOW_DEEP_ALL ||
+ (flag.show_deep == SHOW_DEEP_ONE_HASH && deephash_issame (dl->deep_hash, flag.debug_deep_hash)))
+ iprintf ("%s Recording deep_hash=%016"PRIx64",%08x,%08x\nQNAME=\"%.*s\"\nSEQ=\"%.*s\"\nQUAL=\"%.*s\"\n\n",
LN_NAME, DEEPHASHf(dl->deep_hash), STRfw(dl->QNAME), dl->SEQ.len, vb->textual_seq_str, STRfw(dl->QUAL));
}
-// ZIP compute thread: mutex-protected callback from ctx_merge_in_vb_ctx during merge: add VB's deep_hash to z_file->deep_index_by_*/deep_ents
+// ZIP compute thread: mutex-protected callback from ctx_merge_in_vb_ctx during merge: add VB's deep_hash to z_file->deep_index/deep_ents
void sam_deep_zip_merge (VBlockP vb_)
{
if (!flag.deep || zip_is_biopsy) return;
@@ -195,28 +190,26 @@ void sam_deep_zip_merge (VBlockP vb_)
START_TIMER;
// initialize - first VB merging
- if (!z_file->deep_index_by[BY_SEQ].len) {
- double est_num_vbs = MAX_(1, (double)txtfile_get_seggable_size() / (double)Ltxt * 1.05/* additonal PRIM VBs */);
- uint64_t est_num_lines = MIN_(MAX_ENTRIES, (uint64_t)(est_num_vbs * (double)vb->lines.len * 1.15)); // inflate the estimate by 15% - to reduce hash contention, and to reduce realloc chances for z_file->deep_ents
+ if (!z_file->deep_index.len) {
+ double est_num_vbs = MAX_(1, (double)txt_file->est_seggable_size / (double)Ltxt * 1.05/* additonal PRIM VBs */);
+ uint64_t est_num_lines = MIN_(MAX_ENTRIES, (uint64_t)(est_num_vbs * (double)(vb->lines.len32 + vb->num_gc_lines) * 1.15)); // inflate the estimate by 15% - to reduce hash contention, and to reduce realloc chances for z_file->deep_ents
// number of bits of hash table size (maximum 31)
int clzll = (int)__builtin_clzll (est_num_lines * 2);
num_hash_bits = MIN_(31, 64 - clzll); // num hash bits
- for (int by=BY_SEQ; by <= BY_QNAME; by++) {
- z_file->deep_index_by[by].can_be_big = true;
+ z_file->deep_index.can_be_big = true;
- // pointer into deep_ents - initialize to NO_NEXT
- buf_alloc_exact_255 (evb, z_file->deep_index_by[by], ((uint64_t)1 << num_hash_bits), uint32_t, "z_file->deep_index_by");
- }
+ // pointers into deep_ents - initialize to NO_NEXT
+ buf_alloc_exact_255 (evb, z_file->deep_index, ((uint64_t)1 << num_hash_bits), uint32_t, "z_file->deep_index");
// note: no initialization of deep_ents - not needed and it takes too long (many seconds to get pages from kernel) while all threads are waiting for vb=1
z_file->deep_ents.can_be_big = true;
buf_alloc_exact (evb, z_file->deep_ents, MAX_(est_num_lines, vb->lines.count), ZipZDeep, "z_file->deep_ents");
if (flag.show_deep)
- printf ("num_hash_bits=%u est_num_lines*1.15=%"PRIu64" clzll=%d deep_index_by[BY_SEQ].len=%"PRIu64" deep_index_by[BY_QNAME].len=%"PRIu64" deep_ents.len=%"PRIu64"\n",
- num_hash_bits, est_num_lines, clzll, z_file->deep_index_by[BY_SEQ].len, z_file->deep_index_by[BY_QNAME].len, z_file->deep_ents.len);
+ printf ("num_hash_bits=%u est_num_lines*1.15=%"PRIu64" clzll=%d deep_index.len=%"PRIu64" deep_ents.len=%"PRIu64"\n",
+ num_hash_bits, est_num_lines, clzll, z_file->deep_index.len, z_file->deep_ents.len);
z_file->deep_ents.len = 0;
}
@@ -231,10 +224,10 @@ void sam_deep_zip_merge (VBlockP vb_)
uint32_t ent_i = z_file->deep_ents.len32;
- uint32_t *deep_index_by[2] = { B1ST32 (z_file->deep_index_by[BY_SEQ]), B1ST32 (z_file->deep_index_by[BY_QNAME]) };
+ uint32_t *deep_index = B1ST32 (z_file->deep_index);
ARRAY (ZipZDeep, deep_ents, z_file->deep_ents);
- uint32_t mask = bitmask32 (num_hash_bits); // num_hash_bits is calculated upon first merge
+ uint64_t mask = bitmask64 (num_hash_bits); // num_hash_bits is calculated upon first merge
uint32_t deep_line_i = 0; // line_i within the VB, but counting only deepable lines that have SEQ.len > 0 and are not monochar
for_buf2 (ZipDataLineSAM, dl, line_i, vb->lines) {
if (!dl->is_deepable) continue; // case: non-deepable: secondary or supplementary line, no sequence or mono-base
@@ -247,15 +240,12 @@ void sam_deep_zip_merge (VBlockP vb_)
}
// add to indices (by SEQ and by QNAME)
- uint32_t prev_head[2];
- for (int by=BY_SEQ; by <= BY_QNAME; by++) {
- uint32_t hash = (by==BY_SEQ ? dl->deep_hash.seq : dl->deep_hash.qname) & mask;
- prev_head[by] = deep_index_by[by][hash];
- deep_index_by[by][hash] = ent_i; // new head of linked list
- }
+ uint32_t hash = dl->deep_hash.qname & mask;
+ uint32_t prev_head = deep_index[hash];
+ deep_index[hash] = ent_i; // new head of linked list
- // create new entry - which is now the head of both linked links (BY_SEQ, BY_QNAME)
- deep_ents[ent_i++] = (ZipZDeep){ .next = { prev_head[0], prev_head[1] }, // previous head, if there is one, will now be the 2nd element
+ // create new entry - which is now the head of linked list
+ deep_ents[ent_i++] = (ZipZDeep){ .next = prev_head, // this will now be the 2nd element
.seq_len = dl->SEQ.len,
.hash = dl->deep_hash,
.vb_i = vb->vblock_i,
@@ -316,7 +306,8 @@ static void sam_piz_deep_add_qname (VBlockSAMP vb)
BNXT32(vb->deep_index) = vb->deep_ents.len32; // index in deep_ents of data of current deepable line
BNXT(PizZDeepFlags, vb->deep_ents) = (PizZDeepFlags){}; // initialize flags
- if (segconf.deep_qtype == QNONE || flag.seq_only || flag.qual_only)
+ if (segconf.deep_qtype == QNONE || // back comp - QNONE was possible up to 15.0.66 (match by non-trimmed SEQ and QUAL)
+ flag.seq_only || flag.qual_only)
goto done;
ASSPIZ (qname_len <= SAM_MAX_QNAME_LEN, "QNAME.len=%u, but per BAM specfication it is limited to %u characters. QNAME=\"%.*s\"", SAM_MAX_QNAME_LEN, qname_len, STRf(qname));
@@ -545,11 +536,11 @@ static void sam_piz_deep_finalize_ents (void)
for (uint32_t vb_idx=0; vb_idx < z_file->deep_index.len32; vb_idx++) {
BufferP deep_index = B(Buffer, z_file->deep_index, vb_idx);
- // too much output, uncomment when needed
- // BufferP deep_ents = B(Buffer, z_file->deep_ents, vb_idx);
- // if (flag.show_deep)
- // iprintf ("vb_idx=%u vb=%s/%u start_deepable_line=%"PRIu64" num_deepable_lines=%u ents=(%p, %u)\n",
- // vb_idx, comp_name(deep_ents->prm32[1]), deep_ents->prm32[0], next_deepable_line, deep_index->len32, deep_ents->data, deep_ents->len32);
+ if (flag.show_deep == SHOW_DEEP_ALL) {
+ BufferP deep_ents = B(Buffer, z_file->deep_ents, vb_idx);
+ iprintf ("vb_idx=%u vb=%s/%u start_deepable_line=%"PRIu64" num_deepable_lines=%u ents=(%p, %u)\n",
+ vb_idx, comp_name(deep_ents->prm32[1]), deep_ents->prm32[0], next_deepable_line, deep_index->len32, deep_ents->data, deep_ents->len32);
+ }
BNXT64(z_file->vb_start_deep_line) = next_deepable_line;
next_deepable_line += deep_index->len;
@@ -577,8 +568,6 @@ void sam_piz_deep_grab_deep_ents (VBlockSAMP vb)
{
START_TIMER;
- #define num_deepable_sam_vbs z_file->deep_index.param
-
// two cases: 1. in FASTQ-only reconstruction (eg --R1), DEPN items have need_recon=false, and hence will not recontructed and
// this function will not be called for them. When unbinding (SAM+FASTQ), it will be called for DEPN VBs, and return here.
if (IS_DEPN(vb)) return; // DEPN VBs have only SEC/SUPP alignments, and hence don't contribute to Deep data
diff --git a/src/sam_fields.c b/src/sam_fields.c
index b62a38f7..df1e48b8 100644
--- a/src/sam_fields.c
+++ b/src/sam_fields.c
@@ -347,7 +347,11 @@ static bool sam_seg_MM_Z_item (VBlockP vb, ContextP ctx,
rom arr = comma + 1;
uint32_t arr_len = mm_item_len + mm_item - arr;
- seg_array (vb, ctx_get_ctx (vb, segconf.MM_con.items[1].dict_id), ctx->st_did_i, STRa(arr), ',', 0, false, true, sub_dict_id (_OPTION_MM_Z, 'A'), arr_len);
+
+ // note: we seg MNM:Z repeats as "special" and copy them from ML:Z if confirmed to be the same
+ seg_array_(vb, ctx_get_ctx (vb, segconf.MM_con.items[1].dict_id), ctx->st_did_i, STRa(arr), ',', 0, false, true, sub_dict_id (_OPTION_MM_Z, 'A'),
+ SAM_SPECIAL_ML_REPEATS, ctx_encountered_in_line (VB, OPTION_ML_B_C) ? CTX(OPTION_ML_B_C)->last_value.i/*ML's n_repeats*/ : -1, // note: will only use special if MM:Z has a single item (otherwise ML:B repeats are distributed between all MM:Z items)
+ arr_len);
}
// trivial but legal MM:Z field: "MM:Z:C+m?;"
@@ -366,6 +370,13 @@ static void sam_seg_MM_Z (VBlockSAMP vb, STRp(mm), unsigned add_bytes)
seg_array_by_callback (VB, CTX(OPTION_MM_Z), STRa(mm), ';', sam_seg_MM_Z_item, 0, 0, add_bytes);
}
+// used by container_reconstruct to retrieve the number of repeats of ML:B, and use that for MM:Z subcontext MNM:Z
+SPECIAL_RECONSTRUCTOR (sam_piz_special_ML_REPEATS)
+{
+ new_value->i = CTX(OPTION_ML_B_C)->last_value.i; // number of repeats because ML.store=STORE_INDEX
+ return HAS_NEW_VALUE;
+}
+
// ----------------------------------------------------------------------------------------------------------
// SM:i: Template-independent mapping quality
// ----------------------------------------------------------------------------------------------------------
@@ -1292,6 +1303,9 @@ void sam_seg_array_one_ctx (VBlockSAMP vb, ZipDataLineSAMP dl, DictId dict_id, u
ctx_set_last_value (VB, con_ctx, sum);
}
+ if (con_ctx->flags.store == STORE_INDEX)
+ ctx_set_last_value (VB, con_ctx, (int64_t)repeats);
+
if (callback)
callback (vb, elem_ctx, cb_param, local_start, repeats);
@@ -1693,6 +1707,8 @@ DictId sam_seg_aux_field (VBlockSAMP vb, ZipDataLineSAMP dl, bool is_bam,
case _OPTION_zm_i: COND(segconf.sam_has_zm_by_Q1NAME, sam_seg_pacbio_zm (vb, numeric.i, add_bytes));
+ // case _OPTION_ql_Z: _OPTION_qt_Z: // better off as snips that QUAL-like context
+
// Illumina DRAGEN fields
case _OPTION_sd_f: COND (MP(DRAGEN), sam_dragen_seg_sd_f (vb, dl, STRa(value), numeric, add_bytes));
diff --git a/src/sam_header.c b/src/sam_header.c
index f9f524a6..fa9c19db 100644
--- a/src/sam_header.c
+++ b/src/sam_header.c
@@ -392,17 +392,46 @@ static void sam_header_zip_inspect_RG_lines (BufferP txt_header)
// advance to point to first @RG line
#define IS_RG(s) (s[1] == 'R' && s[2] == 'G' && s[3] == '\t')
#define IS_ID(s) (s[0] == 'I' && s[1] == 'D' && s[2] == ':')
+ #define IS_PL(s) (s[0] == 'P' && s[1] == 'L' && s[2] == ':')
while ((hdr = strchr (hdr, '@')))
if (IS_RG(hdr)) {
str_split_by_tab (hdr, after - hdr, 32, NULL, false, false); // also advances hdr to after the newline
-
- if (n_flds)
- for (int i=1; i < n_flds; i++)
- if (IS_ID (flds[i])) {
- ctx_populate_zf_ctx (OPTION_RG_Z, flds[i]+3, fld_lens[i]-3, WORD_INDEX_NONE);
- break;
- }
+
+ for (int i=1; i < n_flds; i++)
+ if (IS_ID (flds[i]))
+ ctx_populate_zf_ctx (OPTION_RG_Z, flds[i]+3, fld_lens[i]-3, WORD_INDEX_NONE);
+
+ else if (!segconf.tech_by_RG && IS_PL (flds[i])) {
+ SAFE_NUL(&flds[i][fld_lens[i]]);
+
+ static struct { rom name/*case insensitive*/; SeqTech tech; } PL_to_tech[] = {
+ // standard // observed non-standard
+ { "Illumina", TECH_ILLUM },
+ { "DNBSEQ", TECH_MGI }, { "mgi", TECH_MGI },
+ { "PacBio", TECH_PACBIO },
+ { "Ultima", TECH_ULTIMA },
+ { "ONT", TECH_NANOPORE },
+ { "IonTorrent", TECH_IONTORR },
+ { "Element", TECH_ELEMENT },
+ { "Singular", TECH_SINGLR },
+ { "Capillary", TECH_CAPILLARY },
+ { "Helicos", TECH_HELICOS },
+ { "LS454", TECH_LS454 },
+ { "SOLiD", TECH_SOLID },
+ };
+
+ for (int pl=0; pl < ARRAY_LEN(PL_to_tech); pl++)
+ if (str_case_compare (flds[i]+3, PL_to_tech[pl].name, NULL)) {
+ segconf.tech_by_RG = PL_to_tech[pl].tech; // note: this is very unreliable, our segconf.tech is a much better predictor of the true sequencing technology. In many BAM files, this is missing, contains non-standard values, and is sometimes simply false (e.g. old Ultima files have LS454)
+ break;
+ }
+
+ if (!segconf.tech_by_RG)
+ strncpy (segconf.tech_by_RG_unidentified, flds[i]+3, sizeof(segconf.tech_by_RG_unidentified)-1);
+
+ SAFE_RESTORE;
+ }
}
else hdr++;
diff --git a/src/sam_piz.c b/src/sam_piz.c
index 6b74a8b9..83b19e7c 100644
--- a/src/sam_piz.c
+++ b/src/sam_piz.c
@@ -58,7 +58,7 @@ void sam_piz_genozip_header (ConstSectionHeaderGenozipHeaderP header)
if (VER(15)) {
segconf.deep_qtype = header->sam.segconf_deep_qname1 ? QNAME1
: header->sam.segconf_deep_qname2 ? QNAME2
- : QNONE;
+ : QNONE; // no deep, or up to 15.0.66 possibly Deep by full SEQ+QUAL match (without QNAME matching)
segconf.deep_no_qual = header->sam.segconf_deep_no_qual;
segconf.deep_N_fq_score = header->sam.segconf_deep_N_fq_score;
segconf.use_insertion_ctxs = header->sam.segconf_use_ins_ctxs;
diff --git a/src/sam_private.h b/src/sam_private.h
index 24ff4f0b..10994c8f 100644
--- a/src/sam_private.h
+++ b/src/sam_private.h
@@ -219,7 +219,7 @@ typedef struct {
typedef struct VBlockSAM {
VBLOCK_COMMON_FIELDS
- // --------- current line before every line by sam_reset_line) ----------
+ // --------- current line - reset before every line by sam_reset_line() ----------
Buffer textual_seq; // ZIP/PIZ: BAM: contains the textual SEQ (PIZ: used unless flags.no_textual_seq)
Buffer textual_cigar; // ZIP: Seg of BAM, PIZ: store CIGAR in sam_cigar_analyze
Buffer binary_cigar; // ZIP/PIZ: BAM-style CIGAR representation, generated in sam_cigar_analyze. binary_cigar.next is used by sam_seg_SEQ
@@ -243,10 +243,9 @@ typedef struct VBlockSAM {
#define first_sam_piz_vb_ff_per_line mate_line_i
LineIType mate_line_i; // Seg/PIZ: the mate of this line.
LineIType saggy_line_i; // Seg/PIZ: without gencomp: a previous line with the same sag as the current line.
-
#define after_sam_vb_ff_per_line seq_is_monochar
- #define first_sam_vb_zero_per_line seq_is_monochar
+ #define first_sam_vb_zero_per_line seq_is_monochar
bool seq_is_monochar; // PIZ only
bool line_not_deepable; // PIZ only
bool cigar_missing; // ZIP/PIZ: current line: SAM "*" BAM: n cigar op=0
@@ -303,6 +302,7 @@ typedef struct VBlockSAM {
// gencomp stuff
uint32_t main_vb_info_i; // ZIP SAM MAIN: index of entry in z_file->vb_info[0] for this VB
+ uint32_t num_gc_lines; // ZIP: number of lines removed from this VB and sent to gencomp (0 if not MAIN VB)
// sag stuff
uint32_t plsg_i; // PIZ: prim_vb: index of this VB in the plsg array
@@ -448,7 +448,6 @@ typedef struct CigarAnalItem {
} CigarAnalItem;
#define ALN_NUM_ALNS_BITS 12 // determines max number of alignments (including primary) in a SAG (note: sam_load_groups_add_SA_alns assumes it's at most 16 bits)
-#define PRIM_ALN_I_BITS 7 // determines the max index of a primary alignment in a full SA:Z alignment list
#define GRP_SEQ_LEN_BITS 26 // determines the maximum seq_len of any primary alignment
#define GRP_QUAL_COMP_LEN_BITS 22 // determines max length of compressed QUAL of any primary alignment in the file
#define GRP_AS_BITS 16
@@ -489,7 +488,7 @@ typedef struct Sag { // 32 bytes (= 4 x uint64_t)
uint64_t first_grp_in_vb : 1; // This group is the first group in its PRIM vb
uint64_t no_qual : 2; // (QualMissingType) this SA has no quality, all dependends are expected to not have quality either
uint64_t num_alns : ALN_NUM_ALNS_BITS; // number of alignments in this SA group (including primary alignment)
- uint64_t prim_aln_i : PRIM_ALN_I_BITS; // (15.0.66) the index of the primary alignment in the full SA:Z (including all group alignments)
+ uint64_t unused : 7;
} Sag; // Originally named "SA Group" representing the group of alignments in an SA:Z field, but now extended to other types of SAGs
#define ZGRP_I(g) ((SAGroup)BNUM (z_file->sag_grps, (g))) // group pointer to grp_i
@@ -497,7 +496,6 @@ typedef struct Sag { // 32 bytes (= 4 x uint64_t)
#define GRP_QNAME(g) B8(z_file->sag_qnames, (g)->qname)
#define MAX_SA_NUM_ALNS MAXB(ALN_NUM_ALNS_BITS) // our limit (number of alignments per group)
-#define MAX_PRIM_ALN_I (MAXB(PRIM_ALN_I_BITS)-1)// our limit of prim_aln_i (127 means "not set")
#define MAX_SA_POS MAXB(31) // BAM limit
#define MAX_SA_NM MAXB(ALN_NM_BITS) // our limit
#define MAX_SA_MAPQ MAXB(8) // BAM limit
@@ -696,7 +694,7 @@ extern void sam_MD_Z_verify_due_to_seq (VBlockSAMP vb, STRp(seq), PosType32 pos,
extern void sam_seg_SA_Z (VBlockSAMP vb, ZipDataLineSAMP dl, STRp(sa), unsigned add_bytes);
extern bool sam_seg_0A_mapq_cb (VBlockP vb, ContextP ctx, STRp (mapq_str), uint32_t repeat);
extern bool sam_seg_SA_get_prim_item (VBlockSAMP vb, int sa_item, pSTRp(out));
-extern void sam_piz_SA_get_prim_item (VBlockSAMP vb, int sa_item, pSTRp(out));
+extern void sam_piz_SA_get_saggy_prim_item (VBlockSAMP vb, int sa_item, pSTRp(out));
extern bool sam_seg_is_item_predicted_by_prim_SA (VBlockSAMP vb, int sa_item_i, int64_t value);
extern bool sam_zip_is_valid_SA (rom sa, uint32_t char_limit, bool is_bam);
extern bool sam_test_SA_CIGAR_abbreviated (STRp(sa));
diff --git a/src/sam_qual.c b/src/sam_qual.c
index 0662fa39..5ad3f769 100644
--- a/src/sam_qual.c
+++ b/src/sam_qual.c
@@ -100,7 +100,7 @@ COMPRESSOR_CALLBACK (sam_zip_##tag) \
{ \
ZipDataLineSAMP dl = DATA_LINE (vb_line_i); \
*line_data_len = dl->dont_compress_##tag ? 0 : MIN_(maximum_size, dl->f.len); /* note: maximum_len might be shorter than the data available if we're just sampling data in codec_assign_best_codec */ \
- if (!line_data || ! *line_data_len) return; /* no dat, or only lengths were requested */ \
+ if (!line_data || ! *line_data_len) return; /* no data, or only lengths were requested */ \
*line_data = Btxt (dl->f.index); \
if (is_rev) *is_rev = may_be_revcomped ? dl->FLAG.rev_comp : false;\
}
@@ -271,7 +271,7 @@ void sam_seg_QUAL (VBlockSAMP vb, ZipDataLineSAMP dl, STRp(qual)/*always textual
if (!vb->qual_missing) vb->has_qual = true;
- if ((flag.deep || flag.show_deep == 2) && !segconf.running)
+ if ((flag.deep || flag.show_deep == SHOW_DEEP_ONE_HASH) && !segconf.running)
sam_deep_set_QUAL_hash (vb, dl, STRa(qual));
// case: monochar
diff --git a/src/sam_sa.c b/src/sam_sa.c
index 3873b25f..52f19d39 100644
--- a/src/sam_sa.c
+++ b/src/sam_sa.c
@@ -444,7 +444,7 @@ SPECIAL_RECONSTRUCTOR_DT (sam_piz_special_SA_main)
return NO_NEW_VALUE;
}
-void sam_piz_SA_get_prim_item (VBlockSAMP vb, int sa_item, pSTRp(out))
+void sam_piz_SA_get_saggy_prim_item (VBlockSAMP vb, int sa_item, pSTRp(out))
{
STR(SA);
sam_reconstruct_from_buddy_get_textual_snip (vb, CTX (OPTION_SA_Z), BUDDY_SAGGY, pSTRa(SA));
@@ -468,7 +468,7 @@ SPECIAL_RECONSTRUCTOR_DT (sam_piz_special_COPY_PRIM)
int item_i = snip[0]-'0';
STR(item);
- sam_piz_SA_get_prim_item (vb, item_i, pSTRa(item));
+ sam_piz_SA_get_saggy_prim_item (vb, item_i, pSTRa(item));
if (reconstruct) RECONSTRUCT_str (item);
if (item_i == SA_POS || item_i == SA_MAPQ)
diff --git a/src/sam_sag_ingest.c b/src/sam_sag_ingest.c
index 012c1c2a..b7777c8a 100644
--- a/src/sam_sag_ingest.c
+++ b/src/sam_sag_ingest.c
@@ -171,7 +171,7 @@ static void sam_zip_prim_ingest_vb_compress_qnames (VBlockSAMP vb, Sag *vb_grps,
huffman_compress (SAM_QNAME, STRa(qname), BAFT8(*comp_qname_buf), &comp_len);
comp_qname_buf->len += comp_len;
- qname_hashes[vb_grp_i] = qname_calc_hash (QNAME1, COMP_NONE, STRa(qname), g->is_last, false, NULL);
+ qname_hashes[vb_grp_i] = qname_calc_hash (QNAME1, COMP_NONE, STRa(qname), g->is_last, false, CRC32, NULL);
}
}
diff --git a/src/sam_sag_load.c b/src/sam_sag_load.c
index a0f209ad..c5128c64 100644
--- a/src/sam_sag_load.c
+++ b/src/sam_sag_load.c
@@ -105,7 +105,7 @@ void sam_show_sag_one_grp (SAGroup grp_i)
iprintf ("grp_i=%u: qname(i=%"PRIu64",l=%u,hash=%u)=%.*s seq=(i=%"PRIu64",l=%u) qual=(i=%"PRIu64",comp_l=%u)[%u]=\"%s\" AS=%u strand=%c mul/fst/lst=%u,%u,%u %s=%u%s%s\n",
grp_i, (uint64_t)g->qname, g->qname_len,
- qname_calc_hash (QNAME1, COMP_NONE, (rom)GRP_QNAME(g), g->qname_len, g->is_last, false, NULL), // the hash is part of the index in ZIP, and not part of the SAGroup struct. Its provided here for its usefulness in debugging
+ (uint32_t)qname_calc_hash (QNAME1, COMP_NONE, (rom)GRP_QNAME(g), g->qname_len, g->is_last, false, CRC32, NULL), // the hash is part of the index in ZIP, and not part of the SAGroup struct. Its provided here for its usefulness in debugging
g->qname_len, GRP_QNAME(g), (uint64_t)g->seq, g->seq_len,
(uint64_t)g->qual, g->qual_comp_len, SA_QUAL_DISPLAY_LEN, sam_display_qual_from_SA_Group (g),
(int)g->as, "+-"[g->revcomp], g->multi_segs, g->is_first, g->is_last,
diff --git a/src/sam_sag_scan.c b/src/sam_sag_scan.c
index 37d374f0..cf12fdf4 100644
--- a/src/sam_sag_scan.c
+++ b/src/sam_sag_scan.c
@@ -64,7 +64,7 @@ static rom scan_index_one_line (VBlockSAMP vb, rom alignment, uint32_t remaining
ASSERT (str_get_int_range16 (flag_str, tab - flag_str, 0, SAM_MAX_FLAG, &flag.value), "%s: invalid FLAG=%.*s", VB_NAME, (int)(tab - flag_str), flag_str);
}
- uint32_t hash = qname_calc_hash (QNAME1, COMP_NONE, qname, qname_len, flag.is_last, false, NULL);
+ uint32_t hash = qname_calc_hash (QNAME1, COMP_NONE, qname, qname_len, flag.is_last, false, CRC32, NULL);
if (vb->preprocessing) {
// depn index
@@ -351,7 +351,7 @@ bool sam_might_have_saggies_in_other_VBs (VBlockSAMP vb, ZipDataLineSAMP dl, int
{
if (!vb->qname_count.len32) return true; // we didn't count qnames, so we don't have proof that there aren't any saggies in other VBs
- uint32_t qname_hash = qname_calc_hash (QNAME1, COMP_NONE, Btxt (dl->QNAME.index), dl->QNAME.len, dl->FLAG.is_last, false, NULL);
+ uint32_t qname_hash = qname_calc_hash (QNAME1, COMP_NONE, Btxt (dl->QNAME.index), dl->QNAME.len, dl->FLAG.is_last, false, CRC32, NULL);
if (IS_SAG_CC)
return true; // we don't do qname accounting for SAG_BY_CC
diff --git a/src/sam_sag_zip.c b/src/sam_sag_zip.c
index b5168a86..b83281a9 100644
--- a/src/sam_sag_zip.c
+++ b/src/sam_sag_zip.c
@@ -440,6 +440,7 @@ bool sam_seg_is_gc_line (VBlockSAMP vb, ZipDataLineSAMP dl, STRp(alignment), boo
vb->line_i--;
vb->recon_size -= alignment_len;
vb->txt_size -= alignment_len;
+ vb->num_gc_lines++;
if (comp_i == SAM_COMP_PRIM) vb->seg_found_prim_line = true;
else if (comp_i == SAM_COMP_DEPN) vb->seg_found_depn_line = true;
@@ -483,7 +484,7 @@ static Sag *sam_sa_get_first_group_by_qname_hash (VBlockSAMP vb, STRp(this_qname
uint32_t *this_qname_hash) // out
{
// search for a group with qname in z_file->sa_qname
- *this_qname_hash = qname_calc_hash (QNAME1, COMP_NONE, this_qname, this_qname_len, is_last, false, NULL);
+ *this_qname_hash = qname_calc_hash (QNAME1, COMP_NONE, this_qname, this_qname_len, is_last, false, CRC32, NULL);
// search for index entry by qname_hash
SAGroupIndexEntry *index_entry = binary_search (sam_sa_binary_search_for_qname_hash, SAGroupIndexEntry, z_file->sag_grps_index, *this_qname_hash);
diff --git a/src/sam_seg.c b/src/sam_seg.c
index 95ff0d16..14f07f30 100644
--- a/src/sam_seg.c
+++ b/src/sam_seg.c
@@ -336,6 +336,8 @@ void sam_seg_initialize (VBlockP vb_)
DID_EOL);
ctx_set_store (VB, STORE_INDEX, SAM_RNAME, SAM_RNEXT, // when reconstructing BAM, we output the word_index instead of the string
+ OPTION_XA_Z, // for containers this stores repeats - used by sam_piz_special_X1->container_peek_repeats
+ OPTION_ML_B_C, // used to predict repeats in MM:Z subcontext MNM:Z
DID_EOL);
// when reconstructing these contexts against another context (DELTA_OTHER or XOR_DIFF) the other may be before or after
@@ -463,8 +465,6 @@ void sam_seg_initialize (VBlockP vb_)
if (MP(STAR)) sam_star_seg_initialize (vb);
if (TECH(PACBIO)) sam_pacbio_seg_initialize (vb);
- ctx_set_store (VB, STORE_INDEX, OPTION_XA_Z, DID_EOL); // for containers this stores repeats - used by sam_piz_special_X1->container_peek_repeats
-
if (segconf.sam_has_BWA_XS_i) // XS:i is as defined some aligners
seg_mux_init (vb, OPTION_XS_i, SAM_SPECIAL_BWA_XS, false, XS);
@@ -1410,7 +1410,7 @@ static inline BuddyType sam_seg_mate (VBlockSAMP vb, SamFlags f, STRp (qname), u
{
if (sam_is_depn (f) || !segconf.is_paired) return BUDDY_NONE;
- uint32_t mate_hash = qname_calc_hash (QNAME1, COMP_NONE, STRa(qname), !f.is_last, true, NULL) & MAXB(CTX(SAM_QNAME)->qname_hash.prm8[0]);
+ uint32_t mate_hash = qname_calc_hash (QNAME1, COMP_NONE, STRa(qname), !f.is_last, true, CRC32, NULL) & MAXB(CTX(SAM_QNAME)->qname_hash.prm8[0]);
LineIType candidate = LINE_BY_HASH(mate_hash);
SamFlags *mate_f = &DATA_LINE(candidate)->FLAG; // invalid pointer if no mate
@@ -1445,8 +1445,17 @@ static inline BuddyType sam_seg_saggy (VBlockSAMP vb, SamFlags f, STRp (qname),
// case: we found another member of the same sag (i.e. same qname, same is_last)
if (has_same_qname (vb, STRa (qname), candidate, false) && saggy_f->is_last == f.is_last) { // the "prim" line against which we are segging cannot have hard clips
- vb->saggy_line_i = candidate;
+
vb->saggy_is_prim = !sam_is_depn (*saggy_f);
+
+ // case: two alignments with the same QNAME, FLAG.is_first, and both are primary: usually a signof defective BAM
+ if (vb->saggy_is_prim && !sam_is_depn (f)) {
+ WARN_ONCE ("WARNING: Non-compliant %s file: Found two alignments with the same QNAME, FLAG.IS_FIRST, and both are non-secondary, non-supplementary: QNAME=\"%.*s\"",
+ dt_name (vb->data_type), STRf(qname));
+ goto no_saggy;
+ }
+
+ vb->saggy_line_i = candidate;
vb->saggy_near_count++;
// replace value in hash table with current line, so future lines from the same seg have a smaller buddy delta,
@@ -1459,7 +1468,7 @@ static inline BuddyType sam_seg_saggy (VBlockSAMP vb, SamFlags f, STRp (qname),
}
// case: we didn't find another member of our sag in this VB
- else {
+ else no_saggy: {
*insert_to_hash = true;
return BUDDY_NONE;
}
@@ -1502,11 +1511,11 @@ void sam_seg_QNAME (VBlockSAMP vb, ZipDataLineSAMP dl, STRp (qname), unsigned ad
QType q = qname_sam_get_qtype (STRa(qname)); // QNAME2 if we have QNAME2 and qname matches, else QNAME1
- uint32_t qname_hash = qname_calc_hash (q, COMP_NONE, STRa(qname), dl->FLAG.is_last, true, NULL); // note: canonical=true as we use the same hash for find a mate and a saggy
+ uint32_t qname_hash = qname_calc_hash (q, COMP_NONE, STRa(qname), dl->FLAG.is_last, true, CRC32, NULL); // note: canonical=true as we use the same hash for find a mate and a saggy
uint32_t my_hash = qname_hash & MAXB(CTX(SAM_QNAME)->qname_hash.prm8[0]);
bool insert_to_hash = false;
- if (flag.deep || flag.show_deep == 2)
+ if (flag.deep || flag.show_deep == SHOW_DEEP_ONE_HASH)
sam_deep_set_QNAME_hash (vb, dl, q, STRa(qname));
BuddyType bt = sam_seg_mate (vb, dl->FLAG, STRa (qname), my_hash, &insert_to_hash) | // bitwise or
diff --git a/src/sam_seq.c b/src/sam_seq.c
index 26542e6e..0d3ff92e 100644
--- a/src/sam_seq.c
+++ b/src/sam_seq.c
@@ -310,7 +310,7 @@ void sam_zip_report_monochar_inserts (void)
return;
}
- iprintf ("\nSequencing technology: %s\nAligner: %s\n", segconf_tech_name(), segconf_sam_mapper_name());
+ iprintf ("\nSequencing technology: %s\nAligner: %s\n", tech_name(segconf.tech), segconf_sam_mapper_name());
double total_nonref = count_monochar_ins + count_non_monochar_ins + count_S;
iprint0 ("\nBreakdown of NONREF data:\n");
@@ -674,7 +674,7 @@ void sam_seg_SEQ (VBlockSAMP vb, ZipDataLineSAMP dl, STRp(textual_seq), unsigned
if (textual_seq[0] == '*')
vb->seq_missing = dl->no_seq = true;
- if (flag.deep || flag.show_deep == 2)
+ if (flag.deep || flag.show_deep == SHOW_DEEP_ONE_HASH)
sam_deep_set_SEQ_hash (vb, dl, STRa(textual_seq));
bool aligner_ok = flag.aligner_available && !segconf.running;
diff --git a/src/sections.c b/src/sections.c
index 6c4b5c25..7b27b92f 100644
--- a/src/sections.c
+++ b/src/sections.c
@@ -345,8 +345,7 @@ void sections_create_index (void)
case SEC_VB_HEADER : {
VBIType vb_i = sec->vblock_i;
- ASSERT (vb_i>=1 && vb_i < z_file->vb_sections_index.len32, "Bad vb_i=%u, z_file->vb_sections_index.len=%u", vb_i, z_file->vb_sections_index.len32);
-
+ ASSERTINRANGE (vb_i, 1, z_file->vb_sections_index.len32);
vbinx = &vb_index[vb_i];
*vbinx = (SectionsVbIndexEnt){ .vb_header_sec_i = sec_i };
break;
@@ -850,6 +849,10 @@ VBIType sections_get_num_vbs_(CompIType first_comp_i, CompIType last_comp_i)
VBIType sections_get_first_vb_i (CompIType comp_i)
{
+ // ZIP: works for current comp_i in certain cases, most importantly for a FQ component in deep
+ if (comp_i > 0 && comp_i == z_file->comp_sections_index.len32 && IS_ZIP && flag.zip_comp_i == comp_i)
+ return Bcompindex(comp_i-1)->first_vb_i + Bcompindex(comp_i-1)->num_vbs;
+
VBIType first_vb_i = Bcompindex(comp_i)->first_vb_i;
ASSERT (first_vb_i, "comp_i=%s has no VBs", comp_name (comp_i));
return first_vb_i;
@@ -858,7 +861,6 @@ VBIType sections_get_first_vb_i (CompIType comp_i)
// get number of VBs in comp_i, for which vblock_i is at most max_vb_i
VBIType sections_get_num_vbs_up_to (CompIType comp_i, VBIType max_vb_i)
{
-
VBIType num_vbs = 0;
for (VBIType vb_i = Bcompindex(comp_i)->first_vb_i;
diff --git a/src/segconf.c b/src/segconf.c
index 61234d00..98441054 100644
--- a/src/segconf.c
+++ b/src/segconf.c
@@ -191,7 +191,7 @@ static void segconf_set_vb_size (VBlockP vb, uint64_t curr_vb_size)
segconf.vb_size = MIN_(segconf.vb_size * 1.5, ABSOLUTE_MAX_VBLOCK_MEMORY);
}
- int64_t est_seggable_size = txtfile_get_seggable_size();
+ int64_t est_seggable_size = txt_file->est_seggable_size;
// for small files - reduce VB size, to take advantage of all cores (subject to a minimum VB size)
if (!flag.best && est_seggable_size && global_max_threads > 1) {
@@ -311,11 +311,11 @@ void segconf_zip_initialize (void)
.CB_con_snip_len = sizeof (segconf.CB_con_snip),
// FASTQ stuff
- .deep_qtype = QNONE,
+ .deep_qtype = QNONE, // no deep
.deep_is_last = unknown,
// FASTA stuff
- .fasta_has_contigs = true, // initialize optimistically
+ .fasta_has_contigs = true, // initialize optimistically
};
// case: 1st FASTQ in Deep. Note: we re-segconf with this file, but don't remove the SAM segconf data
@@ -531,13 +531,13 @@ rom segconf_sam_mapper_name (void)
return (segconf.sam_mapper >= 0 && segconf.sam_mapper < NUM_MAPPERS) ? mapper_name[segconf.sam_mapper] : "INVALID_MAPPER";
}
-rom segconf_tech_name (void)
+rom tech_name (SeqTech tech)
{
- rom tech_name[] = TECH_NAME;
- ASSERT0 (ARRAY_LEN(tech_name) == NUM_TECHS, "Invalid TECH_NAME array length - perhaps missing commas between strings?");
+ rom names[] = TECH_NAME;
+ ASSERT0 (ARRAY_LEN(names) == NUM_TECHS, "Invalid TECH_NAME array length - perhaps missing commas between strings?");
- switch (segconf.tech) {
- case 0 ... NUM_TECHS-1 : return tech_name[segconf.tech];
+ switch (tech) {
+ case 0 ... NUM_TECHS-1 : return names[tech];
case TECH_NONE : return "None";
case TECH_ANY : return "Any";
case TECH_CONS : return "Consensus";
diff --git a/src/segconf.h b/src/segconf.h
index 81548844..390a3062 100644
--- a/src/segconf.h
+++ b/src/segconf.h
@@ -24,8 +24,8 @@
#define MAX_SEGCONF_LINES 1000 // max lines tested in segconf (even if VB is large e.g. due to reading full MGZIP block)
-typedef packed_enum { TECH_NONE=-1, TECH_ANY=-2, TECH_CONS=-3, TECH_UNKNOWN=0, TECH_ILLUM, TECH_PACBIO, TECH_NANOPORE, TECH_454, TECH_MGI, TECH_IONTORR, TECH_HELICOS, TECH_NCBI, TECH_ULTIMA, TECH_SINGLR, TECH_ELEMENT, TECH_ONSO, NUM_TECHS } SeqTech;
-#define TECH_NAME { "Unknown_tech", "Illumina", "PacBio", "Oxford_Nanopore", "454", "MGI_Tech", "IonTorrent", "Helicos", "NCBI", "Ultima", "Singular", "Element", "Onso" }
+typedef packed_enum { TECH_NONE=-1, TECH_ANY=-2, TECH_CONS=-3, TECH_UNKNOWN=0, TECH_ILLUM, TECH_PACBIO, TECH_NANOPORE, TECH_LS454, TECH_MGI, TECH_IONTORR, TECH_HELICOS, TECH_NCBI, TECH_ULTIMA, TECH_SINGLR, TECH_ELEMENT, TECH_ONSO, TECH_CAPILLARY, TECH_SOLID, NUM_TECHS } SeqTech;
+#define TECH_NAME { "Unknown_tech", "Illumina", "PacBio", "Oxford_Nanopore", "LS454", "MGI_Tech", "IonTorrent", "Helicos", "NCBI", "Ultima", "Singular", "Element", "Onso", "Capillary", "SOLiD", }
#define TECH(x) (segconf.tech == TECH_##x)
typedef packed_enum { SQT_UNKNOWN, SQT_NUKE, SQT_AMINO, SQT_NUKE_OR_AMINO } SeqType;
@@ -102,8 +102,10 @@ typedef struct {
bool sorted_by_qname[NUM_QTYPES]; // qnames appear in the file in a sorted order
bool qname_flavor_rediscovered[NUM_QTYPES]; // true if flavor has been modified (used only during segconf.running)
QnameStr qname_line0[NUM_QTYPES]; // qname of line_i=0 (by which flavor is determined) (nul-terminated)
- SeqTech tech;
-
+ SeqTech tech; // tech by QNAME flavor (more reliable)
+ SeqTech tech_by_RG; // tech by first @RG PL field
+ char tech_by_RG_unidentified[32]; // tech by first @RG PL field - if failed to identify as a known tech
+
// SAM/BAM and FASTQ
uint32_t longest_seq_len; // length of the longest seq_len in the segconf data
DictId seq_len_dict_id; // dict_id of one of the QNAME/QNAME2/LINE3/FASTQ_AUX contexts, which is expected to hold the seq_len for this read. 0 if there is no such item.
@@ -301,7 +303,7 @@ typedef struct {
thool is_interleaved; // whether FASTQ file is identified as interleaved
char interleaved_r1; // valid if is_interleaved: character representing R1: usually '0' or '1'
char interleaved_r2; // valid if is_interleaved: character representing R2: usually '1' or '2'
- QType deep_qtype; // Deep: QNAME1 or QNAME2 if for most segconf lines for which have Deep, SAM qname matches FASTQ's QNAME1 or QNAME2, or QNONE if not
+ QType deep_qtype; // Deep ZIP/PIZ: QNAME1 or QNAME2 if for most segconf lines for which have Deep, SAM qname matches FASTQ's QNAME1 or QNAME2. QNONE means no deep, or in PIZ of Deep files up to 15.0.66, it means matching was by SEQ and QUAL only (not QNAME)
bool deep_paired_qname; // Deep: QNAME hash includes deep_is_last
thool deep_is_last; // Deep: whether this FASTQ file corresponds to is_first or is_last alignments in BAM, or unknown if !segconf.deep_paired_qname
bool deep_no_qual; // Deep: true if for most segconf lines which have Deep, qual doesn't match (eg, bc of undocumented BQSR)
@@ -312,9 +314,8 @@ typedef struct {
char deep_1st_desc[256]; // Deep: DESC line of first FASTQ read of first FASTQ file
#define NUM_INSTS 6
unsigned n_full_mch[NUM_INSTS]; // Deep: count segconf lines where hash matches with at least one SAM line - (QNAME1 or QNAME2), SEQ, QUAL
+ unsigned n_full_mch_trimmed;// Deep: the size of the subset of n_full_mch which are trimmed
unsigned n_seq_qname_mch[NUM_INSTS]; // Deep: count segconf lines where FASTQ (QNAME1 or QNAME2) hash matches with at least one SAM line - SEQ and QNAME
- unsigned n_seq_qual_mch; // Deep: count segconf lines where hash matches with at least one SAM line - SEQ and QUAL
- unsigned n_seq_mch; // Deep: count segconf lines where hash matches with at least one SAM line - SEQ
unsigned n_no_mch; // Deep: count segconf lines that don't match any SAM line (perhaps because SAM is filtered)
StrText optimized_qname; // --optimize: prefix of optimized qname
uint32_t optimized_qname_len;
@@ -349,7 +350,7 @@ extern void segconf_set_width (FieldWidth *w, int bits);
extern bool segconf_is_long_reads(void);
extern void segconf_mark_as_used (VBlockP vb, unsigned num_ctxs, ...);
extern rom segconf_sam_mapper_name (void);
-extern rom segconf_tech_name (void);
+extern rom tech_name (SeqTech tech);
extern rom segconf_deep_trimming_name (void);
extern rom VCF_QUAL_method_name (VcfQualMethod method);
extern rom VCF_INFO_method_name (VcfInfoMethod method);
diff --git a/src/stats.c b/src/stats.c
index b94b0809..fd2211f9 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -136,15 +136,6 @@ static void stats_calc_hash_occ (StatsByLine *sbl, unsigned num_stats)
bufprintf (evb, &exceptions, "%%2C%s", url_esc_non_valid_charsS (str_replace_letter (segconf.unk_ids[id_i][i], strlen(segconf.unk_ids[id_i][i]), ',', -127)).s);
}
- // if Deep with QNONE, we send the first QNAME of the SAM file and the first read name of the FASTQ
- if (flag.deep && segconf.deep_qtype == QNONE) {
- STR(master_qname);
- huffman_get_master (SAM_QNAME, pSTRa(master_qname));
- bufprintf (evb, &exceptions, "%sQNONE_REASON%%2C%%2C%%2C", need_sep++ ? "%3B" : "");
- bufprintf (evb, &exceptions, "%%2C%s", url_esc_non_valid_charsS (str_replace_letter ((char *)STRa(master_qname), ',', -127)).s);
- bufprintf (evb, &exceptions, "%%2C%s", url_esc_non_valid_charsS (str_replace_letter (segconf.deep_1st_desc, strlen(segconf.deep_1st_desc), ',', -127)).s);
- }
-
if (segconf.sam_malformed_XA[0])
bufprintf (evb, &exceptions, "%sBAD_XA%%2C%s", (need_sep++ ? "%3B" : ""), url_esc_non_valid_charsS (str_replace_letter (segconf.sam_malformed_XA, strlen(segconf.sam_malformed_XA), ',', -127)).s);
}
@@ -408,6 +399,15 @@ static void stats_output_file_metadata (void)
REPORT_QNAME;
}
+
+ bufprintf (evb, &features, "tech=%s;", tech_name (segconf.tech));
+ if (segconf.tech != segconf.tech_by_RG) {
+ if (segconf.tech_by_RG)
+ bufprintf (evb, &features, "tech_by_RG=%s;", tech_name (segconf.tech_by_RG));
+ else if (segconf.tech_by_RG_unidentified[0])
+ bufprintf (evb, &features, "@RG_PL=%s;", segconf.tech_by_RG_unidentified);
+ }
+
break;
}
@@ -456,7 +456,7 @@ static void stats_output_file_metadata (void)
case DT_FASTQ:
REPORT_VBs;
REPORT_QNAME;
- FEATURE (segconf.optimize[FASTQ_QNAME] && z_file->num_lines, "Sequencer: %s", "Sequencer=%s", segconf_tech_name());\
+ FEATURE (segconf.optimize[FASTQ_QNAME] && z_file->num_lines, "Sequencer: %s", "Sequencer=%s", tech_name(segconf.tech));\
FEATURE0 (FAF, "FASTA-as-FASTQ", "FASTA-as-FASTQ=True");
FEATURE0 (segconf.multiseq, "Multiseq", "multiseq=True");
FEATURE0 (segconf.is_interleaved, "Interleaved", "interleaved=True");
diff --git a/src/strings.c b/src/strings.c
index 2b5f7663..89724cba 100644
--- a/src/strings.c
+++ b/src/strings.c
@@ -968,7 +968,7 @@ void str_trim (qSTRp(str))
// splits a string with up to (max_items-1) separators (doesn't need to be nul-terminated) to up to or exactly max_items integers
// returns the actual number of items, or 0 is unsuccessful
-uint32_t str_split_ints_do (STRp(str), uint32_t max_items, char sep, bool exactly,
+uint32_t str_split_ints_do (STRp(str), uint32_t max_items, char sep, bool exactly, int base,
int64_t *items) // out - array of integers
{
@@ -977,7 +977,8 @@ uint32_t str_split_ints_do (STRp(str), uint32_t max_items, char sep, bool exactl
uint32_t item_i;
for (item_i=0; item_i < max_items && str < after; item_i++, str++) {
- items[item_i] = strtoll (str, (char **)&str, 10);
+ items[item_i] = (base==16) ? strtoull (str, (char **)&str, base) // allows parsing 64bit (unsigned) hexes
+ : strtoll (str, (char **)&str, base);
if (item_i < max_items-1 && *str != sep && *str != 0) {
item_i=0;
diff --git a/src/strings.h b/src/strings.h
index 2f5f724a..fd0ad3d0 100644
--- a/src/strings.h
+++ b/src/strings.h
@@ -312,12 +312,17 @@ extern uint32_t str_split_by_lines_do (STRp(str), uint32_t max_lines, rom *lines
#define str_split_by_lines(str,str_len,max_lines) \
STR_ARRAY (line, (max_lines)) = str_split_by_lines_do ((str), (str_len), max_lines, lines, line_lens)
-extern uint32_t str_split_ints_do (STRp(str), uint32_t max_items, char sep, bool exactly, int64_t *items);
+extern uint32_t str_split_ints_do (STRp(str), uint32_t max_items, char sep, bool exactly, int base, int64_t *items);
#define str_split_ints(str,str_len,max_items,sep,name,exactly) \
uint32_t n_##name##s = (max_items) ? (max_items) : str_count_char ((str), (str_len), (sep)) + 1; \
int64_t name##s[n_##name##s]; \
- n_##name##s = str_split_ints_do ((str), (str_len), n_##name##s, (sep), (exactly), name##s);
+ n_##name##s = str_split_ints_do ((str), (str_len), n_##name##s, (sep), (exactly), 10, name##s);
+
+#define str_split_hexs(str,str_len,max_items,sep,name,exactly) \
+ uint32_t n_##name##s = (max_items) ? (max_items) : str_count_char ((str), (str_len), (sep)) + 1; \
+ int64_t name##s[n_##name##s]; \
+ n_##name##s = str_split_ints_do ((str), (str_len), n_##name##s, (sep), (exactly), 16, name##s);
extern uint32_t str_split_floats_do (STRp(str), uint32_t max_items, char sep, bool exactly, char this_char_is_NAN, double *items);
#define str_split_floats(str,str_len,max_items,sep,name,exactly,this_char_is_NAN) \
@@ -384,3 +389,5 @@ static inline char base32(uint32_t n) { return (n) < 26 ? ('a' + (n)) // 97-
: '@'; } // 64. 5bits: 0
extern uint64_t p10[]; // powers of 10
+
+extern uint64_t crc64 (uint64_t crc, bytes data, uint64_t data_len); // implementation is in crc64.c
diff --git a/src/test.sh b/src/test.sh
index aa70150d..070f127c 100644
--- a/src/test.sh
+++ b/src/test.sh
@@ -518,6 +518,10 @@ batch_special_algs()
$genozip -tf ${TESTDIR}/special.force-2nd-segconf-vb_size.plain.fq || exit 1
$genozip -tf ${TESTDIR}/special.force-2nd-segconf-vb_size.bgzf.fq.gz || exit 1
$genozip -tf ${TESTDIR}/special.force-2nd-segconf-vb_size.gz.fq.gz || exit 1
+
+ # bug was: bad SAM/BAM file with two prim alignments with the same QNAME and is_first compressed successfully but failed uncompress
+ $genozip -tf ${TESTDIR}/regression.defect-2024-09-19.duplicate-QNAMEs.sam --no-gencomp || exit 1
+ $genozip -tf ${TESTDIR}/regression.defect-2024-09-19.duplicate-QNAMEs.sam --force-gencomp || exit 1
}
batch_qual_codecs()
@@ -1394,9 +1398,9 @@ batch_real_world_backcomp()
make -C $TESTDIR $1.version # generate files listed in "files"
local files=( $TESTDIR/$1/*.genozip )
-
- # remove reference file from list
- files=( ${files[@]/"$TESTDIR/$1/hs37d5.ref.genozip"} )
+
+ # remove reference files from list
+ files=( ${files[@]//*.ref.genozip/} )
# bug 1099 (only applicable to 11.0.11 with debug)
if [[ "$1" == "11.0.11" && "$is_debug" != "" ]]; then
@@ -1404,17 +1408,24 @@ batch_real_world_backcomp()
fi
# remove 0-size files - test/Makefile sets file size to 0, if old Genozip version failed on this (in compression or test)
- for f in ${files[@]}; do
+ for f in ${files[*]}; do
if [ ! -s $f ]; then
- files=( ${files[@]/"$f"} )
+ files=( ${files[@]//$f/} )
fi
done
+ if [[ "$1" == "latest" ]]; then
+ export GENOZIP_REFERENCE=$TESTDIR/$1 # deep files (available since v15) use various references
+ # local ref_option="--no-cache"
+ else
+ local ref_option="-e $TESTDIR/$1/hs37d5.ref.genozip"
+ fi
+
local i=0
for f in ${files[@]}; do
i=$(( i + 1 ))
test_header "$f - backcomp with $1 ($i/${#files[@]} batch_id=${GENOZIP_TEST})"
- $genounzip -t $f -e $TESTDIR/$1/hs37d5.ref.genozip || exit 1
+ $genounzip -t $ref_option $f || exit 1
done
test_header "backcomp $1 genounzip --bgzf=exact"
@@ -2266,17 +2277,14 @@ batch_deep() # note: use --debug-deep for detailed tracking
test_count_genocat_lines "" "--sam --head --no-header" 10
test_count_genocat_lines "" "--sam --tail=11 --no-header" 11
test_count_genocat_lines "" "--sam --header-only" 3376
-
+
+ #genozip="$genozip --show-deep"
+
test_header deep.human2-38
local T=$TESTDIR/deep.human2-38
$genozip $T.sam $T.R1.fq.gz $T.R2.fq.gz -fe $GRCh38 -o $output -3t --best --not-paired || exit 1
$genozip $T.sam $T.R1.fq.gz $T.R2.fq.gz -fe $GRCh38 -o $output -3t --no-gencomp --not-paired || exit 1
- # bismark (bisulfite), SRA2, non-matching FASTQ filenames
- test_header deep.bismark.sra2
- local T=$TESTDIR/deep.bismark.sra2
- $genozip $T.two.fq.gz $T.bam $T.one.fq.gz -fE $GRCh38 -o $output -3t || exit 1
-
# gem3 (bisulfite), multiple (>2) FASTQ
test_header deep.gem3.multi-fastq
local T=$TESTDIR/deep.gem3.multi-fastq
@@ -2285,6 +2293,11 @@ batch_deep() # note: use --debug-deep for detailed tracking
test_count_genocat_lines "" "--R 2" 20
test_count_genocat_lines "" "--R 3" 20
+ # bismark (bisulfite), SRA2, non-matching FASTQ filenames
+ test_header deep.bismark.sra2
+ local T=$TESTDIR/deep.bismark.sra2
+ $genozip $T.two.fq.gz $T.bam $T.one.fq.gz -fE $GRCh38 -o $output -3t || exit 1
+
# SAM has cropped one base at the end of every read (101 bases in FQ vs 100 in SAM)
test_header deep.crop-100
local T=$TESTDIR/deep.crop-100
diff --git a/src/txtfile.c b/src/txtfile.c
index 6e471c73..36362d49 100644
--- a/src/txtfile.c
+++ b/src/txtfile.c
@@ -1011,9 +1011,10 @@ static bool seggable_size_is_modifiable (void)
{
return !is_read_via_ext_decompressor (txt_file) && !TXT_IS_PLAIN;
}
+
// estimate the size of the txt_data of the file - i.e. the uncompressed data excluding the header -
// based on the observed or assumed compression ratio of the source compression so far
-void txtfile_set_seggable_size (void)
+static void txtfile_set_seggable_size (void)
{
uint64_t disk_size = txt_file->disk_size ? txt_file->disk_size
: flag.stdin_size ? flag.stdin_size // user-provided size
@@ -1056,13 +1057,6 @@ void txtfile_set_seggable_size (void)
txt_file->txt_data_so_far_single = txt_file->header_size; // roll back as we will re-account for this data in VB=1
}
-int64_t txtfile_get_seggable_size (void)
-{
- // note: this changes each time a VB is read by the main thread, but since its a single 64B word,
- // a compute thread reading this value will always get a value that makes sense (not using atomic to avoid unnecessary overhead)
- return txt_file->est_seggable_size;
-}
-
uint32_t txt_data_alloc_size (uint32_t vb_size)
{
return TXT_IS_MGSP ? MAX_(24, txt_file->max_mgsp_blocks_in_vb) * (txt_file->max_mgzip_isize + 1/*1 + for last gz block extra length*/) + TXTFILE_READ_VB_PADDING
diff --git a/src/txtfile.h b/src/txtfile.h
index 4500a111..5c39c5e3 100644
--- a/src/txtfile.h
+++ b/src/txtfile.h
@@ -23,8 +23,6 @@ extern void txtfile_zip_finalize_codecs (void);
extern void txtfile_read_header (bool is_first_txt);
extern uint32_t txt_data_alloc_size (uint32_t vb_size) ;
extern void txtfile_read_vblock (VBlockP vb);
-extern void txtfile_set_seggable_size (void);
-extern int64_t txtfile_get_seggable_size (void);
extern bool txtfile_is_gzip (FileP file);
extern void txtfile_discover_specific_gz (FileP file);
extern rom isal_error (int ret);
diff --git a/src/version.h b/src/version.h
index 17d9d404..00c851c4 100644
--- a/src/version.h
+++ b/src/version.h
@@ -1,4 +1,4 @@
-#define GENOZIP_CODE_VERSION "15.0.66"
+#define GENOZIP_CODE_VERSION "15.0.67"
extern int code_version_major (void);
extern int code_version_minor (void);
diff --git a/src/zip.c b/src/zip.c
index 55d7b340..bcf58c8b 100644
--- a/src/zip.c
+++ b/src/zip.c
@@ -351,7 +351,7 @@ static void zip_update_txt_counters (VBlockP vb)
// counters of data AS IT APPEARS IN THE TXT FILE
z_file->num_lines += vb->lines.len; // lines in all bound files in this z_file
- z_file->comp_num_lines[vb->comp_i] += vb->lines.len; // set also for DVCF rejects
+ z_file->comp_num_lines[vb->comp_i] += vb->lines.len;
z_file->txt_data_so_far_single_0 += (int64_t)vb->txt_size; // length of data before any modifications
z_file->txt_data_so_far_bind_0 += (int64_t)vb->txt_size;
@@ -395,7 +395,6 @@ static void zip_free_undeeded_zctx_bufs_after_seg (void)
buf_destroy (z_file->sag_seq);
buf_destroy (z_file->sag_qual);
buf_destroy (z_file->vb_start_deep_line);
- buf_destroy (z_file->deep_ents);
buf_low_level_release_memory_back_to_kernel();
@@ -681,7 +680,7 @@ uint64_t zip_get_target_progress (void)
(flag.deep && flag.zip_comp_i <= SAM_COMP_FQ00) ||
(!flag.deep && !Z_DT(FASTQ))) {
- int64_t progress_unit = txt_file->est_num_lines ? txt_file->est_num_lines : txtfile_get_seggable_size();
+ int64_t progress_unit = txt_file->est_num_lines ? txt_file->est_num_lines : txt_file->est_seggable_size;
target_progress = progress_unit * (3 + segconf.zip_txt_modified) // read, (modify), seg, compress
+ (!flag.make_reference && !flag.seg_only && !flag.biopsy) * progress_unit; // write
@@ -800,7 +799,7 @@ void zip_one_file (rom txt_basename,
z_file->txt_file_disk_sizes_sum += z_file->txt_file_disk_sizes[flag.zip_comp_i];
// (re-)index sections after adding this txt_file
- if (!flag.make_reference)
+ if (!flag.make_reference && !flag.zip_no_z_file)
sections_create_index();
if (z_file->z_closes_after_me && !flag.seg_only) {