From c8a8470492769c69e56a348e5142734cab19664e Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Thu, 23 Jan 2025 04:21:36 -0800 Subject: [PATCH] s2: Improve small block compression speed w/o asm (#1048) * s2: Improve small block compression speed w/o asm Test with `-tags=noasm` Uses uint16 hash table, which reduced the zero cost by a lot. ``` benchmark old MB/s new MB/s speedup BenchmarkEncodeSnappyBlockParallel/12-txt1_128b/s2-snappy-16 2518.04 4430.70 1.76x BenchmarkEncodeSnappyBlockParallel/12-txt1_128b/s2-snappy-better-16 4.59 1852.29 403.55x BenchmarkEncodeSnappyBlockParallel/12-txt1_128b/snappy-noasm-16 3362.43 3561.17 1.06x BenchmarkEncodeSnappyBlockParallel/13-txt1_1000b/s2-snappy-16 8053.34 9026.64 1.12x BenchmarkEncodeSnappyBlockParallel/13-txt1_1000b/s2-snappy-better-16 34.60 3804.57 109.96x BenchmarkEncodeSnappyBlockParallel/13-txt1_1000b/snappy-noasm-16 4901.85 4904.47 1.00x BenchmarkEncodeSnappyBlockParallel/14-txt1_10000b/s2-snappy-16 9349.04 9544.38 1.02x BenchmarkEncodeSnappyBlockParallel/14-txt1_10000b/s2-snappy-better-16 359.32 3795.91 10.56x BenchmarkEncodeSnappyBlockParallel/14-txt1_10000b/snappy-noasm-16 5361.99 5535.82 1.03x BenchmarkEncodeSnappyBlockParallel/15-txt1_20000b/s2-snappy-16 8852.18 9300.86 1.05x BenchmarkEncodeSnappyBlockParallel/15-txt1_20000b/s2-snappy-better-16 594.96 3226.26 5.42x BenchmarkEncodeSnappyBlockParallel/15-txt1_20000b/snappy-noasm-16 3418.03 3435.88 1.01x ``` ``` --- s2/encode_all.go | 416 +++++++++++++++++++++++++++++++++++++++++++- s2/encode_better.go | 416 +++++++++++++++++++++++++++++++++++++++++++- s2/encode_go.go | 12 ++ s2/fuzz_test.go | 6 + 4 files changed, 841 insertions(+), 9 deletions(-) diff --git a/s2/encode_all.go b/s2/encode_all.go index c56ce52e7d..a473b64529 100644 --- a/s2/encode_all.go +++ b/s2/encode_all.go @@ -46,7 +46,12 @@ func encodeGo(dst, src []byte) []byte { d += emitLiteral(dst[d:], src) return dst[:d] } - n := encodeBlockGo(dst[d:], src) + var n int + if len(src) < 64<<10 { + n = encodeBlockGo64K(dst[d:], src) + } else { + n = encodeBlockGo(dst[d:], src) + } if n > 0 { d += n return dst[:d] @@ -72,7 +77,6 @@ func encodeBlockGo(dst, src []byte) (d int) { debug = false ) - var table [maxTableSize]uint32 // sLimit is when to stop looking for offset/length copies. The inputMargin @@ -279,13 +283,228 @@ emitRemainder: return d } -func encodeBlockSnappyGo(dst, src []byte) (d int) { +// encodeBlockGo64K is a specialized version for compressing blocks <= 64KB +func encodeBlockGo64K(dst, src []byte) (d int) { // Initialize the hash table. const ( tableBits = 14 maxTableSize = 1 << tableBits + + debug = false ) + var table [maxTableSize]uint16 + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := len(src) - inputMargin + + // Bail if we can't compress to at least this. + dstLimit := len(src) - len(src)>>5 - 5 + + // nextEmit is where in src the next emitLiteral should start from. + nextEmit := 0 + + // The encoded form must start with a literal, as there are no previous + // bytes to copy, so we start looking for hash matches at s == 1. + s := 1 + cv := load64(src, s) + + // We search for a repeat at -1, but don't output repeats when nextEmit == 0 + repeat := 1 + + for { + candidate := 0 + for { + // Next src position to check + nextS := s + (s-nextEmit)>>5 + 4 + if nextS > sLimit { + goto emitRemainder + } + hash0 := hash6(cv, tableBits) + hash1 := hash6(cv>>8, tableBits) + candidate = int(table[hash0]) + candidate2 := int(table[hash1]) + table[hash0] = uint16(s) + table[hash1] = uint16(s + 1) + hash2 := hash6(cv>>16, tableBits) + + // Check repeat at offset checkRep. + const checkRep = 1 + if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) { + base := s + checkRep + // Extend back + for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; { + i-- + base-- + } + + // Bail if we exceed the maximum size. + if d+(base-nextEmit) > dstLimit { + return 0 + } + + d += emitLiteral(dst[d:], src[nextEmit:base]) + + // Extend forward + candidate := s - repeat + 4 + checkRep + s += 4 + checkRep + for s <= sLimit { + if diff := load64(src, s) ^ load64(src, candidate); diff != 0 { + s += bits.TrailingZeros64(diff) >> 3 + break + } + s += 8 + candidate += 8 + } + if debug { + // Validate match. + if s <= candidate { + panic("s <= candidate") + } + a := src[base:s] + b := src[base-repeat : base-repeat+(s-base)] + if !bytes.Equal(a, b) { + panic("mismatch") + } + } + if nextEmit > 0 { + // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset. + d += emitRepeat(dst[d:], repeat, s-base) + } else { + // First match, cannot be repeat. + d += emitCopy(dst[d:], repeat, s-base) + } + nextEmit = s + if s >= sLimit { + goto emitRemainder + } + cv = load64(src, s) + continue + } + + if uint32(cv) == load32(src, candidate) { + break + } + candidate = int(table[hash2]) + if uint32(cv>>8) == load32(src, candidate2) { + table[hash2] = uint16(s + 2) + candidate = candidate2 + s++ + break + } + table[hash2] = uint16(s + 2) + if uint32(cv>>16) == load32(src, candidate) { + s += 2 + break + } + + cv = load64(src, nextS) + s = nextS + } + + // Extend backwards. + // The top bytes will be rechecked to get the full match. + for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] { + candidate-- + s-- + } + + // Bail if we exceed the maximum size. + if d+(s-nextEmit) > dstLimit { + return 0 + } + + // A 4-byte match has been found. We'll later see if more than 4 bytes + // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit + // them as literal bytes. + + d += emitLiteral(dst[d:], src[nextEmit:s]) + + // Call emitCopy, and then see if another emitCopy could be our next + // move. Repeat until we find no match for the input immediately after + // what was consumed by the last emitCopy call. + // + // If we exit this loop normally then we need to call emitLiteral next, + // though we don't yet know how big the literal will be. We handle that + // by proceeding to the next iteration of the main loop. We also can + // exit this loop via goto if we get close to exhausting the input. + for { + // Invariant: we have a 4-byte match at s, and no need to emit any + // literal bytes prior to s. + base := s + repeat = base - candidate + + // Extend the 4-byte match as long as possible. + s += 4 + candidate += 4 + for s <= len(src)-8 { + if diff := load64(src, s) ^ load64(src, candidate); diff != 0 { + s += bits.TrailingZeros64(diff) >> 3 + break + } + s += 8 + candidate += 8 + } + + d += emitCopy(dst[d:], repeat, s-base) + if debug { + // Validate match. + if s <= candidate { + panic("s <= candidate") + } + a := src[base:s] + b := src[base-repeat : base-repeat+(s-base)] + if !bytes.Equal(a, b) { + panic("mismatch") + } + } + + nextEmit = s + if s >= sLimit { + goto emitRemainder + } + + if d > dstLimit { + // Do we have space for more, if not bail. + return 0 + } + // Check for an immediate match, otherwise start search at s+1 + x := load64(src, s-2) + m2Hash := hash6(x, tableBits) + currHash := hash6(x>>16, tableBits) + candidate = int(table[currHash]) + table[m2Hash] = uint16(s - 2) + table[currHash] = uint16(s) + if debug && s == candidate { + panic("s == candidate") + } + if uint32(x>>16) != load32(src, candidate) { + cv = load64(src, s+1) + s++ + break + } + } + } + +emitRemainder: + if nextEmit < len(src) { + // Bail if we exceed the maximum size. + if d+len(src)-nextEmit > dstLimit { + return 0 + } + d += emitLiteral(dst[d:], src[nextEmit:]) + } + return d +} + +func encodeBlockSnappyGo(dst, src []byte) (d int) { + // Initialize the hash table. + const ( + tableBits = 14 + maxTableSize = 1 << tableBits + ) var table [maxTableSize]uint32 // sLimit is when to stop looking for offset/length copies. The inputMargin @@ -469,6 +688,197 @@ emitRemainder: return d } +// encodeBlockSnappyGo64K is a special version of encodeBlockSnappyGo for sizes <64KB +func encodeBlockSnappyGo64K(dst, src []byte) (d int) { + // Initialize the hash table. + const ( + tableBits = 14 + maxTableSize = 1 << tableBits + ) + + var table [maxTableSize]uint16 + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := len(src) - inputMargin + + // Bail if we can't compress to at least this. + dstLimit := len(src) - len(src)>>5 - 5 + + // nextEmit is where in src the next emitLiteral should start from. + nextEmit := 0 + + // The encoded form must start with a literal, as there are no previous + // bytes to copy, so we start looking for hash matches at s == 1. + s := 1 + cv := load64(src, s) + + // We search for a repeat at -1, but don't output repeats when nextEmit == 0 + repeat := 1 + + for { + candidate := 0 + for { + // Next src position to check + nextS := s + (s-nextEmit)>>5 + 4 + if nextS > sLimit { + goto emitRemainder + } + hash0 := hash6(cv, tableBits) + hash1 := hash6(cv>>8, tableBits) + candidate = int(table[hash0]) + candidate2 := int(table[hash1]) + table[hash0] = uint16(s) + table[hash1] = uint16(s + 1) + hash2 := hash6(cv>>16, tableBits) + + // Check repeat at offset checkRep. + const checkRep = 1 + if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) { + base := s + checkRep + // Extend back + for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; { + i-- + base-- + } + // Bail if we exceed the maximum size. + if d+(base-nextEmit) > dstLimit { + return 0 + } + + d += emitLiteral(dst[d:], src[nextEmit:base]) + + // Extend forward + candidate := s - repeat + 4 + checkRep + s += 4 + checkRep + for s <= sLimit { + if diff := load64(src, s) ^ load64(src, candidate); diff != 0 { + s += bits.TrailingZeros64(diff) >> 3 + break + } + s += 8 + candidate += 8 + } + + d += emitCopyNoRepeat(dst[d:], repeat, s-base) + nextEmit = s + if s >= sLimit { + goto emitRemainder + } + + cv = load64(src, s) + continue + } + + if uint32(cv) == load32(src, candidate) { + break + } + candidate = int(table[hash2]) + if uint32(cv>>8) == load32(src, candidate2) { + table[hash2] = uint16(s + 2) + candidate = candidate2 + s++ + break + } + table[hash2] = uint16(s + 2) + if uint32(cv>>16) == load32(src, candidate) { + s += 2 + break + } + + cv = load64(src, nextS) + s = nextS + } + + // Extend backwards + for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] { + candidate-- + s-- + } + + // Bail if we exceed the maximum size. + if d+(s-nextEmit) > dstLimit { + return 0 + } + + // A 4-byte match has been found. We'll later see if more than 4 bytes + // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit + // them as literal bytes. + + d += emitLiteral(dst[d:], src[nextEmit:s]) + + // Call emitCopy, and then see if another emitCopy could be our next + // move. Repeat until we find no match for the input immediately after + // what was consumed by the last emitCopy call. + // + // If we exit this loop normally then we need to call emitLiteral next, + // though we don't yet know how big the literal will be. We handle that + // by proceeding to the next iteration of the main loop. We also can + // exit this loop via goto if we get close to exhausting the input. + for { + // Invariant: we have a 4-byte match at s, and no need to emit any + // literal bytes prior to s. + base := s + repeat = base - candidate + + // Extend the 4-byte match as long as possible. + s += 4 + candidate += 4 + for s <= len(src)-8 { + if diff := load64(src, s) ^ load64(src, candidate); diff != 0 { + s += bits.TrailingZeros64(diff) >> 3 + break + } + s += 8 + candidate += 8 + } + + d += emitCopyNoRepeat(dst[d:], repeat, s-base) + if false { + // Validate match. + a := src[base:s] + b := src[base-repeat : base-repeat+(s-base)] + if !bytes.Equal(a, b) { + panic("mismatch") + } + } + + nextEmit = s + if s >= sLimit { + goto emitRemainder + } + + if d > dstLimit { + // Do we have space for more, if not bail. + return 0 + } + // Check for an immediate match, otherwise start search at s+1 + x := load64(src, s-2) + m2Hash := hash6(x, tableBits) + currHash := hash6(x>>16, tableBits) + candidate = int(table[currHash]) + table[m2Hash] = uint16(s - 2) + table[currHash] = uint16(s) + if uint32(x>>16) != load32(src, candidate) { + cv = load64(src, s+1) + s++ + break + } + } + } + +emitRemainder: + if nextEmit < len(src) { + // Bail if we exceed the maximum size. + if d+len(src)-nextEmit > dstLimit { + return 0 + } + d += emitLiteral(dst[d:], src[nextEmit:]) + } + return d +} + // encodeBlockGo encodes a non-empty src to a guaranteed-large-enough dst. It // assumes that the varint-encoded length of the decompressed bytes has already // been written. diff --git a/s2/encode_better.go b/s2/encode_better.go index 544cb1e17b..90ebf89c20 100644 --- a/s2/encode_better.go +++ b/s2/encode_better.go @@ -348,12 +348,7 @@ func encodeBlockBetterSnappyGo(dst, src []byte) (d int) { nextS := 0 for { // Next src position to check - nextS = (s-nextEmit)>>7 + 1 - if nextS > maxSkip { - nextS = s + maxSkip - } else { - nextS += s - } + nextS = min(s+(s-nextEmit)>>7+1, s+maxSkip) if nextS > sLimit { goto emitRemainder @@ -483,6 +478,415 @@ emitRemainder: return d } +func encodeBlockBetterGo64K(dst, src []byte) (d int) { + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := len(src) - inputMargin + if len(src) < minNonLiteralBlockSize { + return 0 + } + // Initialize the hash tables. + // Use smaller tables for smaller blocks + const ( + // Long hash matches. + lTableBits = 16 + maxLTableSize = 1 << lTableBits + + // Short hash matches. + sTableBits = 13 + maxSTableSize = 1 << sTableBits + ) + + var lTable [maxLTableSize]uint16 + var sTable [maxSTableSize]uint16 + + // Bail if we can't compress to at least this. + dstLimit := len(src) - len(src)>>5 - 6 + + // nextEmit is where in src the next emitLiteral should start from. + nextEmit := 0 + + // The encoded form must start with a literal, as there are no previous + // bytes to copy, so we start looking for hash matches at s == 1. + s := 1 + cv := load64(src, s) + + // We initialize repeat to 0, so we never match on first attempt + repeat := 0 + + for { + candidateL := 0 + nextS := 0 + for { + // Next src position to check + nextS = s + (s-nextEmit)>>6 + 1 + if nextS > sLimit { + goto emitRemainder + } + hashL := hash7(cv, lTableBits) + hashS := hash4(cv, sTableBits) + candidateL = int(lTable[hashL]) + candidateS := int(sTable[hashS]) + lTable[hashL] = uint16(s) + sTable[hashS] = uint16(s) + + valLong := load64(src, candidateL) + valShort := load64(src, candidateS) + + // If long matches at least 8 bytes, use that. + if cv == valLong { + break + } + if cv == valShort { + candidateL = candidateS + break + } + + // Check repeat at offset checkRep. + const checkRep = 1 + // Minimum length of a repeat. Tested with various values. + // While 4-5 offers improvements in some, 6 reduces + // regressions significantly. + const wantRepeatBytes = 6 + const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep) + if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask { + base := s + checkRep + // Extend back + for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; { + i-- + base-- + } + d += emitLiteral(dst[d:], src[nextEmit:base]) + + // Extend forward + candidate := s - repeat + wantRepeatBytes + checkRep + s += wantRepeatBytes + checkRep + for s < len(src) { + if len(src)-s < 8 { + if src[s] == src[candidate] { + s++ + candidate++ + continue + } + break + } + if diff := load64(src, s) ^ load64(src, candidate); diff != 0 { + s += bits.TrailingZeros64(diff) >> 3 + break + } + s += 8 + candidate += 8 + } + // same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset. + d += emitRepeat(dst[d:], repeat, s-base) + nextEmit = s + if s >= sLimit { + goto emitRemainder + } + // Index in-between + index0 := base + 1 + index1 := s - 2 + + for index0 < index1 { + cv0 := load64(src, index0) + cv1 := load64(src, index1) + lTable[hash7(cv0, lTableBits)] = uint16(index0) + sTable[hash4(cv0>>8, sTableBits)] = uint16(index0 + 1) + + lTable[hash7(cv1, lTableBits)] = uint16(index1) + sTable[hash4(cv1>>8, sTableBits)] = uint16(index1 + 1) + index0 += 2 + index1 -= 2 + } + + cv = load64(src, s) + continue + } + + // Long likely matches 7, so take that. + if uint32(cv) == uint32(valLong) { + break + } + + // Check our short candidate + if uint32(cv) == uint32(valShort) { + // Try a long candidate at s+1 + hashL = hash7(cv>>8, lTableBits) + candidateL = int(lTable[hashL]) + lTable[hashL] = uint16(s + 1) + if uint32(cv>>8) == load32(src, candidateL) { + s++ + break + } + // Use our short candidate. + candidateL = candidateS + break + } + + cv = load64(src, nextS) + s = nextS + } + + // Extend backwards + for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] { + candidateL-- + s-- + } + + // Bail if we exceed the maximum size. + if d+(s-nextEmit) > dstLimit { + return 0 + } + + base := s + offset := base - candidateL + + // Extend the 4-byte match as long as possible. + s += 4 + candidateL += 4 + for s < len(src) { + if len(src)-s < 8 { + if src[s] == src[candidateL] { + s++ + candidateL++ + continue + } + break + } + if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 { + s += bits.TrailingZeros64(diff) >> 3 + break + } + s += 8 + candidateL += 8 + } + + d += emitLiteral(dst[d:], src[nextEmit:base]) + if repeat == offset { + d += emitRepeat(dst[d:], offset, s-base) + } else { + d += emitCopy(dst[d:], offset, s-base) + repeat = offset + } + + nextEmit = s + if s >= sLimit { + goto emitRemainder + } + + if d > dstLimit { + // Do we have space for more, if not bail. + return 0 + } + + // Index short & long + index0 := base + 1 + index1 := s - 2 + + cv0 := load64(src, index0) + cv1 := load64(src, index1) + lTable[hash7(cv0, lTableBits)] = uint16(index0) + sTable[hash4(cv0>>8, sTableBits)] = uint16(index0 + 1) + + // lTable could be postponed, but very minor difference. + lTable[hash7(cv1, lTableBits)] = uint16(index1) + sTable[hash4(cv1>>8, sTableBits)] = uint16(index1 + 1) + index0 += 1 + index1 -= 1 + cv = load64(src, s) + + // Index large values sparsely in between. + // We do two starting from different offsets for speed. + index2 := (index0 + index1 + 1) >> 1 + for index2 < index1 { + lTable[hash7(load64(src, index0), lTableBits)] = uint16(index0) + lTable[hash7(load64(src, index2), lTableBits)] = uint16(index2) + index0 += 2 + index2 += 2 + } + } + +emitRemainder: + if nextEmit < len(src) { + // Bail if we exceed the maximum size. + if d+len(src)-nextEmit > dstLimit { + return 0 + } + d += emitLiteral(dst[d:], src[nextEmit:]) + } + return d +} + +// encodeBlockBetterSnappyGo encodes a non-empty src to a guaranteed-large-enough dst. It +// assumes that the varint-encoded length of the decompressed bytes has already +// been written. +// +// It also assumes that: +// +// len(dst) >= MaxEncodedLen(len(src)) && +// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize +func encodeBlockBetterSnappyGo64K(dst, src []byte) (d int) { + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := len(src) - inputMargin + if len(src) < minNonLiteralBlockSize { + return 0 + } + + // Initialize the hash tables. + // Use smaller tables for smaller blocks + const ( + // Long hash matches. + lTableBits = 15 + maxLTableSize = 1 << lTableBits + + // Short hash matches. + sTableBits = 13 + maxSTableSize = 1 << sTableBits + ) + + var lTable [maxLTableSize]uint16 + var sTable [maxSTableSize]uint16 + + // Bail if we can't compress to at least this. + dstLimit := len(src) - len(src)>>5 - 6 + + // nextEmit is where in src the next emitLiteral should start from. + nextEmit := 0 + + // The encoded form must start with a literal, as there are no previous + // bytes to copy, so we start looking for hash matches at s == 1. + s := 1 + cv := load64(src, s) + + const maxSkip = 100 + + for { + candidateL := 0 + nextS := 0 + for { + // Next src position to check + nextS = min(s+(s-nextEmit)>>6+1, s+maxSkip) + + if nextS > sLimit { + goto emitRemainder + } + hashL := hash7(cv, lTableBits) + hashS := hash4(cv, sTableBits) + candidateL = int(lTable[hashL]) + candidateS := int(sTable[hashS]) + lTable[hashL] = uint16(s) + sTable[hashS] = uint16(s) + + if uint32(cv) == load32(src, candidateL) { + break + } + + // Check our short candidate + if uint32(cv) == load32(src, candidateS) { + // Try a long candidate at s+1 + hashL = hash7(cv>>8, lTableBits) + candidateL = int(lTable[hashL]) + lTable[hashL] = uint16(s + 1) + if uint32(cv>>8) == load32(src, candidateL) { + s++ + break + } + // Use our short candidate. + candidateL = candidateS + break + } + + cv = load64(src, nextS) + s = nextS + } + + // Extend backwards + for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] { + candidateL-- + s-- + } + + // Bail if we exceed the maximum size. + if d+(s-nextEmit) > dstLimit { + return 0 + } + + base := s + offset := base - candidateL + + // Extend the 4-byte match as long as possible. + s += 4 + candidateL += 4 + for s < len(src) { + if len(src)-s < 8 { + if src[s] == src[candidateL] { + s++ + candidateL++ + continue + } + break + } + if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 { + s += bits.TrailingZeros64(diff) >> 3 + break + } + s += 8 + candidateL += 8 + } + + d += emitLiteral(dst[d:], src[nextEmit:base]) + d += emitCopyNoRepeat(dst[d:], offset, s-base) + + nextEmit = s + if s >= sLimit { + goto emitRemainder + } + + if d > dstLimit { + // Do we have space for more, if not bail. + return 0 + } + + // Index short & long + index0 := base + 1 + index1 := s - 2 + + cv0 := load64(src, index0) + cv1 := load64(src, index1) + lTable[hash7(cv0, lTableBits)] = uint16(index0) + sTable[hash4(cv0>>8, sTableBits)] = uint16(index0 + 1) + + lTable[hash7(cv1, lTableBits)] = uint16(index1) + sTable[hash4(cv1>>8, sTableBits)] = uint16(index1 + 1) + index0 += 1 + index1 -= 1 + cv = load64(src, s) + + // Index large values sparsely in between. + // We do two starting from different offsets for speed. + index2 := (index0 + index1 + 1) >> 1 + for index2 < index1 { + lTable[hash7(load64(src, index0), lTableBits)] = uint16(index0) + lTable[hash7(load64(src, index2), lTableBits)] = uint16(index2) + index0 += 2 + index2 += 2 + } + } + +emitRemainder: + if nextEmit < len(src) { + // Bail if we exceed the maximum size. + if d+len(src)-nextEmit > dstLimit { + return 0 + } + d += emitLiteral(dst[d:], src[nextEmit:]) + } + return d +} + // encodeBlockBetterDict encodes a non-empty src to a guaranteed-large-enough dst. It // assumes that the varint-encoded length of the decompressed bytes has already // been written. diff --git a/s2/encode_go.go b/s2/encode_go.go index dd1c973ca5..e25b78445d 100644 --- a/s2/encode_go.go +++ b/s2/encode_go.go @@ -21,6 +21,9 @@ func encodeBlock(dst, src []byte) (d int) { if len(src) < minNonLiteralBlockSize { return 0 } + if len(src) <= 64<<10 { + return encodeBlockGo64K(dst, src) + } return encodeBlockGo(dst, src) } @@ -32,6 +35,9 @@ func encodeBlock(dst, src []byte) (d int) { // // len(dst) >= MaxEncodedLen(len(src)) func encodeBlockBetter(dst, src []byte) (d int) { + if len(src) <= 64<<10 { + return encodeBlockBetterGo64K(dst, src) + } return encodeBlockBetterGo(dst, src) } @@ -43,6 +49,9 @@ func encodeBlockBetter(dst, src []byte) (d int) { // // len(dst) >= MaxEncodedLen(len(src)) func encodeBlockBetterSnappy(dst, src []byte) (d int) { + if len(src) <= 64<<10 { + return encodeBlockBetterSnappyGo64K(dst, src) + } return encodeBlockBetterSnappyGo(dst, src) } @@ -57,6 +66,9 @@ func encodeBlockSnappy(dst, src []byte) (d int) { if len(src) < minNonLiteralBlockSize { return 0 } + if len(src) <= 64<<10 { + return encodeBlockSnappyGo64K(dst, src) + } return encodeBlockSnappyGo(dst, src) } diff --git a/s2/fuzz_test.go b/s2/fuzz_test.go index 02aa7f4ec7..e25af84f83 100644 --- a/s2/fuzz_test.go +++ b/s2/fuzz_test.go @@ -143,6 +143,12 @@ func FuzzEncodingBlocks(f *testing.F) { encodeBlockBetterGo(encoded, data) encodeBlockSnappyGo(encoded, data) encodeBlockBetterSnappyGo(encoded, data) + if len(data) <= 64<<10 { + encodeBlockGo64K(encoded, data) + encodeBlockSnappyGo64K(encoded, data) + encodeBlockBetterGo64K(encoded, data) + encodeBlockBetterSnappyGo64K(encoded, data) + } dst := encodeGo(encoded, data) if dst == nil { return