From c8a8470492769c69e56a348e5142734cab19664e Mon Sep 17 00:00:00 2001
From: Klaus Post <klauspost@gmail.com>
Date: Thu, 23 Jan 2025 04:21:36 -0800
Subject: [PATCH] s2: Improve small block compression speed w/o asm (#1048)

* s2: Improve small block compression speed w/o asm

Test with `-tags=noasm`

Uses uint16 hash table, which reduced the zero cost by a lot.

```
benchmark                                                                 old MB/s      new MB/s      speedup
BenchmarkEncodeSnappyBlockParallel/12-txt1_128b/s2-snappy-16              2518.04       4430.70       1.76x
BenchmarkEncodeSnappyBlockParallel/12-txt1_128b/s2-snappy-better-16       4.59          1852.29       403.55x
BenchmarkEncodeSnappyBlockParallel/12-txt1_128b/snappy-noasm-16           3362.43       3561.17       1.06x
BenchmarkEncodeSnappyBlockParallel/13-txt1_1000b/s2-snappy-16             8053.34       9026.64       1.12x
BenchmarkEncodeSnappyBlockParallel/13-txt1_1000b/s2-snappy-better-16      34.60         3804.57       109.96x
BenchmarkEncodeSnappyBlockParallel/13-txt1_1000b/snappy-noasm-16          4901.85       4904.47       1.00x
BenchmarkEncodeSnappyBlockParallel/14-txt1_10000b/s2-snappy-16            9349.04       9544.38       1.02x
BenchmarkEncodeSnappyBlockParallel/14-txt1_10000b/s2-snappy-better-16     359.32        3795.91       10.56x
BenchmarkEncodeSnappyBlockParallel/14-txt1_10000b/snappy-noasm-16         5361.99       5535.82       1.03x
BenchmarkEncodeSnappyBlockParallel/15-txt1_20000b/s2-snappy-16            8852.18       9300.86       1.05x
BenchmarkEncodeSnappyBlockParallel/15-txt1_20000b/s2-snappy-better-16     594.96        3226.26       5.42x
BenchmarkEncodeSnappyBlockParallel/15-txt1_20000b/snappy-noasm-16         3418.03       3435.88       1.01x
```
```
---
 s2/encode_all.go    | 416 +++++++++++++++++++++++++++++++++++++++++++-
 s2/encode_better.go | 416 +++++++++++++++++++++++++++++++++++++++++++-
 s2/encode_go.go     |  12 ++
 s2/fuzz_test.go     |   6 +
 4 files changed, 841 insertions(+), 9 deletions(-)

diff --git a/s2/encode_all.go b/s2/encode_all.go
index c56ce52e7d..a473b64529 100644
--- a/s2/encode_all.go
+++ b/s2/encode_all.go
@@ -46,7 +46,12 @@ func encodeGo(dst, src []byte) []byte {
 		d += emitLiteral(dst[d:], src)
 		return dst[:d]
 	}
-	n := encodeBlockGo(dst[d:], src)
+	var n int
+	if len(src) < 64<<10 {
+		n = encodeBlockGo64K(dst[d:], src)
+	} else {
+		n = encodeBlockGo(dst[d:], src)
+	}
 	if n > 0 {
 		d += n
 		return dst[:d]
@@ -72,7 +77,6 @@ func encodeBlockGo(dst, src []byte) (d int) {
 
 		debug = false
 	)
-
 	var table [maxTableSize]uint32
 
 	// sLimit is when to stop looking for offset/length copies. The inputMargin
@@ -279,13 +283,228 @@ emitRemainder:
 	return d
 }
 
-func encodeBlockSnappyGo(dst, src []byte) (d int) {
+// encodeBlockGo64K is a specialized version for compressing blocks <= 64KB
+func encodeBlockGo64K(dst, src []byte) (d int) {
 	// Initialize the hash table.
 	const (
 		tableBits    = 14
 		maxTableSize = 1 << tableBits
+
+		debug = false
 	)
 
+	var table [maxTableSize]uint16
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 5
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	// We search for a repeat at -1, but don't output repeats when nextEmit == 0
+	repeat := 1
+
+	for {
+		candidate := 0
+		for {
+			// Next src position to check
+			nextS := s + (s-nextEmit)>>5 + 4
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hash0 := hash6(cv, tableBits)
+			hash1 := hash6(cv>>8, tableBits)
+			candidate = int(table[hash0])
+			candidate2 := int(table[hash1])
+			table[hash0] = uint16(s)
+			table[hash1] = uint16(s + 1)
+			hash2 := hash6(cv>>16, tableBits)
+
+			// Check repeat at offset checkRep.
+			const checkRep = 1
+			if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+				base := s + checkRep
+				// Extend back
+				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+					i--
+					base--
+				}
+
+				// Bail if we exceed the maximum size.
+				if d+(base-nextEmit) > dstLimit {
+					return 0
+				}
+
+				d += emitLiteral(dst[d:], src[nextEmit:base])
+
+				// Extend forward
+				candidate := s - repeat + 4 + checkRep
+				s += 4 + checkRep
+				for s <= sLimit {
+					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidate += 8
+				}
+				if debug {
+					// Validate match.
+					if s <= candidate {
+						panic("s <= candidate")
+					}
+					a := src[base:s]
+					b := src[base-repeat : base-repeat+(s-base)]
+					if !bytes.Equal(a, b) {
+						panic("mismatch")
+					}
+				}
+				if nextEmit > 0 {
+					// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+					d += emitRepeat(dst[d:], repeat, s-base)
+				} else {
+					// First match, cannot be repeat.
+					d += emitCopy(dst[d:], repeat, s-base)
+				}
+				nextEmit = s
+				if s >= sLimit {
+					goto emitRemainder
+				}
+				cv = load64(src, s)
+				continue
+			}
+
+			if uint32(cv) == load32(src, candidate) {
+				break
+			}
+			candidate = int(table[hash2])
+			if uint32(cv>>8) == load32(src, candidate2) {
+				table[hash2] = uint16(s + 2)
+				candidate = candidate2
+				s++
+				break
+			}
+			table[hash2] = uint16(s + 2)
+			if uint32(cv>>16) == load32(src, candidate) {
+				s += 2
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards.
+		// The top bytes will be rechecked to get the full match.
+		for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
+			candidate--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		d += emitLiteral(dst[d:], src[nextEmit:s])
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+			base := s
+			repeat = base - candidate
+
+			// Extend the 4-byte match as long as possible.
+			s += 4
+			candidate += 4
+			for s <= len(src)-8 {
+				if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+					s += bits.TrailingZeros64(diff) >> 3
+					break
+				}
+				s += 8
+				candidate += 8
+			}
+
+			d += emitCopy(dst[d:], repeat, s-base)
+			if debug {
+				// Validate match.
+				if s <= candidate {
+					panic("s <= candidate")
+				}
+				a := src[base:s]
+				b := src[base-repeat : base-repeat+(s-base)]
+				if !bytes.Equal(a, b) {
+					panic("mismatch")
+				}
+			}
+
+			nextEmit = s
+			if s >= sLimit {
+				goto emitRemainder
+			}
+
+			if d > dstLimit {
+				// Do we have space for more, if not bail.
+				return 0
+			}
+			// Check for an immediate match, otherwise start search at s+1
+			x := load64(src, s-2)
+			m2Hash := hash6(x, tableBits)
+			currHash := hash6(x>>16, tableBits)
+			candidate = int(table[currHash])
+			table[m2Hash] = uint16(s - 2)
+			table[currHash] = uint16(s)
+			if debug && s == candidate {
+				panic("s == candidate")
+			}
+			if uint32(x>>16) != load32(src, candidate) {
+				cv = load64(src, s+1)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
+
+func encodeBlockSnappyGo(dst, src []byte) (d int) {
+	// Initialize the hash table.
+	const (
+		tableBits    = 14
+		maxTableSize = 1 << tableBits
+	)
 	var table [maxTableSize]uint32
 
 	// sLimit is when to stop looking for offset/length copies. The inputMargin
@@ -469,6 +688,197 @@ emitRemainder:
 	return d
 }
 
+// encodeBlockSnappyGo64K is a special version of encodeBlockSnappyGo for sizes <64KB
+func encodeBlockSnappyGo64K(dst, src []byte) (d int) {
+	// Initialize the hash table.
+	const (
+		tableBits    = 14
+		maxTableSize = 1 << tableBits
+	)
+
+	var table [maxTableSize]uint16
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 5
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	// We search for a repeat at -1, but don't output repeats when nextEmit == 0
+	repeat := 1
+
+	for {
+		candidate := 0
+		for {
+			// Next src position to check
+			nextS := s + (s-nextEmit)>>5 + 4
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hash0 := hash6(cv, tableBits)
+			hash1 := hash6(cv>>8, tableBits)
+			candidate = int(table[hash0])
+			candidate2 := int(table[hash1])
+			table[hash0] = uint16(s)
+			table[hash1] = uint16(s + 1)
+			hash2 := hash6(cv>>16, tableBits)
+
+			// Check repeat at offset checkRep.
+			const checkRep = 1
+			if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) {
+				base := s + checkRep
+				// Extend back
+				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+					i--
+					base--
+				}
+				// Bail if we exceed the maximum size.
+				if d+(base-nextEmit) > dstLimit {
+					return 0
+				}
+
+				d += emitLiteral(dst[d:], src[nextEmit:base])
+
+				// Extend forward
+				candidate := s - repeat + 4 + checkRep
+				s += 4 + checkRep
+				for s <= sLimit {
+					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidate += 8
+				}
+
+				d += emitCopyNoRepeat(dst[d:], repeat, s-base)
+				nextEmit = s
+				if s >= sLimit {
+					goto emitRemainder
+				}
+
+				cv = load64(src, s)
+				continue
+			}
+
+			if uint32(cv) == load32(src, candidate) {
+				break
+			}
+			candidate = int(table[hash2])
+			if uint32(cv>>8) == load32(src, candidate2) {
+				table[hash2] = uint16(s + 2)
+				candidate = candidate2
+				s++
+				break
+			}
+			table[hash2] = uint16(s + 2)
+			if uint32(cv>>16) == load32(src, candidate) {
+				s += 2
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards
+		for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] {
+			candidate--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		d += emitLiteral(dst[d:], src[nextEmit:s])
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+			base := s
+			repeat = base - candidate
+
+			// Extend the 4-byte match as long as possible.
+			s += 4
+			candidate += 4
+			for s <= len(src)-8 {
+				if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+					s += bits.TrailingZeros64(diff) >> 3
+					break
+				}
+				s += 8
+				candidate += 8
+			}
+
+			d += emitCopyNoRepeat(dst[d:], repeat, s-base)
+			if false {
+				// Validate match.
+				a := src[base:s]
+				b := src[base-repeat : base-repeat+(s-base)]
+				if !bytes.Equal(a, b) {
+					panic("mismatch")
+				}
+			}
+
+			nextEmit = s
+			if s >= sLimit {
+				goto emitRemainder
+			}
+
+			if d > dstLimit {
+				// Do we have space for more, if not bail.
+				return 0
+			}
+			// Check for an immediate match, otherwise start search at s+1
+			x := load64(src, s-2)
+			m2Hash := hash6(x, tableBits)
+			currHash := hash6(x>>16, tableBits)
+			candidate = int(table[currHash])
+			table[m2Hash] = uint16(s - 2)
+			table[currHash] = uint16(s)
+			if uint32(x>>16) != load32(src, candidate) {
+				cv = load64(src, s+1)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
+
 // encodeBlockGo encodes a non-empty src to a guaranteed-large-enough dst. It
 // assumes that the varint-encoded length of the decompressed bytes has already
 // been written.
diff --git a/s2/encode_better.go b/s2/encode_better.go
index 544cb1e17b..90ebf89c20 100644
--- a/s2/encode_better.go
+++ b/s2/encode_better.go
@@ -348,12 +348,7 @@ func encodeBlockBetterSnappyGo(dst, src []byte) (d int) {
 		nextS := 0
 		for {
 			// Next src position to check
-			nextS = (s-nextEmit)>>7 + 1
-			if nextS > maxSkip {
-				nextS = s + maxSkip
-			} else {
-				nextS += s
-			}
+			nextS = min(s+(s-nextEmit)>>7+1, s+maxSkip)
 
 			if nextS > sLimit {
 				goto emitRemainder
@@ -483,6 +478,415 @@ emitRemainder:
 	return d
 }
 
+func encodeBlockBetterGo64K(dst, src []byte) (d int) {
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+	// Initialize the hash tables.
+	// Use smaller tables for smaller blocks
+	const (
+		// Long hash matches.
+		lTableBits    = 16
+		maxLTableSize = 1 << lTableBits
+
+		// Short hash matches.
+		sTableBits    = 13
+		maxSTableSize = 1 << sTableBits
+	)
+
+	var lTable [maxLTableSize]uint16
+	var sTable [maxSTableSize]uint16
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 6
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	// We initialize repeat to 0, so we never match on first attempt
+	repeat := 0
+
+	for {
+		candidateL := 0
+		nextS := 0
+		for {
+			// Next src position to check
+			nextS = s + (s-nextEmit)>>6 + 1
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hashL := hash7(cv, lTableBits)
+			hashS := hash4(cv, sTableBits)
+			candidateL = int(lTable[hashL])
+			candidateS := int(sTable[hashS])
+			lTable[hashL] = uint16(s)
+			sTable[hashS] = uint16(s)
+
+			valLong := load64(src, candidateL)
+			valShort := load64(src, candidateS)
+
+			// If long matches at least 8 bytes, use that.
+			if cv == valLong {
+				break
+			}
+			if cv == valShort {
+				candidateL = candidateS
+				break
+			}
+
+			// Check repeat at offset checkRep.
+			const checkRep = 1
+			// Minimum length of a repeat. Tested with various values.
+			// While 4-5 offers improvements in some, 6 reduces
+			// regressions significantly.
+			const wantRepeatBytes = 6
+			const repeatMask = ((1 << (wantRepeatBytes * 8)) - 1) << (8 * checkRep)
+			if false && repeat > 0 && cv&repeatMask == load64(src, s-repeat)&repeatMask {
+				base := s + checkRep
+				// Extend back
+				for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; {
+					i--
+					base--
+				}
+				d += emitLiteral(dst[d:], src[nextEmit:base])
+
+				// Extend forward
+				candidate := s - repeat + wantRepeatBytes + checkRep
+				s += wantRepeatBytes + checkRep
+				for s < len(src) {
+					if len(src)-s < 8 {
+						if src[s] == src[candidate] {
+							s++
+							candidate++
+							continue
+						}
+						break
+					}
+					if diff := load64(src, s) ^ load64(src, candidate); diff != 0 {
+						s += bits.TrailingZeros64(diff) >> 3
+						break
+					}
+					s += 8
+					candidate += 8
+				}
+				// same as `add := emitCopy(dst[d:], repeat, s-base)` but skips storing offset.
+				d += emitRepeat(dst[d:], repeat, s-base)
+				nextEmit = s
+				if s >= sLimit {
+					goto emitRemainder
+				}
+				// Index in-between
+				index0 := base + 1
+				index1 := s - 2
+
+				for index0 < index1 {
+					cv0 := load64(src, index0)
+					cv1 := load64(src, index1)
+					lTable[hash7(cv0, lTableBits)] = uint16(index0)
+					sTable[hash4(cv0>>8, sTableBits)] = uint16(index0 + 1)
+
+					lTable[hash7(cv1, lTableBits)] = uint16(index1)
+					sTable[hash4(cv1>>8, sTableBits)] = uint16(index1 + 1)
+					index0 += 2
+					index1 -= 2
+				}
+
+				cv = load64(src, s)
+				continue
+			}
+
+			// Long likely matches 7, so take that.
+			if uint32(cv) == uint32(valLong) {
+				break
+			}
+
+			// Check our short candidate
+			if uint32(cv) == uint32(valShort) {
+				// Try a long candidate at s+1
+				hashL = hash7(cv>>8, lTableBits)
+				candidateL = int(lTable[hashL])
+				lTable[hashL] = uint16(s + 1)
+				if uint32(cv>>8) == load32(src, candidateL) {
+					s++
+					break
+				}
+				// Use our short candidate.
+				candidateL = candidateS
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards
+		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
+			candidateL--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		base := s
+		offset := base - candidateL
+
+		// Extend the 4-byte match as long as possible.
+		s += 4
+		candidateL += 4
+		for s < len(src) {
+			if len(src)-s < 8 {
+				if src[s] == src[candidateL] {
+					s++
+					candidateL++
+					continue
+				}
+				break
+			}
+			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
+				s += bits.TrailingZeros64(diff) >> 3
+				break
+			}
+			s += 8
+			candidateL += 8
+		}
+
+		d += emitLiteral(dst[d:], src[nextEmit:base])
+		if repeat == offset {
+			d += emitRepeat(dst[d:], offset, s-base)
+		} else {
+			d += emitCopy(dst[d:], offset, s-base)
+			repeat = offset
+		}
+
+		nextEmit = s
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		if d > dstLimit {
+			// Do we have space for more, if not bail.
+			return 0
+		}
+
+		// Index short & long
+		index0 := base + 1
+		index1 := s - 2
+
+		cv0 := load64(src, index0)
+		cv1 := load64(src, index1)
+		lTable[hash7(cv0, lTableBits)] = uint16(index0)
+		sTable[hash4(cv0>>8, sTableBits)] = uint16(index0 + 1)
+
+		// lTable could be postponed, but very minor difference.
+		lTable[hash7(cv1, lTableBits)] = uint16(index1)
+		sTable[hash4(cv1>>8, sTableBits)] = uint16(index1 + 1)
+		index0 += 1
+		index1 -= 1
+		cv = load64(src, s)
+
+		// Index large values sparsely in between.
+		// We do two starting from different offsets for speed.
+		index2 := (index0 + index1 + 1) >> 1
+		for index2 < index1 {
+			lTable[hash7(load64(src, index0), lTableBits)] = uint16(index0)
+			lTable[hash7(load64(src, index2), lTableBits)] = uint16(index2)
+			index0 += 2
+			index2 += 2
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
+
+// encodeBlockBetterSnappyGo encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlockBetterSnappyGo64K(dst, src []byte) (d int) {
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+	if len(src) < minNonLiteralBlockSize {
+		return 0
+	}
+
+	// Initialize the hash tables.
+	// Use smaller tables for smaller blocks
+	const (
+		// Long hash matches.
+		lTableBits    = 15
+		maxLTableSize = 1 << lTableBits
+
+		// Short hash matches.
+		sTableBits    = 13
+		maxSTableSize = 1 << sTableBits
+	)
+
+	var lTable [maxLTableSize]uint16
+	var sTable [maxSTableSize]uint16
+
+	// Bail if we can't compress to at least this.
+	dstLimit := len(src) - len(src)>>5 - 6
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	cv := load64(src, s)
+
+	const maxSkip = 100
+
+	for {
+		candidateL := 0
+		nextS := 0
+		for {
+			// Next src position to check
+			nextS = min(s+(s-nextEmit)>>6+1, s+maxSkip)
+
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			hashL := hash7(cv, lTableBits)
+			hashS := hash4(cv, sTableBits)
+			candidateL = int(lTable[hashL])
+			candidateS := int(sTable[hashS])
+			lTable[hashL] = uint16(s)
+			sTable[hashS] = uint16(s)
+
+			if uint32(cv) == load32(src, candidateL) {
+				break
+			}
+
+			// Check our short candidate
+			if uint32(cv) == load32(src, candidateS) {
+				// Try a long candidate at s+1
+				hashL = hash7(cv>>8, lTableBits)
+				candidateL = int(lTable[hashL])
+				lTable[hashL] = uint16(s + 1)
+				if uint32(cv>>8) == load32(src, candidateL) {
+					s++
+					break
+				}
+				// Use our short candidate.
+				candidateL = candidateS
+				break
+			}
+
+			cv = load64(src, nextS)
+			s = nextS
+		}
+
+		// Extend backwards
+		for candidateL > 0 && s > nextEmit && src[candidateL-1] == src[s-1] {
+			candidateL--
+			s--
+		}
+
+		// Bail if we exceed the maximum size.
+		if d+(s-nextEmit) > dstLimit {
+			return 0
+		}
+
+		base := s
+		offset := base - candidateL
+
+		// Extend the 4-byte match as long as possible.
+		s += 4
+		candidateL += 4
+		for s < len(src) {
+			if len(src)-s < 8 {
+				if src[s] == src[candidateL] {
+					s++
+					candidateL++
+					continue
+				}
+				break
+			}
+			if diff := load64(src, s) ^ load64(src, candidateL); diff != 0 {
+				s += bits.TrailingZeros64(diff) >> 3
+				break
+			}
+			s += 8
+			candidateL += 8
+		}
+
+		d += emitLiteral(dst[d:], src[nextEmit:base])
+		d += emitCopyNoRepeat(dst[d:], offset, s-base)
+
+		nextEmit = s
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		if d > dstLimit {
+			// Do we have space for more, if not bail.
+			return 0
+		}
+
+		// Index short & long
+		index0 := base + 1
+		index1 := s - 2
+
+		cv0 := load64(src, index0)
+		cv1 := load64(src, index1)
+		lTable[hash7(cv0, lTableBits)] = uint16(index0)
+		sTable[hash4(cv0>>8, sTableBits)] = uint16(index0 + 1)
+
+		lTable[hash7(cv1, lTableBits)] = uint16(index1)
+		sTable[hash4(cv1>>8, sTableBits)] = uint16(index1 + 1)
+		index0 += 1
+		index1 -= 1
+		cv = load64(src, s)
+
+		// Index large values sparsely in between.
+		// We do two starting from different offsets for speed.
+		index2 := (index0 + index1 + 1) >> 1
+		for index2 < index1 {
+			lTable[hash7(load64(src, index0), lTableBits)] = uint16(index0)
+			lTable[hash7(load64(src, index2), lTableBits)] = uint16(index2)
+			index0 += 2
+			index2 += 2
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		// Bail if we exceed the maximum size.
+		if d+len(src)-nextEmit > dstLimit {
+			return 0
+		}
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
+
 // encodeBlockBetterDict encodes a non-empty src to a guaranteed-large-enough dst. It
 // assumes that the varint-encoded length of the decompressed bytes has already
 // been written.
diff --git a/s2/encode_go.go b/s2/encode_go.go
index dd1c973ca5..e25b78445d 100644
--- a/s2/encode_go.go
+++ b/s2/encode_go.go
@@ -21,6 +21,9 @@ func encodeBlock(dst, src []byte) (d int) {
 	if len(src) < minNonLiteralBlockSize {
 		return 0
 	}
+	if len(src) <= 64<<10 {
+		return encodeBlockGo64K(dst, src)
+	}
 	return encodeBlockGo(dst, src)
 }
 
@@ -32,6 +35,9 @@ func encodeBlock(dst, src []byte) (d int) {
 //
 //	len(dst) >= MaxEncodedLen(len(src))
 func encodeBlockBetter(dst, src []byte) (d int) {
+	if len(src) <= 64<<10 {
+		return encodeBlockBetterGo64K(dst, src)
+	}
 	return encodeBlockBetterGo(dst, src)
 }
 
@@ -43,6 +49,9 @@ func encodeBlockBetter(dst, src []byte) (d int) {
 //
 //	len(dst) >= MaxEncodedLen(len(src))
 func encodeBlockBetterSnappy(dst, src []byte) (d int) {
+	if len(src) <= 64<<10 {
+		return encodeBlockBetterSnappyGo64K(dst, src)
+	}
 	return encodeBlockBetterSnappyGo(dst, src)
 }
 
@@ -57,6 +66,9 @@ func encodeBlockSnappy(dst, src []byte) (d int) {
 	if len(src) < minNonLiteralBlockSize {
 		return 0
 	}
+	if len(src) <= 64<<10 {
+		return encodeBlockSnappyGo64K(dst, src)
+	}
 	return encodeBlockSnappyGo(dst, src)
 }
 
diff --git a/s2/fuzz_test.go b/s2/fuzz_test.go
index 02aa7f4ec7..e25af84f83 100644
--- a/s2/fuzz_test.go
+++ b/s2/fuzz_test.go
@@ -143,6 +143,12 @@ func FuzzEncodingBlocks(f *testing.F) {
 		encodeBlockBetterGo(encoded, data)
 		encodeBlockSnappyGo(encoded, data)
 		encodeBlockBetterSnappyGo(encoded, data)
+		if len(data) <= 64<<10 {
+			encodeBlockGo64K(encoded, data)
+			encodeBlockSnappyGo64K(encoded, data)
+			encodeBlockBetterGo64K(encoded, data)
+			encodeBlockBetterSnappyGo64K(encoded, data)
+		}
 		dst := encodeGo(encoded, data)
 		if dst == nil {
 			return