From 5529af4f6f2bf44747af728fc62f9f08585c6c5e Mon Sep 17 00:00:00 2001
From: Daniel Lemire <lemire@gmail.com>
Date: Sun, 21 Jan 2024 11:51:24 -0500
Subject: [PATCH 1/4] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3c79ae0..8ae0c2e 100644
--- a/README.md
+++ b/README.md
@@ -59,7 +59,7 @@ size_t compsize = streamvbyte_delta_encode(datain, N, compressedbuffer,0); // en
 streamvbyte_delta_decode(compressedbuffer, recovdata, N,0); // decoding (fast)
 ```
 You have to know how many integers were coded when you decompress. You can store this
-information along with the compressed stream. The
+information along with the compressed stream.
 
 During decoding, the library may read up to `STREAMVBYTE_PADDING` extra bytes
 from the input buffer (these bytes are read but never used).

From 02e2c46c50f79c6991424c2a62d96d2cc7593af5 Mon Sep 17 00:00:00 2001
From: Mo Yi <moyishi213@gmail.com>
Date: Sun, 3 Mar 2024 02:08:26 +0800
Subject: [PATCH 2/4] Fix misaligned address asan (#70)

---
 src/streamvbyte_x64_encode.c |  2 +-
 tests/unit.c                 | 23 +++++++++++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/src/streamvbyte_x64_encode.c b/src/streamvbyte_x64_encode.c
index c06bfc8..f9ef01c 100644
--- a/src/streamvbyte_x64_encode.c
+++ b/src/streamvbyte_x64_encode.c
@@ -88,7 +88,7 @@ static size_t streamvbyte_encode_SSE41 (const uint32_t* in, uint32_t count, uint
 		uint32_t dw = in[i];
 		uint32_t symbol = (dw > 0x000000FF) + (dw > 0x0000FFFF) + (dw > 0x00FFFFFF);
 		key |= symbol << (i + i);
-		*((uint32_t*)dataPtr) = dw;
+		memcpy(dataPtr, &dw, 4);
 		dataPtr += 1 + symbol;
 	}
 	memcpy(keyPtr, &key, ((count & 7) + 3) >> 2);
diff --git a/tests/unit.c b/tests/unit.c
index beaacd5..e3d23a4 100644
--- a/tests/unit.c
+++ b/tests/unit.c
@@ -2759,8 +2759,31 @@ static bool issue42(void) {
   return true;
 }
 
+
+static bool issue69(void) {
+  uint32_t N = 22;
+  uint32_t test_misaligned[22] = {
+      431, 292, 979, 994, 761, 879, 672, 690, 296,
+      931, 379, 98, 132, 105, 116, 841, 387, 831,
+      335, 333, 557, 915
+  };
+  uint32_t* datain = malloc(N * sizeof(uint32_t));
+  uint8_t* compressedbuffer = malloc(streamvbyte_max_compressedbytes(N));
+  uint32_t* recovdata = malloc(N * sizeof(uint32_t));
+  for (uint32_t k = 0; k < N; ++k) datain[k] = test_misaligned[k];
+  size_t compsize = streamvbyte_encode(datain, N, compressedbuffer);  // encoding
+  // here the result is stored in compressedbuffer using compsize bytes
+  size_t compsize2 = streamvbyte_decode(compressedbuffer, recovdata, N);  // decoding (fast)
+  if (compsize != compsize2) return false;
+  free(datain);
+  free(compressedbuffer);
+  free(recovdata);
+  return true;
+}
+
 int main(void) {
   if (!issue42()) { printf("tests failed.\n"); return EXIT_FAILURE; }
+  if (!issue69()) { printf("tests failed.\n"); return EXIT_FAILURE; }
   if (zigzagtests() == -1) { printf("tests failed.\n"); return EXIT_FAILURE; }
   if (basictests() == -1) { printf("tests failed.\n"); return EXIT_FAILURE; }
   if (aqrittests() == -1) { printf("tests failed.\n"); return EXIT_FAILURE; }

From dd95854ea243acd0554621fd12e4b60c9c8ba9eb Mon Sep 17 00:00:00 2001
From: Mo Yi <moyi@stdb.io>
Date: Mon, 25 Mar 2024 20:01:48 +0800
Subject: [PATCH 3/4] Fix misaligned address asan (#71)

---
 src/streamvbytedelta_x64_encode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/streamvbytedelta_x64_encode.c b/src/streamvbytedelta_x64_encode.c
index 543bbc5..6daa253 100644
--- a/src/streamvbytedelta_x64_encode.c
+++ b/src/streamvbytedelta_x64_encode.c
@@ -66,7 +66,7 @@ static size_t streamvbyte_encode_SSE41_d1_init (const uint32_t* in, uint32_t cou
 		uint32_t dw = in[i] - prev; prev = in[i];
 		uint32_t symbol = (dw > 0x000000FF) + (dw > 0x0000FFFF) + (dw > 0x00FFFFFF);
 		key |= symbol << (i + i);
-		*((uint32_t*)dataPtr) = dw;
+		memcpy(dataPtr, &dw, 4);
 		dataPtr += 1 + symbol;
 	}
 	memcpy(keyPtr, &key, ((count & 7) + 3) >> 2);

From 0934b5115505733d0487ff31f0d5669bd1d052bc Mon Sep 17 00:00:00 2001
From: blawrence-ont <132677418+blawrence-ont@users.noreply.github.com>
Date: Mon, 9 Dec 2024 14:05:59 +0000
Subject: [PATCH 4/4] Add validation functions and fixup sanitizer build (#73)

* Make sure that ASAN is linked in

When UBSan is enabled the equivalent flag is already added to the link
options and the UBSan runtime gets linked in, but when enabling ASAN
the build fails because it can't find a lot of symbols. The fix is to
link it if you use it.

* Remove unnecessary compile definition

ASAN_OPTIONS is a runtime environment variable and doesn't affect
compilation (unless exported as the symbol __asan_default_options,
which it isn't here).

Also note that enabling detect_leaks breaks macOS since AppleClang
doesn't support this (though the LLVM from brew does).

* Fix leaks spotted by the sanitizers

* Fix Makefile for unit

unit makes use of streamvbyte_isadetection.h which is in an internal
header not in the public API.

* Typo copypasta fix

* Add stream validation functions

These are useful when loading untrusted data since it's typically
trivial to change the uncompressed size that's stored alongside the
encoded data. Doing so can cause the decode functions to read past the
end of the input data, which at best generates bad output and at worst
causes crashes if it happens to read into an unmapped page.

* Add missing newlines from print statements

These presumably always passed so were never printed.

Also include the iteration for easier debugging.

* Silence MSVC warnings about possible loss of data

The encode functions return size_ts so we should match that.

* Change data initialisation in unit tests

All the data would be about the same value [gap-1, gap+6] and wouldn't
border on a byte width meaning that all of the keys would be the same
and hence wouldn't catch bugs when reading the wrong keys. Instead use
the gap to space the elements apart.

* Improve performance of streamvbyte_validate_stream

AppleClang 16 doesn't appear to be clever enough to transform the loop
into a branchless one, so help it out by adding a loop that it knows
how to unroll.

Also do the same for streamvbyte_validate_stream_0124 though I haven't
tested the performance of that.

* Add NEON implementation of validate

Nothing fancy, just processing 4 keys per vector. In both micro- and
macro-benchmarks this performs at basically the same speed as the loop
added in the previous commit (tested with clang), both of which are
significantly faster than the original version.

* Document the validate functions in the README
---
 CMakeLists.txt                |  6 ++++-
 Makefile                      |  4 +--
 README.md                     | 13 ++++++++++
 include/streamvbyte.h         | 14 +++++++++-
 include/streamvbytedelta.h    |  2 +-
 src/streamvbyte_0124_decode.c | 44 ++++++++++++++++++++++++++++++++
 src/streamvbyte_arm_decode.c  | 28 ++++++++++++++++++++
 src/streamvbyte_decode.c      | 48 +++++++++++++++++++++++++++++++++++
 src/streamvbyte_encode.c      |  2 +-
 tests/unit.c                  | 39 ++++++++++++++++++++++------
 10 files changed, 186 insertions(+), 14 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ab600a0..72b697c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -86,7 +86,11 @@ if(STREAMVBYTE_SANITIZE)
         -fno-omit-frame-pointer
         -fno-sanitize-recover=all
     )
-    add_compile_definitions(ASAN_OPTIONS=detect_leaks=1)
+    add_link_options(
+        -fsanitize=address
+        -fno-omit-frame-pointer
+        -fno-sanitize-recover=all
+    )
 endif()
 
 if(MSVC)
diff --git a/Makefile b/Makefile
index 70b6383..2f312c9 100644
--- a/Makefile
+++ b/Makefile
@@ -83,10 +83,10 @@ writeseq: ./tests/writeseq.c    $(HEADERS) $(OBJECTS)
 	$(CC) $(CFLAGS) -o writeseq ./tests/writeseq.c -Iinclude  $(OBJECTS)
 
 unit: ./tests/unit.c    $(HEADERS) $(OBJECTS)
-	$(CC) $(CFLAGS) -o unit ./tests/unit.c -Iinclude  $(OBJECTS)
+	$(CC) $(CFLAGS) -o unit ./tests/unit.c -Iinclude -Isrc  $(OBJECTS)
 
 dynunit: ./tests/unit.c    $(HEADERS) $(LIBNAME) $(LNLIBNAME)
-	$(CC) $(CFLAGS) -o dynunit ./tests/unit.c -Iinclude  -L. -lstreamvbyte
+	$(CC) $(CFLAGS) -o dynunit ./tests/unit.c -Iinclude -Isrc  -L. -lstreamvbyte
 
 clean:
 	rm -f unit *.o $(LIBNAME) $(LNLIBNAME)  example shuffle_tables perf writeseq dynunit
diff --git a/README.md b/README.md
index 8ae0c2e..b902096 100644
--- a/README.md
+++ b/README.md
@@ -64,6 +64,19 @@ information along with the compressed stream.
 During decoding, the library may read up to `STREAMVBYTE_PADDING` extra bytes
 from the input buffer (these bytes are read but never used).
 
+To verify that the expected size of a stream is correct you may validate it before
+decoding:
+```C
+// compressedbuffer, compsize, recovdata, N are as above
+if (streamvbyte_validate_stream(compressedbuffer, compsize, N)) {
+    // the stream is safe to decode
+    streamvbyte_decode(compressedbuffer, recovdata, N);
+} else {
+    // there's a mismatch between the expected size of the data (N) and the contents of
+    // the stream, so performing a decode is unsafe since the behaviour is undefined
+}
+```
+
 
 
 
diff --git a/include/streamvbyte.h b/include/streamvbyte.h
index e88ab08..35d17aa 100644
--- a/include/streamvbyte.h
+++ b/include/streamvbyte.h
@@ -1,6 +1,7 @@
 #ifndef INCLUDE_STREAMVBYTE_H_
 #define INCLUDE_STREAMVBYTE_H_
 
+#include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
 
@@ -10,7 +11,7 @@ extern "C" {
 
 #define STREAMVBYTE_PADDING 16
 
-// Encode an array of a given length read from in to bout in varint format.
+// Encode an array of a given length read from in to out in varint format.
 // Returns the number of bytes written.
 // The number of values being stored (length) is not encoded in the compressed stream,
 // the caller is responsible for keeping a record of this length.
@@ -66,6 +67,17 @@ size_t streamvbyte_decode(const uint8_t* in, uint32_t* out, uint32_t length);
 // streamvbyte_encode_0124.
 size_t streamvbyte_decode_0124(const uint8_t* in, uint32_t* out, uint32_t length);
 
+// Validate an encoded stream.
+// This can be used to validate that data received from an untrusted source (disk, network,
+// etc...) has a valid length stored alongside it.
+// "inLength" is the size of the encoded data "in", and "outLength" is the expected number
+// of integers that were compressed.
+bool streamvbyte_validate_stream(const uint8_t* in, size_t inLength, uint32_t outLength);
+
+// Same as streamvbyte_validate_stream but is meant to be used for streams encoded with
+// streamvbyte_encode_0124.
+bool streamvbyte_validate_stream_0124(const uint8_t* in, size_t inLength, uint32_t outLength);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/streamvbytedelta.h b/include/streamvbytedelta.h
index 9ee8040..55ef010 100644
--- a/include/streamvbytedelta.h
+++ b/include/streamvbytedelta.h
@@ -8,7 +8,7 @@
 extern "C" {
 #endif
 
-// Encode an array of a given length read from in to bout in StreamVByte format.
+// Encode an array of a given length read from in to out in StreamVByte format.
 // Returns the number of bytes written.
 // The number of values being stored (length) is not encoded in the compressed stream,
 // the caller is responsible for keeping a record of this length. The pointer "in" should
diff --git a/src/streamvbyte_0124_decode.c b/src/streamvbyte_0124_decode.c
index 524cfe7..a6bb6f3 100644
--- a/src/streamvbyte_0124_decode.c
+++ b/src/streamvbyte_0124_decode.c
@@ -181,5 +181,49 @@ size_t streamvbyte_decode_0124(const uint8_t *in, uint32_t *out, uint32_t count)
 #endif
 
   return (size_t)(svb_decode_scalar(out, keyPtr, dataPtr, count) - in);
+}
+
+bool streamvbyte_validate_stream_0124(const uint8_t *in, size_t inCount,
+                                      uint32_t outCount) {
+  if (inCount == 0 || outCount == 0)
+    return inCount == outCount;
+
+  // 2-bits per key (rounded up)
+  // Note that we don't add to outCount in case it overflows
+  uint32_t keyLen = outCount / 4;
+  if (outCount & 3)
+    keyLen++;
+
+  // Check that there's enough space for the keys
+  if (keyLen > inCount)
+    return false;
+
+  // Accumulate the key sizes in a wider type to avoid overflow
+  const uint8_t *keyPtr = in;
+  uint64_t encodedSize = 0;
+
+  // Give the compiler a hint that it can avoid branches in the inner loop
+  for (uint32_t c = 0; c < outCount / 4; c++) {
+    uint32_t key = *keyPtr++;
+    for (uint8_t shift = 0; shift < 8; shift += 2) {
+      const uint8_t code = (key >> shift) & 0x3;
+      encodedSize += (1 << code) >> 1;
+    }
+  }
+  outCount &= 3;
+
+  // Process the remainder one at a time
+  uint8_t shift = 0;
+  uint32_t key = *keyPtr++;
+  for (uint32_t c = 0; c < outCount; c++) {
+    if (shift == 8) {
+      shift = 0;
+      key = *keyPtr++;
+    }
+    const uint8_t code = (key >> shift) & 0x3;
+    encodedSize += (1 << code) >> 1;
+    shift += 2;
+  }
 
+  return encodedSize == inCount - keyLen;
 }
diff --git a/src/streamvbyte_arm_decode.c b/src/streamvbyte_arm_decode.c
index 02caaba..7480255 100644
--- a/src/streamvbyte_arm_decode.c
+++ b/src/streamvbyte_arm_decode.c
@@ -51,4 +51,32 @@ static const uint8_t *svb_decode_vector(uint32_t *out, const uint8_t *keyPtr, co
 
   return dataPtr;
 }
+
+static uint64_t svb_validate_vector(const uint8_t **keyPtrPtr,
+                                    uint32_t *countPtr) {
+  // Reduce the count by how many we'll process
+  const uint32_t count = *countPtr & ~7U;
+  const uint8_t *keyPtr = *keyPtrPtr;
+  *countPtr &= 7;
+  *keyPtrPtr += count / 4;
+
+  // Deal with each of the 4 keys in a separate lane
+  const int32x4_t shifts = {0, -2, -4, -6};
+  const uint32x4_t mask = vdupq_n_u32(3);
+  uint32x4_t acc0 = vdupq_n_u32(0);
+  uint32x4_t acc1 = vdupq_n_u32(0);
+
+  // Unrolling more than twice doesn't seem to improve performance
+  for (uint32_t c = 0; c < count; c += 8) {
+    uint32x4_t shifted0 = vshlq_u32(vdupq_n_u32(*keyPtr++), shifts);
+    acc0 = vaddq_u32(acc0, vandq_u32(shifted0, mask));
+    uint32x4_t shifted1 = vshlq_u32(vdupq_n_u32(*keyPtr++), shifts);
+    acc1 = vaddq_u32(acc1, vandq_u32(shifted1, mask));
+  }
+
+  // Accumulate the sums and add the +1 for each element (count)
+  uint64x2_t sum0 = vpaddlq_u32(acc0);
+  uint64x2_t sum1 = vpaddlq_u32(acc1);
+  return sum0[0] + sum0[1] + sum1[0] + sum1[1] + count;
+}
 #endif
diff --git a/src/streamvbyte_decode.c b/src/streamvbyte_decode.c
index 4049a2e..b99c7e3 100644
--- a/src/streamvbyte_decode.c
+++ b/src/streamvbyte_decode.c
@@ -84,5 +84,53 @@ size_t streamvbyte_decode(const uint8_t *in, uint32_t *out, uint32_t count) {
 #endif
 
   return (size_t)(svb_decode_scalar(out, keyPtr, dataPtr, count) - in);
+}
+
+bool streamvbyte_validate_stream(const uint8_t *in, size_t inCount,
+                                 uint32_t outCount) {
+  if (inCount == 0 || outCount == 0)
+    return inCount == outCount;
+
+  // 2-bits per key (rounded up)
+  // Note that we don't add to outCount in case it overflows
+  uint32_t keyLen = outCount / 4;
+  if (outCount & 3)
+    keyLen++;
+
+  // Check that there's enough space for the keys
+  if (keyLen > inCount)
+    return false;
+
+  // Accumulate the key sizes in a wider type to avoid overflow
+  const uint8_t *keyPtr = in;
+  uint64_t encodedSize = 0;
+
+#if defined(__ARM_NEON__)
+  encodedSize = svb_validate_vector(&keyPtr, &outCount);
+#endif
+
+  // Give the compiler a hint that it can avoid branches in the inner loop
+  for (uint32_t c = 0; c < outCount / 4; c++) {
+    uint32_t key = *keyPtr++;
+    for (uint8_t shift = 0; shift < 8; shift += 2) {
+      const uint8_t code = (key >> shift) & 0x3;
+      encodedSize += code + 1;
+    }
+  }
+  outCount &= 3;
+
+  // Process the remainder one at a time
+  uint8_t shift = 0;
+  uint32_t key = *keyPtr++;
+  for (uint32_t c = 0; c < outCount; c++) {
+    if (shift == 8) {
+      shift = 0;
+      key = *keyPtr++;
+    }
+    const uint8_t code = (key >> shift) & 0x3;
+    encodedSize += code + 1;
+    shift += 2;
+  }
 
+  return encodedSize == inCount - keyLen;
 }
diff --git a/src/streamvbyte_encode.c b/src/streamvbyte_encode.c
index ef9667c..f1602bd 100644
--- a/src/streamvbyte_encode.c
+++ b/src/streamvbyte_encode.c
@@ -108,7 +108,7 @@ size_t streamvbyte_compressedbytes_0124(const uint32_t* in, uint32_t length) {
 }
 
 
-// Encode an array of a given length read from in to bout in streamvbyte format.
+// Encode an array of a given length read from in to out in streamvbyte format.
 // Returns the number of bytes written.
 size_t streamvbyte_encode(const uint32_t *in, uint32_t count, uint8_t *out) {
 #ifdef STREAMVBYTE_X64
diff --git a/tests/unit.c b/tests/unit.c
index e3d23a4..30e5f2e 100644
--- a/tests/unit.c
+++ b/tests/unit.c
@@ -47,6 +47,8 @@ static int zigzagtests(void) {
       }
     }
 
+    free(deltadataout);
+    free(deltadataback);
     free(databack);
     free(dataout);
     free(datain);
@@ -96,11 +98,18 @@ static int basictests(void) {
 
   for (uint32_t length = 0; length <= N;) {
     for (uint32_t gap = 1; gap <= 387420489; gap *= 3) {
-      for (uint32_t k = 0; k < length; ++k)
-        datain[k] = gap - 1 + ((uint32_t)rand() % 8); // sometimes start with zero
+      datain[0] = (uint32_t)rand() % 8; // sometimes start with zero
+      for (uint32_t k = 1; k < length; ++k)
+        datain[k] = datain[k - 1] + gap - 1 + (uint32_t)rand() % 8;
 
       // Default encoding: 1,2,3,4 bytes per value
       size_t compsize = streamvbyte_encode(datain, length, compressedbuffer);
+      if (!streamvbyte_validate_stream(compressedbuffer, compsize, length)) {
+        printf("[streamvbyte_validate_stream] code is buggy length=%d gap=%d: compsize=%d\n",
+               (int)length, (int)gap, (int)compsize);
+        return -1;
+      }
+
       size_t usedbytes = streamvbyte_decode(compressedbuffer, recovdata, length);
       if (compsize != usedbytes) {
         printf("[streamvbyte_decode] code is buggy length=%d gap=%d: compsize=%d != "
@@ -118,6 +127,12 @@ static int basictests(void) {
 
       // Alternative encoding: 0,1,2,4 bytes per value
       compsize = streamvbyte_encode_0124(datain, length, compressedbuffer);
+      if (!streamvbyte_validate_stream_0124(compressedbuffer, compsize, length)) {
+        printf("[streamvbyte_validate_stream_0124] code is buggy length=%d gap=%d: compsize=%d\n",
+               (int)length, (int)gap, (int)compsize);
+        return -1;
+      }
+
       usedbytes = streamvbyte_decode_0124(compressedbuffer, recovdata, length);
       if (compsize != usedbytes) {
         printf("[streamvbyte_decode_0124] code is buggy length=%d gap=%d: compsize=%d != "
@@ -197,29 +212,37 @@ static int aqrittests(void) {
     const int length = 4;
 
     size_t compsize = streamvbyte_encode((uint32_t *)in, length, compressedbuffer);
-    size_t usedbytes = streamvbyte_decode(compressedbuffer, (uint32_t *)recovdata, length);
+    if (!streamvbyte_validate_stream(compressedbuffer, compsize, length)) {
+      printf("[streamvbyte_validate_stream] code is buggy i=%i\n", i);
+      return -1;
+    }
 
+    size_t usedbytes = streamvbyte_decode(compressedbuffer, (uint32_t *)recovdata, length);
     if (compsize != usedbytes) {
-      printf("[streamvbyte_decode] code is buggy");
+      printf("[streamvbyte_decode] code is buggy i=%i\n", i);
       return -1;
     }
     for (size_t k = 0; k < length * sizeof(uint32_t); ++k) {
       if (recovdata[k] != in[k]) {
-        printf("[streamvbyte_decode] code is buggy");
+        printf("[streamvbyte_decode] code is buggy i=%i\n", i);
         return -1;
       }
     }
 
     compsize = streamvbyte_encode_0124((uint32_t *)in, length, compressedbuffer);
-    usedbytes = streamvbyte_decode_0124(compressedbuffer, (uint32_t *)recovdata, length);
+    if (!streamvbyte_validate_stream_0124(compressedbuffer, compsize, length)) {
+      printf("[streamvbyte_validate_stream_0124] code is buggy i=%i\n", i);
+      return -1;
+    }
 
+    usedbytes = streamvbyte_decode_0124(compressedbuffer, (uint32_t *)recovdata, length);
     if (compsize != usedbytes) {
-      printf("[streamvbyte_decode_0124] code is buggy");
+      printf("[streamvbyte_decode_0124] code is buggy i=%i\n", i);
       return -1;
     }
     for (size_t k = 0; k < length * sizeof(uint32_t); ++k) {
       if (recovdata[k] != in[k]) {
-        printf("[streamvbyte_decode_0124] code is buggy");
+        printf("[streamvbyte_decode_0124] code is buggy i=%i\n", i);
         return -1;
       }
     }