diff --git a/.gitignore b/.gitignore
index ccccd80..788c967 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,6 +49,8 @@ examples/compress-file
 examples/decompress-file
 
 examples/fuzz
+examples/afl_in
+examples/afl_out
 
 bz3grep.1
 
diff --git a/examples/fuzz-decode-block.c b/examples/fuzz-decode-block.c
new file mode 100644
index 0000000..68ba434
--- /dev/null
+++ b/examples/fuzz-decode-block.c
@@ -0,0 +1,332 @@
+/* A tiny utility for fuzzing bzip3 block decompression.
+ *
+ * Prerequisites:
+ * 
+ * - AFL https://github.com/AFLplusplus/AFLplusplus
+ * - clang (part of LLVM)
+ * 
+ * On Arch this is `pacman -S afl++ clang`
+ *
+ * # Instructions:
+ * 
+ * 1. Prepare fuzzer directories
+ * 
+ * mkdir -p afl_in && mkdir -p afl_out
+ * 
+ * 2. Build binary (to compress test data).
+ * 
+ * afl-clang fuzz-decode-block.c -I../include -o fuzz -g3 "-DVERSION=\"0.0.0\"" -O3 -march=native
+ * 
+ * 3. Make a fuzzer input file.
+ * 
+ * With `your_file` being an arbitrary input to test, use this utility
+ * to generate a compressed test block:
+ * 
+ * ./fuzz standard_test_files/63_byte_file.bin 63_byte_file.bin.bz3b 8
+ * ./fuzz standard_test_files/65_byte_file.bin 65_byte_file.bin.bz3b 8
+ * mv 63_byte_file.bin.bz3b afl_in/
+ * mv 65_byte_file.bin.bz3b afl_in/
+ * 
+ * For this test, it is recommended to make 2 files, one that's <64 bytes and one that's >64 bytes.
+ * 
+ * 4. Build binary (for fuzzing).
+ * 
+ * afl-clang-fast fuzz-decode-block.c -I../include -o fuzz -g3 "-DVERSION=\"0.0.0\"" -O3 -march=native
+ * 
+ * 5. Run the fuzzer.
+ * 
+ * AFL_SKIP_CPUFREQ=1 afl-fuzz -i afl_in -o afl_out -- ./fuzz @@
+ *
+ * 6. Wanna go faster? Multithread.
+ * 
+ * alacritty -e bash -c "afl-fuzz -i afl_in -o afl_out -M fuzzer01 -- ./fuzz @@; exec bash" &
+ * alacritty -e bash -c "afl-fuzz -i afl_in -o afl_out -S fuzzer02 -- ./fuzz @@; exec bash" &
+ * alacritty -e bash -c "afl-fuzz -i afl_in -o afl_out -S fuzzer03 -- ./fuzz @@; exec bash" &
+ * alacritty -e bash -c "afl-fuzz -i afl_in -o afl_out -S fuzzer04 -- ./fuzz @@; exec bash" &
+ * 
+ * etc. Replace `alacritty` with your terminal.
+ * 
+ * And check progress with `afl-whatsup afl_out` (updates periodically).
+ * 
+ * 7. Found a crash?
+ * 
+ * If you find a crash, consider also doing the following:
+ * 
+ *      clang fuzz-decode-block.c -g3 -O3 -march=native -o fuzz_asan -I../include "-DVERSION=\"0.0.0\"" -fsanitize=undefined -fsanitize=address
+ *
+ * And run fuzz_asan on the crashing test case (you can find it in one of the `afl_out/crashes/` folders).
+ * Attach the test case /and/ the output of fuzz_asan to the bug report.
+ * 
+ * If no error occurs, it could be that there was a memory corruption `between` the runs.
+ * In which case, you want to run AFL with address sanitizer. Use `export AFL_USE_ASAN=1` to enable
+ * addres sanitizer; then run AFL.
+ * 
+ * export AFL_USE_ASAN=1
+ * afl-clang-fast fuzz-decode-block.c -I../include -o fuzz -g3 "-DVERSION=\"0.0.0\"" -O3 -march=native
+ */
+
+/*
+
+This hex editor template can be used to help debug a breaking file.
+Would provide for ImHex, but ImHex terminates if template is borked.
+
+
+//------------------------------------------------
+//--- 010 Editor v15.0.1 Binary Template
+//
+//      File: bzip3block.bt
+//   Authors: Sewer56
+//   Version: 1.0.0
+//   Purpose: Parse bzip3 fuzzer block data
+//  Category: Archive
+// File Mask: *.bz3b
+//------------------------------------------------
+
+// Colors for different sections
+#define COLOR_HEADER     0xA0FFA0 // Block metadata
+#define COLOR_BLOCKHEAD  0xFFB0B0 // Block headers
+#define COLOR_DATA       0xB0B0FF // Compressed data
+
+local uint32 currentBlockSize; // Store block size globally
+
+// Block metadata structure
+typedef struct {
+    uint32 orig_size;      // Original uncompressed size
+    uint32 comp_size;      // Compressed size
+    uint32 buffer_size;    // Size of decompression buffer
+} BLOCK_META <bgcolor=COLOR_HEADER>;
+
+// Regular block header (for blocks >= 64 bytes)
+typedef struct {
+    uint32 crc32;         // CRC32 checksum of uncompressed data
+    uint32 bwtIndex;      // Burrows-Wheeler transform index
+    uint8  model;         // Compression model flags:
+                         // bit 1 (0x02): LZP was used
+                         // bit 2 (0x04): RLE was used
+    
+    // Optional size fields based on compression flags
+    if(model & 0x02)     
+        uint32 lzpSize;   // Size after LZP compression
+    if(model & 0x04)     
+        uint32 rleSize;   // Size after RLE compression
+} BLOCK_HEADER <bgcolor=COLOR_BLOCKHEAD>;
+
+// Small block header (for blocks < 64 bytes)
+typedef struct {
+    uint32 crc32;        // CRC32 checksum
+    uint32 literal;      // Always 0xFFFFFFFF for small blocks
+    uint8 data[currentBlockSize - 8]; // Uncompressed data
+} SMALL_BLOCK <bgcolor=COLOR_BLOCKHEAD>;
+
+// Block content structure
+typedef struct {
+    currentBlockSize = meta.comp_size;
+    
+    if(meta.orig_size < 64) {
+        SMALL_BLOCK content;
+    } else {
+        BLOCK_HEADER header;
+        uchar data[meta.comp_size - (Popcount(header.model) * 4 + 9)];
+    }
+} BLOCK_CONTENT <bgcolor=COLOR_DATA>;
+
+// Helper function for bit counting (used for header size calculation)
+int Popcount(byte b) {
+    local int count = 0;
+    while(b) {
+        count += b & 1;
+        b >>= 1;
+    }
+    return count;
+}
+
+// Main block structure
+typedef struct {
+    BLOCK_META meta;
+    BLOCK_CONTENT content;
+} BLOCK;
+
+// Main parsing structure
+BLOCK block;
+*/
+
+#include "../include/libbz3.h"
+#include "../src/libbz3.c"
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#define KiB(x) ((x)*1024)
+
+// Required for AFL++ persistent mode
+#ifdef __AFL_HAVE_MANUAL_CONTROL
+#include <unistd.h>
+__AFL_FUZZ_INIT();
+#endif
+
+size_t min_size_t(size_t a, size_t b) {
+    return (a < b) ? a : b;
+}
+
+// Returns 0 on success, positive on bzip3 errors
+static int try_decode_block(const uint8_t *input_buf, size_t input_len) {
+    // Read whatever metadata we can get
+    uint32_t orig_size = 0;
+    uint32_t comp_size = 0;
+    uint32_t buffer_size = 0;
+    
+    if (input_len >= 4) orig_size = *(const uint32_t *)input_buf;
+    if (input_len >= 8) comp_size = *(const uint32_t *)(input_buf + 4);
+    if (input_len >= 12) buffer_size = *(const uint32_t *)(input_buf + 8);
+    
+    // Initialize state with minimum block size
+    struct bz3_state *state = bz3_new(KiB(65));
+    if (!state) return 0; // not under test
+
+    // Allocate buffer with fuzzer-provided size
+    uint8_t *buffer = malloc(buffer_size);
+    if (!buffer) {
+        bz3_free(state);
+        return 0; // not under test
+    }
+
+    // Copy whatever compressed data we can get
+    size_t data_len = input_len > 12 ? input_len - 12 : 0;
+    if (data_len > 0) {
+        memcpy(buffer, input_buf + 12, min_size_t(data_len, (size_t)buffer_size));
+    }
+
+    // Attempt decompression with potentially invalid parameters
+    int bzerr = bz3_decode_block(state, buffer, buffer_size, comp_size, orig_size);
+    // and pray we don't crash :p
+
+    free(buffer);
+    bz3_free(state);
+    return bzerr;
+}
+
+static int encode_block(const char *infile, const char *outfile, uint32_t block_size) {
+    block_size = block_size <= KiB(65) ? KiB(65) : block_size;
+    
+    // Read input file
+    FILE *fp_in = fopen(infile, "rb");
+    if (!fp_in) {
+        perror("Failed to open input file");
+        return 1;
+    }
+
+    fseek(fp_in, 0, SEEK_END);
+    size_t insize = ftell(fp_in);
+    fseek(fp_in, 0, SEEK_SET);
+
+    uint8_t *inbuf = malloc(insize);
+    if (!inbuf) {
+        fclose(fp_in);
+        return 1;
+    }
+
+    fread(inbuf, 1, insize, fp_in);
+    fclose(fp_in);
+
+    // Initialize compression state
+    struct bz3_state *state = bz3_new(block_size);
+    if (!state) {
+        free(inbuf);
+        return 1;
+    }
+
+    // Make output buffer
+    size_t outsize = bz3_bound(insize);
+    uint8_t *outbuf = malloc(outsize + 12); // +12 for metadata
+    if (!outbuf) {
+        bz3_free(state);
+        free(inbuf);
+        return 1;
+    }
+
+    // Store metadata
+    *(uint32_t *)outbuf = insize;        // Original size
+    *(uint32_t *)(outbuf + 8) = outsize; // Buffer size needed for decompression
+    
+    // Compress the block
+    int32_t comp_size = bz3_encode_block(state, outbuf + 12, insize);
+    if (comp_size < 0) {
+        printf("bz3_encode_block() failed with error code %d\n", comp_size);
+        bz3_free(state);
+        free(inbuf);
+        free(outbuf);
+        return comp_size;
+    }
+
+    // Store compressed size
+    *(uint32_t *)(outbuf + 4) = comp_size;
+
+    FILE *fp_out = fopen(outfile, "wb");
+    if (!fp_out) {
+        perror("Failed to open output file");
+        bz3_free(state);
+        free(inbuf);
+        free(outbuf);
+        return 1;
+    }
+
+    fwrite(outbuf, 1, comp_size + 12, fp_out);
+    fclose(fp_out);
+
+    printf("Encoded block from %s (%zu bytes) to %s (%d bytes)\n", 
+           infile, insize, outfile, comp_size + 12);
+
+    bz3_free(state);
+    free(inbuf);
+    free(outbuf);
+    return 0;
+}
+
+int main(int argc, char **argv) {
+#ifdef __AFL_HAVE_MANUAL_CONTROL
+    __AFL_INIT();
+    
+    while (__AFL_LOOP(1000)) {
+        try_decode_block(__AFL_FUZZ_TESTCASE_BUF, __AFL_FUZZ_TESTCASE_LEN);
+    }
+#else
+    if (argc == 4) {
+        // Compression mode: input_file output_file block_size
+        return encode_block(argv[1], argv[2], atoi(argv[3]));
+    }
+    
+    if (argc != 2) {
+        fprintf(stderr, "Usage:\n");
+        fprintf(stderr, "  Decode: %s <input_file>\n", argv[0]);
+        fprintf(stderr, "  Encode: %s <input_file> <output_file> <block_size>\n", argv[0]);
+        return 1;
+    }
+
+    // Decode mode
+    FILE *fp = fopen(argv[1], "rb");
+    if (!fp) {
+        perror("Failed to open input file");
+        return 1;
+    }
+
+    fseek(fp, 0, SEEK_END);
+    size_t size = ftell(fp);
+    fseek(fp, 0, SEEK_SET);
+
+    uint8_t *buffer = malloc(size);
+    if (!buffer) {
+        fclose(fp);
+        return 1;
+    }
+
+    fread(buffer, 1, size, fp);
+    fclose(fp);
+
+    int result = try_decode_block(buffer, size);
+    free(buffer);
+    return result > 0 ? result : 0; // Return bzip3 errors but treat validation errors as success
+#endif
+
+    return 0;
+}
\ No newline at end of file
diff --git a/examples/fuzz-decompress.c b/examples/fuzz-decompress.c
new file mode 100644
index 0000000..3d18a32
--- /dev/null
+++ b/examples/fuzz-decompress.c
@@ -0,0 +1,312 @@
+/* A tiny utility for fuzzing bzip3 frame decompression.
+ *
+ * Prerequisites:
+ * 
+ * - AFL https://github.com/AFLplusplus/AFLplusplus
+ * - clang (part of LLVM)
+ * 
+ * On Arch this is `pacman -S afl++ clang`
+ *
+ * # Instructions:
+ * 
+ * 1. Prepare fuzzer directories
+ * 
+ * mkdir -p afl_in && mkdir -p afl_out
+ * 
+ * 2. Build binary (to compress test data).
+ * 
+ * afl-clang fuzz-decompress.c -I../include -o fuzz -g3 "-DVERSION=\"0.0.0\"" -O3 -march=native
+ * 
+ * 3. Make a fuzzer input file.
+ * 
+ * With `your_file` being an arbitrary input to test, use this utility
+ * to generate a compressed test frame:
+ * 
+ * ./fuzz hl-api.c hl-api.c.bz3 8
+ * mv hl-api.c.bz3 afl_in/
+ * 
+ * 4. Build binary (for fuzzing).
+ * 
+ * afl-clang-fast fuzz-decompress.c -I../include -o fuzz -g3 "-DVERSION=\"0.0.0\"" -O3 -march=native
+ * 
+ * 5. Run the fuzzer.
+ * 
+ * AFL_SKIP_CPUFREQ=1 afl-fuzz -i afl_in -o afl_out -- ./fuzz @@
+ *
+ * 6. Wanna go faster? Multithread.
+ * 
+ * alacritty -e bash -c "afl-fuzz -i afl_in -o afl_out -M fuzzer01 -- ./fuzz @@; exec bash" &
+ * alacritty -e bash -c "afl-fuzz -i afl_in -o afl_out -S fuzzer02 -- ./fuzz @@; exec bash" &
+ * alacritty -e bash -c "afl-fuzz -i afl_in -o afl_out -S fuzzer03 -- ./fuzz @@; exec bash" &
+ * alacritty -e bash -c "afl-fuzz -i afl_in -o afl_out -S fuzzer04 -- ./fuzz @@; exec bash" &
+ * 
+ * etc. Replace `alacritty` with your terminal.
+ * 
+ * And check progress with `afl-whatsup afl_out` (updates periodically).
+ * 
+ * 7. Found a crash?
+ * 
+ * If you find a crash, consider also doing the following:
+ * 
+ *      clang fuzz-decompress.c -g3 -O3 -march=native -o fuzz_asan -I../include "-DVERSION=\"0.0.0\"" -fsanitize=undefined -fsanitize=address
+ *
+ * And run fuzz_asan on the crashing test case (you can find it in one of the `afl_out/crashes/` folders).
+ * Attach the test case /and/ the output of fuzz_asan to the bug report.
+ * 
+ * If no error occurs, it could be that there was a memory corruption `between` the runs.
+ * In which case, you want to run AFL with address sanitizer. Use `export AFL_USE_ASAN=1` to enable
+ * addres sanitizer; then run AFL.
+ * 
+ * export AFL_USE_ASAN=1
+ * afl-clang-fast fuzz-decompress.c -I../include -o fuzz -g3 "-DVERSION=\"0.0.0\"" -O3 -march=native
+ */
+
+
+/*
+This hex editor template can be used to help debug a breaking file.
+Would provide for ImHex, but ImHex terminates if template is borked.
+
+//------------------------------------------------
+//--- 010 Editor v15.0.1 Binary Template
+//
+//      File: bzip3-fuzz-decompress.bt
+//   Authors: Sewer56
+//   Version: 1.0.0
+//   Purpose: Parse bzip3 fuzzer data
+//------------------------------------------------
+
+// Colors for different sections
+#define COLOR_HEADER     0xA0FFA0 // Frame header
+#define COLOR_BLOCKHEAD  0xFFB0B0 // Block headers
+#define COLOR_DATA       0xB0B0FF // Compressed data
+
+local uint32 currentBlockSize; // Store block size globally
+
+// Frame header structure
+typedef struct {
+    char signature[5];     // "BZ3v1"
+    uint32 blockSize;      // Maximum block size
+    uint32 block_count;
+} FRAME_HEADER <bgcolor=COLOR_HEADER>;
+
+// Regular block header (for blocks >= 64 bytes)
+typedef struct {
+    uint32 crc32;         // CRC32 checksum of uncompressed data
+    uint32 bwtIndex;      // Burrows-Wheeler transform index
+    uint8  model;         // Compression model flags:
+                         // bit 1 (0x02): LZP was used
+                         // bit 2 (0x04): RLE was used
+    
+    // Optional size fields based on compression flags
+    if(model & 0x02)     
+        uint32 lzpSize;   // Size after LZP compression
+    if(model & 0x04)     
+        uint32 rleSize;   // Size after RLE compression
+} BLOCK_HEADER <bgcolor=COLOR_BLOCKHEAD>;
+
+// Small block header (for blocks < 64 bytes)
+typedef struct {
+    uint32 crc32;        // CRC32 checksum
+    uint32 literal;      // Always 0xFFFFFFFF for small blocks
+    uint8 data[currentBlockSize - 8]; // Uncompressed data
+} SMALL_BLOCK <bgcolor=COLOR_BLOCKHEAD>;
+
+// Main block structure
+typedef struct {
+    uint32 compressedSize;  // Size of compressed block
+    uint32 origSize;        // Original uncompressed size
+    
+    currentBlockSize = compressedSize; // Store for use in SMALL_BLOCK
+    
+    if(origSize < 64) {
+        SMALL_BLOCK content;
+    } else {
+        BLOCK_HEADER header;
+        uchar data[compressedSize - (Popcount(header.model) * 4 + 9)];
+    }
+} BLOCK <bgcolor=COLOR_DATA>;
+
+// Helper function for bit counting (used for header size calculation)
+int Popcount(byte b) {
+    local int count = 0;
+    while(b) {
+        count += b & 1;
+        b >>= 1;
+    }
+    return count;
+}
+
+// Main parsing structure
+uint32 orig_size;
+FRAME_HEADER frameHeader;
+
+// Read blocks until end of file
+while(!FEof()) {
+    BLOCK block;
+}
+
+*/
+
+#include "../include/libbz3.h"
+#include "../src/libbz3.c"
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#define KiB(x) ((x)*1024)
+
+// Required for AFL++ persistent mode
+#ifdef __AFL_HAVE_MANUAL_CONTROL
+#include <unistd.h>
+__AFL_FUZZ_INIT();
+#endif
+
+// Maximum allowed size to prevent excessive memory allocation
+#define MAX_SIZE 0x10000000 // 256MB
+
+// Returns 0 on success, negative on input validation errors, positive on bzip3 errors
+static int try_decompress(const uint8_t *input_buf, size_t input_len) {
+    if (input_len < 8) { // invalid, does not contain orig_size
+        return -1;
+    }
+
+    size_t orig_size = *(const uint32_t *)input_buf;
+    uint8_t *outbuf = malloc(orig_size);
+    if (!outbuf) {
+        return -3;
+    }
+
+    // We read orig_size from the input as we also want to fuzz it.
+    int bzerr = bz3_decompress(
+        input_buf + sizeof(uint32_t),
+        outbuf,
+        input_len - sizeof(uint32_t),
+        &orig_size
+    );
+
+    if (bzerr != BZ3_OK) {
+        printf("bz3_decompress() failed with error code %d\n", bzerr);
+    } else {
+        printf("OK, %d => %d\n", (int)input_len, (int)orig_size);
+    }
+
+    free(outbuf);
+    return bzerr;
+}
+
+static int compress_file(const char *infile, const char *outfile, uint32_t block_size) {
+    block_size = block_size <= KiB(65) ? KiB(65) : block_size;
+    
+    // Read the data into `inbuf`
+    FILE *fp_in = fopen(infile, "rb");
+    if (!fp_in) {
+        perror("Failed to open input file");
+        return 1;
+    }
+
+    fseek(fp_in, 0, SEEK_END);
+    size_t insize = ftell(fp_in);
+    fseek(fp_in, 0, SEEK_SET);
+
+    uint8_t *inbuf = malloc(insize);
+    if (!inbuf) {
+        fclose(fp_in);
+        return 1;
+    }
+
+    fread(inbuf, 1, insize, fp_in);
+    fclose(fp_in);
+
+    // Make buffer for output.
+    size_t outsize = bz3_bound(insize);
+    uint8_t *outbuf = malloc(outsize + sizeof(uint32_t));
+    if (!outbuf) {
+        free(inbuf);
+        return 1;
+    }
+
+    // Store original size at the start
+    // This is important, the `try_decompress` will read this field during fuzzing.
+    // And pass it as a parameter to `bz3_decompress`. 
+    *(uint32_t *)outbuf = insize;
+
+    int bzerr = bz3_compress(block_size, inbuf, outbuf + sizeof(uint32_t), insize, &outsize);
+    if (bzerr != BZ3_OK) {
+        printf("bz3_compress() failed with error code %d\n", bzerr);
+        free(inbuf);
+        free(outbuf);
+        return bzerr;
+    }
+
+    FILE *fp_out = fopen(outfile, "wb");
+    if (!fp_out) {
+        perror("Failed to open output file");
+        free(inbuf);
+        free(outbuf);
+        return 1;
+    }
+
+    fwrite(outbuf, 1, outsize + sizeof(uint32_t), fp_out);
+    fclose(fp_out);
+
+    printf("Compressed %s (%zu bytes) to %s (%zu bytes)\n", 
+           infile, insize, outfile, outsize + sizeof(uint32_t));
+
+    free(inbuf);
+    free(outbuf);
+    return 0;
+}
+
+int main(int argc, char **argv) {
+#ifdef __AFL_HAVE_MANUAL_CONTROL
+    __AFL_INIT();
+    
+    while (__AFL_LOOP(1000)) {
+        try_decompress(__AFL_FUZZ_TESTCASE_BUF, __AFL_FUZZ_TESTCASE_LEN);
+    }
+#else
+    if (argc == 4) {
+        // Compression mode: input_file output_file block_size
+        return compress_file(argv[1], argv[2], atoi(argv[3]));
+    }
+    
+    if (argc != 2) {
+        fprintf(stderr, "Usage:\n");
+        fprintf(stderr, "  Decompress: %s <input_file>\n", argv[0]);
+        fprintf(stderr, "  Compress:   %s <input_file> <output_file> <block_size>\n", argv[0]);
+        return 1;
+    }
+
+    // Decompression mode
+    FILE *fp = fopen(argv[1], "rb");
+    if (!fp) {
+        perror("Failed to open input file");
+        return 1;
+    }
+
+    fseek(fp, 0, SEEK_END);
+    size_t size = ftell(fp);
+    fseek(fp, 0, SEEK_SET);
+
+    if (size < 64) {
+        fclose(fp);
+        return 0;
+    }
+
+    uint8_t *buffer = malloc(size);
+    if (!buffer) {
+        fclose(fp);
+        return 1;
+    }
+
+    fread(buffer, 1, size, fp);
+    fclose(fp);
+
+    int result = try_decompress(buffer, size);
+    free(buffer);
+    return result > 0 ? result : 0; // Return bzip3 errors but treat validation errors as success
+#endif
+
+    return 0;
+}
\ No newline at end of file
diff --git a/examples/fuzz-round-trip.c b/examples/fuzz-round-trip.c
new file mode 100644
index 0000000..7517c5f
--- /dev/null
+++ b/examples/fuzz-round-trip.c
@@ -0,0 +1,164 @@
+/* A tiny utility for fuzzing bzip3 round-trip compression/decompression.
+ *
+ * Prerequisites:
+ * 
+ * - AFL https://github.com/AFLplusplus/AFLplusplus
+ * - clang (part of LLVM)
+ * 
+ * On Arch this is `pacman -S afl++ clang`
+ *
+ * # Instructions:
+ * 
+ * 1. Prepare fuzzer directories
+ * 
+ * mkdir -p afl_in && mkdir -p afl_out
+ * 
+ * 2. Insert a test file to afl_in/
+ * 
+ * cp ./standard_test_files/63_byte_file.bin afl_in/
+ * 
+ * 3. Build binary (for fuzzing)
+ * 
+ * afl-clang-fast fuzz-round-trip.c -I../include -o fuzz -g3 "-DVERSION=\"0.0.0\"" -O3 -march=native
+ * 
+ * 4. Run the fuzzer
+ * 
+ * AFL_SKIP_CPUFREQ=1 afl-fuzz -i afl_in -o afl_out -- ./fuzz @@
+ *
+ * 5. Need to go faster? Multithread.
+ * 
+ * alacritty -e bash -c "afl-fuzz -i afl_in -o afl_out -M fuzzer01 -- ./fuzz @@; exec bash" &
+ * alacritty -e bash -c "afl-fuzz -i afl_in -o afl_out -S fuzzer02 -- ./fuzz @@; exec bash" &
+ * alacritty -e bash -c "afl-fuzz -i afl_in -o afl_out -S fuzzer03 -- ./fuzz @@; exec bash" &
+ * alacritty -e bash -c "afl-fuzz -i afl_in -o afl_out -S fuzzer04 -- ./fuzz @@; exec bash" &
+ * 
+ * etc. Replace `alacritty` with your terminal.
+ * 
+ * 6. For ASAN testing:
+ *
+ * export AFL_USE_ASAN=1
+ * afl-clang-fast fuzz-round-trip.c -I../include -o fuzz -g3 "-DVERSION=\"0.0.0\"" -O3 -march=native
+ */
+
+#include "../include/libbz3.h"
+#include "../src/libbz3.c"
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#define KiB(x) ((x)*1024)
+#define DEFAULT_BLOCK_SIZE KiB(65)
+
+// Required for AFL++ persistent mode
+#ifdef __AFL_HAVE_MANUAL_CONTROL
+#include <unistd.h>
+__AFL_FUZZ_INIT();
+#endif
+
+// Function to emulate a crash for diagnostic purposes
+static void __attribute__((noreturn)) crash_with_message(const char* msg) {
+    fprintf(stderr, "Emulating crash: %s\n", msg);
+    // Use abort() to generate a crash that ASAN and other tools can catch
+    abort();
+}
+
+// Returns 0 on success, crashes on failure
+static int try_round_trip(const uint8_t *input_buf, size_t input_len) {
+    if (input_len == 0) return 0;
+
+    // Use the larger of DEFAULT_BLOCK_SIZE or input_len
+    size_t block_size = input_len > DEFAULT_BLOCK_SIZE ? input_len : DEFAULT_BLOCK_SIZE;
+    
+    struct bz3_state *state = bz3_new(block_size);
+    if (!state) {
+        return -1; // allocation failures not tested.
+    }
+
+    // Allocate buffer for both compression and decompression
+    // Using block_size to ensure we have enough space for both operations
+    size_t comp_buf_len = bz3_bound(input_len);
+    uint8_t *comp_buf = malloc(comp_buf_len);
+    if (!comp_buf) {
+        bz3_free(state);
+        return -1; // allocation failures not tested.
+    }
+
+    // Step 0: Move input to compress buffer
+    memmove(comp_buf, input_buf, input_len);
+
+    // Step 1: Compress the input
+    int32_t comp_size = bz3_encode_block(state, comp_buf, input_len);
+    if (comp_size < 0) {
+        bz3_free(state);
+        free(comp_buf);
+        crash_with_message("Compression failed");
+    }
+
+    // Step 2: Decompress
+    int bzerr = bz3_decode_block(state, comp_buf, comp_buf_len, comp_size, input_len);
+    if (bzerr < 0 || bzerr != input_len) {
+        bz3_free(state);
+        free(comp_buf);
+        crash_with_message("Decompression failed");
+    }
+
+    // Step 3: Compare
+    if (memcmp(input_buf, comp_buf, input_len) != 0) {
+        bz3_free(state);
+        free(comp_buf);
+        crash_with_message("Round-trip data mismatch");
+    }
+
+    bz3_free(state);
+    free(comp_buf);
+    return 0;
+}
+
+static int test_file(const char *filename) {
+    FILE *fp = fopen(filename, "rb");
+    if (!fp) {
+        perror("Failed to open input file");
+        return 1;
+    }
+
+    fseek(fp, 0, SEEK_END);
+    size_t size = ftell(fp);
+    fseek(fp, 0, SEEK_SET);
+
+    uint8_t *buffer = malloc(size);
+    if (!buffer) {
+        fclose(fp);
+        crash_with_message("Failed to allocate input buffer");
+    }
+
+    if (fread(buffer, 1, size, fp) != size) {
+        fclose(fp);
+        free(buffer);
+        crash_with_message("Failed to read input file");
+    }
+    fclose(fp);
+
+    int result = try_round_trip(buffer, size);
+    free(buffer);
+    return result;
+}
+
+int main(int argc, char **argv) {
+#ifdef __AFL_HAVE_MANUAL_CONTROL
+    __AFL_INIT();
+    
+    while (__AFL_LOOP(1000)) {
+        try_round_trip(__AFL_FUZZ_TESTCASE_BUF, __AFL_FUZZ_TESTCASE_LEN);
+    }
+#else
+    if (argc != 2) {
+        fprintf(stderr, "Usage: %s <input_file>\n", argv[0]);
+        return 1;
+    }
+
+    return test_file(argv[1]);
+#endif
+
+    return 0;
+}
\ No newline at end of file
diff --git a/examples/fuzz.c b/examples/fuzz.c
deleted file mode 100644
index 0485a39..0000000
--- a/examples/fuzz.c
+++ /dev/null
@@ -1,86 +0,0 @@
-
-/* A tiny utility for fuzzing bzip3.
- *
- * Prerequisites:
- * 
- * - AFL https://github.com/AFLplusplus/AFLplusplus
- * - clang (part of LLVM)
- * 
- * On Arch this is `pacman -S afl++ clang`
- *
- * # Instructions:
- * 
- * 1. Build the Repository (per example in README.md)
- * 
- * This will get you a working binary of `bzip3` (in repo root).
- * Then cd into this (examples) folder.
- * 
- * 2. Prepare fuzzer directories
- * 
- * mkdir -p afl_in && mkdir -p afl_out
- * 
- * 3. Make a fuzzer input file.
- * 
- * With `your_file` being an arbitrary input to test.
- * 
- * ../bzip3 -e your_file
- * mv your_file.bz3 afl_in/
- * 
- * 4. Build instrumented binary.
- * 
- * afl-clang fuzz.c -I../include ../src/libbz3.c -o fuzz -g3 "-DVERSION=\"0.0.0\"" -O3 -march=native
- * 
- * 5. Run the fuzzer.
- * 
- * AFL_SKIP_CPUFREQ=1 afl-fuzz -i afl_in -o afl_out -- ./fuzz @@
- *
- * 6. Found a crash?
- * 
- * If you find a crash, consider also doing the following:
- * 
- * clang fuzz.c ../src/libbz3.c -g3 -O3 -march=native -o fuzz_asan -I../include "-DVERSION=\"0.0.0\"" -fsanitize=undefined -fsanitize=address
- *
- * And run fuzz_asan on the crashing test case. Attach the test case /and/ the output of fuzz_asan to the bug report.
- */
-
-#include <libbz3.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-int main(int argc, char ** argv) {
-    // Read the entire input file to memory:
-    FILE * fp = fopen(argv[1], "rb");
-    fseek(fp, 0, SEEK_END);
-    size_t size = ftell(fp);
-    fseek(fp, 0, SEEK_SET);
-    volatile uint8_t * buffer = malloc(size);
-    fread(buffer, 1, size, fp);
-    fclose(fp);
-
-    if (size < 64) {
-        // Too small.
-        free(buffer);
-        return 0;
-    }
-
-    // Decompress the file:
-    size_t orig_size = *(size_t *)buffer;
-    if (orig_size >= 0x10000000) {
-        // Sanity check: don't allocate more than 256MB.
-        free(buffer);
-        return 0;
-    }
-    uint8_t * outbuf = malloc(orig_size);
-    int bzerr = bz3_decompress(buffer + sizeof(size_t), outbuf, size - sizeof(size_t), &orig_size);
-    if (bzerr != BZ3_OK) {
-        printf("bz3_decompress() failed with error code %d", bzerr);
-        free(outbuf);
-        free(buffer);
-        return 1;
-    }
-
-    printf("OK, %d => %d", size, orig_size);
-    free(outbuf);
-    free(buffer);
-    return 0;
-}
diff --git a/examples/standard_test_files/63_byte_file.bin b/examples/standard_test_files/63_byte_file.bin
new file mode 100644
index 0000000..5c80e5d
--- /dev/null
+++ b/examples/standard_test_files/63_byte_file.bin
@@ -0,0 +1 @@
+	 !"#$%&'()0123456789@ABCDEFGHIPQRSTUVWXY`abc
\ No newline at end of file
diff --git a/examples/standard_test_files/65_byte_file.bin b/examples/standard_test_files/65_byte_file.bin
new file mode 100644
index 0000000..5fc7824
--- /dev/null
+++ b/examples/standard_test_files/65_byte_file.bin
@@ -0,0 +1 @@
+	 !"#$%&'()0123456789@ABCDEFGHIPQRSTUVWXY`abcde
\ No newline at end of file
diff --git a/examples/standard_test_files/readme.txt b/examples/standard_test_files/readme.txt
new file mode 100644
index 0000000..ae7d134
--- /dev/null
+++ b/examples/standard_test_files/readme.txt
@@ -0,0 +1,4 @@
+This is a standard set of files to use as inputs for fuzzer testing:
+
+- 65_byte_file.bin: 65 bytes, all unique
+- 63_byte_file.bin: 63 bytes, all unique
diff --git a/include/libbz3.h b/include/libbz3.h
index 4b31a38..af447e7 100644
--- a/include/libbz3.h
+++ b/include/libbz3.h
@@ -52,6 +52,7 @@ extern "C" {
 #define BZ3_ERR_TRUNCATED_DATA -5
 #define BZ3_ERR_DATA_TOO_BIG -6
 #define BZ3_ERR_INIT -7
+#define BZ3_ERR_DATA_SIZE_TOO_SMALL -8
 
 struct bz3_state;
 
@@ -90,7 +91,7 @@ BZIP3_API size_t bz3_bound(size_t input_size);
 /* ** HIGH LEVEL APIs ** */
 
 /**
- * @brief Compress a block of data. This function does not support parallelism
+ * @brief Compress a frame. This function does not support parallelism
  * by itself, consider using the low level `bz3_encode_blocks()` function instead.
  * Using the low level API might provide better performance.
  * Returns a bzip3 error code; BZ3_OK when the operation is successful.
@@ -100,7 +101,7 @@ BZIP3_API size_t bz3_bound(size_t input_size);
 BZIP3_API int bz3_compress(uint32_t block_size, const uint8_t * in, uint8_t * out, size_t in_size, size_t * out_size);
 
 /**
- * @brief Decompress a block of data. This function does not support parallelism
+ * @brief Decompress a frame. This function does not support parallelism
  * by itself, consider using the low level `bz3_decode_blocks()` function instead.
  * Using the low level API might provide better performance.
  * Returns a bzip3 error code; BZ3_OK when the operation is successful.
@@ -108,6 +109,63 @@ BZIP3_API int bz3_compress(uint32_t block_size, const uint8_t * in, uint8_t * ou
  */
 BZIP3_API int bz3_decompress(const uint8_t * in, uint8_t * out, size_t in_size, size_t * out_size);
 
+/**
+ * @brief Calculate the minimal memory required for compression with the given block size.
+ * This includes all internal buffers and state structures. This calculates the amount of bytes
+ * that will be allocated by a call to `bz3_new()`.
+ * 
+ * @details Memory allocation and usage patterns:
+ * 
+ * bz3_new():
+ *    - Allocates all memory upfront:
+ *      - Core state structure (sizeof(struct bz3_state))
+ *      - Swap buffer (bz3_bound(block_size) bytes)
+ *      - SAIS array (BWT_BOUND(block_size) * sizeof(int32_t) bytes)
+ *      - LZP lookup table ((1 << LZP_DICTIONARY) * sizeof(int32_t) bytes)
+ *      - Compression state (sizeof(state))
+ *    - All memory remains allocated until bz3_free()
+ * 
+ * Additional memory may be used depending on API used from here.
+ * 
+ * # Low Level APIs
+ * 
+ * 1. bz3_encode_block() / bz3_decode_block():
+ *    - Uses pre-allocated memory from bz3_new()
+ *    - No additional memory allocation except for libsais (usually ~16KiB)
+ *    - Peak memory usage of physical RAM varies with compression stages:
+ *      - LZP: Uses LZP lookup table + swap buffer
+ *      - BWT: Uses SAIS array + swap buffer
+ *      - Entropy coding: Uses compression state (cm_state) + swap buffer
+ * 
+ * Using the higher level API, `bz3_compress`, expect an additional allocation
+ * of `bz3_bound(block_size)`.
+ * 
+ * In the parallel version `bz3_encode_blocks`, each thread gets its own state,
+ * so memory usage is `n_threads * bz3_compress_memory_needed()`.
+ * 
+ * # High Level APIs
+ * 
+ * 1. bz3_compress():
+ *    - Allocates additional temporary compression buffer (bz3_bound(block_size) bytes)
+ *      in addition to the memory amount returned by this method call and libsais.
+ *    - Everything is freed after compression completes
+ * 
+ * 2. bz3_decompress():
+ *    - Allocates additional temporary compression buffer (bz3_bound(block_size) bytes)
+ *      in addition to the memory amount returned by this method call and libsais.
+ *    - Everything is freed after compression completes
+ * 
+ * Memory remains constant during operation, with except of some small allocations from libsais during
+ * BWT stage. That is not accounted by this function, though it usually amounts to ~16KiB, negligible.
+ * The worst case of BWT is 2*block_size technically speaking.
+ * 
+ * No dynamic (re)allocation occurs outside of that.
+ * 
+ * @param block_size The block size to be used for compression
+ * @return The total number of bytes required for compression, or 0 if block_size is invalid
+ */
+BZIP3_API size_t bz3_min_memory_needed(int32_t block_size);
+
 /* ** LOW LEVEL APIs ** */
 
 /**
@@ -119,12 +177,21 @@ BZIP3_API int32_t bz3_encode_block(struct bz3_state * state, uint8_t * buffer, i
 
 /**
  * @brief Decode a single block.
- * `buffer' must be able to hold at least `bz3_bound(orig_size)' bytes. The size must not exceed the block size
- * associated with the state.
- * @param size The size of the compressed data in `buffer'
+ * 
+ * `buffer' must be able to hold at least `bz3_bound(orig_size)' bytes
+ * in order to ensure decompression will succeed for all possible bzip3 blocks.
+ * 
+ * In most (but not all) cases, `orig_size` should usually be sufficient.
+ * If it is not sufficient, you must allocate a buffer of size `bz3_bound(orig_size)` temporarily. 
+ * 
+ * If `buffer_size` is too small, `BZ3_ERR_DATA_SIZE_TOO_SMALL` will be returned.
+ * The size must not exceed the block size associated with the state.
+ * 
+ * @param buffer_size The size of the buffer at 'buffer'
+ * @param compressed_size The size of the compressed data in 'buffer'
  * @param orig_size The original size of the data before compression.
  */
-BZIP3_API int32_t bz3_decode_block(struct bz3_state * state, uint8_t * buffer, int32_t size, int32_t orig_size);
+BZIP3_API int32_t bz3_decode_block(struct bz3_state * state, uint8_t * buffer, size_t buffer_size, int32_t compressed_size, int32_t orig_size);
 
 /**
  * @brief Encode `n' blocks, all in parallel.
@@ -142,9 +209,32 @@ BZIP3_API void bz3_encode_blocks(struct bz3_state * states[], uint8_t * buffers[
  * @brief Decode `n' blocks, all in parallel.
  * Same specifics as `bz3_encode_blocks', but doesn't overwrite `sizes'.
  */
-BZIP3_API void bz3_decode_blocks(struct bz3_state * states[], uint8_t * buffers[], int32_t sizes[],
+BZIP3_API void bz3_decode_blocks(struct bz3_state * states[], uint8_t * buffers[], size_t buffer_sizes[], int32_t sizes[],
                                  int32_t orig_sizes[], int32_t n);
 
+/**
+ * @brief Check if using original file size as buffer size is sufficient for decompressing
+ * a block at `block` pointer.
+ * 
+ * @param block Pointer to the compressed block data
+ * @param block_size Size of the block buffer in bytes (must be at least 13 bytes for header)
+ * @param orig_size Size of the original uncompressed data 
+ * @return 1 if original size is sufficient, 0 if insufficient, -1 on header error (insufficient buffer size)
+ * 
+ * @remarks
+ * 
+ *      This function is useful for external APIs using the low level block encoding API,
+ *      `bz3_encode_block`. You would normally call this directly after `bz3_encode_block`
+ *      on the block that has been output.
+ *      
+ *      The purpose of this function is to prevent encoding blocks that would require an additional
+ *      malloc at decompress time.
+ *      The goal is to prevent erroring with `BZ3_ERR_DATA_SIZE_TOO_SMALL`, thus
+ *      in turn 
+ */
+BZIP3_API int bz3_orig_size_sufficient_for_decode(const uint8_t * block, size_t block_size, int32_t orig_size);
+
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
diff --git a/src/libbz3.c b/src/libbz3.c
index 91bc272..554aa36 100644
--- a/src/libbz3.c
+++ b/src/libbz3.c
@@ -18,12 +18,18 @@
  */
 
 #include "libbz3.h"
-
 #include <stdlib.h>
 #include <string.h>
-
 #include "libsais.h"
 
+#if defined(__GNUC__) || defined(__clang__)
+    #define LIKELY(x)   __builtin_expect(!!(x), 1)
+    #define UNLIKELY(x) __builtin_expect(!!(x), 0)
+#else
+    #define LIKELY(x)   (x)
+    #define UNLIKELY(x) (x)
+#endif
+
 /* CRC32 implementation. Since CRC32 generally takes less than 1% of the runtime on real-world data (e.g. the
    Silesia corpus), I decided against using hardware CRC32. This implementation is simple, fast, fool-proof and
    good enough to be used with bzip3. */
@@ -87,6 +93,34 @@ static u32 lzp_upcast(const u8 * ptr) {
     return val;
 }
 
+/**
+ * @brief Check if the buffer size is sufficient for decoding a bz3 block
+ * 
+ * Data passed to the last step can be one of the following:
+ * - original data
+ * - original data + LZP
+ * - original data + RLE
+ * - original data + RLE + LZP
+ *
+ * We must ensure `buffer_size` is large enough to store the data at every step 
+ * when walking backwards. The required size may be stored in  either `lzp_size`,
+ * `rle_size` OR `orig_size`.
+ *
+ * @param buffer_size Size of the output buffer
+ * @param lzp_size Size after LZP decompression (-1 if LZP not used)
+ * @param rle_size Size after RLE decompression (-1 if RLE not used) 
+ * @return 1 if buffer size is sufficient, 0 otherwise
+ */
+static int bz3_check_buffer_size(size_t buffer_size, s32 lzp_size, s32 rle_size, s32 orig_size) {
+    // Handle -1 cases to avoid implicit conversion issues
+    size_t effective_lzp_size = lzp_size < 0 ? 0 : (size_t)lzp_size;
+    size_t effective_rle_size = rle_size < 0 ? 0 : (size_t)rle_size;
+    size_t effective_orig_size = orig_size < 0 ? 0 : (size_t)orig_size;
+
+    // Check if buffer can hold intermediate results
+    return (effective_lzp_size <= buffer_size) && (effective_rle_size <= buffer_size) && (effective_orig_size <= buffer_size);
+}
+
 static s32 lzp_encode_block(const u8 * RESTRICT in, const u8 * in_end, u8 * RESTRICT out, u8 * out_end,
                             s32 * RESTRICT lut) {
     const u8 * ins = in;
@@ -173,21 +207,23 @@ static s32 lzp_decode_block(const u8 * RESTRICT in, const u8 * in_end, s32 * RES
 
     while (in < in_end && out < out_end) {
         u32 idx = (ctx >> 15 ^ ctx ^ ctx >> 3) & ((s32)(1 << LZP_DICTIONARY) - 1);
-        s32 val = lut[idx];
+        s32 val = lut[idx]; // SAFETY: guaranteed to be in-bounds by & mask. 
         lut[idx] = (s32)(out - outs);
         if (*in == MATCH && val > 0) {
             in++;
+            // SAFETY: 'in' is advanced here, but it may have been at last index in the case of untrusted bad data.
+            if (UNLIKELY(in == in_end)) return -1;
             if (*in != 255) {
                 s32 len = LZP_MIN_MATCH;
                 while (1) {
-                    if (in == in_end) return -1;
+                    if (UNLIKELY(in == in_end)) return -1;
                     len += *in;
                     if (*in++ != 254) break;
                 }
 
                 const u8 * ref = outs + val;
                 const u8 * oe = out + len;
-                if (oe > out_end) oe = out_end;
+                if (UNLIKELY(oe > out_end)) oe = out_end;
 
                 while (out < oe) *out++ = *ref++;
 
@@ -489,6 +525,8 @@ BZIP3_API const char * bz3_strerror(struct bz3_state * state) {
             return "Truncated data";
         case BZ3_ERR_DATA_TOO_BIG:
             return "Too much data";
+        case BZ3_ERR_DATA_SIZE_TOO_SMALL:
+            return "Size of buffer `buffer_size` passed to the block decoder (bz3_decode_block) is too small. See function docs for details.";
         default:
             return "Unknown error";
     }
@@ -615,41 +653,59 @@ BZIP3_API s32 bz3_encode_block(struct bz3_state * state, u8 * buffer, s32 data_s
     return data_size + overhead * 4 + 1;
 }
 
-BZIP3_API s32 bz3_decode_block(struct bz3_state * state, u8 * buffer, s32 data_size, s32 orig_size) {
+BZIP3_API s32 bz3_decode_block(struct bz3_state * state, u8 * buffer, size_t buffer_size, s32 compressed_size, s32 orig_size) {
+    // Need minimum bytes for initial header, and compressed_size needs to fit within claimed buffer size.
+    if (buffer_size < 9 || buffer_size < compressed_size) {
+        state->last_error = BZ3_ERR_DATA_SIZE_TOO_SMALL;
+        return -1;
+    }
+
     // Read the header.
     u32 crc32 = read_neutral_s32(buffer);
     s32 bwt_idx = read_neutral_s32(buffer + 4);
 
-    if (data_size > bz3_bound(state->block_size) || data_size < 0) {
+    if (compressed_size > bz3_bound(state->block_size) || compressed_size < 0) {
         state->last_error = BZ3_ERR_MALFORMED_HEADER;
         return -1;
     }
 
     if (bwt_idx == -1) {
-        if (data_size - 8 > 64 || data_size < 8) {
+        if (compressed_size - 8 > 64 || compressed_size < 8) {
             state->last_error = BZ3_ERR_MALFORMED_HEADER;
             return -1;
         }
 
-        memmove(buffer, buffer + 8, data_size - 8);
+        // Ensure there's enough space for the raw copied data.
+        if (compressed_size - 8 > buffer_size) {
+            state->last_error = BZ3_ERR_DATA_SIZE_TOO_SMALL;
+            return -1;
+        }
 
-        if (crc32sum(1, buffer, data_size - 8) != crc32) {
+        memmove(buffer, buffer + 8, compressed_size - 8);
+
+        if (crc32sum(1, buffer, compressed_size - 8) != crc32) {
             state->last_error = BZ3_ERR_CRC;
             return -1;
         }
 
-        return data_size - 8;
+        return compressed_size - 8;
     }
 
     s8 model = buffer[8];
-    s32 lzp_size = -1, rle_size = -1, p = 0;
 
+    // Ensure we have sufficient bytes for the rle/lzp sizes.
+    size_t needed_header_size = 9 + ((model & 2) * 4) + ((model & 4) * 4);
+    if (buffer_size < needed_header_size) {
+        state->last_error = BZ3_ERR_DATA_SIZE_TOO_SMALL;
+        return -1;
+    }
+
+    s32 lzp_size = -1, rle_size = -1, p = 0;
     if (model & 2) lzp_size = read_neutral_s32(buffer + 9 + 4 * p++);
     if (model & 4) rle_size = read_neutral_s32(buffer + 9 + 4 * p++);
-
     p += 2;
 
-    data_size -= p * 4 + 1;
+    compressed_size -= p * 4 + 1;
 
     if (((model & 2) && (lzp_size > bz3_bound(state->block_size) || lzp_size < 0)) ||
         ((model & 4) && (rle_size > bz3_bound(state->block_size) || rle_size < 0))) {
@@ -662,40 +718,51 @@ BZIP3_API s32 bz3_decode_block(struct bz3_state * state, u8 * buffer, s32 data_s
         return -1;
     }
 
+    // Size that undoing BWT+BCM should decompress into.
+    s32 size_before_bwt;
+
+    if (model & 2)
+        size_before_bwt = lzp_size;
+    else if (model & 4)
+        size_before_bwt = rle_size;
+    else
+        size_before_bwt = orig_size;
+
+    // Note(sewer): It's technically valid within the spec to create a bzip3 block
+    // where the size after LZP/RLE is larger than the original input. Some earlier encoders
+    // even (mistakenly?) were able to do this.
+    if (!bz3_check_buffer_size(buffer_size, lzp_size, rle_size, orig_size)) {
+        state->last_error = BZ3_ERR_DATA_SIZE_TOO_SMALL;
+        return -1;
+    }
+
     // Decode the data.
     u8 *b1 = buffer, *b2 = state->swap_buffer;
 
     begin(state->cm_state);
     state->cm_state->in_queue = b1 + p * 4 + 1;
     state->cm_state->input_ptr = 0;
-    state->cm_state->input_max = data_size;
-
-    s32 size_src;
+    state->cm_state->input_max = compressed_size;
 
-    if (model & 2)
-        size_src = lzp_size;
-    else if (model & 4)
-        size_src = rle_size;
-    else
-        size_src = orig_size;
-
-    decode_bytes(state->cm_state, b2, size_src);
+    decode_bytes(state->cm_state, b2, size_before_bwt);
     swap(b1, b2);
 
-    if (bwt_idx > size_src) {
+    if (bwt_idx > size_before_bwt) {
         state->last_error = BZ3_ERR_MALFORMED_HEADER;
         return -1;
     }
 
     // Undo BWT
     memset(state->sais_array, 0, sizeof(s32) * BWT_BOUND(state->block_size));
-    memset(b2, 0, size_src);
-    if (libsais_unbwt(b1, b2, state->sais_array, size_src, NULL, bwt_idx) < 0) {
+    memset(b2, 0, size_before_bwt); // buffer b2, swap b1
+    if (libsais_unbwt(b1, b2, state->sais_array, size_before_bwt, NULL, bwt_idx) < 0) {
         state->last_error = BZ3_ERR_BWT;
         return -1;
     }
     swap(b1, b2);
 
+    s32 size_src = size_before_bwt;
+
     // Undo LZP
     if (model & 2) {
         size_src = lzp_decompress(b1, b2, lzp_size, bz3_bound(state->block_size), state->lzp_lut);
@@ -703,10 +770,18 @@ BZIP3_API s32 bz3_decode_block(struct bz3_state * state, u8 * buffer, s32 data_s
             state->last_error = BZ3_ERR_CRC;
             return -1;
         }
+        // SAFETY(sewer): An attacker formed bzip3 data which decompresses as valid lzp.
+        // The headers above were set to ones that pass validation (size within bounds), but the 
+        // data itself tries to escape buffer_size. Don't allow it to.
+        if (size_src > buffer_size) {
+            state->last_error = BZ3_ERR_DATA_SIZE_TOO_SMALL;    
+            return -1;
+        }
         swap(b1, b2);
     }
 
-    if (model & 4) {
+    if (model & 4) { 
+        // SAFETY: mrled is capped at orig_size, which is in bounds.
         int err = mrled(b1, b2, orig_size, size_src);
         if (err) {
             state->last_error = BZ3_ERR_CRC;
@@ -748,6 +823,7 @@ typedef struct {
 typedef struct {
     struct bz3_state * state;
     u8 * buffer;
+    size_t buffer_size;
     s32 size;
     s32 orig_size;
 } decode_thread_msg;
@@ -761,7 +837,7 @@ static void * bz3_init_encode_thread(void * _msg) {
 
 static void * bz3_init_decode_thread(void * _msg) {
     decode_thread_msg * msg = _msg;
-    bz3_decode_block(msg->state, msg->buffer, msg->size, msg->orig_size);
+    bz3_decode_block(msg->state, msg->buffer, msg->buffer_size, msg->size, msg->orig_size);
     pthread_exit(NULL);
     return NULL;  // unreachable
 }
@@ -779,12 +855,13 @@ BZIP3_API void bz3_encode_blocks(struct bz3_state * states[], u8 * buffers[], s3
     for (s32 i = 0; i < n; i++) sizes[i] = messages[i].size;
 }
 
-BZIP3_API void bz3_decode_blocks(struct bz3_state * states[], u8 * buffers[], s32 sizes[], s32 orig_sizes[], s32 n) {
+BZIP3_API void bz3_decode_blocks(struct bz3_state * states[], u8 * buffers[], size_t buffer_sizes[], s32 sizes[], s32 orig_sizes[], s32 n) {
     decode_thread_msg messages[n];
     pthread_t threads[n];
     for (s32 i = 0; i < n; i++) {
         messages[i].state = states[i];
         messages[i].buffer = buffers[i];
+        messages[i].buffer_size = buffer_sizes[i];
         messages[i].size = sizes[i];
         messages[i].orig_size = orig_sizes[i];
         pthread_create(&threads[i], NULL, bz3_init_decode_thread, &messages[i]);
@@ -868,7 +945,8 @@ BZIP3_API int bz3_decompress(const uint8_t * in, uint8_t * out, size_t in_size,
     struct bz3_state * state = bz3_new(block_size);
     if (!state) return BZ3_ERR_INIT;
 
-    u8 * compression_buf = malloc(bz3_bound(block_size));
+    size_t compression_buf_size = bz3_bound(block_size);
+    u8 * compression_buf = malloc(compression_buf_size);
     if (!compression_buf) {
         bz3_free(state);
         return BZ3_ERR_INIT;
@@ -899,7 +977,7 @@ BZIP3_API int bz3_decompress(const uint8_t * in, uint8_t * out, size_t in_size,
             return BZ3_ERR_DATA_TOO_BIG;
         }
         memcpy(compression_buf, in + 8, size);
-        bz3_decode_block(state, compression_buf, size, orig_size);
+        bz3_decode_block(state, compression_buf, compression_buf_size, size, orig_size);
         if (bz3_last_error(state) != BZ3_OK) {
             s8 last_error = state->last_error;
             bz3_free(state);
@@ -915,3 +993,61 @@ BZIP3_API int bz3_decompress(const uint8_t * in, uint8_t * out, size_t in_size,
     bz3_free(state);
     return BZ3_OK;
 }
+
+BZIP3_API size_t bz3_min_memory_needed(int32_t block_size) {
+    if (block_size < KiB(65) || block_size > MiB(511)) {
+        return 0;
+    }
+
+    size_t total_size = 0;
+
+    // This is based on bz3_new.
+    // Core state structure
+    total_size += sizeof(struct bz3_state);
+
+    // cm_state
+    total_size += sizeof(state);
+
+    // Swap buffer (needs to handle expanded size) (swap_buffer)
+    total_size += bz3_bound(block_size);
+
+    // SAIS array
+    total_size += BWT_BOUND(block_size) * sizeof(int32_t);
+
+    // LZP lookup table (lzp_lut)
+    total_size += (1 << LZP_DICTIONARY) * sizeof(int32_t);
+    return total_size;
+}
+
+
+BZIP3_API int bz3_orig_size_sufficient_for_decode(const u8 * block, size_t block_size, s32 orig_size) {
+    // Need at least 9 bytes for the initial header (4 bytes BWT index + 4 bytes CRC + 1 byte model)
+    if (block_size < 9) {
+        return -1;
+    }
+
+    s32 bwt_idx = read_neutral_s32(block + 4);
+    if (bwt_idx == -1) {
+        // Uncompressed literals.
+        // Original size always sufficient for uncompressed blocks
+        return 1;  
+    }
+
+    s8 model = block[8];
+    s32 lzp_size = -1, rle_size = -1;
+    size_t header_size = 9;  // Start after model byte
+
+    // Ensure we have sufficient bytes for the rle/lzp sizes.
+    size_t needed_header_size = 9 + ((model & 2) * 4) + ((model & 4) * 4);
+    if (block_size < needed_header_size) {
+        return -1;
+    }
+
+    // Need additional 4 bytes for each size field that might be present
+    if (model & 2) {
+        lzp_size = read_neutral_s32(block + header_size);
+        header_size += 4;
+    }
+    if (model & 4) rle_size = read_neutral_s32(block + header_size);
+    return bz3_check_buffer_size((size_t)orig_size, lzp_size, rle_size, orig_size);
+}
diff --git a/src/main.c b/src/main.c
index 9a5f7d5..a449f8a 100644
--- a/src/main.c
+++ b/src/main.c
@@ -229,7 +229,8 @@ static int process(FILE * input_des, FILE * output_des, int mode, int block_size
             return 1;
         }
 
-        u8 * buffer = malloc(bz3_bound(block_size));
+        size_t buffer_size = bz3_bound(block_size);
+        u8 * buffer = malloc(buffer_size);
 
         if (!buffer) {
             fprintf(stderr, "Failed to allocate memory.\n");
@@ -272,7 +273,7 @@ static int process(FILE * input_des, FILE * output_des, int mode, int block_size
                 }
                 xread_noeof(buffer, 1, new_size, input_des);
                 bytes_read += 8 + new_size;
-                if (bz3_decode_block(state, buffer, new_size, old_size) == -1) {
+                if (bz3_decode_block(state, buffer, buffer_size, new_size, old_size) == -1) {
                     fprintf(stderr, "Failed to decode a block: %s\n", bz3_strerror(state));
                     return 1;
                 }
@@ -294,7 +295,7 @@ static int process(FILE * input_des, FILE * output_des, int mode, int block_size
                 }
                 xread_noeof(buffer, 1, new_size, input_des);
                 bytes_read += 8 + new_size;
-                if (bz3_decode_block(state, buffer, new_size, old_size) == -1) {
+                if (bz3_decode_block(state, buffer, buffer_size, new_size, old_size) == -1) {
                     fprintf(stderr, "Writing invalid block: %s\n", bz3_strerror(state));
                 }
                 xwrite(buffer, old_size, 1, output_des);
@@ -315,7 +316,7 @@ static int process(FILE * input_des, FILE * output_des, int mode, int block_size
                 xread_noeof(buffer, 1, new_size, input_des);
                 bytes_read += 8 + new_size;
                 bytes_written += old_size;
-                if (bz3_decode_block(state, buffer, new_size, old_size) == -1) {
+                if (bz3_decode_block(state, buffer, buffer_size, new_size, old_size) == -1) {
                     fprintf(stderr, "Failed to decode a block: %s\n", bz3_strerror(state));
                     return 1;
                 }
@@ -335,6 +336,7 @@ static int process(FILE * input_des, FILE * output_des, int mode, int block_size
         struct bz3_state * states[workers];
         u8 * buffers[workers];
         s32 sizes[workers];
+        size_t buffer_sizes[workers];
         s32 old_sizes[workers];
         for (s32 i = 0; i < workers; i++) {
             states[i] = bz3_new(block_size);
@@ -342,7 +344,9 @@ static int process(FILE * input_des, FILE * output_des, int mode, int block_size
                 fprintf(stderr, "Failed to create a block encoder state.\n");
                 return 1;
             }
-            buffers[i] = malloc(block_size + block_size / 50 + 32);
+            size_t buffer_size = bz3_bound(block_size);
+            buffer_sizes[i] = buffer_size;
+            buffers[i] = malloc(buffer_size);
             if (!buffers[i]) {
                 fprintf(stderr, "Failed to allocate memory.\n");
                 return 1;
@@ -393,7 +397,7 @@ static int process(FILE * input_des, FILE * output_des, int mode, int block_size
                     xread_noeof(buffers[i], 1, sizes[i], input_des);
                     bytes_read += 8 + sizes[i];
                 }
-                bz3_decode_blocks(states, buffers, sizes, old_sizes, i);
+                bz3_decode_blocks(states, buffers, buffer_sizes, sizes, old_sizes, i);
                 for (s32 j = 0; j < i; j++) {
                     if (bz3_last_error(states[j]) != BZ3_OK) {
                         fprintf(stderr, "Failed to decode data: %s\n", bz3_strerror(states[j]));
@@ -421,7 +425,7 @@ static int process(FILE * input_des, FILE * output_des, int mode, int block_size
                     xread_noeof(buffers[i], 1, sizes[i], input_des);
                     bytes_read += 8 + sizes[i];
                 }
-                bz3_decode_blocks(states, buffers, sizes, old_sizes, i);
+                bz3_decode_blocks(states, buffers, buffer_sizes, sizes, old_sizes, i);
                 for (s32 j = 0; j < i; j++) {
                     if (bz3_last_error(states[j]) != BZ3_OK) {
                         fprintf(stderr, "Writing invalid block: %s\n", bz3_strerror(states[j]));
@@ -449,7 +453,7 @@ static int process(FILE * input_des, FILE * output_des, int mode, int block_size
                     bytes_read += 8 + sizes[i];
                     bytes_written += old_sizes[i];
                 }
-                bz3_decode_blocks(states, buffers, sizes, old_sizes, i);
+                bz3_decode_blocks(states, buffers, buffer_sizes, sizes, old_sizes, i);
                 for (s32 j = 0; j < i; j++) {
                     if (bz3_last_error(states[j]) != BZ3_OK) {
                         fprintf(stderr, "Failed to decode data: %s\n", bz3_strerror(states[j]));
@@ -817,4 +821,4 @@ int main(int argc, char * argv[]) {
     }
 
     return r;
-}
+}
\ No newline at end of file