diff --git a/.github/workflows/ccp-workflow.yml b/.github/workflows/ccp-workflow.yml
index da94fd2..94ca3eb 100644
--- a/.github/workflows/ccp-workflow.yml
+++ b/.github/workflows/ccp-workflow.yml
@@ -10,6 +10,7 @@ on:
 jobs:
   build:
     strategy:
+      fail-fast: false
       matrix:
         include: [
           { system: MacOS,          runner: macos-latest },
@@ -29,6 +30,7 @@ jobs:
 
   build_windows:
     strategy:
+      fail-fast: false
       matrix:
         include: [
           { system: Windows,  runner: windows-latest },
@@ -46,9 +48,11 @@ jobs:
 
   test:
     strategy:
+      fail-fast: false
       matrix:
         include: [
-          { system: MacOS,          runner: macos-latest },
+          { system: MacOS-13,       runner: macos-13 },
+          { system: MacOS-latest,   runner: macos-latest },
           { system: Ubuntu-latest,  runner: ubuntu-latest },
         ]
     name: ${{ matrix.system }} Test
@@ -67,6 +71,7 @@ jobs:
       
   test_windows:
     strategy:
+      fail-fast: false
       matrix:
         include: [
           { system: Windows,  runner: windows-latest },
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index c2d527a..7d031b5 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -46,11 +46,11 @@ jobs:
 
     steps:
     - name: Checkout repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     # Initializes the CodeQL tools for scanning.
     - name: Initialize CodeQL
-      uses: github/codeql-action/init@v2
+      uses: github/codeql-action/init@v3
       with:
         languages: ${{ matrix.language }}
         # If you wish to specify custom queries, you can do so here or in a config file.
@@ -64,7 +64,7 @@ jobs:
     # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift).
     # If this step fails, then you should remove it and run the build manually (see below)
     - name: Autobuild
-      uses: github/codeql-action/autobuild@v2
+      uses: github/codeql-action/autobuild@v3
 
     # ℹ️ Command-line programs to run using the OS shell.
     # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
@@ -77,6 +77,6 @@ jobs:
     #     ./location_of_script_within_repo/buildscript.sh
 
     - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@v2
+      uses: github/codeql-action/analyze@v3
       with:
         category: "/language:${{matrix.language}}"
diff --git a/src/apps/common/ojph_img_io.h b/src/apps/common/ojph_img_io.h
index 401ad65..a9ee243 100644
--- a/src/apps/common/ojph_img_io.h
+++ b/src/apps/common/ojph_img_io.h
@@ -54,7 +54,7 @@ namespace ojph {
   ////////////////////////////////////////////////////////////////////////////
   // defined elsewhere
   class mem_fixed_allocator;
-  struct line_buf;
+  class line_buf;
 
   ////////////////////////////////////////////////////////////////////////////
   //
@@ -760,7 +760,7 @@ namespace ojph {
     const char* fname;
     bool is_signed;
     ui32 bit_depth, bytes_per_sample;
-    si32 lower_val, upper_val;
+    si64 lower_val, upper_val;
     ui32 width;
     ui8* buffer;
     ui32 buffer_size;
diff --git a/src/apps/ojph_compress/ojph_compress.cpp b/src/apps/ojph_compress/ojph_compress.cpp
index e7c047a..144c837 100644
--- a/src/apps/ojph_compress/ojph_compress.cpp
+++ b/src/apps/ojph_compress/ojph_compress.cpp
@@ -592,20 +592,25 @@ int main(int argc, char * argv[]) {
     ".pfm files receive special treatment. Currently, lossy compression\n"
     "with these files is not supported, only lossless. When these files are\n"
     "used, the NLT segment marker is automatically inserted into the\n"
-    "codestream. For these files the following arguments can be useful\n"
-    " -signed    a comma - separated list of true or false parameters, one\n"
+    "codestream when needed, as explained shortly. The following arguments\n"
+    "can be useful for this file type.\n"
+    " -signed    a comma-separated list of true or false parameters, one\n"
     "            for each component; for example: true,false,false.\n"
-    "            The sign only affects how values are treated; for negative\n"
-    "            values the standard requires a special non-linear\n"
-    "            transformation.  When signed is false, no transformation\n"
-    "            is employed, as we assume all values are 0 or positive.\n"
-    "            When signed is true, the aforementioned transformation is\n"
-    "            employed on negative values only.\n"
+    "            If you are sure that all sample values are positive or 0,\n"
+    "            set the corresponding entry to false; otherwise set it to\n"
+    "            true.\n"
+    "            When a component entry is set to true, an NLT segment\n"
+    "            marker segment is inserted into the codestream.\n"
+    "            The NLT segment specifies a non-linear transform that\n"
+    "            changes only negative values, producing better coding\n"
+    "            efficiency.\n"
+    "            The NLT segment marker might be less supported in other\n"
+    "            encoders.\n"
     " -bit_depth a comma-separated list of bit depth values, one per \n"
     "            component; for example: 12,10,10.\n"
     "            Floating value numbers are treated as integers, and they\n"
     "            are shifted to the right, keeping only the specified\n"
-    "            number of bits. Note that a bit depth of 28 upwards is not\n"
+    "            number of bits. Up to 32 bits (which is the default) are\n"
     "            supported.\n"
 
     "\n";
@@ -768,10 +773,6 @@ int main(int argc, char * argv[]) {
         assert(num_comps == 1 || num_comps == 3);
         siz.set_num_components(num_comps);
 
-        if (bit_depth[0] == 0)
-          OJPH_ERROR(0x01000091,
-            "-bit_depth must be specified (this is temporary only).\n");
-
         if (bit_depth[0] != 0)             // one was set
           if (num_bit_depths < num_comps)  // but if not enough, repeat
             for (ojph::ui32 c = num_bit_depths; c < num_comps; ++c)
@@ -840,11 +841,8 @@ int main(int argc, char * argv[]) {
               nlt.set_type3_transformation(c, true);
         }
         else
-          OJPH_ERROR(0x01000093, "The support for pfm image is not "
-            "complete; I need to figure how to modify the interface "
-            "to better support the exchange of floating point data. "
-            "Feeding float point data is not supported yet, unless it "
-            "is for lossless compression.");
+          OJPH_ERROR(0x01000093, "We currently support lossless only for "
+            "pfm images; this may change in the future.");
 
         codestream.set_planar(false);
         if (profile_string[0] != '\0')
diff --git a/src/apps/others/ojph_img_io.cpp b/src/apps/others/ojph_img_io.cpp
index d812025..05ce4df 100644
--- a/src/apps/others/ojph_img_io.cpp
+++ b/src/apps/others/ojph_img_io.cpp
@@ -329,9 +329,9 @@ namespace ojph {
       return;
       
     if (bytes_per_sample == 1)
-      temp_buf = alloc_p->post_alloc_data<ui8>(num_comps * width, 0);
+      temp_buf = alloc_p->post_alloc_data<ui8>(num_comps * (size_t)width, 0);
     else
-      temp_buf = alloc_p->post_alloc_data<ui16>(num_comps * width, 0);
+      temp_buf = alloc_p->post_alloc_data<ui16>(num_comps * (size_t)width, 0);
   }
 
   /////////////////////////////////////////////////////////////////////////////
@@ -408,7 +408,7 @@ namespace ojph {
           "unable to open file %s for writing", filename);
 
       fprintf(fh, "P5\n%d %d\n%d\n", width, height, (1 << bit_depth) - 1);
-      buffer_size = width * bytes_per_sample;
+      buffer_size = (size_t)width * bytes_per_sample;
       buffer = (ui8*)malloc(buffer_size);
     }
     else
@@ -435,7 +435,7 @@ namespace ojph {
         fprintf(fh, "P6\n%d %d\n%d\n", width, height, (1 << bit_depth) - 1);
       if (result == 0)
         OJPH_ERROR(0x03000027, "error writing to file %s", filename);
-      buffer_size = width * num_components * bytes_per_sample;
+      buffer_size = (size_t)width * num_components * (size_t)bytes_per_sample;
       buffer = (ui8*)malloc(buffer_size);
     }
     fname = filename;
@@ -935,12 +935,12 @@ namespace ojph {
     // the first time trying to access this line
     if (PLANARCONFIG_SEPARATE == planar_configuration && 0 == comp_num )
     {
-      for (unsigned short color = 0; color < num_comps; color++)
+      for (ui32 color = 0; color < num_comps; color++)
       {
         if (bytes_per_sample == 1)
         {
           TIFFReadScanline(tiff_handle, line_buffer_for_planar_support_uint8, 
-            cur_line, color);
+            cur_line, (ui16)color);
           ui32 x = color;
           uint8_t* line_buffer_of_interleaved_components = 
             (uint8_t*)line_buffer;
@@ -953,7 +953,7 @@ namespace ojph {
         else if (bytes_per_sample == 2)
         {
           TIFFReadScanline(tiff_handle, line_buffer_for_planar_support_uint16, 
-            cur_line, color);
+            cur_line, (ui16)color);
           ui32 x = color;
           ui16* line_buffer_of_interleaved_components = (ui16*)line_buffer;
           for (ui32 i = 0; i < width; i++, x += num_comps)
@@ -1070,7 +1070,7 @@ namespace ojph {
       OJPH_ERROR(0x030000B3, "unable to open file %s for writing", filename);
     }
 
-    buffer_size = width * num_components * bytes_per_sample;
+    buffer_size = (size_t)width * num_components * (size_t)bytes_per_sample;
     buffer = (ui8*)malloc(buffer_size);
     fname = filename;
     cur_line = 0;
@@ -1146,7 +1146,7 @@ namespace ojph {
       bytes_per_sample = 2;
     }
     samples_per_line = num_components * width;
-    bytes_per_line = bytes_per_sample * samples_per_line;
+    bytes_per_line = bytes_per_sample * (size_t)samples_per_line;
 
   }
 
@@ -1482,7 +1482,7 @@ namespace ojph {
 
     cur_line = 0;
     bytes_per_sample = (bit_depth + 7) >> 3;
-    buffer_size = width * bytes_per_sample;
+    buffer_size = (size_t)width * bytes_per_sample;
     buffer = (ui8*)malloc(buffer_size);
     fname = filename;
   }
@@ -1618,15 +1618,15 @@ namespace ojph {
     this->width = width;
 
     if (is_signed) { 
-      upper_val = (1 << (bit_depth - 1));
-      lower_val = -(1 << (bit_depth - 1));
+      upper_val = (1LL << (bit_depth - 1));
+      lower_val = -(1LL << (bit_depth - 1));
     } else {
-      upper_val = 1 << bit_depth;
-      lower_val = 0;
+      upper_val = 1LL << bit_depth;
+      lower_val = 0LL;
     }
 
     bytes_per_sample = (bit_depth + 7) >> 3;
-    buffer_size = width * bytes_per_sample;
+    buffer_size = (size_t)width * bytes_per_sample;
     buffer = (ui8*)malloc(buffer_size);
   }
 
@@ -1637,63 +1637,127 @@ namespace ojph {
     assert(fh);
     assert(comp_num == 0);
 
-    if (bytes_per_sample > 3)
+    if (is_signed) 
     {
-      const si32* sp = line->i32;
-      ui32* dp = (ui32*)buffer;
-      for (ui32 i = width; i > 0; --i)
+      if (bytes_per_sample > 3)
       {
-        int val = *sp++;
-        val = val < upper_val ? val : upper_val;
-        val = val >= lower_val ? val : lower_val;
-        *dp++ = (ui32)val;
+        const si32* sp = line->i32;
+        si32* dp = (si32*)buffer;
+        for (ui32 i = width; i > 0; --i)
+        {
+          si64 val = *sp++;
+          val = val < upper_val ? val : upper_val;
+          val = val >= lower_val ? val : lower_val;
+          *dp++ = (si32)val;
+        }
+        if (fwrite(buffer, bytes_per_sample, width, fh) != width)
+          OJPH_ERROR(0x03000151, "unable to write to file %s", fname);
       }
-      if (fwrite(buffer, bytes_per_sample, width, fh) != width)
-        OJPH_ERROR(0x03000151, "unable to write to file %s", fname);
-    }
-    else if (bytes_per_sample > 2)
-    {
-      const si32* sp = line->i32;
-      ui32* dp = (ui32*)buffer;
-      for (ui32 i = width; i > 0; --i)
+      else if (bytes_per_sample > 2)
       {
-        int val = *sp++;
-        val = val < upper_val ? val : upper_val;
-        val = val >= lower_val ? val : lower_val;
-        *dp = (ui32)val;
-        // this only works for little endian architecture
-        dp = (ui32*)((ui8*)dp + 3);
+        const si32* sp = line->i32;
+        si32* dp = (si32*)buffer;
+        for (ui32 i = width; i > 0; --i)
+        {
+          si64 val = *sp++;
+          val = val < upper_val ? val : upper_val;
+          val = val >= lower_val ? val : lower_val;
+          *dp = (si32)val;
+          // this only works for little endian architecture
+          dp = (si32*)((ui8*)dp + 3);
+        }
+        if (fwrite(buffer, bytes_per_sample, width, fh) != width)
+          OJPH_ERROR(0x03000152, "unable to write to file %s", fname);
       }
-      if (fwrite(buffer, bytes_per_sample, width, fh) != width)
-        OJPH_ERROR(0x03000152, "unable to write to file %s", fname);
-    }
-    else if (bytes_per_sample > 1)
-    {
-      const si32* sp = line->i32;
-      ui16* dp = (ui16*)buffer;
-      for (ui32 i = width; i > 0; --i)
+      else if (bytes_per_sample > 1)
       {
-        int val = *sp++;
-        val = val < upper_val ? val : upper_val;
-        val = val >= lower_val ? val : lower_val;
-        *dp++ = (ui16)val;
+        const si32* sp = line->i32;
+        si16* dp = (si16*)buffer;
+        for (ui32 i = width; i > 0; --i)
+        {
+          si64 val = *sp++;
+          val = val < upper_val ? val : upper_val;
+          val = val >= lower_val ? val : lower_val;
+          *dp++ = (si16)val;
+        }
+        if (fwrite(buffer, bytes_per_sample, width, fh) != width)
+          OJPH_ERROR(0x03000153, "unable to write to file %s", fname);
+      }
+      else
+      {
+        const si32* sp = line->i32;
+        si8* dp = (si8*)buffer;
+        for (ui32 i = width; i > 0; --i)
+        {
+          si64 val = *sp++;
+          val = val < upper_val ? val : upper_val;
+          val = val >= lower_val ? val : lower_val;
+          *dp++ = (si8)val;
+        }
+        if (fwrite(buffer, bytes_per_sample, width, fh) != width)
+          OJPH_ERROR(0x03000154, "unable to write to file %s", fname);
       }
-      if (fwrite(buffer, bytes_per_sample, width, fh) != width)
-        OJPH_ERROR(0x03000153, "unable to write to file %s", fname);
     }
-    else
+    else 
     {
-      const si32* sp = line->i32;
-      ui8* dp = (ui8*)buffer;
-      for (ui32 i = width; i > 0; --i)
+      if (bytes_per_sample > 3)
       {
-        int val = *sp++;
-        val = val < upper_val ? val : upper_val;
-        val = val >= lower_val ? val : lower_val;
-        *dp++ = (ui8)val;
+        const ui32* sp = (ui32*)line->i32;
+        ui32* dp = (ui32*)buffer;
+        for (ui32 i = width; i > 0; --i)
+        {
+          si64 val = *sp++;
+          val = val < upper_val ? val : upper_val;
+          val = val >= lower_val ? val : lower_val;
+          *dp++ = (ui32)val;
+        }
+        if (fwrite(buffer, bytes_per_sample, width, fh) != width)
+          OJPH_ERROR(0x03000155, "unable to write to file %s", fname);
+      }
+      else if (bytes_per_sample > 2)
+      {
+        const ui32* sp = (ui32*)line->i32;
+        ui32* dp = (ui32*)buffer;
+        for (ui32 i = width; i > 0; --i)
+        {
+          si64 val = *sp++;
+          val = val < upper_val ? val : upper_val;
+          val = val >= lower_val ? val : lower_val;
+          *dp = (ui32)val;
+          // this only works for little endian architecture
+          dp = (ui32*)((ui8*)dp + 3);
+        }
+        if (fwrite(buffer, bytes_per_sample, width, fh) != width)
+          OJPH_ERROR(0x03000156, "unable to write to file %s", fname);
+      }
+      else if (bytes_per_sample > 1)
+      {
+        const ui32* sp = (ui32*)line->i32;
+        ui16* dp = (ui16*)buffer;
+        for (ui32 i = width; i > 0; --i)
+        {
+          si64 val = *sp++;
+          val = val < upper_val ? val : upper_val;
+          val = val >= lower_val ? val : lower_val;
+          *dp++ = (ui16)val;
+        }
+        if (fwrite(buffer, bytes_per_sample, width, fh) != width)
+          OJPH_ERROR(0x03000157, "unable to write to file %s", fname);
+      }
+      else
+      {
+        const ui32* sp = (ui32*)line->i32;
+        ui8* dp = (ui8*)buffer;
+        for (ui32 i = width; i > 0; --i)
+        {
+          si64 val = *sp++;
+          val = val < upper_val ? val : upper_val;
+          val = val >= lower_val ? val : lower_val;
+          *dp++ = (ui8)val;
+        }
+        if (fwrite(buffer, bytes_per_sample, width, fh) != width)
+          OJPH_ERROR(0x03000158, "unable to write to file %s", fname);
       }
-      if (fwrite(buffer, bytes_per_sample, width, fh) != width)
-        OJPH_ERROR(0x03000154, "unable to write to file %s", fname);
     }
 
     return width;
@@ -1940,11 +2004,11 @@ namespace ojph {
 
     // allocate line_buffer_16bit_samples to hold a line of image data in memory
     line_buffer_16bit_samples = 
-      (ui16*) malloc(width * num_comps * sizeof(ui16));
+      (ui16*) malloc((size_t)width * num_comps * sizeof(ui16));
     if (NULL == line_buffer_16bit_samples)
       OJPH_ERROR(0x03000179, "Unable to allocate %d bytes for "
         "line_buffer_16bit_samples[] for file %s", 
-        width * num_comps * sizeof(ui16), filename);
+        (size_t)width * num_comps * sizeof(ui16), filename);
 
     cur_line = 0;
 
diff --git a/src/core/codestream/ojph_bitbuffer_write.h b/src/core/codestream/ojph_bitbuffer_write.h
index d5b6bca..ecb9dd2 100644
--- a/src/core/codestream/ojph_bitbuffer_write.h
+++ b/src/core/codestream/ojph_bitbuffer_write.h
@@ -109,33 +109,25 @@ namespace ojph {
       }
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    void bb_put_zeros(bit_write_buf *bbp, int num_zeros,
+                      mem_elastic_allocator *elastic,
+                      coded_lists*& cur_coded_list, ui32& ph_bytes)
+    {
+      for (int i = num_zeros; i > 0; --i)
+        bb_put_bit(bbp, 0, elastic, cur_coded_list, ph_bytes);
+    }
+
     //////////////////////////////////////////////////////////////////////////
     static inline
     void bb_put_bits(bit_write_buf *bbp, ui32 data, int num_bits,
                      mem_elastic_allocator *elastic,
                      coded_lists*& cur_coded_list, ui32& ph_bytes)
     {
-//      assert(num_bits <= 32);
-      for (int i = num_bits - 1; i >= 0; --i)
+      assert(num_bits <= 32);
+      for (int i = num_bits - 1; i >= 0; --i) 
         bb_put_bit(bbp, data >> i, elastic, cur_coded_list, ph_bytes);
-//      while (num_bits) {
-//        int tx_bits = num_bits < bbp->avail_bits ? num_bits : bbp->avail_bits;
-//        bbp->tmp |= (data >> (num_bits - tx_bits)) & ((1 << tx_bits) - 1);
-//        bbp->avail_bits -= tx_bits;
-//        if (bbp->avail_bits <= 0)
-//        {
-//          bbp->avail_bits = 8 - (bbp->tmp != 0xFF ? 0 : 1);
-//          bbp->buf[bbp->buf_size - bbp->avail_size] = (ui8)(bbp->tmp & 0xFF);
-//          bbp->tmp = 0;
-//          --bbp->avail_size;
-//          if (bbp->avail_size == 0)
-//          {
-//            bb_expand_buf(bbp, elastic, cur_coded_list->next_list);
-//            cur_coded_list = cur_coded_list->next_list;
-//            ph_bytes += bit_buffer::needed;
-//          }
-//        }
-//      }
     }
 
     //////////////////////////////////////////////////////////////////////////
diff --git a/src/core/codestream/ojph_codeblock.cpp b/src/core/codestream/ojph_codeblock.cpp
index 0915951..351284b 100644
--- a/src/core/codestream/ojph_codeblock.cpp
+++ b/src/core/codestream/ojph_codeblock.cpp
@@ -45,6 +45,7 @@
 #include "ojph_codestream_local.h"
 #include "ojph_codeblock.h"
 #include "ojph_subband.h"
+#include "ojph_resolution.h"
 
 namespace ojph {
 
@@ -52,7 +53,7 @@ namespace ojph {
   {
 
     //////////////////////////////////////////////////////////////////////////
-    void codeblock::pre_alloc(codestream *codestream,
+    void codeblock::pre_alloc(codestream *codestream, ui32 comp_num,
                               const size& nominal)
     {
       mem_fixed_allocator* allocator = codestream->get_allocator();
@@ -60,7 +61,14 @@ namespace ojph {
       assert(byte_alignment / sizeof(ui32) > 1);
       const ui32 f = byte_alignment / sizeof(ui32) - 1;
       ui32 stride = (nominal.w + f) & ~f; // a multiple of 8
-      allocator->pre_alloc_data<ui32>(nominal.h * stride, 0);
+
+      const param_siz* sz = codestream->get_siz();
+      const param_cod* cd = codestream->get_cod(comp_num);
+      ui32 precision = cd->propose_implementation_precision(sz);
+      if (precision <= 32)
+        allocator->pre_alloc_data<ui32>(nominal.h * (size_t)stride, 0);
+      else
+        allocator->pre_alloc_data<ui64>(nominal.h * (size_t)stride, 0);
     }
 
     //////////////////////////////////////////////////////////////////////////
@@ -75,7 +83,19 @@ namespace ojph {
       const ui32 f = byte_alignment / sizeof(ui32) - 1;
       this->stride = (nominal.w + f) & ~f; // a multiple of 8
       this->buf_size = this->stride * nominal.h;
-      this->buf = allocator->post_alloc_data<ui32>(this->buf_size, 0);
+
+      ui32 comp_num = parent->get_parent()->get_comp_num();
+      const param_siz* sz = codestream->get_siz();
+      const param_cod* cd = codestream->get_cod(comp_num);
+      ui32 bit_depth = cd->propose_implementation_precision(sz);
+      if (bit_depth <= 32) {
+        precision = BUF32;
+        this->buf32 = allocator->post_alloc_data<ui32>(this->buf_size, 0);
+      }
+      else {
+        precision = BUF64;
+        this->buf64 = allocator->post_alloc_data<ui64>(this->buf_size, 0);
+      }
 
       this->nominal_size = nominal;
       this->cb_size = cb_size;
@@ -85,8 +105,8 @@ namespace ojph {
       this->delta = parent->get_delta();
       this->delta_inv = 1.0f / this->delta;
       this->K_max = K_max;
-      for (int i = 0; i < 8; ++i)
-        this->max_val[i] = 0;
+      for (int i = 0; i < 4; ++i)
+        this->max_val64[i] = 0;
       ojph::param_cod cod = codestream->access_cod();
       this->reversible = cod.is_reversible();
       this->resilient = codestream->is_resilient();
@@ -100,28 +120,61 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
     void codeblock::push(line_buf *line)
     {
-      // convert to sign and magnitude and keep max_val
-      const si32 *sp = line->i32 + line_offset;
-      ui32 *dp = buf + cur_line * stride;
-      this->codeblock_functions.tx_to_cb(sp, dp, K_max, delta_inv, cb_size.w, 
-        max_val);
-      ++cur_line;
+      // convert to sign and magnitude and keep max_val      
+      if (precision == BUF32)
+      {
+        assert(line->flags & line_buf::LFT_32BIT);
+        const si32 *sp = line->i32 + line_offset;
+        ui32 *dp = buf32 + cur_line * stride;
+        this->codeblock_functions.tx_to_cb32(sp, dp, K_max, delta_inv, 
+                                             cb_size.w, max_val32);
+        ++cur_line;
+      }
+      else 
+      {
+        assert(precision == BUF64);
+        assert(line->flags & line_buf::LFT_64BIT);
+        const si64 *sp = line->i64 + line_offset;
+        ui64 *dp = buf64 + cur_line * stride;
+        this->codeblock_functions.tx_to_cb64(sp, dp, K_max, delta_inv, 
+                                             cb_size.w, max_val64);
+        ++cur_line;
+      }
     }
 
     //////////////////////////////////////////////////////////////////////////
     void codeblock::encode(mem_elastic_allocator *elastic)
     {
-      ui32 mv = this->codeblock_functions.find_max_val(max_val);
-      if (mv >= 1u<<(31 - K_max))
+      if (precision == BUF32)
       {
-        coded_cb->missing_msbs = K_max - 1;
-        assert(coded_cb->missing_msbs > 0);
-        assert(coded_cb->missing_msbs < K_max);
-        coded_cb->num_passes = 1;
-        
-        this->codeblock_functions.encode_cb(buf, K_max-1, 1,
-          cb_size.w, cb_size.h, stride, coded_cb->pass_length,
-          elastic, coded_cb->next_coded);
+        ui32 mv = this->codeblock_functions.find_max_val32(max_val32);
+        if (mv >= 1u << (31 - K_max))
+        {
+          coded_cb->missing_msbs = K_max - 1;
+          assert(coded_cb->missing_msbs > 0);
+          assert(coded_cb->missing_msbs < K_max);
+          coded_cb->num_passes = 1;
+          
+          this->codeblock_functions.encode_cb32(buf32, K_max-1, 1,
+            cb_size.w, cb_size.h, stride, coded_cb->pass_length,
+            elastic, coded_cb->next_coded);
+        }
+      }
+      else
+      {
+        assert(precision == BUF64);
+        ui64 mv = this->codeblock_functions.find_max_val64(max_val64);
+        if (mv >= 1ULL << (63 - K_max))
+        {
+          coded_cb->missing_msbs = K_max - 1;
+          assert(coded_cb->missing_msbs > 0);
+          assert(coded_cb->missing_msbs < K_max);
+          coded_cb->num_passes = 1;
+          
+          this->codeblock_functions.encode_cb64(buf64, K_max-1, 1,
+            cb_size.w, cb_size.h, stride, coded_cb->pass_length,
+            elastic, coded_cb->next_coded);
+        }
       }
     }
 
@@ -132,8 +185,8 @@ namespace ojph {
       this->cb_size = cb_size;
       this->coded_cb = coded_cb;
       this->cur_line = 0;
-      for (int i = 0; i < 8; ++i)
-        this->max_val[i] = 0;
+      for (int i = 0; i < 4; ++i)
+        this->max_val64[i] = 0;
       this->zero_block = false;
     }
 
@@ -143,11 +196,24 @@ namespace ojph {
       if (coded_cb->pass_length[0] > 0 && coded_cb->num_passes > 0 &&
           coded_cb->next_coded != NULL)
       {
-        bool result = this->codeblock_functions.decode_cb(
+        bool result;
+        if (precision == BUF32)
+        {
+          result = this->codeblock_functions.decode_cb32(
+            coded_cb->next_coded->buf + coded_cb_header::prefix_buf_size,
+            buf32, coded_cb->missing_msbs, coded_cb->num_passes,
+            coded_cb->pass_length[0], coded_cb->pass_length[1],
+            cb_size.w, cb_size.h, stride, stripe_causal);
+        }
+        else 
+        {
+          assert(precision == BUF64);
+          result = this->codeblock_functions.decode_cb64(
             coded_cb->next_coded->buf + coded_cb_header::prefix_buf_size,
-            buf, coded_cb->missing_msbs, coded_cb->num_passes,
+            buf64, coded_cb->missing_msbs, coded_cb->num_passes,
             coded_cb->pass_length[0], coded_cb->pass_length[1],
             cb_size.w, cb_size.h, stride, stripe_causal);
+        }
 
         if (result == false)
         {
@@ -167,15 +233,35 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
     void codeblock::pull_line(line_buf *line)
     {
-      si32 *dp = line->i32 + line_offset;
-      if (!zero_block)
+      //convert to sign and magnitude
+      if (precision == BUF32)
       {
-        //convert to sign and magnitude
-        const ui32 *sp = buf + cur_line * stride;
-        this->codeblock_functions.tx_from_cb(sp, dp, K_max, delta, cb_size.w);
+        assert(line->flags & line_buf::LFT_32BIT);
+        si32 *dp = line->i32 + line_offset;
+        if (!zero_block)
+        {
+          const ui32 *sp = buf32 + cur_line * stride;
+          this->codeblock_functions.tx_from_cb32(sp, dp, K_max, delta, 
+                                                 cb_size.w);
+        }
+        else
+          this->codeblock_functions.mem_clear(dp, cb_size.w * sizeof(ui32));
       }
       else
-        this->codeblock_functions.mem_clear(dp, cb_size.w * sizeof(*dp));
+      {
+        assert(precision == BUF64);
+        assert(line->flags & line_buf::LFT_64BIT);
+        si64 *dp = line->i64 + line_offset;
+        if (!zero_block)
+        {
+          const ui64 *sp = buf64 + cur_line * stride;
+          this->codeblock_functions.tx_from_cb64(sp, dp, K_max, delta, 
+                                                 cb_size.w);
+        }
+        else
+          this->codeblock_functions.mem_clear(dp, cb_size.w * sizeof(*dp));
+      }
+
       ++cur_line;
       assert(cur_line <= cb_size.h);
     }
diff --git a/src/core/codestream/ojph_codeblock.h b/src/core/codestream/ojph_codeblock.h
index 2f7d8e7..fde8e6a 100644
--- a/src/core/codestream/ojph_codeblock.h
+++ b/src/core/codestream/ojph_codeblock.h
@@ -48,7 +48,7 @@ namespace ojph {
 
   ////////////////////////////////////////////////////////////////////////////
   //defined elsewhere
-  struct line_buf;
+  class line_buf;
   class mem_elastic_allocator;
   class codestream;
   struct coded_lists;
@@ -65,8 +65,14 @@ namespace ojph {
     class codeblock
     {
       friend struct precinct;
+      enum : ui32 {
+        BUF32 = 4,
+        BUF64 = 8,
+      };
+
     public:
-      static void pre_alloc(codestream *codestream, const size& nominal);
+      static void pre_alloc(codestream *codestream, ui32 comp_num, 
+                            const size& nominal);
       void finalize_alloc(codestream *codestream, subband* parent,
                           const size& nominal, const size& cb_size,
                           coded_cb_header* coded_cb,
@@ -79,7 +85,11 @@ namespace ojph {
       void pull_line(line_buf *line);
 
     private:
-      ui32* buf;
+      ui32 precision;
+      union {
+        ui32* buf32;
+        ui64* buf64;
+      };
       size nominal_size;
       size cb_size;
       ui32 stride;
@@ -93,7 +103,10 @@ namespace ojph {
       bool resilient;
       bool stripe_causal;
       bool zero_block; // true when the decoded block is all zero
-      ui32 max_val[8]; // supports up to 256 bits
+      union {
+        ui32 max_val32[8]; // supports up to 256 bits
+        ui64 max_val64[4]; // supports up to 256 bits
+      };
       coded_cb_header* coded_cb;
       codeblock_fun codeblock_functions;
     };
diff --git a/src/core/codestream/ojph_codeblock_fun.cpp b/src/core/codestream/ojph_codeblock_fun.cpp
index 51253c1..08d8d73 100644
--- a/src/core/codestream/ojph_codeblock_fun.cpp
+++ b/src/core/codestream/ojph_codeblock_fun.cpp
@@ -63,72 +63,107 @@ namespace ojph {
     void wasm_mem_clear(void* addr, size_t count);
 
     //////////////////////////////////////////////////////////////////////////
-    ui32 gen_find_max_val(ui32* address);
-    ui32 sse2_find_max_val(ui32* address);
-    ui32 avx2_find_max_val(ui32* address);
-    ui32 wasm_find_max_val(ui32* address);
+    ui32  gen_find_max_val32(ui32* address);
+    ui32 sse2_find_max_val32(ui32* address);
+    ui32 avx2_find_max_val32(ui32* address);
+    ui32 wasm_find_max_val32(ui32* address);
+    ui64  gen_find_max_val64(ui64* address);
+    ui64 sse2_find_max_val64(ui64* address);
+    ui64 avx2_find_max_val64(ui64* address);
+    ui64 wasm_find_max_val64(ui64* address);
+
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val);
-    void sse2_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val);
-    void avx2_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val);
-    void gen_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val);
-    void sse2_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val);
-    void avx2_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val);
-    void wasm_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val);
-    void wasm_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val);
+    void  gen_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val);
+    void sse2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val);
+    void avx2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val);
+    void  gen_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val);
+    void sse2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val);
+    void avx2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val);
+    void wasm_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val);
+    void wasm_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val);
+
+    void  gen_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui64* max_val);
+    void sse2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui64* max_val);
+    void avx2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui64* max_val);
+    void wasm_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui64* max_val);
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max,
-                             float delta, ui32 count);
-    void sse2_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max,
-                             float delta, ui32 count);
-    void avx2_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max,
-                             float delta, ui32 count);
-    void gen_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max,
-                             float delta, ui32 count);
-    void sse2_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max,
-                             float delta, ui32 count);
-    void avx2_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max,
-                             float delta, ui32 count);
-    void wasm_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max,
-                             float delta, ui32 count);
-    void wasm_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max,
-                             float delta, ui32 count);
+    void  gen_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
+    void sse2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
+    void avx2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
+    void  gen_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
+    void sse2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
+    void avx2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
+    void wasm_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
+    void wasm_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
 
+    void  gen_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
+    void sse2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
+    void avx2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);
+    void wasm_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max,
+                               float delta, ui32 count);                               
 
     void codeblock_fun::init(bool reversible) {
 
 #if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN)
 
       // Default path, no acceleration.  We may change this later
-      decode_cb = ojph_decode_codeblock;
-      find_max_val = gen_find_max_val;
+      decode_cb32 = ojph_decode_codeblock32;
+      find_max_val32 = gen_find_max_val32;
       mem_clear = gen_mem_clear;
       if (reversible) {
-        tx_to_cb = gen_rev_tx_to_cb;
-        tx_from_cb = gen_rev_tx_from_cb;
+        tx_to_cb32 = gen_rev_tx_to_cb32;
+        tx_from_cb32 = gen_rev_tx_from_cb32;
       }
       else
       {
-        tx_to_cb = gen_irv_tx_to_cb;
-        tx_from_cb = gen_irv_tx_from_cb;
+        tx_to_cb32 = gen_irv_tx_to_cb32;
+        tx_from_cb32 = gen_irv_tx_from_cb32;
       }
-      encode_cb = ojph_encode_codeblock;
+      encode_cb32 = ojph_encode_codeblock32;
+
+      decode_cb64 = ojph_decode_codeblock64;
+      find_max_val64 = gen_find_max_val64;
+      if (reversible) {
+        tx_to_cb64 = gen_rev_tx_to_cb64;
+        tx_from_cb64 = gen_rev_tx_from_cb64;
+      }
+      else
+      {
+        tx_to_cb64 = NULL;
+        tx_from_cb64 = NULL;
+      }
+      encode_cb64 = ojph_encode_codeblock64;
 
   #ifndef OJPH_DISABLE_SIMD
 
     #if (defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386))
 
-        // Accelerated functions for INTEL/AMD CPUs
+      // Accelerated functions for INTEL/AMD CPUs
       #ifndef OJPH_DISABLE_SSE
         if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE)
           mem_clear = sse_mem_clear;
@@ -136,21 +171,31 @@ namespace ojph {
 
       #ifndef OJPH_DISABLE_SSE2
         if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE2) {
-          find_max_val = sse2_find_max_val;
+          find_max_val32 = sse2_find_max_val32;
           if (reversible) {
-            tx_to_cb = sse2_rev_tx_to_cb;
-            tx_from_cb = sse2_rev_tx_from_cb;
+            tx_to_cb32 = sse2_rev_tx_to_cb32;
+            tx_from_cb32 = sse2_rev_tx_from_cb32;
           }
           else {
-            tx_to_cb = sse2_irv_tx_to_cb;
-            tx_from_cb = sse2_irv_tx_from_cb;
+            tx_to_cb32 = sse2_irv_tx_to_cb32;
+            tx_from_cb32 = sse2_irv_tx_from_cb32;
+          }
+          find_max_val64 = sse2_find_max_val64;
+          if (reversible) {
+            tx_to_cb64 = sse2_rev_tx_to_cb64;
+            tx_from_cb64 = sse2_rev_tx_from_cb64;
+          }
+          else
+          {
+            tx_to_cb64 = NULL;
+            tx_from_cb64 = NULL;
           }
         }
       #endif // !OJPH_DISABLE_SSE2
 
       #ifndef OJPH_DISABLE_SSSE3
         if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSSE3)
-          decode_cb = ojph_decode_codeblock_ssse3;
+          decode_cb32 = ojph_decode_codeblock_ssse3;
       #endif // !OJPH_DISABLE_SSSE3
 
       #ifndef OJPH_DISABLE_AVX
@@ -160,23 +205,39 @@ namespace ojph {
 
       #ifndef OJPH_DISABLE_AVX2
         if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX2) {
-          find_max_val = avx2_find_max_val;
+          decode_cb32 = ojph_decode_codeblock_avx2;
+          find_max_val32 = avx2_find_max_val32;
           if (reversible) {
-            tx_to_cb = avx2_rev_tx_to_cb;
-            tx_from_cb = avx2_rev_tx_from_cb;
+            tx_to_cb32 = avx2_rev_tx_to_cb32;
+            tx_from_cb32 = avx2_rev_tx_from_cb32;
           }
           else {
-            tx_to_cb = avx2_irv_tx_to_cb;
-            tx_from_cb = avx2_irv_tx_from_cb;
+            tx_to_cb32 = avx2_irv_tx_to_cb32;
+            tx_from_cb32 = avx2_irv_tx_from_cb32;
+          }
+          encode_cb32 = ojph_encode_codeblock_avx2;
+          bool result = initialize_block_encoder_tables_avx2();
+          assert(result); ojph_unused(result);
+
+          find_max_val64 = avx2_find_max_val64;
+          if (reversible) {
+            tx_to_cb64 = avx2_rev_tx_to_cb64;
+            tx_from_cb64 = avx2_rev_tx_from_cb64;
+          }
+          else
+          {
+            tx_to_cb64 = NULL;
+            tx_from_cb64 = NULL;
           }
-          encode_cb = ojph_encode_codeblock_avx2;
-          decode_cb = ojph_decode_codeblock_avx2;
         }
       #endif // !OJPH_DISABLE_AVX2
 
       #if (defined(OJPH_ARCH_X86_64) && !defined(OJPH_DISABLE_AVX512))
-        if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX512)
-          encode_cb = ojph_encode_codeblock_avx512;
+        if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX512) {
+          encode_cb32 = ojph_encode_codeblock_avx512;
+          bool result = initialize_block_encoder_tables_avx512();
+          assert(result); ojph_unused(result);
+        }
       #endif // !OJPH_DISABLE_AVX512
 
     #elif defined(OJPH_ARCH_ARM)
@@ -188,18 +249,31 @@ namespace ojph {
 #else // OJPH_ENABLE_WASM_SIMD
 
       // Accelerated functions for WASM SIMD.
-      decode_cb = ojph_decode_codeblock_wasm;
-      find_max_val = wasm_find_max_val;
+      decode_cb32 = ojph_decode_codeblock_wasm;
+      find_max_val32 = wasm_find_max_val32;
       mem_clear = wasm_mem_clear;
       if (reversible) {
-        tx_to_cb = wasm_rev_tx_to_cb;
-        tx_from_cb = wasm_rev_tx_from_cb;
+        tx_to_cb32 = wasm_rev_tx_to_cb32;
+        tx_from_cb32 = wasm_rev_tx_from_cb32;
       }
       else {
-        tx_to_cb = wasm_irv_tx_to_cb;
-        tx_from_cb = wasm_irv_tx_from_cb;
+        tx_to_cb32 = wasm_irv_tx_to_cb32;
+        tx_from_cb32 = wasm_irv_tx_from_cb32;
+      }
+      encode_cb32 = ojph_encode_codeblock32;
+
+      decode_cb64 = ojph_decode_codeblock64;
+      find_max_val64 = wasm_find_max_val64;
+      if (reversible) {
+        tx_to_cb64 = wasm_rev_tx_to_cb64;
+        tx_from_cb64 = wasm_rev_tx_from_cb64;
+      }
+      else
+      {
+        tx_to_cb64 = NULL;
+        tx_from_cb64 = NULL;
       }
-      encode_cb = ojph_encode_codeblock;
+      encode_cb64 = ojph_encode_codeblock64;
 
 #endif // !OJPH_ENABLE_WASM_SIMD
 
diff --git a/src/core/codestream/ojph_codeblock_fun.h b/src/core/codestream/ojph_codeblock_fun.h
index 679b2d3..67fbc2b 100644
--- a/src/core/codestream/ojph_codeblock_fun.h
+++ b/src/core/codestream/ojph_codeblock_fun.h
@@ -51,23 +51,40 @@ namespace ojph {
     typedef void (*mem_clear_fun)(void* addr, size_t count);
 
     // define function signature for max value finding
-    typedef ui32 (*find_max_val_fun)(ui32* addr);
+    typedef ui32 (*find_max_val_fun32)(ui32* addr);
+
+    typedef ui64 (*find_max_val_fun64)(ui64* addr);
 
     // define line transfer function signature from subbands to codeblocks
-    typedef void (*tx_to_cb_fun)(const void *sp, ui32 *dp, ui32 K_max,
+    typedef void (*tx_to_cb_fun32)(const void *sp, ui32 *dp, ui32 K_max,
                                    float delta_inv, ui32 count, ui32* max_val);
 
+    typedef void (*tx_to_cb_fun64)(const void *sp, ui64 *dp, ui32 K_max,
+                                   float delta_inv, ui32 count, ui64* max_val);
+
     // define line transfer function signature from codeblock to subband
-    typedef void (*tx_from_cb_fun)(const ui32 *sp, void *dp, ui32 K_max,
+    typedef void (*tx_from_cb_fun32)(const ui32 *sp, void *dp, ui32 K_max,
+                                     float delta, ui32 count);
+
+    typedef void (*tx_from_cb_fun64)(const ui64 *sp, void *dp, ui32 K_max,
                                      float delta, ui32 count);
 
     // define the block decoder function signature
-    typedef bool (*cb_decoder_fun)(ui8* coded_data, ui32* decoded_data,
+    typedef bool (*cb_decoder_fun32)(ui8* coded_data, ui32* decoded_data,
+      ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2,
+      ui32 width, ui32 height, ui32 stride, bool stripe_causal);
+
+    typedef bool (*cb_decoder_fun64)(ui8* coded_data, ui64* decoded_data,
       ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2,
       ui32 width, ui32 height, ui32 stride, bool stripe_causal);
 
     // define the block encoder function signature
-    typedef void (*cb_encoder_fun)(ui32* buf, ui32 missing_msbs, 
+    typedef void (*cb_encoder_fun32)(ui32* buf, ui32 missing_msbs, 
+      ui32 num_passes, ui32 width, ui32 height, ui32 stride,
+      ui32* lengths, ojph::mem_elastic_allocator* elastic,
+      ojph::coded_lists*& coded);
+
+    typedef void (*cb_encoder_fun64)(ui64* buf, ui32 missing_msbs, 
       ui32 num_passes, ui32 width, ui32 height, ui32 stride,
       ui32* lengths, ojph::mem_elastic_allocator* elastic,
       ojph::coded_lists*& coded);
@@ -81,19 +98,24 @@ namespace ojph {
       mem_clear_fun mem_clear;
      
       // a pointer to the max value finding function
-      find_max_val_fun find_max_val;
+      find_max_val_fun32 find_max_val32;
+      find_max_val_fun64 find_max_val64;
      
       // a pointer to function transferring samples from subbands to codeblocks
-      tx_to_cb_fun tx_to_cb;
+      tx_to_cb_fun32 tx_to_cb32;
+      tx_to_cb_fun64 tx_to_cb64;
      
       // a pointer to function transferring samples from codeblocks to subbands
-      tx_from_cb_fun tx_from_cb;
+      tx_from_cb_fun32 tx_from_cb32;
+      tx_from_cb_fun64 tx_from_cb64;
      
       // a pointer to the decoder function
-      cb_decoder_fun decode_cb;
+      cb_decoder_fun32 decode_cb32;
+      cb_decoder_fun64 decode_cb64;
 
       // a pointer to the encoder function
-      cb_encoder_fun encode_cb;
+      cb_encoder_fun32 encode_cb32;
+      cb_encoder_fun64 encode_cb64;
     };
     
   }
diff --git a/src/core/codestream/ojph_codestream_avx2.cpp b/src/core/codestream/ojph_codestream_avx2.cpp
index 04a81ed..a8e5138 100644
--- a/src/core/codestream/ojph_codestream_avx2.cpp
+++ b/src/core/codestream/ojph_codestream_avx2.cpp
@@ -35,6 +35,7 @@
 // Date: 15 May 2022
 //***************************************************************************/
 
+#include <climits>
 #include <immintrin.h>
 #include "ojph_defs.h"
 
@@ -42,7 +43,7 @@ namespace ojph {
   namespace local {
 
     //////////////////////////////////////////////////////////////////////////
-    ui32 avx2_find_max_val(ui32* address)
+    ui32 avx2_find_max_val32(ui32* address)
     {
       __m128i x0 = _mm_loadu_si128((__m128i*)address);
       __m128i x1 = _mm_loadu_si128((__m128i*)address + 1);
@@ -56,14 +57,26 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, 
-                           float delta_inv, ui32 count, ui32* max_val)
+    ui64 avx2_find_max_val64(ui64* address)
+    {
+      __m128i x0 = _mm_loadu_si128((__m128i*)address);
+      __m128i x1 = _mm_loadu_si128((__m128i*)address + 1);
+      x0 = _mm_or_si128(x0, x1);
+      x1 = _mm_shuffle_epi32(x0, 0xEE);   // x1 = x0[2,3,2,3]
+      x0 = _mm_or_si128(x0, x1);
+      ui64 t = (ui64)_mm_extract_epi64(x0, 0);
+      return t;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, 
+                             float delta_inv, ui32 count, ui32* max_val)
     {
       ojph_unused(delta_inv);
 
       // convert to sign and magnitude and keep max_val      
       ui32 shift = 31 - K_max;
-      __m256i m0 = _mm256_set1_epi32((int)0x80000000);
+      __m256i m0 = _mm256_set1_epi32(INT_MIN);
       __m256i tmax = _mm256_loadu_si256((__m256i*)max_val);
       __m256i *p = (__m256i*)sp;
       for (ui32 i = 0; i < count; i += 8, p += 1, dp += 8)
@@ -78,16 +91,16 @@ namespace ojph {
       }
       _mm256_storeu_si256((__m256i*)max_val, tmax);
     }
-                           
+
     //////////////////////////////////////////////////////////////////////////
-    void avx2_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val)
+    void avx2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val)
     {
       ojph_unused(K_max);
 
       //quantize and convert to sign and magnitude and keep max_val
       __m256 d = _mm256_set1_ps(delta_inv);
-      __m256i m0 = _mm256_set1_epi32((int)0x80000000);
+      __m256i m0 = _mm256_set1_epi32(INT_MIN);
       __m256i tmax = _mm256_loadu_si256((__m256i*)max_val);
       float *p = (float*)sp;
       
@@ -106,29 +119,29 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, 
-                             float delta, ui32 count)
+    void avx2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, 
+                               float delta, ui32 count)
     {
       ojph_unused(delta);
       ui32 shift = 31 - K_max;
-      __m256i m1 = _mm256_set1_epi32(0x7FFFFFFF);
+      __m256i m1 = _mm256_set1_epi32(INT_MAX);
       si32 *p = (si32*)dp;
       for (ui32 i = 0; i < count; i += 8, sp += 8, p += 8)
       {
-          __m256i v = _mm256_load_si256((__m256i*)sp);
-          __m256i val = _mm256_and_si256(v, m1);
-          val = _mm256_srli_epi32(val, (int)shift);
-          val = _mm256_sign_epi32(val, v);
-          _mm256_storeu_si256((__m256i*)p, val);
+        __m256i v = _mm256_load_si256((__m256i*)sp);
+        __m256i val = _mm256_and_si256(v, m1);
+        val = _mm256_srli_epi32(val, (int)shift);
+        val = _mm256_sign_epi32(val, v);
+        _mm256_storeu_si256((__m256i*)p, val);
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, 
-                             float delta, ui32 count)
+    void avx2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, 
+                               float delta, ui32 count)
     {
       ojph_unused(K_max);
-      __m256i m1 = _mm256_set1_epi32(0x7FFFFFFF);
+      __m256i m1 = _mm256_set1_epi32(INT_MAX);
       __m256 d = _mm256_set1_ps(delta);
       float *p = (float*)dp;
       for (ui32 i = 0; i < count; i += 8, sp += 8, p += 8)
@@ -142,5 +155,58 @@ namespace ojph {
         _mm256_storeu_ps(p, valf);
       }
     }
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, 
+                             float delta_inv, ui32 count, ui64* max_val)
+    {
+      ojph_unused(delta_inv);
+
+      // convert to sign and magnitude and keep max_val      
+      ui32 shift = 63 - K_max;
+      __m256i m0 = _mm256_set1_epi64x(LLONG_MIN);
+      __m256i zero = _mm256_setzero_si256();
+      __m256i one = _mm256_set1_epi64x(1);
+      __m256i tmax = _mm256_loadu_si256((__m256i*)max_val);
+      __m256i *p = (__m256i*)sp;
+      for (ui32 i = 0; i < count; i += 4, p += 1, dp += 4)
+      {
+        __m256i v = _mm256_loadu_si256(p);
+        __m256i sign = _mm256_cmpgt_epi64(zero, v);
+        __m256i val = _mm256_xor_si256(v, sign);  // negate 1's complement
+        __m256i ones = _mm256_and_si256(sign, one);
+        val = _mm256_add_epi64(val, ones);        // 2's complement
+        sign = _mm256_and_si256(sign, m0);
+        val = _mm256_slli_epi64(val, (int)shift);
+        tmax = _mm256_or_si256(tmax, val);
+        val = _mm256_or_si256(val, sign);
+        _mm256_storeu_si256((__m256i*)dp, val);
+      }
+      _mm256_storeu_si256((__m256i*)max_val, tmax);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, 
+                               float delta, ui32 count)
+    {
+      ojph_unused(delta);
+      
+      ui32 shift = 63 - K_max;
+      __m256i m1 = _mm256_set1_epi64x(LLONG_MAX);
+      __m256i zero = _mm256_setzero_si256();
+      __m256i one = _mm256_set1_epi64x(1);
+      si64 *p = (si64*)dp;
+      for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
+      {
+        __m256i v = _mm256_load_si256((__m256i*)sp);
+        __m256i val = _mm256_and_si256(v, m1);
+        val = _mm256_srli_epi64(val, (int)shift);
+        __m256i sign = _mm256_cmpgt_epi64(zero, v);
+        val = _mm256_xor_si256(val, sign); // negate 1's complement
+        __m256i ones = _mm256_and_si256(sign, one);
+        val = _mm256_add_epi64(val, ones); // 2's complement
+        _mm256_storeu_si256((__m256i*)p, val);
+      }
+    }
   }
-}
\ No newline at end of file
+}
diff --git a/src/core/codestream/ojph_codestream_gen.cpp b/src/core/codestream/ojph_codestream_gen.cpp
index 466f483..cdc72c6 100644
--- a/src/core/codestream/ojph_codestream_gen.cpp
+++ b/src/core/codestream/ojph_codestream_gen.cpp
@@ -44,18 +44,21 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
     void gen_mem_clear(void* addr, size_t count)
     {
-      ui32* p = (ui32*)addr;
-      for (size_t i = 0; i < count; i += 4, p += 1)
-        *p = 0;
+      si64* p = (si64*)addr;
+      for (size_t i = 0; i < count; i += 8)
+        *p++ = 0;
     }
 
     //////////////////////////////////////////////////////////////////////////
-    ui32 gen_find_max_val(ui32* addr) { return addr[0]; }
+    ui32 gen_find_max_val32(ui32* addr) { return addr[0]; }
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, 
-                                     float delta_inv, ui32 count, 
-                                     ui32* max_val)
+    ui64 gen_find_max_val64(ui64* addr) { return addr[0]; }
+
+    //////////////////////////////////////////////////////////////////////////
+    void gen_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, 
+                            float delta_inv, ui32 count, 
+                            ui32* max_val)
     {
       ojph_unused(delta_inv);
       ui32 shift = 31 - K_max;
@@ -65,7 +68,7 @@ namespace ojph {
       for (ui32 i = count; i > 0; --i)
       {
         si32 v = *p++;
-        ui32 sign = v >= 0 ? 0 : 0x80000000;
+        ui32 sign = v >= 0 ? 0U : 0x80000000U;
         ui32 val = (ui32)(v >= 0 ? v : -v);
         val <<= shift;
         *dp++ = sign | val;
@@ -75,9 +78,31 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                                     float delta_inv, ui32 count, 
-                                     ui32* max_val)
+    void gen_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, 
+                            float delta_inv, ui32 count, 
+                            ui64* max_val)
+    {
+      ojph_unused(delta_inv);
+      ui32 shift = 63 - K_max;
+      // convert to sign and magnitude and keep max_val
+      ui64 tmax = *max_val;
+      si64 *p = (si64*)sp;
+      for (ui32 i = count; i > 0; --i)
+      {
+        si64 v = *p++;
+        ui64 sign = v >= 0 ? 0ULL : 0x8000000000000000ULL;
+        ui64 val = (ui64)(v >= 0 ? v : -v);
+        val <<= shift;
+        *dp++ = sign | val;
+        tmax |= val; // it is more efficient to use or than max
+      }
+      *max_val = tmax;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void gen_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                            float delta_inv, ui32 count, 
+                            ui32* max_val)
     {
       ojph_unused(K_max);
       //quantize and convert to sign and magnitude and keep max_val
@@ -87,7 +112,7 @@ namespace ojph {
       {
         float v = *p++;
         si32 t = ojph_trunc(v * delta_inv);
-        ui32 sign = t >= 0 ? 0 : 0x80000000;
+        ui32 sign = t >= 0 ? 0U : 0x80000000U;
         ui32 val = (ui32)(t >= 0 ? t : -t);
         *dp++ = sign | val;
         tmax |= val; // it is more efficient to use or than max
@@ -96,8 +121,8 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max,
-                                       float delta, ui32 count)
+    void gen_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                              float delta, ui32 count)
     {
       ojph_unused(delta);
       ui32 shift = 31 - K_max;
@@ -106,14 +131,30 @@ namespace ojph {
       for (ui32 i = count; i > 0; --i)
       {
         ui32 v = *sp++;
-        si32 val = (v & 0x7FFFFFFF) >> shift;
-        *p++ = (v & 0x80000000) ? -val : val;
+        si32 val = (v & 0x7FFFFFFFU) >> shift;
+        *p++ = (v & 0x80000000U) ? -val : val;
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void gen_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max,
+                              float delta, ui32 count)
+    {
+      ojph_unused(delta);
+      ui32 shift = 63 - K_max;
+      //convert to sign and magnitude
+      si64 *p = (si64*)dp;
+      for (ui32 i = count; i > 0; --i)
+      {
+        ui64 v = *sp++;
+        si64 val = (v & 0x7FFFFFFFFFFFFFFFULL) >> shift;
+        *p++ = (v & 0x8000000000000000ULL) ? -val : val;
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max,
-                                       float delta, ui32 count)
+    void gen_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max,
+                              float delta, ui32 count)
     {
       ojph_unused(K_max);
       //convert to sign and magnitude
@@ -121,8 +162,8 @@ namespace ojph {
       for (ui32 i = count; i > 0; --i)
       {
         ui32 v = *sp++;
-        float val = (float)(v & 0x7FFFFFFF) * delta;
-        *p++ = (v & 0x80000000) ? -val : val;
+        float val = (float)(v & 0x7FFFFFFFU) * delta;
+        *p++ = (v & 0x80000000U) ? -val : val;
       }
     }
     
diff --git a/src/core/codestream/ojph_codestream_local.h b/src/core/codestream/ojph_codestream_local.h
index e6930d5..3d03658 100644
--- a/src/core/codestream/ojph_codestream_local.h
+++ b/src/core/codestream/ojph_codestream_local.h
@@ -46,7 +46,7 @@ namespace ojph {
 
   ////////////////////////////////////////////////////////////////////////////
   //defined elsewhere
-  struct line_buf;
+  class line_buf;
   class mem_fixed_allocator;
   class mem_elastic_allocator;
   class codestream;
diff --git a/src/core/codestream/ojph_codestream_sse.cpp b/src/core/codestream/ojph_codestream_sse.cpp
index 7c64ad9..6a31cbd 100644
--- a/src/core/codestream/ojph_codestream_sse.cpp
+++ b/src/core/codestream/ojph_codestream_sse.cpp
@@ -49,6 +49,5 @@ namespace ojph {
       for (size_t i = 0; i < count; i += 16, p += 4)
         _mm_storeu_ps(p, zero);
     }
-
   }
 }
\ No newline at end of file
diff --git a/src/core/codestream/ojph_codestream_sse2.cpp b/src/core/codestream/ojph_codestream_sse2.cpp
index 9bb0643..3352bcd 100644
--- a/src/core/codestream/ojph_codestream_sse2.cpp
+++ b/src/core/codestream/ojph_codestream_sse2.cpp
@@ -35,6 +35,7 @@
 // Date: 15 May 2022
 //***************************************************************************/
 
+#include <climits>
 #include <immintrin.h>
 #include "ojph_defs.h"
 
@@ -42,7 +43,7 @@ namespace ojph {
   namespace local {
 
     //////////////////////////////////////////////////////////////////////////
-    ui32 sse2_find_max_val(ui32* address)
+    ui32 sse2_find_max_val32(ui32* address)
     {
       __m128i x1, x0 = _mm_loadu_si128((__m128i*)address);
       x1 = _mm_shuffle_epi32(x0, 0xEE);   // x1 = x0[2,3,2,3]
@@ -59,14 +60,29 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, 
-                           float delta_inv, ui32 count, ui32* max_val)
+    ui64 sse2_find_max_val64(ui64* address)
+    {
+      __m128i x1, x0 = _mm_loadu_si128((__m128i*)address);
+      x1 = _mm_shuffle_epi32(x0, 0xEE);   // x1 = x0[2,3,2,3]
+      x0 = _mm_or_si128(x0, x1);
+      _mm_storeu_si128((__m128i*)address, x0);
+      return *address;
+      // A single movd t, xmm0 can do the trick, but it is not available
+      // in SSE2 intrinsics. extract_epi32 is available in sse4.1
+      // ui32 t = (ui32)_mm_extract_epi16(x0, 0);
+      // t |= (ui32)_mm_extract_epi16(x0, 1) << 16;
+      // return t;
+    }    
+
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, 
+                             float delta_inv, ui32 count, ui32* max_val)
     {
       ojph_unused(delta_inv);
 
       // convert to sign and magnitude and keep max_val      
       ui32 shift = 31 - K_max;
-      __m128i m0 = _mm_set1_epi32((int)0x80000000);
+      __m128i m0 = _mm_set1_epi32(INT_MIN);
       __m128i zero = _mm_setzero_si128();
       __m128i one = _mm_set1_epi32(1);
       __m128i tmax = _mm_loadu_si128((__m128i*)max_val);
@@ -88,8 +104,8 @@ namespace ojph {
     }
                            
     //////////////////////////////////////////////////////////////////////////
-    void sse2_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val)
+    void sse2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val)
     {
       ojph_unused(K_max);
 
@@ -118,34 +134,34 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, 
-                             float delta, ui32 count)
+    void sse2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, 
+                               float delta, ui32 count)
     {
       ojph_unused(delta);
       ui32 shift = 31 - K_max;
-      __m128i m1 = _mm_set1_epi32(0x7FFFFFFF);
+      __m128i m1 = _mm_set1_epi32(INT_MAX);
       __m128i zero = _mm_setzero_si128();
       __m128i one = _mm_set1_epi32(1);
       si32 *p = (si32*)dp;
       for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
       {
-          __m128i v = _mm_load_si128((__m128i*)sp);
-          __m128i val = _mm_and_si128(v, m1);
-          val = _mm_srli_epi32(val, (int)shift);
-          __m128i sign = _mm_cmplt_epi32(v, zero);
-          val = _mm_xor_si128(val, sign); // negate 1's complement
-          __m128i ones = _mm_and_si128(sign, one);
-          val = _mm_add_epi32(val, ones); // 2's complement
-          _mm_storeu_si128((__m128i*)p, val);
+        __m128i v = _mm_load_si128((__m128i*)sp);
+        __m128i val = _mm_and_si128(v, m1);
+        val = _mm_srli_epi32(val, (int)shift);
+        __m128i sign = _mm_cmplt_epi32(v, zero);
+        val = _mm_xor_si128(val, sign); // negate 1's complement
+        __m128i ones = _mm_and_si128(sign, one);
+        val = _mm_add_epi32(val, ones); // 2's complement
+        _mm_storeu_si128((__m128i*)p, val);
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, 
-                             float delta, ui32 count)
+    void sse2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, 
+                               float delta, ui32 count)
     {
       ojph_unused(K_max);
-      __m128i m1 = _mm_set1_epi32(0x7FFFFFFF);
+      __m128i m1 = _mm_set1_epi32(INT_MAX);
       __m128 d = _mm_set1_ps(delta);
       float *p = (float*)dp;
       for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
@@ -159,5 +175,59 @@ namespace ojph {
         _mm_storeu_ps(p, valf);
       }
     }
+
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, 
+                             float delta_inv, ui32 count, ui64* max_val)
+    {
+      ojph_unused(delta_inv);
+
+      // convert to sign and magnitude and keep max_val      
+      ui32 shift = 63 - K_max;
+      __m128i m0 = _mm_set1_epi64x(LLONG_MIN);
+      __m128i zero = _mm_setzero_si128();
+      __m128i one = _mm_set1_epi64x(1);
+      __m128i tmax = _mm_loadu_si128((__m128i*)max_val);
+      __m128i *p = (__m128i*)sp;
+      for (ui32 i = 0; i < count; i += 2, p += 1, dp += 2)
+      {
+        __m128i v = _mm_loadu_si128(p);
+        __m128i sign = _mm_cmplt_epi32(v, zero);
+        sign = _mm_shuffle_epi32(sign, 0xF5);  // sign = sign[1,1,3,3];
+        __m128i val = _mm_xor_si128(v, sign);  // negate 1's complement
+        __m128i ones = _mm_and_si128(sign, one);
+        val = _mm_add_epi64(val, ones);        // 2's complement
+        sign = _mm_and_si128(sign, m0);
+        val = _mm_slli_epi64(val, (int)shift);
+        tmax = _mm_or_si128(tmax, val);
+        val = _mm_or_si128(val, sign);
+        _mm_storeu_si128((__m128i*)dp, val);
+      }
+      _mm_storeu_si128((__m128i*)max_val, tmax);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, 
+                               float delta, ui32 count)
+    {
+      ojph_unused(delta);
+      ui32 shift = 63 - K_max;
+      __m128i m1 = _mm_set1_epi64x(LLONG_MAX);
+      __m128i zero = _mm_setzero_si128();
+      __m128i one = _mm_set1_epi64x(1);
+      si64 *p = (si64*)dp;
+      for (ui32 i = 0; i < count; i += 2, sp += 2, p += 2)
+      {
+        __m128i v = _mm_load_si128((__m128i*)sp);
+        __m128i val = _mm_and_si128(v, m1);
+        val = _mm_srli_epi64(val, (int)shift);
+        __m128i sign = _mm_cmplt_epi32(v, zero);
+        sign = _mm_shuffle_epi32(sign, 0xF5);  // sign = sign[1,1,3,3];
+        val = _mm_xor_si128(val, sign); // negate 1's complement
+        __m128i ones = _mm_and_si128(sign, one);
+        val = _mm_add_epi64(val, ones); // 2's complement
+        _mm_storeu_si128((__m128i*)p, val);
+      }
+    }
   }
 }
\ No newline at end of file
diff --git a/src/core/codestream/ojph_codestream_wasm.cpp b/src/core/codestream/ojph_codestream_wasm.cpp
index 19e47aa..e2cd444 100644
--- a/src/core/codestream/ojph_codestream_wasm.cpp
+++ b/src/core/codestream/ojph_codestream_wasm.cpp
@@ -35,6 +35,7 @@
 // Date: 15 May 2022
 //***************************************************************************/
 
+#include <climits>
 #include <cstddef> 
 #include <wasm_simd128.h>
 
@@ -43,20 +44,17 @@
 namespace ojph {
   namespace local {
 
-    //////////////////////////////////////////////////////////////////////////
-  #define REPEAT(a) a,a,a,a
-
     //////////////////////////////////////////////////////////////////////////
     void wasm_mem_clear(void* addr, size_t count)
     {
       float* p = (float*)addr;
-      v128_t zero = wasm_i32x4_const(REPEAT(0));
+      v128_t zero = wasm_i32x4_splat(0);
       for (size_t i = 0; i < count; i += 16, p += 4)
         wasm_v128_store(p, zero);
     }
 
     //////////////////////////////////////////////////////////////////////////
-    ui32 wasm_find_max_val(ui32* address)
+    ui32 wasm_find_max_val32(ui32* address)
     {
       v128_t x1, x0 = wasm_v128_load(address);
       x1 = wasm_i32x4_shuffle(x0, x0, 2, 3, 2, 3);   // x1 = x0[2,3,2,3]
@@ -68,19 +66,29 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, 
-                           float delta_inv, ui32 count, ui32* max_val)
+    ui64 wasm_find_max_val64(ui64* address)
+    {
+      v128_t x1, x0 = wasm_v128_load(address);
+      x1 = wasm_i64x2_shuffle(x0, x0, 1, 1);   // x1 = x0[2,3,2,3]
+      x0 = wasm_v128_or(x0, x1);
+      ui64 t = (ui64)wasm_i64x2_extract_lane(x0, 0);
+      return t;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, 
+                             float delta_inv, ui32 count, ui32* max_val)
     {
       ojph_unused(delta_inv);
 
       // convert to sign and magnitude and keep max_val      
       ui32 shift = 31 - K_max;
-      v128_t m0 = wasm_i32x4_const(REPEAT((int)0x80000000));
-      v128_t zero = wasm_i32x4_const(REPEAT(0));
-      v128_t one = wasm_i32x4_const(REPEAT(1));
+      v128_t m0 = wasm_i32x4_splat(INT_MIN);
+      v128_t zero = wasm_i32x4_splat(0);
+      v128_t one = wasm_i32x4_splat(1);
       v128_t tmax = wasm_v128_load(max_val);
-      v128_t *p = (v128_t*)sp;
-      for (ui32 i = 0; i < count; i += 4, p += 1, dp += 4)
+      si32 *p = (si32*)sp;
+      for (ui32 i = 0; i < count; i += 4, p += 4, dp += 4)
       {
         v128_t v = wasm_v128_load(p);
         v128_t sign = wasm_i32x4_lt(v, zero);
@@ -97,16 +105,16 @@ namespace ojph {
     }
                            
     //////////////////////////////////////////////////////////////////////////
-    void wasm_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
-                           float delta_inv, ui32 count, ui32* max_val)
+    void wasm_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max,
+                             float delta_inv, ui32 count, ui32* max_val)
     {
       ojph_unused(K_max);
 
       //quantize and convert to sign and magnitude and keep max_val
 
       v128_t d = wasm_f32x4_splat(delta_inv);
-      v128_t zero = wasm_i32x4_const(REPEAT(0));
-      v128_t one = wasm_i32x4_const(REPEAT(1));
+      v128_t zero = wasm_i32x4_splat(0);
+      v128_t one = wasm_i32x4_splat(1);
       v128_t tmax = wasm_v128_load(max_val);
       float *p = (float*)sp;
       for (ui32 i = 0; i < count; i += 4, p += 4, dp += 4)
@@ -127,14 +135,14 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, 
-                             float delta, ui32 count)
+    void wasm_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, 
+                               float delta, ui32 count)
     {
       ojph_unused(delta);
       ui32 shift = 31 - K_max;
-      v128_t m1 = wasm_i32x4_const(REPEAT(0x7FFFFFFF));
-      v128_t zero = wasm_i32x4_const(REPEAT(0));
-      v128_t one = wasm_i32x4_const(REPEAT(1));
+      v128_t m1 = wasm_i32x4_splat(INT_MAX);
+      v128_t zero = wasm_i32x4_splat(0);
+      v128_t one = wasm_i32x4_splat(1);
       si32 *p = (si32*)dp;
       for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
       {
@@ -150,11 +158,11 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, 
-                             float delta, ui32 count)
+    void wasm_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, 
+                               float delta, ui32 count)
     {
       ojph_unused(K_max);
-      v128_t m1 = wasm_i32x4_const(REPEAT(0x7FFFFFFF));
+      v128_t m1 = wasm_i32x4_splat(INT_MAX);
       v128_t d = wasm_f32x4_splat(delta);
       float *p = (float*)dp;
       for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
@@ -167,6 +175,58 @@ namespace ojph {
         valf = wasm_v128_or(valf, sign);
         wasm_v128_store(p, valf);
       }
-    }  
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, 
+                             float delta_inv, ui32 count, ui64* max_val)
+    {
+      ojph_unused(delta_inv);
+
+      // convert to sign and magnitude and keep max_val      
+      ui32 shift = 63 - K_max;
+      v128_t m0 = wasm_i64x2_splat(LLONG_MIN);
+      v128_t zero = wasm_i64x2_splat(0);
+      v128_t one = wasm_i64x2_splat(1);
+      v128_t tmax = wasm_v128_load(max_val);
+      si64 *p = (si64*)sp;
+      for (ui32 i = 0; i < count; i += 2, p += 2, dp += 2)
+      {
+        v128_t v = wasm_v128_load(p);
+        v128_t sign = wasm_i64x2_lt(v, zero);
+        v128_t val = wasm_v128_xor(v, sign); // negate 1's complement
+        v128_t ones = wasm_v128_and(sign, one);
+        val = wasm_i64x2_add(val, ones);     // 2's complement
+        sign = wasm_v128_and(sign, m0);
+        val = wasm_i64x2_shl(val, shift);
+        tmax = wasm_v128_or(tmax, val);
+        val = wasm_v128_or(val, sign);
+        wasm_v128_store(dp, val);
+      }
+      wasm_v128_store(max_val, tmax);
+    }   
+
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, 
+                               float delta, ui32 count)
+    {
+      ojph_unused(delta);
+      ui32 shift = 63 - K_max;
+      v128_t m1 = wasm_i64x2_splat(LLONG_MAX);
+      v128_t zero = wasm_i64x2_splat(0);
+      v128_t one = wasm_i64x2_splat(1);
+      si64 *p = (si64*)dp;
+      for (ui32 i = 0; i < count; i += 2, sp += 2, p += 2)
+      {
+          v128_t v = wasm_v128_load((v128_t*)sp);
+          v128_t val = wasm_v128_and(v, m1);
+          val = wasm_i64x2_shr(val, shift);
+          v128_t sign = wasm_i64x2_lt(v, zero);
+          val = wasm_v128_xor(val, sign); // negate 1's complement
+          v128_t ones = wasm_v128_and(sign, one);
+          val = wasm_i64x2_add(val, ones); // 2's complement
+          wasm_v128_store(p, val);
+      }
+    }
   }
 }
\ No newline at end of file
diff --git a/src/core/codestream/ojph_params.cpp b/src/core/codestream/ojph_params.cpp
index 2bd3987..8a234e5 100644
--- a/src/core/codestream/ojph_params.cpp
+++ b/src/core/codestream/ojph_params.cpp
@@ -776,6 +776,25 @@ namespace ojph {
     //
     //////////////////////////////////////////////////////////////////////////
 
+    //////////////////////////////////////////////////////////////////////////
+    ui32 
+    param_cod::propose_implementation_precision(const param_siz* siz) const
+    {
+      bool employing_color_transform = is_employing_color_transform() ? 1 : 0;
+      bool reversible = atk->is_reversible();
+
+      ui32 bit_depth = 32; 
+      if (reversible) {
+        bit_depth = siz->get_bit_depth(comp_num);
+        bit_depth += comp_num < 3 ? employing_color_transform : 0;
+        // 3 or 4 is how many extra bits are needed for the HH band at the 
+        // bottom most level of decomposition. 
+        bit_depth += get_num_decompositions() > 5 ? 4 : 3; 
+      }
+
+      return bit_depth;
+    }
+
     //////////////////////////////////////////////////////////////////////////
     bool param_cod::write(outfile_base *file)
     {
@@ -929,23 +948,46 @@ namespace ojph {
     void param_qcd::set_rev_quant(ui32 num_decomps, ui32 bit_depth,
                                   bool is_employing_color_transform)
     {
-      int guard_bits = 1;
-      Sqcd = (ui8)(guard_bits << 5); //one guard bit, and no quantization
       ui32 B = bit_depth;
       B += is_employing_color_transform ? 1 : 0; //1 bit for RCT
       int s = 0;
       double bibo_l = bibo_gains::get_bibo_gain_l(num_decomps, true);
       ui32 X = (ui32) ceil(log(bibo_l * bibo_l) / M_LN2);
-      u8_SPqcd[s++] = (ui8)((B + X) << 3);
+      u8_SPqcd[s++] = (ui8)(B + X);
+      ui32 max_B_plus_X = (ui32)(B + X);
       for (ui32 d = num_decomps; d > 0; --d)
       {
         double bibo_l = bibo_gains::get_bibo_gain_l(d, true);
         double bibo_h = bibo_gains::get_bibo_gain_h(d - 1, true);
         X = (ui32) ceil(log(bibo_h * bibo_l) / M_LN2);
-        u8_SPqcd[s++] = (ui8)((B + X) << 3);
-        u8_SPqcd[s++] = (ui8)((B + X) << 3);
+        u8_SPqcd[s++] = (ui8)(B + X);
+        max_B_plus_X = ojph_max(max_B_plus_X, B + X);
+        u8_SPqcd[s++] = (ui8)(B + X);
+        max_B_plus_X = ojph_max(max_B_plus_X, B + X);
         X = (ui32) ceil(log(bibo_h * bibo_h) / M_LN2);
-        u8_SPqcd[s++] = (ui8)((B + X) << 3);
+        u8_SPqcd[s++] = (ui8)(B + X);
+        max_B_plus_X = ojph_max(max_B_plus_X, B + X);
+      }
+
+      if (max_B_plus_X > 38)
+        OJPH_ERROR(0x00050151, "The specified combination of bit_depth, "
+         "colour transform, and type of wavelet transform requires more than "
+         "38 bits; it requires %d bits. This is beyond what is allowed in "
+         "the JPEG2000 image coding format.", max_B_plus_X);
+
+      int guard_bits = ojph_max(1, (si32)max_B_plus_X - 31);
+      Sqcd = (ui8)(guard_bits << 5);
+      s = 0;
+      u8_SPqcd[s] = encode_SPqcd((ui8)(u8_SPqcd[s] - guard_bits));
+      s++;
+      for (ui32 d = num_decomps; d > 0; --d)
+      {
+        u8_SPqcd[s] = encode_SPqcd((ui8)(u8_SPqcd[s] - guard_bits));
+        s++;
+        u8_SPqcd[s] = encode_SPqcd((ui8)(u8_SPqcd[s] - guard_bits));
+        s++;
+        u8_SPqcd[s] = encode_SPqcd((ui8)(u8_SPqcd[s] - guard_bits));
+        s++;
       }
     }
 
@@ -1001,8 +1043,11 @@ namespace ojph {
       ui32 B = 0;
       int irrev = Sqcd & 0x1F;
       if (irrev == 0) //reversible
-        for (ui32 i = 0; i < num_subbands; ++i)
-          B = ojph_max(B, (u8_SPqcd[i] >> 3) + get_num_guard_bits() - 1u);
+        for (ui32 i = 0; i < num_subbands; ++i) {
+          ui32 t = decode_SPqcd(u8_SPqcd[i]);
+          t += get_num_guard_bits() - 1u;
+          B = ojph_max(B, t);
+        }
       else if (irrev == 2) //scalar expounded
         for (ui32 i = 0; i < num_subbands; ++i)
         {
@@ -1072,9 +1117,9 @@ namespace ojph {
       }
 
       int irrev = Sqcd & 0x1F;
-      if (irrev == 0) //reversible; this is (10.22) from the J2K book
+      if (irrev == 0) // reversible; this is (10.22) from the J2K book
       {
-        num_bits += u8_SPqcd[idx] >> 3;
+        num_bits += decode_SPqcd(u8_SPqcd[idx]);
         num_bits = num_bits == 0 ? 0 : num_bits - 1;
       }
       else if (irrev == 1)
diff --git a/src/core/codestream/ojph_params_local.h b/src/core/codestream/ojph_params_local.h
index ac8bb77..cce5cd8 100644
--- a/src/core/codestream/ojph_params_local.h
+++ b/src/core/codestream/ojph_params_local.h
@@ -176,10 +176,16 @@ namespace ojph {
     public:
       param_siz()
       {
-        memset(this, 0, sizeof(param_siz));
+        Lsiz = Csiz = 0;        
+        Xsiz = Ysiz = XOsiz = YOsiz = XTsiz = YTsiz = XTOsiz = YTOsiz = 0;
+        skipped_resolutions = 0;
+        memset(store, 0, sizeof(store));
+        ws_kern_support_needed = dfs_support_needed = false;
+        cod = NULL;
+        dfs = NULL;
+        Rsiz = RSIZ_HT_FLAG;
         cptr = store;
         old_Csiz = 4;
-        Rsiz = RSIZ_HT_FLAG;
       }
 
       ~param_siz()
@@ -263,6 +269,7 @@ namespace ojph {
         ui32 t = ojph_div_ceil(Xsiz, ds) - ojph_div_ceil(XOsiz, ds);
         return t;
       }
+      
       ui32 get_height(ui32 comp_num) const
       {
         assert(comp_num < get_num_components());
@@ -516,6 +523,9 @@ namespace ojph {
           return (Scod & 4) == 4;
       }
 
+      ////////////////////////////////////////
+      ui32 propose_implementation_precision(const param_siz* siz) const;
+
       ////////////////////////////////////////
       bool write(outfile_base *file);
 
@@ -639,7 +649,11 @@ namespace ojph {
                          bool is_employing_color_transform);
       void set_irrev_quant(ui32 num_decomps);
 
-    protected:
+      ui8 decode_SPqcd(ui8 v) const
+      { return (ui8)(v >> 3); }
+      ui8 encode_SPqcd(ui8 v) const
+      { return (ui8)(v << 3); }
+   protected:
       ui16 Lqcd;
       ui8 Sqcd;
       union
@@ -863,9 +877,10 @@ namespace ojph {
       };
 
     public: // member functions
-      param_dfs() { memset(this, 0, sizeof(param_dfs)); }
+      param_dfs() { init(); }
       ~param_dfs() { if (next) delete next; }
-      void init() { memset(this, 0, sizeof(param_dfs)); }
+      void init() 
+      { Ldfs = Sdfs = Ids = 0; memset(Ddfs, 0, sizeof(Ddfs)); next = NULL; }
       bool read(infile_base *file);
       bool exists() const { return Ldfs != 0; }
 
@@ -940,8 +955,17 @@ namespace ojph {
       bool read_coefficient(infile_base *file, float &K);
       bool read_coefficient(infile_base *file, si16 &K);
       void init(bool clear_all = true) { 
-        if (clear_all)
-          memset(this, 0, sizeof(param_atk));
+        if (clear_all) 
+        {
+          Latk = Satk = 0;
+          Katk = 0.0f;
+          Natk = 0;
+          d = NULL;
+          max_steps = 0;
+          memset(d_store, 0, sizeof(d_store));
+          next = NULL;
+          alloced_next = false;
+        }
         d = d_store; max_steps = sizeof(d_store) / sizeof(lifting_step);
       }
       void init_irv97();
diff --git a/src/core/codestream/ojph_precinct.cpp b/src/core/codestream/ojph_precinct.cpp
index 813e33b..803790d 100644
--- a/src/core/codestream/ojph_precinct.cpp
+++ b/src/core/codestream/ojph_precinct.cpp
@@ -221,7 +221,9 @@ namespace ojph {
               {
                 int num_zeros = *mmsb_tag.get(x>>levm1, y>>levm1, levm1);
                 num_zeros -= *mmsb_tag.get(x>>cur_lev, y>>cur_lev, cur_lev);
-                bb_put_bits(&bb, 1, num_zeros + 1,
+                bb_put_zeros(&bb, num_zeros,
+                  elastic, cur_coded_list, ph_bytes);
+                bb_put_bits(&bb, 1, 1,
                   elastic, cur_coded_list, ph_bytes);
                 *mmsb_tag_flags.get(x>>levm1, y>>levm1, levm1) = 1;
               }
diff --git a/src/core/codestream/ojph_resolution.cpp b/src/core/codestream/ojph_resolution.cpp
index 87466e0..0246400 100644
--- a/src/core/codestream/ojph_resolution.cpp
+++ b/src/core/codestream/ojph_resolution.cpp
@@ -199,6 +199,9 @@ namespace ojph {
         allocator->pre_alloc_obj<precinct>((size_t)num_precincts.area());
       }
 
+      const param_siz* szp = codestream->get_siz();
+      ui32 precision = cdp->propose_implementation_precision(szp);
+
       //allocate lines
       if (skipped_res_for_recon == false)
       {
@@ -207,10 +210,19 @@ namespace ojph {
         allocator->pre_alloc_obj<lifting_buf>(num_steps + 2);
 
         ui32 width = res_rect.siz.w + 1;
-        for (ui32 i = 0; i < num_steps; ++i)
+        if (precision <= 32) {
+          for (ui32 i = 0; i < num_steps; ++i)
+            allocator->pre_alloc_data<si32>(width, 1);
+          allocator->pre_alloc_data<si32>(width, 1);
           allocator->pre_alloc_data<si32>(width, 1);
-        allocator->pre_alloc_data<si32>(width, 1);
-        allocator->pre_alloc_data<si32>(width, 1);
+        }
+        else 
+        {
+          for (ui32 i = 0; i < num_steps; ++i)
+            allocator->pre_alloc_data<si64>(width, 1);
+          allocator->pre_alloc_data<si64>(width, 1);
+          allocator->pre_alloc_data<si64>(width, 1);
+        }
       }
     }
 
@@ -436,6 +448,9 @@ namespace ojph {
         level_index[i] = level_index[i - 1] + val;
       cur_precinct_loc = point(0, 0);
 
+      const param_siz* szp = codestream->get_siz();
+      ui32 precision = cdp->propose_implementation_precision(szp);
+
       //allocate lines
       if (skipped_res_for_recon == false)
       {
@@ -460,11 +475,22 @@ namespace ojph {
 
         // initiate storage of line_buf
         ui32 width = res_rect.siz.w + 1;
-        for (ui32 i = 0; i < num_steps; ++i)
-          ssp[i].line->wrap(
-            allocator->post_alloc_data<si32>(width, 1), width, 1);
-        sig->line->wrap(allocator->post_alloc_data<si32>(width, 1), width, 1);
-        aug->line->wrap(allocator->post_alloc_data<si32>(width, 1), width, 1);
+        if (precision <= 32)
+        {
+          for (ui32 i = 0; i < num_steps; ++i)
+            ssp[i].line->wrap(
+              allocator->post_alloc_data<si32>(width, 1), width, 1);
+          sig->line->wrap(allocator->post_alloc_data<si32>(width, 1), width, 1);
+          aug->line->wrap(allocator->post_alloc_data<si32>(width, 1), width, 1);
+        }
+        else
+        {
+          for (ui32 i = 0; i < num_steps; ++i)
+            ssp[i].line->wrap(
+              allocator->post_alloc_data<si64>(width, 1), width, 1);
+          sig->line->wrap(allocator->post_alloc_data<si64>(width, 1), width, 1);
+          aug->line->wrap(allocator->post_alloc_data<si64>(width, 1), width, 1);
+        }
 
         cur_line = 0;
         rows_to_produce = res_rect.siz.h;
@@ -682,8 +708,9 @@ namespace ojph {
                     rev_horz_syn(atk, aug->line, child_res->pull_line(), 
                       bands[1].pull_line(), width, horz_even);
                   else
-                    memcpy(aug->line->i32, child_res->pull_line()->i32,
-                      width * sizeof(si32));
+                    memcpy(aug->line->p, child_res->pull_line()->p,
+                      (size_t)width 
+                      * (aug->line->flags & line_buf::LFT_SIZE_MASK));
                   aug->active = true;
                   vert_even = !vert_even;
                   ++cur_line;
@@ -694,8 +721,9 @@ namespace ojph {
                     rev_horz_syn(atk, sig->line, bands[2].pull_line(), 
                       bands[3].pull_line(), width, horz_even);
                   else
-                    memcpy(sig->line->i32, bands[2].pull_line()->i32,
-                      width * sizeof(si32));
+                    memcpy(sig->line->p, bands[2].pull_line()->p,
+                      (size_t)width 
+                      * (sig->line->flags & line_buf::LFT_SIZE_MASK));
                   sig->active = true;
                   vert_even = !vert_even;
                   ++cur_line;
@@ -733,8 +761,9 @@ namespace ojph {
                 rev_horz_syn(atk, aug->line, child_res->pull_line(),
                   bands[1].pull_line(), width, horz_even);
               else
-                memcpy(aug->line->i32, child_res->pull_line()->i32,
-                  width * sizeof(si32));
+                memcpy(aug->line->p, child_res->pull_line()->p,
+                  (size_t)width 
+                  * (aug->line->flags & line_buf::LFT_SIZE_MASK));
             }
             else
             {
@@ -742,11 +771,22 @@ namespace ojph {
                 rev_horz_syn(atk, aug->line, bands[2].pull_line(),
                   bands[3].pull_line(), width, horz_even);
               else
-                memcpy(aug->line->i32, bands[2].pull_line()->i32,
-                  width * sizeof(si32));
-              si32* sp = aug->line->i32;
-              for (ui32 i = width; i > 0; --i)
-                *sp++ >>= 1;
+                memcpy(aug->line->p, bands[2].pull_line()->p,
+                  (size_t)width 
+                  * (aug->line->flags & line_buf::LFT_SIZE_MASK));
+              if (aug->line->flags & line_buf::LFT_32BIT)
+              {
+                si32* sp = aug->line->i32;                
+                for (ui32 i = width; i > 0; --i)
+                  *sp++ >>= 1;
+              }
+              else
+              {
+                assert(aug->line->flags & line_buf::LFT_64BIT);
+                si64* sp = aug->line->i64;
+                for (ui32 i = width; i > 0; --i)
+                  *sp++ >>= 1;
+              }
             }
             return aug->line;
           }
@@ -854,8 +894,8 @@ namespace ojph {
             rev_horz_syn(atk, aug->line, child_res->pull_line(),
               bands[1].pull_line(), width, horz_even);
           else
-            memcpy(aug->line->i32, child_res->pull_line()->i32,
-              width * sizeof(si32));
+            memcpy(aug->line->p, child_res->pull_line()->p,
+              (size_t)width * (aug->line->flags & line_buf::LFT_SIZE_MASK));
           return aug->line;
         }
         else
diff --git a/src/core/codestream/ojph_resolution.h b/src/core/codestream/ojph_resolution.h
index 635a4ce..6156455 100644
--- a/src/core/codestream/ojph_resolution.h
+++ b/src/core/codestream/ojph_resolution.h
@@ -45,7 +45,7 @@ namespace ojph {
 
   ////////////////////////////////////////////////////////////////////////////
   //defined elsewhere
-  struct line_buf;
+  class line_buf;
   class mem_elastic_allocator;
   class codestream;
 
diff --git a/src/core/codestream/ojph_subband.cpp b/src/core/codestream/ojph_subband.cpp
index cf007fc..8efc8de 100644
--- a/src/core/codestream/ojph_subband.cpp
+++ b/src/core/codestream/ojph_subband.cpp
@@ -91,13 +91,18 @@ namespace ojph {
       allocator->pre_alloc_obj<coded_cb_header>((size_t)num_blocks.area());
 
       for (ui32 i = 0; i < num_blocks.w; ++i)
-        codeblock::pre_alloc(codestream, nominal);
+        codeblock::pre_alloc(codestream, comp_num, nominal);
 
       //allocate lines
       allocator->pre_alloc_obj<line_buf>(1);
       //allocate line_buf
       ui32 width = band_rect.siz.w + 1;
-      allocator->pre_alloc_data<si32>(width, 1);
+      const param_siz* szp = codestream->get_siz();
+      ui32 precision = cdp->propose_implementation_precision(szp);
+      if (precision <= 32)      
+        allocator->pre_alloc_data<si32>(width, 1);
+      else
+        allocator->pre_alloc_data<si64>(width, 1);
     }
 
     //////////////////////////////////////////////////////////////////////////
@@ -192,7 +197,12 @@ namespace ojph {
       lines = allocator->post_alloc_obj<line_buf>(1);
       //allocate line_buf
       ui32 width = band_rect.siz.w + 1;
-      lines->wrap(allocator->post_alloc_data<si32>(width,1),width,1);
+      const param_siz* szp = codestream->get_siz();
+      ui32 precision = cdp->propose_implementation_precision(szp);
+      if (precision <= 32)      
+        lines->wrap(allocator->post_alloc_data<si32>(width, 1), width, 1);
+      else
+        lines->wrap(allocator->post_alloc_data<si64>(width, 1), width, 1);
     }
 
     //////////////////////////////////////////////////////////////////////////
@@ -256,10 +266,11 @@ namespace ojph {
       if (empty)
         return;
 
-      assert(l->pre_size == lines[0].pre_size && l->size == lines[0].size);
-      si32* t = lines[0].i32;
-      lines[0].i32 = l->i32;
-      l->i32 = t;
+      assert(l->pre_size == lines[0].pre_size && l->size == lines[0].size &&
+             l->flags == lines[0].flags);
+      void* p = lines[0].p;
+      lines[0].p = l->p;
+      l->p = p;
     }
 
     //////////////////////////////////////////////////////////////////////////
diff --git a/src/core/codestream/ojph_subband.h b/src/core/codestream/ojph_subband.h
index 8cadae0..e1c291a 100644
--- a/src/core/codestream/ojph_subband.h
+++ b/src/core/codestream/ojph_subband.h
@@ -45,7 +45,7 @@ namespace ojph {
 
   ////////////////////////////////////////////////////////////////////////////
   //defined elsewhere
-  struct line_buf;
+  class line_buf;
   class mem_elastic_allocator;
   class codestream;
 
@@ -94,6 +94,8 @@ namespace ojph {
       bool exists() { return !empty; }
 
       line_buf* pull_line();
+      resolution* get_parent() { return parent; }
+      const resolution* get_parent() const { return parent; }
 
     private:
       bool empty;                  // true if the subband has no pixels or
diff --git a/src/core/codestream/ojph_tile.cpp b/src/core/codestream/ojph_tile.cpp
index 281e156..4755bb4 100644
--- a/src/core/codestream/ojph_tile.cpp
+++ b/src/core/codestream/ojph_tile.cpp
@@ -231,8 +231,7 @@ namespace ojph {
         num_lines = 3;
         lines = allocator->post_alloc_obj<line_buf>(num_lines);
         for (int i = 0; i < 3; ++i)
-          lines[i].wrap(
-            allocator->post_alloc_data<si32>(width,0),width,0);
+          lines[i].wrap(allocator->post_alloc_data<si32>(width, 0), width, 0);
       }
       else
       {
@@ -259,17 +258,15 @@ namespace ojph {
         line_buf *tc = comps[comp_num].get_line();
         if (reversible)
         {
-          int shift = 1 << (num_bits[comp_num] - 1);
-          const si32 *sp = line->i32 + line_offsets[comp_num];
-          si32* dp = tc->i32;
-          if (is_signed[comp_num]) {
-            if (nlt_type3[comp_num])
-              cnvrt_si32_to_si32_nlt_type3(sp, dp, shift + 1, comp_width);
-            else
-              memcpy(dp, sp, comp_width * sizeof(si32));
+          si64 shift = (si64)1 << (num_bits[comp_num] - 1);
+          if (is_signed[comp_num] && nlt_type3[comp_num])
+            rev_convert_nlt_type3(line, line_offsets[comp_num],
+              tc, 0, shift + 1, comp_width);
+          else {
+            shift = is_signed[comp_num] ? 0 : -shift;
+            rev_convert(line, line_offsets[comp_num], tc, 0, 
+              shift, comp_width);
           }
-          else
-              cnvrt_si32_to_si32_shftd(sp, dp, -shift, comp_width);
         }
         else
         {
@@ -285,26 +282,25 @@ namespace ojph {
       }
       else
       {
-        int shift = 1 << (num_bits[comp_num] - 1);
+        si64 shift = (si64)1 << (num_bits[comp_num] - 1);
         ui32 comp_width = comp_rects[comp_num].siz.w;
         if (reversible)
         {
-          const si32 *sp = line->i32 + line_offsets[comp_num];
-          si32 *dp = lines[comp_num].i32;
-          if (is_signed[comp_num]) {
-            if (nlt_type3[comp_num])
-              cnvrt_si32_to_si32_nlt_type3(sp, dp, shift + 1, comp_width);
-            else
-              memcpy(dp, sp, comp_width * sizeof(si32));
+          if (is_signed[comp_num] && nlt_type3[comp_num])
+            rev_convert_nlt_type3(line, line_offsets[comp_num], 
+              lines + comp_num, 0, shift + 1, comp_width);            
+          else {
+            shift = is_signed[comp_num] ? 0 : -shift;
+            rev_convert(line, line_offsets[comp_num], lines + comp_num, 0, 
+              shift, comp_width);
           }
-          else
-            cnvrt_si32_to_si32_shftd(sp, dp, -shift, comp_width);
+
           if (comp_num == 2)
           { // reversible color transform
-            rct_forward(lines[0].i32, lines[1].i32, lines[2].i32,
-                        comps[0].get_line()->i32,
-                        comps[1].get_line()->i32,
-                        comps[2].get_line()->i32, comp_width);
+            rct_forward(lines + 0, lines + 1, lines + 2,
+                        comps[0].get_line(),
+                        comps[1].get_line(),
+                        comps[2].get_line(), comp_width);
                         comps[0].push_line();
                         comps[1].push_line();
                         comps[2].push_line();
@@ -350,17 +346,15 @@ namespace ojph {
         ui32 comp_width = recon_comp_rects[comp_num].siz.w;
         if (reversible)
         {
-          int shift = 1 << (num_bits[comp_num] - 1);
-          const si32 *sp = src_line->i32;
-          si32* dp = tgt_line->i32 + line_offsets[comp_num];
-          if (is_signed[comp_num]) {
-            if (nlt_type3[comp_num])
-              cnvrt_si32_to_si32_nlt_type3(sp, dp, shift + 1, comp_width);
-            else
-              memcpy(dp, sp, comp_width * sizeof(si32));
+          si64 shift = (si64)1 << (num_bits[comp_num] - 1);
+          if (is_signed[comp_num] && nlt_type3[comp_num])
+            rev_convert_nlt_type3(src_line, 0, tgt_line, 
+              line_offsets[comp_num], shift + 1, comp_width);
+          else {
+            shift = is_signed[comp_num] ? 0 : shift;
+            rev_convert(src_line, 0, tgt_line, 
+              line_offsets[comp_num], shift, comp_width);
           }
-          else
-            cnvrt_si32_to_si32_shftd(sp, dp, +shift, comp_width);
         }
         else
         {
@@ -380,9 +374,9 @@ namespace ojph {
         if (comp_num == 0)
         {
           if (reversible)
-            rct_backward(comps[0].pull_line()->i32, comps[1].pull_line()->i32,
-              comps[2].pull_line()->i32, lines[0].i32, lines[1].i32,
-              lines[2].i32, comp_width);
+            rct_backward(comps[0].pull_line(), comps[1].pull_line(),
+              comps[2].pull_line(), lines + 0, lines + 1,
+              lines + 2, comp_width);
           else
             ict_backward(comps[0].pull_line()->f32, comps[1].pull_line()->f32,
               comps[2].pull_line()->f32, lines[0].f32, lines[1].f32,
@@ -390,21 +384,20 @@ namespace ojph {
         }
         if (reversible)
         {
-          int shift = 1 << (num_bits[comp_num] - 1);
-          const si32 *sp;
+          si64 shift = (si64)1 << (num_bits[comp_num] - 1);
+          line_buf* src_line;
           if (comp_num < 3)
-            sp = lines[comp_num].i32;
+            src_line = lines + comp_num;
           else
-            sp = comps[comp_num].pull_line()->i32;
-          si32* dp = tgt_line->i32 + line_offsets[comp_num];
-          if (is_signed[comp_num]) {
-            if (nlt_type3[comp_num])
-              cnvrt_si32_to_si32_nlt_type3(sp, dp, shift + 1, comp_width);
-            else
-              memcpy(dp, sp, comp_width * sizeof(si32));
+            src_line = comps[comp_num].pull_line();
+          if (is_signed[comp_num] && nlt_type3[comp_num])
+            rev_convert_nlt_type3(src_line, 0, tgt_line, 
+              line_offsets[comp_num], shift + 1, comp_width);
+          else {
+            shift = is_signed[comp_num] ? 0 : shift;
+            rev_convert(src_line, 0, tgt_line, 
+              line_offsets[comp_num], shift, comp_width);
           }
-          else
-            cnvrt_si32_to_si32_shftd(sp, dp, +shift, comp_width);
         }
         else
         {
diff --git a/src/core/codestream/ojph_tile.h b/src/core/codestream/ojph_tile.h
index 4b54242..6b65a13 100644
--- a/src/core/codestream/ojph_tile.h
+++ b/src/core/codestream/ojph_tile.h
@@ -47,7 +47,7 @@ namespace ojph {
 
   ////////////////////////////////////////////////////////////////////////////
   //defined elsewhere
-  struct line_buf;
+  class line_buf;
   class codestream;
 
   namespace local {
diff --git a/src/core/codestream/ojph_tile_comp.h b/src/core/codestream/ojph_tile_comp.h
index def39e5..62b8fba 100644
--- a/src/core/codestream/ojph_tile_comp.h
+++ b/src/core/codestream/ojph_tile_comp.h
@@ -48,7 +48,7 @@ namespace ojph {
 
   ////////////////////////////////////////////////////////////////////////////
   //defined elsewhere
-  struct line_buf;
+  class line_buf;
   class codestream;
 
   namespace local {
diff --git a/src/core/coding/ojph_block_common.cpp b/src/core/coding/ojph_block_common.cpp
index e6b4de6..2ba138a 100644
--- a/src/core/coding/ojph_block_common.cpp
+++ b/src/core/coding/ojph_block_common.cpp
@@ -84,11 +84,20 @@ namespace ojph {
      *  + 4 * mel event for initial row of quads when needed                 \n
      *                                                                       \n
      *  Each entry contains, starting from the LSB                           \n
-     *  \li \c total prefix length for quads 0 and 1 (3 bits)                \n
-     *  \li \c total suffix length for quads 0 and 1 (4 bits)                \n
+     *  \li \c total total prefix length for quads 0 and 1 (3 bits)          \n
+     *  \li \c total total suffix length for quads 0 and 1 (4 bits)          \n
      *  \li \c suffix length for quad 0 (3 bits)                             \n
      *  \li \c prefix for quad 0 (3 bits)                                    \n
      *  \li \c prefix for quad 1 (3 bits)                                    \n
+     *                                                                       \n
+     *  Another table is uvlc_bias, which is needed to correctly decode the 
+     *  extension u_ext for initial row of quads. Under certain condition,
+     *  we deduct 1 or 2 from u_q0 and u_q1 before encoding them; so for us 
+     *  to know that decoding u_ext is needed, we recreate the u_q0 and u_q1
+     *  that we actually encoded.                                            \n
+     *  For simplicity, we use the same index as before                      \n
+     *  \li \c u_q0 bias is 2 bits                                           \n
+     *  \li \c u_q1 bias is 2 bits                                           \n
      */
 
     /// @brief uvlc_tbl0 contains decoding information for initial row of quads
@@ -96,6 +105,8 @@ namespace ojph {
     /// @brief uvlc_tbl1 contains decoding information for non-initial row of 
     ///        quads
     ui16 uvlc_tbl1[256] = { 0 };
+    /// @brief uvlc_bias contains decoding info. for initial row of quads
+    ui8 uvlc_bias[256+64] = { 0 };
     /// @}
 
     //************************************************************************/
@@ -199,8 +210,10 @@ namespace ojph {
         ui32 mode = i >> 6;
         ui32 vlc = i & 0x3F;
 
-        if (mode == 0)      // both u_off are 0
+        if (mode == 0) {      // both u_off are 0
           uvlc_tbl0[i] = 0;
+          uvlc_bias[i] = 0;
+        }
         else if (mode <= 2) // u_off are either 01 or 10
         {
           ui32 d = dec[vlc & 0x7];   //look at the least significant 3 bits
@@ -232,6 +245,7 @@ namespace ojph {
             total_suffix = u0_suffix_len;
             u0 = d0 >> 5;
             u1 = (vlc & 1) + 1;
+            uvlc_bias[i] = 4; // 0b00 for u0 and 0b01 for u1
           }
           else
           {
@@ -240,6 +254,7 @@ namespace ojph {
             total_suffix = u0_suffix_len + ((d1 >> 2) & 0x7);
             u0 = d0 >> 5;
             u1 = d1 >> 5;
+            uvlc_bias[i] = 0;
           }
 
           uvlc_tbl0[i] = (ui16)(total_prefix | 
@@ -265,6 +280,7 @@ namespace ojph {
                                (u0_suffix_len << 7) |
                                (u0 << 10) |
                                (u1 << 13));
+          uvlc_bias[i] = 10; // 0b10 for u0 and 0b10 for u1
         }
       }
 
diff --git a/src/core/coding/ojph_block_common.h b/src/core/coding/ojph_block_common.h
index 29a84ba..f8d6503 100644
--- a/src/core/coding/ojph_block_common.h
+++ b/src/core/coding/ojph_block_common.h
@@ -44,6 +44,6 @@ namespace ojph{
     extern ui16 vlc_tbl1[1024];
     extern ui16 uvlc_tbl0[256+64];
     extern ui16 uvlc_tbl1[256];
-
+    extern ui8 uvlc_bias[256+64];
   } // !namespace local
 } // !namespace ojph
diff --git a/src/core/coding/ojph_block_decoder.h b/src/core/coding/ojph_block_decoder.h
index ab01961..a197017 100644
--- a/src/core/coding/ojph_block_decoder.h
+++ b/src/core/coding/ojph_block_decoder.h
@@ -50,7 +50,12 @@ namespace ojph {
 
     // generic decoder
     bool
-      ojph_decode_codeblock(ui8* coded_data, ui32* decoded_data,
+      ojph_decode_codeblock32(ui8* coded_data, ui32* decoded_data,
+        ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2,
+        ui32 width, ui32 height, ui32 stride, bool stripe_causal);
+
+    bool
+      ojph_decode_codeblock64(ui8* coded_data, ui64* decoded_data,
         ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2,
         ui32 width, ui32 height, ui32 stride, bool stripe_causal);
 
diff --git a/src/core/coding/ojph_block_decoder.cpp b/src/core/coding/ojph_block_decoder32.cpp
similarity index 99%
rename from src/core/coding/ojph_block_decoder.cpp
rename to src/core/coding/ojph_block_decoder32.cpp
index 259371b..f54c77e 100644
--- a/src/core/coding/ojph_block_decoder.cpp
+++ b/src/core/coding/ojph_block_decoder32.cpp
@@ -739,11 +739,11 @@ namespace ojph {
      *  @param [in]   stride is the decoded codeblock buffer stride 
      *  @param [in]   stripe_causal is true for stripe causal mode
      */
-    bool ojph_decode_codeblock(ui8* coded_data, ui32* decoded_data,
-                               ui32 missing_msbs, ui32 num_passes,
-                               ui32 lengths1, ui32 lengths2,
-                               ui32 width, ui32 height, ui32 stride,
-                               bool stripe_causal)
+    bool ojph_decode_codeblock32(ui8* coded_data, ui32* decoded_data,
+                                 ui32 missing_msbs, ui32 num_passes,
+                                 ui32 lengths1, ui32 lengths2,
+                                 ui32 width, ui32 height, ui32 stride,
+                                 bool stripe_causal)
     {
       static bool insufficient_precision = false;
       static bool modify_code = false;
@@ -1217,7 +1217,7 @@ namespace ojph {
 
             ui32 gamma = inf & 0xF0; gamma &= gamma - 0x10; //is gamma_q 1?
             ui32 emax = vp[0] | vp[1];
-            emax = 31 - count_leading_zeros(emax | 2); // emax - 1            
+            emax = 31 - count_leading_zeros(emax | 2); // emax - 1
             ui32 kappa = gamma ? emax : 1;
 
             ui32 U_q = u_q + kappa;
@@ -1613,4 +1613,4 @@ namespace ojph {
       return true;
     }
   }
-}
+}
\ No newline at end of file
diff --git a/src/core/coding/ojph_block_decoder64.cpp b/src/core/coding/ojph_block_decoder64.cpp
new file mode 100644
index 0000000..8801735
--- /dev/null
+++ b/src/core/coding/ojph_block_decoder64.cpp
@@ -0,0 +1,1663 @@
+//***************************************************************************/
+// This software is released under the 2-Clause BSD license, included
+// below.
+//
+// Copyright (c) 2019, Aous Naman 
+// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
+// Copyright (c) 2019, The University of New South Wales, Australia
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// 
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//***************************************************************************/
+// This file is part of the OpenJPH software implementation.
+// File: ojph_block_decoder.cpp
+// Author: Aous Naman
+// Date: 13 May 2022
+//***************************************************************************/
+
+//***************************************************************************/
+/** @file ojph_block_decoder.cpp
+ *  @brief implements a HTJ2K block decoder
+ */
+
+#include <string>
+#include <iostream>
+
+#include <cassert>
+#include <cstring>
+#include "ojph_block_common.h"
+#include "ojph_block_decoder.h"
+#include "ojph_arch.h"
+#include "ojph_message.h"
+
+namespace ojph {
+  namespace local {
+
+    //************************************************************************/
+    /** @brief MEL state structure for reading and decoding the MEL bitstream
+     *
+     *  A number of events is decoded from the MEL bitstream ahead of time
+     *  and stored in run/num_runs.
+     *  Each run represents the number of zero events before a one event.
+     */ 
+    struct dec_mel_st {
+      dec_mel_st() : data(NULL), tmp(0), bits(0), size(0), unstuff(false),
+        k(0), num_runs(0), runs(0)
+      {}
+      // data decoding machinery
+      ui8* data;    //!<the address of data (or bitstream)
+      ui64 tmp;     //!<temporary buffer for read data
+      int bits;     //!<number of bits stored in tmp
+      int size;     //!<number of bytes in MEL code
+      bool unstuff; //!<true if the next bit needs to be unstuffed
+      int k;        //!<state of MEL decoder
+
+      // queue of decoded runs
+      int num_runs; //!<number of decoded runs left in runs (maximum 8)
+      ui64 runs;    //!<runs of decoded MEL codewords (7 bits/run)
+    };
+
+    //************************************************************************/
+    /** @brief Reads and unstuffs the MEL bitstream
+     * 
+     *  This design needs more bytes in the codeblock buffer than the length
+     *  of the cleanup pass by up to 2 bytes.
+     *
+     *  Unstuffing removes the MSB of the byte following a byte whose
+     *  value is 0xFF; this prevents sequences larger than 0xFF7F in value
+     *  from appearing the bitstream.
+     *
+     *  @param [in]  melp is a pointer to dec_mel_st structure
+     */
+    static inline
+    void mel_read(dec_mel_st *melp)
+    {
+      if (melp->bits > 32)  //there are enough bits in the tmp variable
+        return;             // return without reading new data
+
+      ui32 val = 0xFFFFFFFF;       // feed in 0xFF if buffer is exhausted
+      if (melp->size > 4) {        // if there is data in the MEL segment
+        val = *(ui32*)melp->data;  // read 32 bits from MEL data
+        melp->data += 4;           // advance pointer
+        melp->size -= 4;           // reduce counter
+      }
+      else if (melp->size > 0)
+      { // 4 or less
+        int i = 0;
+        while (melp->size > 1) {   
+          ui32 v = *melp->data++;    // read one byte at a time
+          ui32 m = ~(0xFFu << i);    // mask of location
+          val = (val & m) | (v << i);// put one byte in its correct location
+          --melp->size;
+          i += 8;
+        }
+        // size equal to 1
+        ui32 v = *melp->data++;    // the one before the last is different 
+        v |= 0xF;                  // MEL and VLC segments can overlap
+        ui32 m = ~(0xFFu << i);
+        val = (val & m) | (v << i);
+        --melp->size;
+      }
+      
+      // next we unstuff them before adding them to the buffer
+      int bits = 32 - melp->unstuff; // number of bits in val, subtract 1 if
+                                     // the previously read byte requires 
+                                     // unstuffing
+
+      // data is unstuffed and accumulated in t
+      // bits has the number of bits in t
+      ui32 t = val & 0xFF; 
+      bool unstuff = ((val & 0xFF) == 0xFF); // true if we need unstuffing
+      bits -= unstuff; // there is one less bit in t if unstuffing is needed
+      t = t << (8 - unstuff); // move up to make room for the next byte
+
+      //this is a repeat of the above
+      t |= (val>>8) & 0xFF;
+      unstuff = (((val >> 8) & 0xFF) == 0xFF);
+      bits -= unstuff;
+      t = t << (8 - unstuff);
+
+      t |= (val>>16) & 0xFF;
+      unstuff = (((val >> 16) & 0xFF) == 0xFF);
+      bits -= unstuff;
+      t = t << (8 - unstuff);
+
+      t |= (val>>24) & 0xFF;
+      melp->unstuff = (((val >> 24) & 0xFF) == 0xFF);
+
+      // move t to tmp, and push the result all the way up, so we read from
+      // the MSB
+      melp->tmp |= ((ui64)t) << (64 - bits - melp->bits);
+      melp->bits += bits; //increment the number of bits in tmp
+    }
+
+    //************************************************************************/
+    /** @brief Decodes unstuffed MEL segment bits stored in tmp to runs
+     * 
+     *  Runs are stored in "runs" and the number of runs in "num_runs".
+     *  Each run represents a number of zero events that may or may not 
+     *  terminate in a 1 event.
+     *  Each run is stored in 7 bits.  The LSB is 1 if the run terminates in
+     *  a 1 event, 0 otherwise.  The next 6 bits, for the case terminating 
+     *  with 1, contain the number of consecutive 0 zero events * 2; for the 
+     *  case terminating with 0, they store (number of consecutive 0 zero 
+     *  events - 1) * 2.
+     *  A total of 6 bits (made up of 1 + 5) should have been enough.
+     *
+     *  @param [in]  melp is a pointer to dec_mel_st structure
+     */
+    static inline
+    void mel_decode(dec_mel_st *melp)
+    {
+      static const int mel_exp[13] = { //MEL exponents
+        0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5
+      };
+
+      if (melp->bits < 6) // if there are less than 6 bits in tmp
+        mel_read(melp);   // then read from the MEL bitstream
+                          // 6 bits is the largest decodable MEL cwd
+
+      //repeat so long that there is enough decodable bits in tmp,
+      // and the runs store is not full (num_runs < 8)
+      while (melp->bits >= 6 && melp->num_runs < 8)
+      {
+        int eval = mel_exp[melp->k]; // number of bits associated with state
+        int run = 0;
+        if (melp->tmp & (1ull<<63)) //The next bit to decode (stored in MSB)
+        { //one is found
+          run = 1 << eval;  
+          run--; // consecutive runs of 0 events - 1
+          melp->k = melp->k + 1 < 12 ? melp->k + 1 : 12;//increment, max is 12
+          melp->tmp <<= 1; // consume one bit from tmp
+          melp->bits -= 1;
+          run = run << 1; // a stretch of zeros not terminating in one
+        }
+        else
+        { //0 is found
+          run = (int)(melp->tmp >> (63 - eval)) & ((1 << eval) - 1);
+          melp->k = melp->k - 1 > 0 ? melp->k - 1 : 0; //decrement, min is 0
+          melp->tmp <<= eval + 1; //consume eval + 1 bits (max is 6)
+          melp->bits -= eval + 1;
+          run = (run << 1) + 1; // a stretch of zeros terminating with one
+        }
+        eval = melp->num_runs * 7;           // 7 bits per run
+        melp->runs &= ~((ui64)0x3F << eval); // 6 bits are sufficient
+        melp->runs |= ((ui64)run) << eval;   // store the value in runs
+        melp->num_runs++;                    // increment count  
+      }
+    }
+
+    //************************************************************************/
+    /** @brief Initiates a dec_mel_st structure for MEL decoding and reads
+     *         some bytes in order to get the read address to a multiple
+     *         of 4 
+     *
+     *  @param [in]  melp is a pointer to dec_mel_st structure
+     *  @param [in]  bbuf is a pointer to byte buffer
+     *  @param [in]  lcup is the length of MagSgn+MEL+VLC segments
+     *  @param [in]  scup is the length of MEL+VLC segments
+     */
+    static inline
+    void mel_init(dec_mel_st *melp, ui8* bbuf, int lcup, int scup)
+    {
+      melp->data = bbuf + lcup - scup; // move the pointer to the start of MEL
+      melp->bits = 0;                  // 0 bits in tmp
+      melp->tmp = 0;                   //
+      melp->unstuff = false;           // no unstuffing
+      melp->size = scup - 1;           // size is the length of MEL+VLC-1
+      melp->k = 0;                     // 0 for state 
+      melp->num_runs = 0;              // num_runs is 0
+      melp->runs = 0;                  //
+
+      //This code is borrowed; original is for a different architecture
+      //These few lines take care of the case where data is not at a multiple
+      // of 4 boundary.  It reads 1,2,3 up to 4 bytes from the MEL segment
+      int num = 4 - (int)(intptr_t(melp->data) & 0x3);
+      for (int i = 0; i < num; ++i) { // this code is similar to mel_read
+        assert(melp->unstuff == false || melp->data[0] <= 0x8F);
+        ui64 d = (melp->size > 0) ? *melp->data : 0xFF;//if buffer is consumed
+                                                       //set data to 0xFF
+        if (melp->size == 1) d |= 0xF; //if this is MEL+VLC-1, set LSBs to 0xF
+                                       // see the standard
+        melp->data += melp->size-- > 0; //increment if the end is not reached
+        int d_bits = 8 - melp->unstuff; //if unstuffing is needed, reduce by 1
+        melp->tmp = (melp->tmp << d_bits) | d; //store bits in tmp
+        melp->bits += d_bits;  //increment tmp by number of bits
+        melp->unstuff = ((d & 0xFF) == 0xFF); //true of next byte needs 
+                                              //unstuffing
+      }
+      melp->tmp <<= (64 - melp->bits); //push all the way up so the first bit
+                                       // is the MSB
+    }
+
+    //************************************************************************/
+    /** @brief Retrieves one run from dec_mel_st; if there are no runs stored
+     *         MEL segment is decoded
+     *
+     * @param [in]  melp is a pointer to dec_mel_st structure
+     */    
+    static inline
+    int mel_get_run(dec_mel_st *melp)
+    {
+      if (melp->num_runs == 0)  //if no runs, decode more bit from MEL segment
+        mel_decode(melp);
+
+      int t = melp->runs & 0x7F; //retrieve one run
+      melp->runs >>= 7;  // remove the retrieved run
+      melp->num_runs--;
+      return t; // return run
+    }
+
+    //************************************************************************/
+    /** @brief A structure for reading and unstuffing a segment that grows
+     *         backward, such as VLC and MRP
+     */ 
+    struct rev_struct {
+      rev_struct() : data(NULL), tmp(0), bits(0), size(0), unstuff(false)
+      {}
+      //storage
+      ui8* data;     //!<pointer to where to read data
+      ui64 tmp;	     //!<temporary buffer of read data
+      ui32 bits;     //!<number of bits stored in tmp
+      int size;      //!<number of bytes left
+      bool unstuff;  //!<true if the last byte is more than 0x8F
+                     //!<then the current byte is unstuffed if it is 0x7F
+    };
+
+    //************************************************************************/
+    /** @brief Read and unstuff data from a backwardly-growing segment
+     *
+     *  This reader reads 8 bits from the VLC segment. It fills zeros when 
+     *  the buffer is exhausted; we basically do not care about these zeros 
+     *  because we should not need them -- any extra data should not be used 
+     *  in the actual decoding. If these bytes are needed, then there is a 
+     *  problem in the bitstream, but we do not flag this error.
+     *
+     *  Unstuffing is needed to prevent sequences larger than 0xFF8F from 
+     *  appearing in the bits stream; since we are reading backward, we keep
+     *  watch when a value larger than 0x8F appears in the bitstream. 
+     *  If the byte following this is 0x7F, we unstuff this byte (ignore the 
+     *  MSB of that byte, which should be 0).
+     *
+     *  @param [in]  vlcp is a pointer to rev_struct structure
+     */
+    static inline 
+    void rev_read8(rev_struct *vlcp)
+    {
+      // process 1 bytes
+      ui8 val = 0; // insert 0s at the end -- the standard says that the
+                   // bitstream must contain all needed bits. Therefore
+                   // if the whole bitstream is consumed and bits are still
+                   // needed, then this is an error condition, but we are
+                   // lenient -- it is also possible that we are decoding
+                   // more bits than what we are actually need.
+      if (vlcp->size > 0)  // if there are more than 3 bytes left in VLC
+      {
+        val = *vlcp->data; // then read 8 bits
+        --vlcp->data;      // increment data pointer
+        --vlcp->size;      // decrement number of bytes in the buffer
+      }
+
+      // accumulate in tmp, and increment bits, check if unstuffing is needed
+      ui8 t = (vlcp->unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0;
+      val = (ui8)(val & (0xFFU >> t)); // protect against erroneous 1 in MSB
+      vlcp->tmp |= (ui64)val << vlcp->bits;
+      vlcp->bits += 8 - t;
+      vlcp->unstuff = val > 0x8F;
+    }
+
+    //************************************************************************/
+    /** @brief Initiates the rev_struct structure and reads the first byte
+     *
+     *  This subroutine initializes the VLC decoder.  It discards the first 
+     *  12 bits (they have the sum of the lengths of VLC and MEL segments), 
+     *  and depending on unstuffing, stores 3 or 4 bits in the unstuffed
+     *  decoded buffer.
+     *
+     *  @param [in]  vlcp is a pointer to rev_struct structure
+     *  @param [in]  data is a pointer to byte at the start of the cleanup pass
+     *  @param [in]  lcup is the length of MagSgn+MEL+VLC segments
+     *  @param [in]  scup is the length of MEL+VLC segments
+     */
+    static inline 
+    void rev_init8(rev_struct *vlcp, ui8* data, int lcup, int scup)
+    {
+      //first byte has only the upper 4 bits
+      vlcp->data = data + lcup - 2;
+
+      //size can not be larger than this, in fact it should be smaller
+      vlcp->size = scup - 2;
+
+      ui8 val = *vlcp->data--; // read one byte (this is a half byte)
+
+      // the first byte is treated different to other bytes, because only
+      // the MSB nibble is part of the VLC code.
+      val = (ui8)(val >> 4);
+      ui8 t = ((val & 0x7) == 0x7) ? 1 : 0; // unstuffing is needed
+      val = (ui8)(val & (0xFU >> t)); // protect against erroneous 1 in MSB
+      vlcp->tmp = val;
+      vlcp->bits = 4 - t;
+      vlcp->unstuff = val > 0x8; //this is useful for the next byte
+    }
+
+    //************************************************************************/
+    /** @brief Fills the temporary variable (vlcp->tmp) by up to 64 bits
+     *
+     *  By the end of this call, vlcp->tmp must have no less than 56 bits
+     *
+     *  @param [in]  vlcp is a pointer to rev_struct structure
+     */
+    static inline 
+    ui64 rev_fetch64(rev_struct *vlcp)
+    {
+      while (vlcp->bits <= 56)
+        rev_read8(vlcp); // read 8 bits, but unstuffing might reduce this
+      return vlcp->tmp;  // return unstuff decoded bits
+    }    
+
+    //************************************************************************/
+    /** @brief Consumes num_bits from a rev_struct structure
+     *
+     *  @param [in]  vlcp is a pointer to rev_struct structure
+     *  @param [in]  num_bits is the number of bits to be removed
+     */
+    static inline 
+    ui64 rev_advance64(rev_struct *vlcp, ui32 num_bits)
+    {
+      assert(num_bits <= vlcp->bits); // vlcp->tmp must have more than num_bits
+      vlcp->tmp >>= num_bits;         // remove bits
+      vlcp->bits -= num_bits;         // decrement the number of bits
+      return vlcp->tmp;
+    }    
+
+    //************************************************************************/
+    /** @brief Reads and unstuffs from rev_struct
+     *
+     *  This is different than rev_read in that this fills in zeros when the
+     *  the available data is consumed.  The other does not care about the
+     *  values when all data is consumed.
+     *
+     *  See rev_read for more information about unstuffing
+     *
+     *  @param [in]  mrp is a pointer to rev_struct structure
+     */
+    static inline 
+    void rev_read_mrp(rev_struct *mrp)
+    {
+      //process 4 bytes at a time
+      if (mrp->bits > 32)
+        return;
+      ui32 val = 0;
+      if (mrp->size > 3) // If there are 3 byte or more
+      { // (mrp->data - 3) move pointer back to read 32 bits at once
+        val = *(ui32*)(mrp->data - 3); // read 32 bits
+        mrp->data -= 4;                // move back pointer
+        mrp->size -= 4;                // reduce count
+      }
+      else if (mrp->size > 0)
+      {
+        int i = 24;
+        while (mrp->size > 0) {   
+          ui32 v = *mrp->data--; // read one byte at a time
+          val |= (v << i);       // put byte in its correct location
+          --mrp->size;
+          i -= 8;
+        }
+      }
+
+      //accumulate in tmp, and keep count in bits
+      ui32 bits, tmp = val >> 24;
+
+      //test if the last byte > 0x8F (unstuff must be true) and this is 0x7F
+      bits = 8 - ((mrp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1 : 0);
+      bool unstuff = (val >> 24) > 0x8F;
+
+      //process the next byte
+      tmp |= ((val >> 16) & 0xFF) << bits;
+      bits += 8 - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1 : 0);
+      unstuff = ((val >> 16) & 0xFF) > 0x8F;
+
+      tmp |= ((val >> 8) & 0xFF) << bits;
+      bits += 8 - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1 : 0);
+      unstuff = ((val >> 8) & 0xFF) > 0x8F;
+
+      tmp |= (val & 0xFF) << bits;
+      bits += 8 - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0);
+      unstuff = (val & 0xFF) > 0x8F;
+
+      mrp->tmp |= (ui64)tmp << mrp->bits; // move data to mrp pointer
+      mrp->bits += bits;
+      mrp->unstuff = unstuff;             // next byte
+    }
+
+    //************************************************************************/
+    /** @brief Initialized rev_struct structure for MRP segment, and reads
+     *         a number of bytes such that the next 32 bits read are from
+     *         an address that is a multiple of 4. Note this is designed for
+     *         an architecture that read size must be compatible with the
+     *         alignment of the read address
+     *
+     *  There is another similar subroutine rev_init.  This subroutine does 
+     *  NOT skip the first 12 bits, and starts with unstuff set to true.
+     *
+     *  @param [in]  mrp is a pointer to rev_struct structure
+     *  @param [in]  data is a pointer to byte at the start of the cleanup pass
+     *  @param [in]  lcup is the length of MagSgn+MEL+VLC segments
+     *  @param [in]  len2 is the length of SPP+MRP segments
+     */
+    static inline 
+    void rev_init_mrp(rev_struct *mrp, ui8* data, int lcup, int len2)
+    {
+      mrp->data = data + lcup + len2 - 1;
+      mrp->size = len2;
+      mrp->unstuff = true;
+      mrp->bits = 0;
+      mrp->tmp = 0;
+
+      //This code is designed for an architecture that read address should
+      // align to the read size (address multiple of 4 if read size is 4)
+      //These few lines take care of the case where data is not at a multiple
+      // of 4 boundary.  It reads 1,2,3 up to 4 bytes from the MRP stream
+      int num = 1 + (int)(intptr_t(mrp->data) & 0x3);
+      for (int i = 0; i < num; ++i) {
+        ui64 d;
+        //read a byte, 0 if no more data
+        d = (mrp->size-- > 0) ? *mrp->data-- : 0; 
+        //check if unstuffing is needed
+        ui32 d_bits = 8 - ((mrp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0);
+        mrp->tmp |= d << mrp->bits; // move data to vlcp->tmp
+        mrp->bits += d_bits;
+        mrp->unstuff = d > 0x8F; // for next byte
+      }
+      rev_read_mrp(mrp);
+    }
+
+    //************************************************************************/
+    /** @brief Retrieves 32 bits from the head of a rev_struct structure 
+     *
+     *  By the end of this call, mrp->tmp must have no less than 33 bits
+     *
+     *  @param [in]  mrp is a pointer to rev_struct structure
+     */
+    static inline 
+    ui32 rev_fetch_mrp(rev_struct *mrp)
+    {
+      if (mrp->bits < 32) // if there are less than 32 bits in mrp->tmp
+      {
+        rev_read_mrp(mrp);    // read 30-32 bits from mrp
+        if (mrp->bits < 32)   // if there is a space of 32 bits
+          rev_read_mrp(mrp);  // read more
+      }
+      return (ui32)mrp->tmp;  // return the head of mrp->tmp
+    }
+
+    //************************************************************************/
+    /** @brief Consumes num_bits from a rev_struct structure
+     *
+     *  @param [in]  mrp is a pointer to rev_struct structure
+     *  @param [in]  num_bits is the number of bits to be removed
+     */
+    static inline 
+    ui32 rev_advance_mrp(rev_struct *mrp, ui32 num_bits)
+    {
+      assert(num_bits <= mrp->bits); // we must not consume more than mrp->bits
+      mrp->tmp >>= num_bits;  // discard the lowest num_bits bits
+      mrp->bits -= num_bits;
+      return (ui32)mrp->tmp;  // return data after consumption
+    }
+
+    //************************************************************************/
+    /** @brief State structure for reading and unstuffing of forward-growing 
+     *         bitstreams; these are: MagSgn and SPP bitstreams
+     */
+    struct frwd_struct {
+      const ui8* data;  //!<pointer to bitstream
+      ui64 tmp;         //!<temporary buffer of read data
+      ui32 bits;        //!<number of bits stored in tmp
+      ui32 unstuff;     //!<1 if a bit needs to be unstuffed from next byte
+      int size;         //!<size of data
+    };
+
+    //************************************************************************/
+    /** @brief Read and unstuffs 32 bits from forward-growing bitstream
+     *  
+     *  A template is used to accommodate a different requirement for
+     *  MagSgn and SPP bitstreams; in particular, when MagSgn bitstream is
+     *  consumed, 0xFF's are fed, while when SPP is exhausted 0's are fed in.
+     *  X controls this value.
+     *
+     *  Unstuffing prevent sequences that are more than 0xFF7F from appearing
+     *  in the conpressed sequence.  So whenever a value of 0xFF is coded, the
+     *  MSB of the next byte is set 0 and must be ignored during decoding.
+     *
+     *  Reading can go beyond the end of buffer by up to 3 bytes.
+     *
+     *  @tparam       X is the value fed in when the bitstream is exhausted
+     *  @param  [in]  msp is a pointer to frwd_struct structure
+     *
+     */ 
+    template<int X>
+    static inline 
+    void frwd_read(frwd_struct *msp)
+    {
+      assert(msp->bits <= 32); // assert that there is a space for 32 bits
+
+      ui32 val = 0;
+      if (msp->size > 3) {
+        val = *(ui32*)msp->data;  // read 32 bits
+        msp->data += 4;           // increment pointer
+        msp->size -= 4;           // reduce size
+      }
+      else if (msp->size > 0)
+      {
+        int i = 0;
+        val = X != 0 ? 0xFFFFFFFFu : 0;
+        while (msp->size > 0) {   
+          ui32 v = *msp->data++;    // read one byte at a time
+          ui32 m = ~(0xFFu << i);    // mask of location
+          val = (val & m) | (v << i);// put one byte in its correct location
+          --msp->size;
+          i += 8;          
+        }
+      }
+      else
+        val = X != 0 ? 0xFFFFFFFFu : 0;
+
+      // we accumulate in t and keep a count of the number of bits in bits
+      ui32 bits = 8 - msp->unstuff;        
+      ui32 t = val & 0xFF;
+      bool unstuff = ((val & 0xFF) == 0xFF);  // Do we need unstuffing next?
+
+      t |= ((val >> 8) & 0xFF) << bits;
+      bits += 8 - unstuff;
+      unstuff = (((val >> 8) & 0xFF) == 0xFF);
+
+      t |= ((val >> 16) & 0xFF) << bits;
+      bits += 8 - unstuff;
+      unstuff = (((val >> 16) & 0xFF) == 0xFF);
+
+      t |= ((val >> 24) & 0xFF) << bits;
+      bits += 8 - unstuff;
+      msp->unstuff = (((val >> 24) & 0xFF) == 0xFF); // for next byte
+
+      msp->tmp |= ((ui64)t) << msp->bits;  // move data to msp->tmp
+      msp->bits += bits;
+    }
+
+    //************************************************************************/
+    /** @brief Read and unstuffs 8 bits from forward-growing bitstream
+     *  
+     *  A template is used to accommodate a different requirement for
+     *  MagSgn and SPP bitstreams; in particular, when MagSgn bitstream is
+     *  consumed, 0xFF's are fed, while when SPP is exhausted 0's are fed in.
+     *  X controls this value.
+     *
+     *  Unstuffing prevent sequences that are more than 0xFF7F from appearing
+     *  in the conpressed sequence.  So whenever a value of 0xFF is coded, the
+     *  MSB of the next byte is set 0 and must be ignored during decoding.
+     *
+     *  @tparam       X is the value fed in when the bitstream is exhausted
+     *  @param  [in]  msp is a pointer to frwd_struct structure
+     *
+     */ 
+    template<ui8 X>
+    static inline 
+    void frwd_read8(frwd_struct *msp)
+    {
+      ui8 val = X;
+      if (msp->size > 0) {
+        val = *msp->data;  // read 8 bits
+        ++msp->data;      // increment pointer
+        --msp->size;      // reduce size
+      }
+
+      // unstuff and accumulate
+      ui8 t = msp->unstuff ? 1 : 0;
+      val = (ui8)(val & (0xFFU >> t));
+      msp->unstuff = (val == 0xFF);
+      msp->tmp |= ((ui64)val) << msp->bits;  // move data to msp->tmp
+      msp->bits += 8 - t;
+    }
+
+    //************************************************************************/
+    /** @brief Initialize frwd_struct struct and reads some bytes
+     *  
+     *  @tparam      X is the value fed in when the bitstream is exhausted.
+     *               See frwd_read regarding the template
+     *  @param [in]  msp is a pointer to frwd_struct
+     *  @param [in]  data is a pointer to the start of data
+     *  @param [in]  size is the number of byte in the bitstream
+     */
+    template<int X>
+    static inline
+    void frwd_init(frwd_struct *msp, const ui8* data, int size)
+    {
+      msp->data = data;
+      msp->tmp = 0;
+      msp->bits = 0;
+      msp->unstuff = 0;
+      msp->size = size;
+
+      //This code is designed for an architecture that read address should
+      // align to the read size (address multiple of 4 if read size is 4)
+      //These few lines take care of the case where data is not at a multiple
+      // of 4 boundary.  It reads 1,2,3 up to 4 bytes from the bitstream
+      int num = 4 - (int)(intptr_t(msp->data) & 0x3);
+      for (int i = 0; i < num; ++i)
+      {
+        ui64 d;
+        //read a byte if the buffer is not exhausted, otherwise set it to X
+        d = msp->size-- > 0 ? *msp->data++ : X;
+        msp->tmp |= (d << msp->bits);      // store data in msp->tmp
+        msp->bits += 8 - msp->unstuff;     // number of bits added to msp->tmp
+        msp->unstuff = ((d & 0xFF) == 0xFF); // unstuffing for next byte
+      }
+      frwd_read<X>(msp); // read 32 bits more
+    }
+
+    //************************************************************************/
+    /** @brief Initialize frwd_struct struct and reads some bytes
+     *  
+     *  @tparam      X is the value fed in when the bitstream is exhausted.
+     *               See frwd_read regarding the template
+     *  @param [in]  msp is a pointer to frwd_struct
+     *  @param [in]  data is a pointer to the start of data
+     *  @param [in]  size is the number of byte in the bitstream
+     */
+    template<ui8 X>
+    static inline
+    void frwd_init8(frwd_struct *msp, const ui8* data, int size)
+    {
+      msp->data = data;
+      msp->tmp = 0;
+      msp->bits = 0;
+      msp->unstuff = 0;
+      msp->size = size;
+      frwd_read8<X>(msp); // read 8 bits
+    }
+
+    //************************************************************************/
+    /** @brief Consume num_bits bits from the bitstream of frwd_struct
+     *
+     *  @param [in]  msp is a pointer to frwd_struct
+     *  @param [in]  num_bits is the number of bit to consume
+     */
+    static inline 
+    void frwd_advance(frwd_struct *msp, ui32 num_bits)
+    {
+      assert(num_bits <= msp->bits);
+      msp->tmp >>= num_bits;  // consume num_bits
+      msp->bits -= num_bits;
+    }
+
+    //************************************************************************/
+    /** @brief Fetches 32 bits from the frwd_struct bitstream
+     *
+     *  @tparam      X is the value fed in when the bitstream is exhausted.
+     *               See frwd_read regarding the template
+     *  @param [in]  msp is a pointer to frwd_struct
+     */
+    template<int X>
+    static inline 
+    ui32 frwd_fetch(frwd_struct *msp)
+    {
+      if (msp->bits < 32)
+      {
+        frwd_read<X>(msp);
+        if (msp->bits < 32) //need to test
+          frwd_read<X>(msp);
+      }
+      return (ui32)msp->tmp;
+    }
+
+    //************************************************************************/
+    /** @brief Fetches up to 64 bits from the frwd_struct bitstream
+     *
+     *  @tparam      X is the value fed in when the bitstream is exhausted.
+     *               See frwd_read regarding the template
+     *  @param [in]  msp is a pointer to frwd_struct
+     */
+    template<ui8 X>
+    static inline 
+    ui64 frwd_fetch64(frwd_struct *msp)
+    {
+      while (msp->bits <= 56)
+        frwd_read8<X>(msp);
+      return msp->tmp;
+    }    
+
+    //************************************************************************/
+    /** @brief Decodes one codeblock, processing the cleanup, siginificance
+     *         propagation, and magnitude refinement pass
+     *
+     *  @param [in]   coded_data is a pointer to bitstream
+     *  @param [in]   decoded_data is a pointer to decoded codeblock data buf.
+     *  @param [in]   missing_msbs is the number of missing MSBs
+     *  @param [in]   num_passes is the number of passes: 1 if CUP only,
+     *                2 for CUP+SPP, and 3 for CUP+SPP+MRP
+     *  @param [in]   lengths1 is the length of cleanup pass
+     *  @param [in]   lengths2 is the length of refinement passes (either SPP
+     *                only or SPP+MRP)
+     *  @param [in]   width is the decoded codeblock width 
+     *  @param [in]   height is the decoded codeblock height
+     *  @param [in]   stride is the decoded codeblock buffer stride 
+     *  @param [in]   stripe_causal is true for stripe causal mode
+     */
+    bool ojph_decode_codeblock64(ui8* coded_data, ui64* decoded_data,
+                                 ui32 missing_msbs, ui32 num_passes,
+                                 ui32 lengths1, ui32 lengths2,
+                                 ui32 width, ui32 height, ui32 stride,
+                                 bool stripe_causal)
+    {
+      // static bool insufficient_precision = false;
+      // static bool modify_code = false;
+      // static bool truncate_spp_mrp = false;
+
+      if (num_passes > 1 && lengths2 == 0)
+      {
+        OJPH_WARN(0x00010001, "A malformed codeblock that has more than "
+                              "one coding pass, but zero length for "
+                              "2nd and potential 3rd pass.");
+        num_passes = 1;
+      }
+
+      if (num_passes > 3)
+      {
+        OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; "
+                              "This codeblocks has %d passes.",
+                              num_passes);
+        return false;
+      }
+
+      // if (missing_msbs > 30) // p < 0
+      // {
+      //   if (insufficient_precision == false) 
+      //   {
+      //     insufficient_precision = true;
+      //     OJPH_WARN(0x00010003, "32 bits are not enough to decode this "
+      //                           "codeblock. This message will not be "
+      //                           "displayed again.");
+      //   }
+      //   return false;
+      // }       
+      // else if (missing_msbs == 30) // p == 0
+      // { // not enough precision to decode and set the bin center to 1
+      //   if (modify_code == false) {
+      //     modify_code = true;
+      //     OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup "
+      //                           "pass. The code can be modified to support "
+      //                           "this case. This message will not be "
+      //                           "displayed again.");
+      //   }
+      //    return false;         // 32 bits are not enough to decode this
+      //  }
+      // else if (missing_msbs == 29) // if p is 1, then num_passes must be 1
+      // {
+      //   if (num_passes > 1) {
+      //     num_passes = 1;
+      //     if (truncate_spp_mrp == false) {
+      //       truncate_spp_mrp = true;
+      //       OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp "
+      //                             "nor MagRef passes; both will be skipped. "
+      //                             "This message will not be displayed "
+      //                             "again.");
+      //     }
+      //   }
+      // }
+      ui32 p = 62 - missing_msbs; // The least significant bitplane for CUP
+      // There is a way to handle the case of p == 0, but a different path
+      // is required
+
+      if (lengths1 < 2)
+      {
+        OJPH_WARN(0x00010006, "Wrong codeblock length.");
+        return false;
+      }
+
+      // read scup and fix the bytes there
+      int lcup, scup;
+      lcup = (int)lengths1;  // length of CUP
+      //scup is the length of MEL + VLC
+      scup = (((int)coded_data[lcup-1]) << 4) + (coded_data[lcup-2] & 0xF);
+      if (scup < 2 || scup > lcup || scup > 4079) //something is wrong
+        return false;
+
+      // The temporary storage scratch holds two types of data in an 
+      // interleaved fashion. The interleaving allows us to use one
+      // memory pointer.
+      // We have one entry for a decoded VLC code, and one entry for UVLC.
+      // Entries are 16 bits each, corresponding to one quad, 
+      // but since we want to use XMM registers of the SSE family 
+      // of SIMD; we allocated 16 bytes or more per quad row; that is,
+      // the width is no smaller than 16 bytes (or 8 entries), and the
+      // height is 512 quads
+      // Each VLC entry contains, in the following order, starting 
+      // from MSB
+      // e_k (4bits), e_1 (4bits), rho (4bits), useless for step 2 (4bits)
+      // Each entry in UVLC contains u_q
+      // One extra row to handle the case of SPP propagating downwards
+      // when codeblock width is 4
+      ui16 scratch[8 * 513] = {0};       // 8 kB
+
+      // We need an extra two entries (one inf and one u_q) beyond
+      // the last column. 
+      // If the block width is 4 (2 quads), then we use sstr of 8 
+      // (enough for 4 quads). If width is 8 (4 quads) we use 
+      // sstr is 16 (enough for 8 quads). For a width of 16 (8 
+      // quads), we use 24 (enough for 12 quads).
+      ui32 sstr = ((width + 2u) + 7u) & ~7u; // multiples of 8
+
+      ui32 mmsbp2 = missing_msbs + 2;
+
+      // The cleanup pass is decoded in two steps; in step one,
+      // the VLC and MEL segments are decoded, generating a record that 
+      // has 2 bytes per quad. The 2 bytes contain, u, rho, e^1 & e^k.
+      // This information should be sufficient for the next step.
+      // In step 2, we decode the MagSgn segment.
+
+      // step 1 decoding VLC and MEL segments
+      {
+        // init structures
+        dec_mel_st mel;
+        mel_init(&mel, coded_data, lcup, scup);
+        rev_struct vlc;
+        rev_init8(&vlc, coded_data, lcup, scup);
+
+        int run = mel_get_run(&mel); // decode runs of events from MEL bitstrm
+                                     // data represented as runs of 0 events
+                                     // See mel_decode description
+
+        ui64 vlc_val;
+        ui32 c_q = 0;
+        ui16 *sp = scratch;
+        //initial quad row
+        for (ui32 x = 0; x < width; sp += 4)
+        {
+          // decode VLC
+          /////////////
+
+          // first quad
+          vlc_val = rev_fetch64(&vlc);
+
+          //decode VLC using the context c_q and the head of VLC bitstream
+          ui16 t0 = vlc_tbl0[ c_q + (vlc_val & 0x7F) ];
+
+          // if context is zero, use one MEL event
+          if (c_q == 0) //zero context
+          {
+            run -= 2; //subtract 2, since events number if multiplied by 2
+
+            // Is the run terminated in 1? if so, use decoded VLC code, 
+            // otherwise, discard decoded data, since we will decoded again 
+            // using a different context
+            t0 = (run == -1) ? t0 : 0;
+
+            // is run -1 or -2? this means a run has been consumed
+            if (run < 0) 
+              run = mel_get_run(&mel);  // get another run
+          }
+          //run -= (c_q == 0) ? 2 : 0;
+          //t0 = (c_q != 0 || run == -1) ? t0 : 0;
+          //if (run < 0)
+          //  run = mel_get_run(&mel);  // get another run
+          sp[0] = t0;
+          x += 2;
+
+          // prepare context for the next quad; eqn. 1 in ITU T.814
+          c_q = ((t0 & 0x10U) << 3) | ((t0 & 0xE0U) << 2);
+
+          //remove data from vlc stream (0 bits are removed if vlc is not used)
+          vlc_val = rev_advance64(&vlc, t0 & 0x7);
+
+          //second quad
+          ui16 t1 = 0;
+
+          //decode VLC using the context c_q and the head of VLC bitstream
+          t1 = vlc_tbl0[c_q + (vlc_val & 0x7F)]; 
+
+          // if context is zero, use one MEL event
+          if (c_q == 0 && x < width) //zero context
+          {
+            run -= 2; //subtract 2, since events number if multiplied by 2
+
+            // if event is 0, discard decoded t1
+            t1 = (run == -1) ? t1 : 0;
+
+            if (run < 0) // have we consumed all events in a run
+              run = mel_get_run(&mel); // if yes, then get another run
+          }
+          t1 = x < width ? t1 : 0;
+          //run -= (c_q == 0 && x < width) ? 2 : 0;
+          //t1 = (c_q != 0 || run == -1) ? t1 : 0;
+          //if (run < 0)
+          //  run = mel_get_run(&mel);  // get another run
+          sp[2] = t1;
+          x += 2;
+
+          //prepare context for the next quad, eqn. 1 in ITU T.814
+          c_q = ((t1 & 0x10U) << 3) | ((t1 & 0xE0U) << 2);
+
+          //remove data from vlc stream, if qinf is not used, cwdlen is 0
+          vlc_val = rev_advance64(&vlc, t1 & 0x7);
+          
+          // decode u
+          /////////////
+          // uvlc_mode is made up of u_offset bits from the quad pair
+          ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
+          if (uvlc_mode == 0xc0)// if both u_offset are set, get an event from
+          {                     // the MEL run of events
+            run -= 2; //subtract 2, since events number if multiplied by 2
+
+            uvlc_mode += (run == -1) ? 0x40 : 0; // increment uvlc_mode by
+                                                 // is 0x40
+
+            if (run < 0)//if run is consumed (run is -1 or -2), get another run
+              run = mel_get_run(&mel);
+          }
+          //run -= (uvlc_mode == 0xc0) ? 2 : 0;
+          //uvlc_mode += (uvlc_mode == 0xc0 && run == -1) ? 0x40 : 0;
+          //if (run < 0)
+          //  run = mel_get_run(&mel);  // get another run
+
+          //decode uvlc_mode to get u for both quads
+          ui32 idx = uvlc_mode + (ui32)(vlc_val & 0x3F);
+          ui32 uvlc_entry = uvlc_tbl0[idx];
+          ui16 u_bias = uvlc_bias[idx];          
+          //remove total prefix length
+          vlc_val = rev_advance64(&vlc, uvlc_entry & 0x7); 
+          uvlc_entry >>= 3; 
+          //extract suffixes for quad 0 and 1
+          ui32 len = uvlc_entry & 0xF;             // suffix length for 2 quads
+          ui32 tmp = (ui32)(vlc_val&((1<<len)-1)); // suffix value for 2 quads
+          vlc_val = rev_advance64(&vlc, len);
+          uvlc_entry >>= 4;
+          // quad 0 length
+          len = uvlc_entry & 0x7; // quad 0 suffix length
+          uvlc_entry >>= 3;
+          ui16 u_q0 = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len)));
+          ui16 u_q1 = (ui16)((uvlc_entry >> 3) + (tmp >> len));
+
+          // decode u_q extensions, which is needed only when u_q > 32
+          ui16 u_ext; bool cond0, cond1;
+          cond0 = u_q0 - (u_bias & 0x3) > 32;
+          u_ext = (ui16)(cond0 ? (vlc_val & 0xF) : 0);
+          vlc_val = rev_advance64(&vlc, cond0 ? 4 : 0);
+          u_q0 = (ui16)(u_q0 + (u_ext << 2));
+          sp[1] = (ui16)(u_q0 + 1); // kappa = 1
+          cond1 = u_q1 - (u_bias >> 2) > 32;
+          u_ext = (ui16)(cond1 ? (vlc_val & 0xF) : 0);
+          vlc_val = rev_advance64(&vlc, cond1 ? 4 : 0);
+          u_q1 = (ui16)(u_q1 + (u_ext << 2));
+          sp[3] = (ui16)(u_q1 + 1); // kappa = 1
+        }
+        sp[0] = sp[1] = 0;
+
+        //non initial quad rows
+        for (ui32 y = 2; y < height; y += 2)
+        {
+          c_q = 0;                                // context
+          ui16 *sp = scratch + (y >> 1) * sstr;   // this row of quads
+
+          for (ui32 x = 0; x < width; sp += 4)
+          {
+            // decode VLC
+            /////////////
+
+            // sigma_q (n, ne, nf)
+            c_q |= ((sp[0 - (si32)sstr] & 0xA0U) << 2);
+            c_q |= ((sp[2 - (si32)sstr] & 0x20U) << 4);
+
+            // first quad
+            vlc_val = rev_fetch64(&vlc);
+
+            //decode VLC using the context c_q and the head of VLC bitstream
+            ui16 t0 = vlc_tbl1[ c_q + (vlc_val & 0x7F) ];
+
+            // if context is zero, use one MEL event
+            if (c_q == 0) //zero context
+            {
+              run -= 2; //subtract 2, since events number is multiplied by 2
+
+              // Is the run terminated in 1? if so, use decoded VLC code, 
+              // otherwise, discard decoded data, since we will decoded again 
+              // using a different context
+              t0 = (run == -1) ? t0 : 0;
+
+              // is run -1 or -2? this means a run has been consumed
+              if (run < 0) 
+                run = mel_get_run(&mel);  // get another run
+            }
+            //run -= (c_q == 0) ? 2 : 0;
+            //t0 = (c_q != 0 || run == -1) ? t0 : 0;
+            //if (run < 0)
+            //  run = mel_get_run(&mel);  // get another run
+            sp[0] = t0;
+            x += 2;
+
+            // prepare context for the next quad; eqn. 2 in ITU T.814
+            // sigma_q (w, sw)
+            c_q = ((t0 & 0x40U) << 2) | ((t0 & 0x80U) << 1);
+            // sigma_q (nw)
+            c_q |= sp[0 - (si32)sstr] & 0x80;
+            // sigma_q (n, ne, nf)
+            c_q |= ((sp[2 - (si32)sstr] & 0xA0U) << 2);
+            c_q |= ((sp[4 - (si32)sstr] & 0x20U) << 4);
+
+            //remove data from vlc stream (0 bits are removed if vlc is unused)
+            vlc_val = rev_advance64(&vlc, t0 & 0x7);
+
+            //second quad
+            ui16 t1 = 0;
+
+            //decode VLC using the context c_q and the head of VLC bitstream
+            t1 = vlc_tbl1[ c_q + (vlc_val & 0x7F)]; 
+
+            // if context is zero, use one MEL event
+            if (c_q == 0 && x < width) //zero context
+            {
+              run -= 2; //subtract 2, since events number if multiplied by 2
+
+              // if event is 0, discard decoded t1
+              t1 = (run == -1) ? t1 : 0;
+
+              if (run < 0) // have we consumed all events in a run
+                run = mel_get_run(&mel); // if yes, then get another run
+            }
+            t1 = x < width ? t1 : 0;
+            //run -= (c_q == 0 && x < width) ? 2 : 0;
+            //t1 = (c_q != 0 || run == -1) ? t1 : 0;
+            //if (run < 0)
+            //  run = mel_get_run(&mel);  // get another run
+            sp[2] = t1;
+            x += 2;
+
+            // partial c_q, will be completed when we process the next quad
+            // sigma_q (w, sw)
+            c_q = ((t1 & 0x40U) << 2) | ((t1 & 0x80U) << 1);
+            // sigma_q (nw)
+            c_q |= sp[2 - (si32)sstr] & 0x80;
+
+            //remove data from vlc stream, if qinf is not used, cwdlen is 0
+            vlc_val = rev_advance64(&vlc, t1 & 0x7);
+          
+            // decode u
+            /////////////
+            // uvlc_mode is made up of u_offset bits from the quad pair
+            ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4);
+            ui32 uvlc_entry = uvlc_tbl1[uvlc_mode + (vlc_val & 0x3F)];
+            //remove total prefix length
+            vlc_val = rev_advance64(&vlc, uvlc_entry & 0x7);
+            uvlc_entry >>= 3;
+            //extract suffixes for quad 0 and 1
+            ui32 len = uvlc_entry & 0xF;             //suffix length for 2 quads
+            ui32 tmp = (ui32)(vlc_val&((1<<len)-1)); //suffix value for 2 quads
+            vlc_val = rev_advance64(&vlc, len);
+            uvlc_entry >>= 4;
+            // quad 0 length
+            len = uvlc_entry & 0x7; // quad 0 suffix length
+            uvlc_entry >>= 3;
+            ui16 u_q0 = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len)));
+            ui16 u_q1 = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // u_q
+
+            // decode u_q extensions, which is needed only when u_q > 32
+            ui16 u_ext; bool cond0, cond1;
+            cond0 = u_q0 > 32;
+            u_ext = (ui16)(cond0 ? (vlc_val & 0xF) : 0);
+            vlc_val = rev_advance64(&vlc, cond0 ? 4 : 0);
+            u_q0 = (ui16)(u_q0 + (u_ext << 2));
+            sp[1] = u_q0;
+            cond1 = u_q1 > 32;
+            u_ext = (ui16)(cond1 ? (vlc_val & 0xF) : 0);
+            vlc_val = rev_advance64(&vlc, cond1 ? 4 : 0);
+            u_q1 = (ui16)(u_q1 + (u_ext << 2));
+            sp[3] = u_q1;
+          }
+          sp[0] = sp[1] = 0;
+        }
+      }
+
+      // step2 we decode magsgn
+      {
+        // We allocate a scratch row for storing v_n values.
+        // We have 512 quads horizontally.
+        // We need an extra entry to handle the case of vp[1]
+        // when vp is at the last column.
+        // Here, we allocate 4 instead of 1 to make the buffer size
+        // a multipled of 16 bytes.
+        const int v_n_size = 512 + 4;
+        ui64 v_n_scratch[v_n_size] = {0};  // 4+ kB
+
+        frwd_struct magsgn;
+        frwd_init8<0xFF>(&magsgn, coded_data, lcup - scup);
+
+        const ui16 *sp = scratch;
+        ui64 *vp = v_n_scratch;
+        ui64 *dp = decoded_data;
+
+        ui64 prev_v_n = 0;
+        for (ui32 x = 0; x < width; sp += 2, ++vp)
+        {
+          ui32 inf = sp[0];
+          ui32 U_q = sp[1];
+          if (U_q > mmsbp2)
+            return false;
+
+          ui64 v_n;
+          ui64 val = 0;
+          ui32 bit = 0;
+          if (inf & (1 << (4 + bit)))
+          {
+            //get 32 bits of magsgn data
+            ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); 
+            ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k
+            frwd_advance(&magsgn, m_n);                 //consume m_n
+
+            val = ms_val << 63;                           // get sign bit
+            v_n = ms_val & ((1ULL << m_n) - 1);           // keep only m_n bits
+            v_n |= (ui64)((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB
+            v_n |= 1;                                     // add center of bin    
+            //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+            //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+            val |= (v_n + 2) << (p - 1);
+          }
+          dp[0] = val;
+
+          v_n = 0;
+          val = 0;
+          bit = 1;
+          if (inf & (1 << (4 + bit)))
+          {
+            //get 32 bits of magsgn data
+            ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); 
+            ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k
+            frwd_advance(&magsgn, m_n);                 //consume m_n
+
+            val = ms_val << 63;                           // get sign bit
+            v_n = ms_val & ((1ULL << m_n) - 1);           // keep only m_n bits
+            v_n |= (ui64)((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB
+            v_n |= 1;                                     // add center of bin    
+            //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+            //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+            val |= (v_n + 2) << (p - 1);
+          }
+          dp[stride] = val;
+          vp[0] = prev_v_n | v_n;
+          prev_v_n = 0;
+          ++dp;
+          if (++x >= width)
+          { ++vp; break; }
+
+          val = 0;
+          bit = 2;
+          if (inf & (1 << (4 + bit)))
+          {
+            //get 32 bits of magsgn data
+            ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); 
+            ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k
+            frwd_advance(&magsgn, m_n);                 //consume m_n
+
+            val = ms_val << 63;                           // get sign bit
+            v_n = ms_val & ((1ULL << m_n) - 1);           // keep only m_n bits
+            v_n |= (ui64)((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB
+            v_n |= 1;                                     // add center of bin    
+            //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+            //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+            val |= (v_n + 2) << (p - 1);
+          }
+          dp[0] = val;
+
+          v_n = 0;
+          val = 0;
+          bit = 3;
+          if (inf & (1 << (4 + bit)))
+          {
+            //get 32 bits of magsgn data
+            ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); 
+            ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k
+            frwd_advance(&magsgn, m_n);                 //consume m_n
+
+            val = ms_val << 63;                           // get sign bit
+            v_n = ms_val & ((1ULL << m_n) - 1);           // keep only m_n bits
+            v_n |= (ui64)((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB
+            v_n |= 1;                                     // add center of bin    
+            //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+            //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+            val |= (v_n + 2) << (p - 1);
+          }
+          dp[stride] = val;
+          prev_v_n = v_n;
+          ++dp;
+          ++x;
+        }
+        vp[0] = prev_v_n;
+
+        for (ui32 y = 2; y < height; y += 2)
+        {
+          const ui16 *sp = scratch + (y >> 1) * sstr;
+          ui64 *vp = v_n_scratch;
+          ui64 *dp = decoded_data + y * stride;
+
+          prev_v_n = 0;
+          for (ui32 x = 0; x < width; sp += 2, ++vp)
+          {
+            ui32 inf = sp[0];
+            ui32 u_q = sp[1];
+
+            ui32 gamma = inf & 0xF0; gamma &= gamma - 0x10; //is gamma_q 1?
+            ui32 emax = 63 - count_leading_zeros(2 | vp[0] | vp[1]); // emax-1
+            ui32 kappa = gamma ? emax : 1;
+
+            ui32 U_q = u_q + kappa;
+            if (U_q > mmsbp2)
+              return false;
+
+            ui64 v_n;
+            ui64 val = 0;
+            ui32 bit = 0;
+            if (inf & (1 << (4 + bit)))
+            {
+              //get 32 bits of magsgn data
+              ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); 
+              ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k
+              frwd_advance(&magsgn, m_n);                 //consume m_n
+
+              val = ms_val << 63;                         // get sign bit
+              v_n = ms_val & ((1ULL << m_n) - 1);         // keep only m_n bits
+              v_n |= (ui64)((inf >> (8+bit)) & 1) << m_n; // add EMB e_1 as MSB
+              v_n |= 1;                                   // add center of bin    
+              //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+              //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+              val |= (v_n + 2) << (p - 1);
+            }
+            dp[0] = val;
+
+            v_n = 0;
+            val = 0;
+            bit = 1;
+            if (inf & (1 << (4 + bit)))
+            {
+              //get 32 bits of magsgn data
+              ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); 
+              ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k
+              frwd_advance(&magsgn, m_n);                 //consume m_n
+
+              val = ms_val << 63;                         // get sign bit
+              v_n = ms_val & ((1ULL << m_n) - 1);         // keep only m_n bits
+              v_n |= (ui64)((inf >> (8+bit)) & 1) << m_n; // add EMB e_1 as MSB
+              v_n |= 1;                                   // add center of bin    
+              //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+              //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+              val |= (v_n + 2) << (p - 1);
+            }
+            dp[stride] = val;
+            vp[0] = prev_v_n | v_n;
+            prev_v_n = 0;
+            ++dp;
+            if (++x >= width)
+            { ++vp; break; }
+
+            val = 0;
+            bit = 2;
+            if (inf & (1 << (4 + bit)))
+            {
+              //get 32 bits of magsgn data
+              ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); 
+              ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k
+              frwd_advance(&magsgn, m_n);                 //consume m_n
+
+              val = ms_val << 63;                         // get sign bit
+              v_n = ms_val & ((1ULL << m_n) - 1);         // keep only m_n bits
+              v_n |= (ui64)((inf >> (8+bit)) & 1) << m_n; // add EMB e_1 as MSB
+              v_n |= 1;                                   // add center of bin    
+              //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+              //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+              val |= (v_n + 2) << (p - 1);
+            }
+            dp[0] = val;
+
+            v_n = 0;
+            val = 0;
+            bit = 3;
+            if (inf & (1 << (4 + bit)))
+            {
+              //get 32 bits of magsgn data
+              ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); 
+              ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k
+              frwd_advance(&magsgn, m_n);                 //consume m_n
+
+              val = ms_val << 63;                         // get sign bit
+              v_n = ms_val & ((1ULL << m_n) - 1);         // keep only m_n bits
+              v_n |= (ui64)((inf >> (8+bit)) & 1) << m_n; // add EMB e_1 as MSB
+              v_n |= 1;                                   // add center of bin    
+              //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit
+              //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs
+              val |= (v_n + 2) << (p - 1);
+            }
+            dp[stride] = val;
+            prev_v_n = v_n;
+            ++dp;
+            ++x;
+          }
+          vp[0] = prev_v_n;
+        }
+      }
+
+      if (num_passes > 1)
+      {
+        // We use scratch again, we can divide it into multiple regions
+        // sigma holds all the significant samples, and it cannot
+        // be modified after it is set.  it will be used during the
+        // Magnitude Refinement Pass
+        ui16* const sigma = scratch;
+
+        ui32 mstr = (width + 3u) >> 2;   // divide by 4, since each
+                                         // ui16 contains 4 columns
+        mstr = ((mstr + 2u) + 7u) & ~7u; // multiples of 8
+
+        // We re-arrange quad significance, where each 4 consecutive
+        // bits represent one quad, into column significance, where,
+        // each 4 consequtive bits represent one column of 4 rows
+        {
+          ui32 y;
+          for (y = 0; y < height; y += 4)
+          {
+            ui16* sp = scratch + (y >> 1) * sstr;
+            ui16* dp = sigma + (y >> 2) * mstr;
+            for (ui32 x = 0; x < width; x += 4, sp += 4, ++dp) {
+              ui32 t0 = 0, t1 = 0;
+              t0  = ((sp[0     ] & 0x30u) >> 4)  | ((sp[0     ] & 0xC0u) >> 2);
+              t0 |= ((sp[2     ] & 0x30u) << 4)  | ((sp[2     ] & 0xC0u) << 6);
+              t1  = ((sp[0+sstr] & 0x30u) >> 2)  | ((sp[0+sstr] & 0xC0u)     );
+              t1 |= ((sp[2+sstr] & 0x30u) << 6)  | ((sp[2+sstr] & 0xC0u) << 8);
+              dp[0] = (ui16)(t0 | t1);
+            }
+            dp[0] = 0; // set an extra entry on the right with 0
+          }
+          {
+            // reset one row after the codeblock
+            ui16* dp = sigma + (y >> 2) * mstr;
+            for (ui32 x = 0; x < width; x += 4, ++dp)
+              dp[0] = 0;
+            dp[0] = 0; // set an extra entry on the right with 0
+          }
+        }
+
+        // We perform Significance Propagation Pass here
+        {
+          // This stores significance information of the previous
+          // 4 rows.  Significance information in this array includes
+          // all signicant samples in bitplane p - 1; that is,
+          // significant samples for bitplane p (discovered during the
+          // cleanup pass and stored in sigma) and samples that have recently
+          // became significant (during the SPP) in bitplane p-1.
+          // We store enough for the widest row, containing 1024 columns,
+          // which is equivalent to 256 of ui16, since each stores 4 columns.
+          // We add an extra 8 entries, just in case we need more
+          ui16 prev_row_sig[256 + 8] = {0}; // 528 Bytes
+
+          frwd_struct sigprop;
+          frwd_init<0>(&sigprop, coded_data + lengths1, (int)lengths2);
+
+          for (ui32 y = 0; y < height; y += 4)
+          {
+            ui32 pattern = 0xFFFFu; // a pattern needed samples
+            if (height - y < 4) {
+              pattern = 0x7777u;
+              if (height - y < 3) {
+                pattern = 0x3333u;
+                if (height - y < 2)
+                  pattern = 0x1111u;
+              }
+            }
+
+            // prev holds sign. info. for the previous quad, together
+            // with the rows on top of it and below it.
+            ui32 prev = 0;
+            ui16 *prev_sig = prev_row_sig;
+            ui16 *cur_sig = sigma + (y >> 2) * mstr;
+            ui64 *dpp = decoded_data + y * stride;
+            for (ui32 x = 0; x < width; x += 4, ++cur_sig, ++prev_sig)
+            {
+              // only rows and columns inside the stripe are included
+              si32 s = (si32)x + 4 - (si32)width;
+              s = ojph_max(s, 0);
+              pattern = pattern >> (s * 4);
+
+              // We first find locations that need to be tested (potential
+              // SPP members); these location will end up in mbr
+              // In each iteration, we produce 16 bits because cwd can have
+              // up to 16 bits of significance information, followed by the
+              // corresponding 16 bits of sign information; therefore, it is
+              // sufficient to fetch 32 bit data per loop.
+
+              // Althougth we are interested in 16 bits only, we load 32 bits.
+              // For the 16 bits we are producing, we need the next 4 bits --
+              // We need data for at least 5 columns out of 8.
+              // Therefore loading 32 bits is easier than loading 16 bits
+              // twice.
+              ui32 ps = *(ui32*)prev_sig;
+              ui32 ns = *(ui32*)(cur_sig + mstr);
+              ui32 u = (ps & 0x88888888) >> 3; // the row on top
+              if (!stripe_causal)
+                u |= (ns & 0x11111111) << 3;   // the row below
+
+              ui32 cs = *(ui32*)cur_sig;
+              // vertical integration
+              ui32 mbr =  cs;                // this sig. info.
+              mbr |= (cs & 0x77777777) << 1; //above neighbors
+              mbr |= (cs & 0xEEEEEEEE) >> 1; //below neighbors
+              mbr |= u;
+              // horizontal integration
+              ui32 t = mbr;
+              mbr |= t << 4;      // neighbors on the left
+              mbr |= t >> 4;      // neighbors on the right
+              mbr |= prev >> 12;  // significance of previous group
+
+              // remove outside samples, and already significant samples
+              mbr &= pattern;
+              mbr &= ~cs;
+
+              // find samples that become significant during the SPP
+              ui32 new_sig = mbr;
+              if (new_sig)
+              {
+                ui64 cwd = frwd_fetch<0>(&sigprop);
+
+                ui32 cnt = 0;
+                ui32 col_mask = 0xFu;
+                ui32 inv_sig = ~cs & pattern;
+                for (int i = 0; i < 16; i += 4, col_mask <<= 4)
+                {
+                  if ((col_mask & new_sig) == 0)
+                    continue;
+
+                  //scan one column
+                  ui32 sample_mask = 0x1111u & col_mask;
+                  if (new_sig & sample_mask)
+                  {
+                    new_sig &= ~sample_mask;
+                    if (cwd & 1)
+                    {
+                      ui32 t = 0x33u << i;
+                      new_sig |= t & inv_sig;
+                    }
+                    cwd >>= 1; ++cnt;
+                  }
+
+                  sample_mask <<= 1;
+                  if (new_sig & sample_mask)
+                  {
+                    new_sig &= ~sample_mask;
+                    if (cwd & 1)
+                    {
+                      ui32 t = 0x76u << i;
+                      new_sig |= t & inv_sig;
+                    }
+                    cwd >>= 1; ++cnt;
+                  }
+
+                  sample_mask <<= 1;
+                  if (new_sig & sample_mask)
+                  {
+                    new_sig &= ~sample_mask;
+                    if (cwd & 1)
+                    {
+                      ui32 t = 0xECu << i;
+                      new_sig |= t & inv_sig;
+                    }
+                    cwd >>= 1; ++cnt;
+                  }
+
+                  sample_mask <<= 1;
+                  if (new_sig & sample_mask)
+                  {
+                    new_sig &= ~sample_mask;
+                    if (cwd & 1)
+                    {
+                      ui32 t = 0xC8u << i;
+                      new_sig |= t & inv_sig;
+                    }
+                    cwd >>= 1; ++cnt;
+                  }
+                }
+
+                if (new_sig)
+                {
+                  // new_sig has newly-discovered sig. samples during SPP
+                  // find the signs and update decoded_data
+                  ui64 *dp = dpp + x;
+                  ui64 val = 3u << (p - 2);
+                  col_mask = 0xFu;
+                  for (int i = 0; i < 4; ++i, ++dp, col_mask <<= 4)
+                  {
+                    if ((col_mask & new_sig) == 0)
+                      continue;
+
+                    //scan 4 signs
+                    ui32 sample_mask = 0x1111u & col_mask;
+                    if (new_sig & sample_mask)
+                    {
+                      assert(dp[0] == 0);
+                      dp[0] = (cwd << 63) | val;
+                      cwd >>= 1; ++cnt;
+                    }
+
+                    sample_mask += sample_mask;
+                    if (new_sig & sample_mask)
+                    {
+                      assert(dp[stride] == 0);
+                      dp[stride] = (cwd << 63) | val;
+                      cwd >>= 1; ++cnt;
+                    }
+
+                    sample_mask += sample_mask;
+                    if (new_sig & sample_mask)
+                    {
+                      assert(dp[2 * stride] == 0);
+                      dp[2 * stride] = (cwd << 63) | val;
+                      cwd >>= 1; ++cnt;
+                    }
+
+                    sample_mask += sample_mask;
+                    if (new_sig & sample_mask)
+                    {
+                      assert(dp[3 * stride] == 0);
+                      dp[3 * stride] = (cwd << 63) | val;
+                      cwd >>= 1; ++cnt;
+                    }
+                  }
+                }
+                frwd_advance(&sigprop, cnt);
+              }
+
+              new_sig |= cs;
+              *prev_sig = (ui16)(new_sig);
+
+              // vertical integration for the new sig. info.
+              t = new_sig;
+              new_sig |= (t & 0x7777) << 1; //above neighbors
+              new_sig |= (t & 0xEEEE) >> 1; //below neighbors
+              // add sig. info. from the row on top and below
+              prev = new_sig | u;
+              // we need only the bits in 0xF000
+              prev &= 0xF000;
+            }
+          }
+        }
+
+        // We perform Magnitude Refinement Pass here
+        if (num_passes > 2)
+        {
+          rev_struct magref;
+          rev_init_mrp(&magref, coded_data, (int)lengths1, (int)lengths2);
+
+          for (ui32 y = 0; y < height; y += 4)
+          {
+            ui32 *cur_sig = (ui32*)(sigma + (y >> 2) * mstr);
+            ui64 *dpp = decoded_data + y * stride;
+            ui64 half = 1ULL << (p - 2);
+            for (ui32 i = 0; i < width; i += 8)
+            {
+              //Process one entry from sigma array at a time
+              // Each nibble (4 bits) in the sigma array represents 4 rows,
+              // and the 32 bits contain 8 columns
+              ui32 cwd = rev_fetch_mrp(&magref); // get 32 bit data
+              ui32 sig = *cur_sig++; // 32 bit that will be processed now
+              ui32 col_mask = 0xFu;  // a mask for a column in sig
+              if (sig) // if any of the 32 bits are set
+              {
+                for (int j = 0; j < 8; ++j) //one column at a time
+                {
+                  if (sig & col_mask) // lowest nibble
+                  {
+                    ui64 *dp = dpp + i + j; // next column in decoded samples
+                    ui32 sample_mask = 0x11111111u & col_mask; //LSB
+
+                    for (int k = 0; k < 4; ++k) {
+                      if (sig & sample_mask) //if LSB is set
+                      {
+                        assert(dp[0] != 0); // decoded value cannot be zero
+                        assert((dp[0] & half) == 0); // no half
+                        ui64 sym = cwd & 1;          // get it value
+                        sym = (1 - sym) << (p - 1); // previous center of bin
+                        sym |= half;            // put half the center of bin
+                        dp[0] ^= sym;    // remove old bin center and put new
+                        cwd >>= 1;       // consume word
+                      }
+                      sample_mask += sample_mask; //next row
+                      dp += stride; // next samples row
+                    }
+                  }
+                  col_mask <<= 4; //next column
+                }
+              }
+              // consume data according to the number of bits set
+              rev_advance_mrp(&magref, population_count(sig));
+            }
+          }
+        }
+      }
+      return true;
+    }
+  }
+}
\ No newline at end of file
diff --git a/src/core/coding/ojph_block_encoder.cpp b/src/core/coding/ojph_block_encoder.cpp
index 2023ef1..ffc9e8d 100644
--- a/src/core/coding/ojph_block_encoder.cpp
+++ b/src/core/coding/ojph_block_encoder.cpp
@@ -65,11 +65,12 @@ namespace ojph {
     static ui16 vlc_tbl1[2048] = { 0 };
 
     //UVLC encoding
-    static int ulvc_cwd_pre[33];
-    static int ulvc_cwd_pre_len[33];
-    static int ulvc_cwd_suf[33];
-    static int ulvc_cwd_suf_len[33];
-
+    const int num_uvlc_entries = 75;
+    struct uvlc_tbl_struct {
+      ui8 pre, pre_len, suf, suf_len, ext, ext_len;
+    };
+    static uvlc_tbl_struct uvlc_tbl[num_uvlc_entries];
+    
     /////////////////////////////////////////////////////////////////////////
     static bool vlc_init_tables()
     {
@@ -194,23 +195,61 @@ namespace ojph {
     static bool uvlc_init_tables()
     {
       //code goes from 0 to 31, extension and 32 are not supported here
-      ulvc_cwd_pre[0] = 0; ulvc_cwd_pre[1] = 1; ulvc_cwd_pre[2] = 2;
-      ulvc_cwd_pre[3] = 4; ulvc_cwd_pre[4] = 4;
-      ulvc_cwd_pre_len[0] = 0; ulvc_cwd_pre_len[1] = 1;
-      ulvc_cwd_pre_len[2] = 2;
-      ulvc_cwd_pre_len[3] = 3; ulvc_cwd_pre_len[4] = 3;
-      ulvc_cwd_suf[0] = 0; ulvc_cwd_suf[1] = 0; ulvc_cwd_suf[2] = 0;
-      ulvc_cwd_suf[3] = 0; ulvc_cwd_suf[4] = 1;
-      ulvc_cwd_suf_len[0] = 0; ulvc_cwd_suf_len[1] = 0;
-      ulvc_cwd_suf_len[2] = 0;
-      ulvc_cwd_suf_len[3] = 1; ulvc_cwd_suf_len[4] = 1;
+      uvlc_tbl[0].pre = 0;
+      uvlc_tbl[0].pre_len = 0;
+      uvlc_tbl[0].suf = 0;
+      uvlc_tbl[0].suf_len = 0;
+      uvlc_tbl[0].ext = 0;
+      uvlc_tbl[0].ext_len = 0;
+
+      uvlc_tbl[1].pre = 1;
+      uvlc_tbl[1].pre_len = 1;
+      uvlc_tbl[1].suf = 0;
+      uvlc_tbl[1].suf_len = 0;
+      uvlc_tbl[1].ext = 0;
+      uvlc_tbl[1].ext_len = 0;
+
+      uvlc_tbl[2].pre = 2;
+      uvlc_tbl[2].pre_len = 2;
+      uvlc_tbl[2].suf = 0;
+      uvlc_tbl[2].suf_len = 0;
+      uvlc_tbl[2].ext = 0;
+      uvlc_tbl[2].ext_len = 0;
+
+      uvlc_tbl[3].pre = 4;
+      uvlc_tbl[3].pre_len = 3;
+      uvlc_tbl[3].suf = 0;
+      uvlc_tbl[3].suf_len = 1;
+      uvlc_tbl[3].ext = 0;
+      uvlc_tbl[3].ext_len = 0;
+
+      uvlc_tbl[4].pre = 4;
+      uvlc_tbl[4].pre_len = 3;
+      uvlc_tbl[4].suf = 1;
+      uvlc_tbl[4].suf_len = 1;
+      uvlc_tbl[4].ext = 0;
+      uvlc_tbl[4].ext_len = 0;
+
       for (int i = 5; i < 33; ++i)
       {
-        ulvc_cwd_pre[i] = 0;
-        ulvc_cwd_pre_len[i] = 3;
-        ulvc_cwd_suf[i] = i-5;
-        ulvc_cwd_suf_len[i] = 5;
+        uvlc_tbl[i].pre = 0;
+        uvlc_tbl[i].pre_len = 3;
+        uvlc_tbl[i].suf = (ui8)(i - 5);
+        uvlc_tbl[i].suf_len = 5;
+        uvlc_tbl[i].ext = 0;
+        uvlc_tbl[i].ext_len = 0;
       }
+
+      for (int i = 33; i < num_uvlc_entries; ++i)
+      {
+        uvlc_tbl[i].pre = 0;
+        uvlc_tbl[i].pre_len = 3;
+        uvlc_tbl[i].suf = (ui8)(28 + (i - 33) % 4);
+        uvlc_tbl[i].suf_len = 5;
+        uvlc_tbl[i].ext = (ui8)((i - 33) / 4);
+        uvlc_tbl[i].ext_len = 4;
+      }
+
       return true;
     }
 
@@ -440,6 +479,29 @@ namespace ojph {
       }
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    static inline void
+    ms_encode64(ms_struct* msp, ui64 cwd, int cwd_len)
+    {
+      while (cwd_len > 0)
+      {
+        if (msp->pos >= msp->buf_size)
+          OJPH_ERROR(0x00020005, "magnitude sign encoder's buffer is full");
+        int t = ojph_min(msp->max_bits - msp->used_bits, cwd_len);
+        msp->tmp |= (ui32)((cwd & ((1ULL << t) - 1)) << msp->used_bits);
+        msp->used_bits += t;
+        cwd >>= t;
+        cwd_len -= t;
+        if (msp->used_bits >= msp->max_bits)
+        {
+          msp->buf[msp->pos++] = (ui8)msp->tmp;
+          msp->max_bits = (msp->tmp == 0xFF) ? 7 : 8;
+          msp->tmp = 0;
+          msp->used_bits = 0;
+        }
+      }
+    }    
+
     //////////////////////////////////////////////////////////////////////////
     static inline void
     ms_terminate(ms_struct* msp)
@@ -467,11 +529,11 @@ namespace ojph {
     //
     //
     //////////////////////////////////////////////////////////////////////////
-    void ojph_encode_codeblock(ui32* buf, ui32 missing_msbs, ui32 num_passes,
-                               ui32 width, ui32 height, ui32 stride,
-                               ui32* lengths,
-                               ojph::mem_elastic_allocator *elastic,
-                               ojph::coded_lists *& coded)
+    void ojph_encode_codeblock32(ui32* buf, ui32 missing_msbs, ui32 num_passes,
+                                 ui32 width, ui32 height, ui32 stride,
+                                 ui32* lengths,
+                                 ojph::mem_elastic_allocator *elastic,
+                                 ojph::coded_lists *& coded)
     {
       assert(num_passes == 1);
       (void)num_passes;                      //currently not used
@@ -693,23 +755,23 @@ namespace ojph {
 
         if (u_q0 > 2 && u_q1 > 2)
         {
-          vlc_encode(&vlc, ulvc_cwd_pre[u_q0-2], ulvc_cwd_pre_len[u_q0-2]);
-          vlc_encode(&vlc, ulvc_cwd_pre[u_q1-2], ulvc_cwd_pre_len[u_q1-2]);
-          vlc_encode(&vlc, ulvc_cwd_suf[u_q0-2], ulvc_cwd_suf_len[u_q0-2]);
-          vlc_encode(&vlc, ulvc_cwd_suf[u_q1-2], ulvc_cwd_suf_len[u_q1-2]);
+          vlc_encode(&vlc, uvlc_tbl[u_q0-2].pre, uvlc_tbl[u_q0-2].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1-2].pre, uvlc_tbl[u_q1-2].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q0-2].suf, uvlc_tbl[u_q0-2].suf_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1-2].suf, uvlc_tbl[u_q1-2].suf_len);
         }
         else if (u_q0 > 2 && u_q1 > 0)
         {
-          vlc_encode(&vlc, ulvc_cwd_pre[u_q0], ulvc_cwd_pre_len[u_q0]);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len);
           vlc_encode(&vlc, u_q1 - 1, 1);
-          vlc_encode(&vlc, ulvc_cwd_suf[u_q0], ulvc_cwd_suf_len[u_q0]);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len);
         }
         else
         {
-          vlc_encode(&vlc, ulvc_cwd_pre[u_q0], ulvc_cwd_pre_len[u_q0]);
-          vlc_encode(&vlc, ulvc_cwd_pre[u_q1], ulvc_cwd_pre_len[u_q1]);
-          vlc_encode(&vlc, ulvc_cwd_suf[u_q0], ulvc_cwd_suf_len[u_q0]);
-          vlc_encode(&vlc, ulvc_cwd_suf[u_q1], ulvc_cwd_suf_len[u_q1]);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1].pre, uvlc_tbl[u_q1].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1].suf, uvlc_tbl[u_q1].suf_len);
         }
 
         //prepare for next iteration
@@ -910,10 +972,514 @@ namespace ojph {
             ms_encode(&ms, s[7] & ((1U<<m)-1), m);
           }
 
-          vlc_encode(&vlc, ulvc_cwd_pre[u_q0], ulvc_cwd_pre_len[u_q0]);
-          vlc_encode(&vlc, ulvc_cwd_pre[u_q1], ulvc_cwd_pre_len[u_q1]);
-          vlc_encode(&vlc, ulvc_cwd_suf[u_q0], ulvc_cwd_suf_len[u_q0]);
-          vlc_encode(&vlc, ulvc_cwd_suf[u_q1], ulvc_cwd_suf_len[u_q1]);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1].pre, uvlc_tbl[u_q1].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1].suf, uvlc_tbl[u_q1].suf_len);
+
+          //prepare for next iteration
+          c_q0 |= ((rho[1] & 4) >> 1) | ((rho[1] & 8) >> 2);
+          s[0] = s[1] = s[2] = s[3] = s[4] = s[5] = s[6] = s[7] = 0;
+          e_q[0]=e_q[1]=e_q[2]=e_q[3]=e_q[4]=e_q[5]=e_q[6]=e_q[7]=0;
+          rho[0] = rho[1] = 0; e_qmax[0] = e_qmax[1] = 0;
+        }
+      }
+
+
+      terminate_mel_vlc(&mel, &vlc);
+      ms_terminate(&ms);
+
+      //copy to elastic
+      lengths[0] = mel.pos + vlc.pos + ms.pos;
+      elastic->get_buffer(mel.pos + vlc.pos + ms.pos, coded);
+      memcpy(coded->buf, ms.buf, ms.pos);
+      memcpy(coded->buf + ms.pos, mel.buf, mel.pos);
+      memcpy(coded->buf + ms.pos + mel.pos, vlc.buf - vlc.pos + 1, vlc.pos);
+
+      // put in the interface locator word
+      ui32 num_bytes = mel.pos + vlc.pos;
+      coded->buf[lengths[0]-1] = (ui8)(num_bytes >> 4);
+      coded->buf[lengths[0]-2] = coded->buf[lengths[0]-2] & 0xF0;
+      coded->buf[lengths[0]-2] = 
+        (ui8)(coded->buf[lengths[0]-2] | (num_bytes & 0xF));
+
+      coded->avail_size -= lengths[0];
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    //
+    //
+    //
+    //
+    //
+    //////////////////////////////////////////////////////////////////////////
+    void ojph_encode_codeblock64(ui64* buf, ui32 missing_msbs, ui32 num_passes,
+                                 ui32 width, ui32 height, ui32 stride,
+                                 ui32* lengths,
+                                 ojph::mem_elastic_allocator *elastic,
+                                 ojph::coded_lists *& coded)
+    {
+      assert(num_passes == 1);
+      (void)num_passes;                      //currently not used
+      // 38 bits/sample + 1 color + 4 wavelet = 43 bits per sample.
+      // * 4096 samples / 8 bits per byte = 22016; then rounded up to the 
+      // nearest 1 kB, givin 22528.  This expanded further to take into 
+      // consideration stuffing at a max rate of 16 bits per 15 bits 
+      // (1 bit for every 15 bits of data); in reality, it is much smaller
+      // than this.
+      const int ms_size = (22528 * 16 + 14) / 15;  //more than enough
+      ui8 ms_buf[ms_size];
+      // For each quad, we need at most, 7 bits for VLC and 12 bits for UVLC.
+      // So we have 1024 quads * 19 / 8, which is 2432.  This must be 
+      // multiplied by 16 / 15 to accommodate stuffing.  
+      // The mel is at most around 1 bit/quad, giving around 128 byte -- in
+      // practice there was on case where it got to 132 bytes.  Even 
+      // accounting for stuffing, it is smaller than 192.  Therefore,
+      // 3072 is more than enough
+      const int mel_vlc_size = 3072;         //more than enough
+      ui8 mel_vlc_buf[mel_vlc_size];
+      const int mel_size = 192;
+      ui8 *mel_buf = mel_vlc_buf;
+      const int vlc_size = mel_vlc_size - mel_size;
+      ui8 *vlc_buf = mel_vlc_buf + mel_size;
+
+      mel_struct mel;
+      mel_init(&mel, mel_size, mel_buf);
+      vlc_struct vlc;
+      vlc_init(&vlc, vlc_size, vlc_buf);
+      ms_struct ms;
+      ms_init(&ms, ms_size, ms_buf);
+
+      ui32 p = 62 - missing_msbs;
+
+      //e_val: E values for a line (these are the highest set bit)
+      //cx_val: is the context values
+      //Each byte stores the info for the 2 sample. For E, it is maximum
+      // of the two samples, while for cx, it is the OR of these two samples.
+      //The maximum is between the pixel at the bottom left of one quad
+      // and the bottom right of the earlier quad. The same is true for cx.
+      //For a 1024 pixels, we need 512 bytes, the 2 extra,
+      // one for the non-existing earlier quad, and one for beyond the
+      // the end
+      ui8 e_val[513];
+      ui8 cx_val[513];
+      ui8* lep = e_val;     lep[0] = 0;
+      ui8* lcxp = cx_val;   lcxp[0] = 0;
+
+      //initial row of quads
+      int e_qmax[2] = {0,0}, e_q[8] = {0,0,0,0,0,0,0,0};
+      int rho[2] = {0,0};
+      int c_q0 = 0;
+      ui64 s[8] = {0,0,0,0,0,0,0,0}, val, t;
+      ui32 y = 0;
+      ui64 *sp = buf;
+      for (ui32 x = 0; x < width; x += 4)
+      {
+        //prepare two quads
+        t = sp[0];
+        val = t + t; //multiply by 2 and get rid of sign
+        val >>= p;  // 2 \mu_p + x
+        val &= ~1ULL; // 2 \mu_p
+        if (val)
+        {
+          rho[0] = 1;
+          e_q[0] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+          e_qmax[0] = e_q[0];
+          s[0] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+        }
+
+        t = height > 1 ? sp[stride] : 0;
+        ++sp;
+        val = t + t; //multiply by 2 and get rid of sign
+        val >>= p; // 2 \mu_p + x
+        val &= ~1ULL;// 2 \mu_p
+        if (val)
+        {
+          rho[0] += 2;
+          e_q[1] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+          e_qmax[0] = ojph_max(e_qmax[0], e_q[1]);
+          s[1] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+        }
+
+        if (x + 1 < width)
+        {
+          t = sp[0];
+          val = t + t; //multiply by 2 and get rid of sign
+          val >>= p; // 2 \mu_p + x
+          val &= ~1ULL;// 2 \mu_p
+          if (val)
+          {
+            rho[0] += 4;
+            e_q[2] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+            e_qmax[0] = ojph_max(e_qmax[0], e_q[2]);
+            s[2] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+          }
+
+          t = height > 1 ? sp[stride] : 0;
+          ++sp;
+          val = t + t; //multiply by 2 and get rid of sign
+          val >>= p; // 2 \mu_p + x
+          val &= ~1ULL;// 2 \mu_p
+          if (val)
+          {
+            rho[0] += 8;
+            e_q[3] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+            e_qmax[0] = ojph_max(e_qmax[0], e_q[3]);
+            s[3] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+          }
+        }
+
+        int Uq0 = ojph_max(e_qmax[0], 1); //kappa_q = 1
+        int u_q0 = Uq0 - 1, u_q1 = 0; //kappa_q = 1
+
+        int eps0 = 0;
+        if (u_q0 > 0)
+        {
+          eps0 |= (e_q[0] == e_qmax[0]);
+          eps0 |= (e_q[1] == e_qmax[0]) << 1;
+          eps0 |= (e_q[2] == e_qmax[0]) << 2;
+          eps0 |= (e_q[3] == e_qmax[0]) << 3;
+        }
+        lep[0] = ojph_max(lep[0], (ui8)e_q[1]); lep++;
+        lep[0] = (ui8)e_q[3];
+        lcxp[0] = (ui8)(lcxp[0] | (ui8)((rho[0] & 2) >> 1)); lcxp++;
+        lcxp[0] = (ui8)((rho[0] & 8) >> 3);
+
+        ui16 tuple0 = vlc_tbl0[(c_q0 << 8) + (rho[0] << 4) + eps0];
+        vlc_encode(&vlc, tuple0 >> 8, (tuple0 >> 4) & 7);
+
+        if (c_q0 == 0)
+          mel_encode(&mel, rho[0] != 0);
+
+        int m = (rho[0] & 1) ? Uq0 - (tuple0 & 1) : 0;
+        ms_encode64(&ms, s[0] & ((1ULL << m) - 1), m);
+        m = (rho[0] & 2) ? Uq0 - ((tuple0 & 2) >> 1) : 0;
+        ms_encode64(&ms, s[1] & ((1ULL << m) - 1), m);
+        m = (rho[0] & 4) ? Uq0 - ((tuple0 & 4) >> 2) : 0;
+        ms_encode64(&ms, s[2] & ((1ULL << m) - 1), m);
+        m = (rho[0] & 8) ? Uq0 - ((tuple0 & 8) >> 3) : 0;
+        ms_encode64(&ms, s[3] & ((1ULL << m) - 1), m);
+
+        if (x + 2 < width)
+        {
+          t = sp[0];
+          val = t + t; //multiply by 2 and get rid of sign
+          val >>= p; // 2 \mu_p + x
+          val &= ~1ULL;// 2 \mu_p
+          if (val)
+          {
+            rho[1] = 1;
+            e_q[4] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+            e_qmax[1] = e_q[4];
+            s[4] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+          }
+
+          t = height > 1 ? sp[stride] : 0;
+          ++sp;
+          val = t + t; //multiply by 2 and get rid of sign
+          val >>= p; // 2 \mu_p + x
+          val &= ~1ULL;// 2 \mu_p
+          if (val)
+          {
+            rho[1] += 2;
+            e_q[5] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+            e_qmax[1] = ojph_max(e_qmax[1], e_q[5]);
+            s[5] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+          }
+
+          if (x + 3 < width)
+          {
+            t = sp[0];
+            val = t + t; //multiply by 2 and get rid of sign
+            val >>= p; // 2 \mu_p + x
+            val &= ~1ULL;// 2 \mu_p
+            if (val)
+            {
+              rho[1] += 4;
+              e_q[6] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+              e_qmax[1] = ojph_max(e_qmax[1], e_q[6]);
+              s[6] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+            }
+
+            t = height > 1 ? sp[stride] : 0;
+            ++sp;
+            val = t + t; //multiply by 2 and get rid of sign
+            val >>= p; // 2 \mu_p + x
+            val &= ~1ULL;// 2 \mu_p
+            if (val)
+            {
+              rho[1] += 8;
+              e_q[7] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+              e_qmax[1] = ojph_max(e_qmax[1], e_q[7]);
+              s[7] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+            }
+          }
+
+          int c_q1 = (rho[0] >> 1) | (rho[0] & 1);
+          int Uq1 = ojph_max(e_qmax[1], 1); //kappa_q = 1
+          u_q1 = Uq1 - 1; //kappa_q = 1
+
+          int eps1 = 0;
+          if (u_q1 > 0)
+          {
+            eps1 |= (e_q[4] == e_qmax[1]);
+            eps1 |= (e_q[5] == e_qmax[1]) << 1;
+            eps1 |= (e_q[6] == e_qmax[1]) << 2;
+            eps1 |= (e_q[7] == e_qmax[1]) << 3;
+          }
+          lep[0] = ojph_max(lep[0], (ui8)e_q[5]); lep++;
+          lep[0] = (ui8)e_q[7];
+          lcxp[0] |= (ui8)(lcxp[0] | (ui8)((rho[1] & 2) >> 1)); lcxp++;
+          lcxp[0] = (ui8)((rho[1] & 8) >> 3);
+          ui16 tuple1 = vlc_tbl0[(c_q1 << 8) + (rho[1] << 4) + eps1];
+          vlc_encode(&vlc, tuple1 >> 8, (tuple1 >> 4) & 7);
+
+          if (c_q1 == 0)
+            mel_encode(&mel, rho[1] != 0);
+
+          int m = (rho[1] & 1) ? Uq1 - (tuple1 & 1) : 0;
+          ms_encode64(&ms, s[4] & ((1ULL << m) - 1), m);
+          m = (rho[1] & 2) ? Uq1 - ((tuple1 & 2) >> 1) : 0;
+          ms_encode64(&ms, s[5] & ((1ULL << m) - 1), m);
+          m = (rho[1] & 4) ? Uq1 - ((tuple1 & 4) >> 2) : 0;
+          ms_encode64(&ms, s[6] & ((1ULL << m) - 1), m);
+          m = (rho[1] & 8) ? Uq1 - ((tuple1 & 8) >> 3) : 0;
+          ms_encode64(&ms, s[7] & ((1ULL << m) - 1), m);
+        }
+
+        if (u_q0 > 0 && u_q1 > 0)
+          mel_encode(&mel, ojph_min(u_q0, u_q1) > 2);
+
+        if (u_q0 > 2 && u_q1 > 2)
+        {
+          vlc_encode(&vlc, uvlc_tbl[u_q0-2].pre, uvlc_tbl[u_q0-2].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1-2].pre, uvlc_tbl[u_q1-2].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q0-2].suf, uvlc_tbl[u_q0-2].suf_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1-2].suf, uvlc_tbl[u_q1-2].suf_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q0-2].ext, uvlc_tbl[u_q0-2].ext_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1-2].ext, uvlc_tbl[u_q1-2].ext_len);
+        }
+        else if (u_q0 > 2 && u_q1 > 0)
+        {
+          vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len);
+          vlc_encode(&vlc, u_q1 - 1, 1);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].ext, uvlc_tbl[u_q0].ext_len);
+        }
+        else
+        {
+          vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1].pre, uvlc_tbl[u_q1].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1].suf, uvlc_tbl[u_q1].suf_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].ext, uvlc_tbl[u_q0].ext_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1].ext, uvlc_tbl[u_q1].ext_len);
+        }
+
+        //prepare for next iteration
+        c_q0 = (rho[1] >> 1) | (rho[1] & 1);
+        s[0] = s[1] = s[2] = s[3] = s[4] = s[5] = s[6] = s[7] = 0;
+        e_q[0]=e_q[1]=e_q[2]=e_q[3]=e_q[4]=e_q[5]=e_q[6]=e_q[7]=0;
+        rho[0] = rho[1] = 0; e_qmax[0] = e_qmax[1] = 0;
+      }
+
+      lep[1] = 0;
+
+      for (y = 2; y < height; y += 2)
+      {
+        lep = e_val;
+        int max_e = ojph_max(lep[0], lep[1]) - 1;
+        lep[0] = 0;
+        lcxp = cx_val;
+        c_q0 = lcxp[0] + (lcxp[1] << 2);
+        lcxp[0] = 0;
+
+        sp = buf + y * stride;
+        for (ui32 x = 0; x < width; x += 4)
+        {
+          //prepare two quads
+          t = sp[0];
+          val = t + t; //multiply by 2 and get rid of sign
+          val >>= p; // 2 \mu_p + x
+          val &= ~1ULL;// 2 \mu_p
+          if (val)
+          {
+            rho[0] = 1;
+            e_q[0] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+            e_qmax[0] = e_q[0];
+            s[0] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+          }
+
+          t = y + 1 < height ? sp[stride] : 0;
+          ++sp;
+          val = t + t; //multiply by 2 and get rid of sign
+          val >>= p; // 2 \mu_p + x
+          val &= ~1ULL;// 2 \mu_p
+          if (val)
+          {
+            rho[0] += 2;
+            e_q[1] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+            e_qmax[0] = ojph_max(e_qmax[0], e_q[1]);
+            s[1] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+          }
+
+          if (x + 1 < width)
+          {
+            t = sp[0];
+            val = t + t; //multiply by 2 and get rid of sign
+            val >>= p; // 2 \mu_p + x
+            val &= ~1ULL;// 2 \mu_p
+            if (val)
+            {
+              rho[0] += 4;
+              e_q[2] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+              e_qmax[0] = ojph_max(e_qmax[0], e_q[2]);
+              s[2] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+            }
+
+            t = y + 1 < height ? sp[stride] : 0;
+            ++sp;
+            val = t + t; //multiply by 2 and get rid of sign
+            val >>= p; // 2 \mu_p + x
+            val &= ~1ULL;// 2 \mu_p
+            if (val)
+            {
+              rho[0] += 8;
+              e_q[3] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+              e_qmax[0] = ojph_max(e_qmax[0], e_q[3]);
+              s[3] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+            }
+          }
+
+          int kappa = (rho[0] & (rho[0]-1)) ? ojph_max(1,max_e) : 1;
+          int Uq0 = ojph_max(e_qmax[0], kappa);
+          int u_q0 = Uq0 - kappa, u_q1 = 0;
+
+          int eps0 = 0;
+          if (u_q0 > 0)
+          {
+            eps0 |= (e_q[0] == e_qmax[0]);
+            eps0 |= (e_q[1] == e_qmax[0]) << 1;
+            eps0 |= (e_q[2] == e_qmax[0]) << 2;
+            eps0 |= (e_q[3] == e_qmax[0]) << 3;
+          }
+          lep[0] = ojph_max(lep[0], (ui8)e_q[1]); lep++;
+          max_e = ojph_max(lep[0], lep[1]) - 1;
+          lep[0] = (ui8)e_q[3];
+          lcxp[0] = (ui8)(lcxp[0] | (ui8)((rho[0] & 2) >> 1)); lcxp++;
+          int c_q1 = lcxp[0] + (lcxp[1] << 2);
+          lcxp[0] = (ui8)((rho[0] & 8) >> 3);
+          ui16 tuple0 = vlc_tbl1[(c_q0 << 8) + (rho[0] << 4) + eps0];
+          vlc_encode(&vlc, tuple0 >> 8, (tuple0 >> 4) & 7);
+
+          if (c_q0 == 0)
+              mel_encode(&mel, rho[0] != 0);
+
+          int m = (rho[0] & 1) ? Uq0 - (tuple0 & 1) : 0;
+          ms_encode64(&ms, s[0] & ((1ULL << m) - 1), m);
+          m = (rho[0] & 2) ? Uq0 - ((tuple0 & 2) >> 1) : 0;
+          ms_encode64(&ms, s[1] & ((1ULL << m) - 1), m);
+          m = (rho[0] & 4) ? Uq0 - ((tuple0 & 4) >> 2) : 0;
+          ms_encode64(&ms, s[2] & ((1ULL << m) - 1), m);
+          m = (rho[0] & 8) ? Uq0 - ((tuple0 & 8) >> 3) : 0;
+          ms_encode64(&ms, s[3] & ((1ULL << m) - 1), m);
+
+          if (x + 2 < width)
+          {
+            t = sp[0];
+            val = t + t; //multiply by 2 and get rid of sign
+            val >>= p; // 2 \mu_p + x
+            val &= ~1ULL;// 2 \mu_p
+            if (val)
+            {
+              rho[1] = 1;
+              e_q[4] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+              e_qmax[1] = e_q[4];
+              s[4] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+            }
+
+            t = y + 1 < height ? sp[stride] : 0;
+            ++sp;
+            val = t + t; //multiply by 2 and get rid of sign
+            val >>= p; // 2 \mu_p + x
+            val &= ~1ULL;// 2 \mu_p
+            if (val)
+            {
+              rho[1] += 2;
+              e_q[5] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+              e_qmax[1] = ojph_max(e_qmax[1], e_q[5]);
+              s[5] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+            }
+
+            if (x + 3 < width)
+            {
+              t = sp[0];
+              val = t + t; //multiply by 2 and get rid of sign
+              val >>= p; // 2 \mu_p + x
+              val &= ~1ULL;// 2 \mu_p
+              if (val)
+              {
+                rho[1] += 4;
+                e_q[6] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+                e_qmax[1] = ojph_max(e_qmax[1], e_q[6]);
+                s[6] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+              }
+
+              t = y + 1 < height ? sp[stride] : 0;
+              ++sp;
+              val = t + t; //multiply by 2 and get rid of sign
+              val >>= p; // 2 \mu_p + x
+              val &= ~1ULL;// 2 \mu_p
+              if (val)
+              {
+                rho[1] += 8;
+                e_q[7] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1
+                e_qmax[1] = ojph_max(e_qmax[1], e_q[7]);
+                s[7] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n
+              }
+            }
+
+            kappa = (rho[1] & (rho[1]-1)) ? ojph_max(1,max_e) : 1;
+            c_q1 |= ((rho[0] & 4) >> 1) | ((rho[0] & 8) >> 2);
+            int Uq1 = ojph_max(e_qmax[1], kappa);
+            u_q1 = Uq1 - kappa;
+
+            int eps1 = 0;
+            if (u_q1 > 0)
+            {
+              eps1 |= (e_q[4] == e_qmax[1]);
+              eps1 |= (e_q[5] == e_qmax[1]) << 1;
+              eps1 |= (e_q[6] == e_qmax[1]) << 2;
+              eps1 |= (e_q[7] == e_qmax[1]) << 3;
+            }
+            lep[0] = ojph_max(lep[0], (ui8)e_q[5]); lep++;
+            max_e = ojph_max(lep[0], lep[1]) - 1;
+            lep[0] = (ui8)e_q[7];
+            lcxp[0] = (ui8)(lcxp[0] | (ui8)((rho[1] & 2) >> 1)); lcxp++;
+            c_q0 = lcxp[0] + (lcxp[1] << 2);
+            lcxp[0] = (ui8)((rho[1] & 8) >> 3);
+            ui16 tuple1 = vlc_tbl1[(c_q1 << 8) + (rho[1] << 4) + eps1];
+            vlc_encode(&vlc, tuple1 >> 8, (tuple1 >> 4) & 7);
+
+            if (c_q1 == 0)
+              mel_encode(&mel, rho[1] != 0);
+
+            int m = (rho[1] & 1) ? Uq1 - (tuple1 & 1) : 0;
+            ms_encode64(&ms, s[4] & ((1ULL << m) - 1), m);
+            m = (rho[1] & 2) ? Uq1 - ((tuple1 & 2) >> 1) : 0;
+            ms_encode64(&ms, s[5] & ((1ULL << m) - 1), m);
+            m = (rho[1] & 4) ? Uq1 - ((tuple1 & 4) >> 2) : 0;
+            ms_encode64(&ms, s[6] & ((1ULL << m) - 1), m);
+            m = (rho[1] & 8) ? Uq1 - ((tuple1 & 8) >> 3) : 0;
+            ms_encode64(&ms, s[7] & ((1ULL << m) - 1), m);
+          }
+
+          vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1].pre, uvlc_tbl[u_q1].pre_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1].suf, uvlc_tbl[u_q1].suf_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q0].ext, uvlc_tbl[u_q0].ext_len);
+          vlc_encode(&vlc, uvlc_tbl[u_q1].ext, uvlc_tbl[u_q1].ext_len);
 
           //prepare for next iteration
           c_q0 |= ((rho[1] & 4) >> 1) | ((rho[1] & 8) >> 2);
diff --git a/src/core/coding/ojph_block_encoder.h b/src/core/coding/ojph_block_encoder.h
index 43d32d8..72b3c0d 100644
--- a/src/core/coding/ojph_block_encoder.h
+++ b/src/core/coding/ojph_block_encoder.h
@@ -52,11 +52,18 @@ namespace ojph {
 
     //////////////////////////////////////////////////////////////////////////
     void
-      ojph_encode_codeblock(ui32* buf, ui32 missing_msbs, ui32 num_passes,
-                            ui32 width, ui32 height, ui32 stride,
-                            ui32* lengths, 
-                            ojph::mem_elastic_allocator *elastic,
-                            ojph::coded_lists *& coded);
+      ojph_encode_codeblock32(ui32* buf, ui32 missing_msbs, ui32 num_passes,
+                              ui32 width, ui32 height, ui32 stride,
+                              ui32* lengths, 
+                              ojph::mem_elastic_allocator *elastic,
+                              ojph::coded_lists *& coded);
+
+    void
+      ojph_encode_codeblock64(ui64* buf, ui32 missing_msbs, ui32 num_passes,
+                              ui32 width, ui32 height, ui32 stride,
+                              ui32* lengths, 
+                              ojph::mem_elastic_allocator *elastic,
+                              ojph::coded_lists *& coded);
 
     void
       ojph_encode_codeblock_avx2(ui32* buf, ui32 missing_msbs,
@@ -71,6 +78,9 @@ namespace ojph {
                                    ui32 stride, ui32* lengths,
                                    ojph::mem_elastic_allocator *elastic,
                                    ojph::coded_lists *& coded);
+
+    bool initialize_block_encoder_tables_avx2();
+    bool initialize_block_encoder_tables_avx512();
   }
 }
 
diff --git a/src/core/coding/ojph_block_encoder_avx2.cpp b/src/core/coding/ojph_block_encoder_avx2.cpp
index d579f83..7624272 100644
--- a/src/core/coding/ojph_block_encoder_avx2.cpp
+++ b/src/core/coding/ojph_block_encoder_avx2.cpp
@@ -64,8 +64,8 @@ namespace ojph {
     // index is (c_q << 8) + (rho << 4) + eps
     // data is  (cwd << 8) + (cwd_len << 4) + eps
     // table 0 is for the initial line of quads
-    static ui32 vlc_tbl0[2048] = { 0 };
-    static ui32 vlc_tbl1[2048] = { 0 };
+    static ui32 vlc_tbl0[2048];
+    static ui32 vlc_tbl1[2048];
 
     //UVLC encoding
     static ui32 ulvc_cwd_pre[33];
@@ -218,18 +218,18 @@ namespace ojph {
     }
 
     /////////////////////////////////////////////////////////////////////////
-    bool initialize_tables_avx2() {
-      if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX2) {
-        bool result;
-        result = vlc_init_tables();
-        result = result && uvlc_init_tables();
-        return result;
-      }
-      return false;
-    }
+    static bool tables_initialized = false;
 
     /////////////////////////////////////////////////////////////////////////
-    static bool tables_initialized = initialize_tables_avx2();
+    bool initialize_block_encoder_tables_avx2() {
+      if (!tables_initialized) {
+        memset(vlc_tbl0, 0, 2048 * sizeof(ui32));
+        memset(vlc_tbl1, 0, 2048 * sizeof(ui32));
+        tables_initialized = vlc_init_tables();
+        tables_initialized = tables_initialized && uvlc_init_tables();
+      }
+      return tables_initialized;
+    }
 
     /////////////////////////////////////////////////////////////////////////
     //
diff --git a/src/core/coding/ojph_block_encoder_avx512.cpp b/src/core/coding/ojph_block_encoder_avx512.cpp
index 9df0e8e..b35373a 100644
--- a/src/core/coding/ojph_block_encoder_avx512.cpp
+++ b/src/core/coding/ojph_block_encoder_avx512.cpp
@@ -64,8 +64,8 @@ namespace ojph {
     // index is (c_q << 8) + (rho << 4) + eps
     // data is  (cwd << 8) + (cwd_len << 4) + eps
     // table 0 is for the initial line of quads
-    static ui32 vlc_tbl0[2048] = { 0 };
-    static ui32 vlc_tbl1[2048] = { 0 };
+    static ui32 vlc_tbl0[2048];
+    static ui32 vlc_tbl1[2048];
 
     //UVLC encoding
     static ui32 ulvc_cwd_pre[33];
@@ -218,18 +218,18 @@ namespace ojph {
     }
 
     /////////////////////////////////////////////////////////////////////////
-    bool initialize_tables() {
-      if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX512) {
-        bool result;
-        result = vlc_init_tables();
-        result = result && uvlc_init_tables();
-        return result;
-      }
-      return false;
-    }
+    static bool tables_initialized = false;
 
     /////////////////////////////////////////////////////////////////////////
-    static bool tables_initialized = initialize_tables();
+    bool initialize_block_encoder_tables_avx512() {
+      if (!tables_initialized) {
+        memset(vlc_tbl0, 0, 2048 * sizeof(ui32));
+        memset(vlc_tbl1, 0, 2048 * sizeof(ui32));
+        tables_initialized = vlc_init_tables();
+        tables_initialized = tables_initialized && uvlc_init_tables();
+      }
+      return tables_initialized;
+    }
 
     /////////////////////////////////////////////////////////////////////////
     //
diff --git a/src/core/common/ojph_arch.h b/src/core/common/ojph_arch.h
index 947f25b..29ab7a5 100644
--- a/src/core/common/ojph_arch.h
+++ b/src/core/common/ojph_arch.h
@@ -166,6 +166,32 @@ namespace ojph {
   #endif
   }
 
+  /////////////////////////////////////////////////////////////////////////////
+  static inline ui32 population_count64(ui64 val)
+  {
+  #if defined(OJPH_COMPILER_MSVC)  \
+    && (defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386))
+    return (ui32)__popcnt64(val);
+  #elif (defined OJPH_COMPILER_GNUC)
+    return (ui32)__builtin_popcountll(val);
+  #else
+    const ui64 k1 = 0x5555555555555555ull;
+    const ui64 k2 = 0x3333333333333333ull;
+    const ui64 k4 = 0x0F0F0F0F0F0F0F0Full;
+    const ui64 kf = 0x0101010101010101ull;
+
+    // put count of each 2 bits into those 2 bits
+    val =  val       - ((val >> 1)  & k1); 
+    // put count of each 4 bits into those 4 bits
+    val = (val & k2) + ((val >> 2)  & k2);
+    // put count of each 8 bits into those 8 bits
+    val = (val       +  (val >> 4)) & k4 ; 
+    // returns 8 most significant bits of x + (x<<8) + (x<<16) + (x<<24) + ...
+    val = (val * kf) >> 56; 
+    return (ui32) val;
+  #endif
+  }  
+
   /////////////////////////////////////////////////////////////////////////////
 #ifdef OJPH_COMPILER_MSVC
   #pragma intrinsic(_BitScanReverse)
@@ -188,6 +214,29 @@ namespace ojph {
   #endif
   }
 
+  /////////////////////////////////////////////////////////////////////////////
+#ifdef OJPH_COMPILER_MSVC
+  #pragma intrinsic(_BitScanReverse64)
+#endif
+  static inline ui32 count_leading_zeros(ui64 val)
+  {
+  #ifdef OJPH_COMPILER_MSVC
+    unsigned long result = 0;
+    _BitScanReverse64(&result, val);
+    return 63 ^ (ui32)result;
+  #elif (defined OJPH_COMPILER_GNUC)
+    return (ui32)__builtin_clzll(val);
+  #else
+    val |= (val >> 1);
+    val |= (val >> 2);
+    val |= (val >> 4);
+    val |= (val >> 8);
+    val |= (val >> 16);
+    val |= (val >> 32);
+    return 64 - population_count64(val);
+  #endif
+  }  
+
   /////////////////////////////////////////////////////////////////////////////
 #ifdef OJPH_COMPILER_MSVC
   #pragma intrinsic(_BitScanForward)
@@ -237,9 +286,15 @@ namespace ojph {
   ////////////////////////////////////////////////////////////////////////////
   // constants
   ////////////////////////////////////////////////////////////////////////////
-  const ui32 byte_alignment = 64; // 64 bytes == 512 bits
-  const ui32 log_byte_alignment = 31 - count_leading_zeros(byte_alignment);
-  const ui32 object_alignment = 8;
+  #ifndef OJPH_EMSCRIPTEN
+    const ui32 byte_alignment = 64; // 64 bytes == 512 bits
+    const ui32 log_byte_alignment = 31 - count_leading_zeros(byte_alignment);
+    const ui32 object_alignment = 8;
+  #else
+    const ui32 byte_alignment = 16; // 16 bytes == 128 bits
+    const ui32 log_byte_alignment = 31 - count_leading_zeros(byte_alignment);
+    const ui32 object_alignment = 8;
+    #endif
 
   ////////////////////////////////////////////////////////////////////////////
   // templates for alignment
@@ -247,17 +302,17 @@ namespace ojph {
 
   ////////////////////////////////////////////////////////////////////////////
   // finds the size such that it is a multiple of byte_alignment
-  template <typename T, int N>
+  template <typename T, ui32 N>
   size_t calc_aligned_size(size_t size) {
     size = size * sizeof(T) + N - 1;
     size &= ~((1ULL << (31 - count_leading_zeros(N))) - 1);
-    size >>= (31 - count_leading_zeros(sizeof(T)));
+    size >>= (63 - count_leading_zeros((ui64)sizeof(T)));
     return size;
   }
 
   ////////////////////////////////////////////////////////////////////////////
   // moves the pointer to first address that is a multiple of byte_alignment
-  template <typename T, int N>
+  template <typename T, ui32 N>
   inline T *align_ptr(T *ptr) {
     intptr_t p = reinterpret_cast<intptr_t>(ptr);
     p += N - 1;
diff --git a/src/core/common/ojph_codestream.h b/src/core/common/ojph_codestream.h
index c28096e..f7a8065 100644
--- a/src/core/common/ojph_codestream.h
+++ b/src/core/common/ojph_codestream.h
@@ -61,7 +61,7 @@ namespace ojph {
   class comment_exchange;
   class mem_fixed_allocator;
   struct point;
-  struct line_buf;
+  class line_buf;
   class outfile_base;
   class infile_base;
 
diff --git a/src/core/common/ojph_mem.h b/src/core/common/ojph_mem.h
index d7497cd..99897f3 100644
--- a/src/core/common/ojph_mem.h
+++ b/src/core/common/ojph_mem.h
@@ -132,9 +132,23 @@ namespace ojph {
   };
 
   /////////////////////////////////////////////////////////////////////////////
-  struct line_buf
+  class line_buf
   {
-    line_buf() : size(0), pre_size(0), i32(0) {}
+  public:
+    enum : ui32 {
+      LFT_UNDEFINED  = 0x00, // Type is undefined/uninitialized
+                             // These flags reflects data size in bytes
+      LFT_BYTE       = 0x01, // Set when data is 1 byte
+      LFT_16BIT      = 0x02, // Set when data is 2 bytes
+      LFT_32BIT      = 0x04, // Set when data is 4 bytes
+      LFT_64BIT      = 0x08, // Set when data is 8 bytes
+      LFT_REVERSIBLE = 0x10, // Set when data is used for reversible coding
+                             // Not all combinations are useful
+      LFT_SIZE_MASK  = 0x0F, // To extract data size
+    };
+
+  public:
+    line_buf() : size(0), pre_size(0), flags(LFT_UNDEFINED), i32(0) {}
 
     template<typename T>
     void pre_alloc(mem_fixed_allocator *p, size_t num_ele, ui32 pre_size)
@@ -153,9 +167,12 @@ namespace ojph {
 
     size_t size;
     ui32 pre_size;
+    ui32 flags;
     union {
-      si32* i32;
-      float* f32;
+      si32* i32;  // 32bit integer type, used for lossless compression
+      si64* i64;  // 64bit integer type, used for lossless compression
+      float* f32; // float type, used for lossy compression
+      void* p;    // no type is associated with the pointer
     };
   };
 
diff --git a/src/core/common/ojph_version.h b/src/core/common/ojph_version.h
index 2f3adcc..00faf75 100644
--- a/src/core/common/ojph_version.h
+++ b/src/core/common/ojph_version.h
@@ -34,5 +34,5 @@
 //***************************************************************************/
 
 #define OPENJPH_VERSION_MAJOR 0
-#define OPENJPH_VERSION_MINOR 17
+#define OPENJPH_VERSION_MINOR 18
 #define OPENJPH_VERSION_PATCH 0
diff --git a/src/core/others/ojph_mem.cpp b/src/core/others/ojph_mem.cpp
index b70d51e..0bb0b5f 100644
--- a/src/core/others/ojph_mem.cpp
+++ b/src/core/others/ojph_mem.cpp
@@ -65,22 +65,42 @@ namespace ojph {
     f32 = p->post_alloc_data<float>(size, pre_size);
   }
 
+  ////////////////////////////////////////////////////////////////////////////
+  template<>
+  void line_buf::finalize_alloc<si64>(mem_fixed_allocator *p)
+  {
+    assert(p != 0 && size != 0);
+    i64 = p->post_alloc_data<si64>(size, pre_size);
+  }
+
   ////////////////////////////////////////////////////////////////////////////
   template<>
   void line_buf::wrap(si32 *buffer, size_t num_ele, ui32 pre_size)
   {
-    i32 = buffer;
+    this->i32 = buffer;
     this->size = num_ele;
     this->pre_size = pre_size;
+    this->flags = LFT_32BIT | LFT_REVERSIBLE;
   }
 
   ////////////////////////////////////////////////////////////////////////////
   template<>
   void line_buf::wrap(float *buffer, size_t num_ele, ui32 pre_size)
   {
-    f32 = buffer;
+    this->f32 = buffer;
+    this->size = num_ele;
+    this->pre_size = pre_size;
+    this->flags = LFT_32BIT;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////
+  template<>
+  void line_buf::wrap(si64 *buffer, size_t num_ele, ui32 pre_size)
+  {
+    this->i64 = buffer;
     this->size = num_ele;
     this->pre_size = pre_size;
+    this->flags = LFT_64BIT | LFT_REVERSIBLE;
   }
 
   ////////////////////////////////////////////////////////////////////////////
diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp
index ca96d2d..a98b477 100644
--- a/src/core/transform/ojph_colour.cpp
+++ b/src/core/transform/ojph_colour.cpp
@@ -39,19 +39,28 @@
 
 #include "ojph_defs.h"
 #include "ojph_arch.h"
+#include "ojph_mem.h"
 #include "ojph_colour.h"
 #include "ojph_colour_local.h"
 
 namespace ojph {
+
+  // defined elsewhere
+  class line_buf;
+
   namespace local {
 
     //////////////////////////////////////////////////////////////////////////
-    void (*cnvrt_si32_to_si32_shftd)
-      (const si32 *sp, si32 *dp, int shift, ui32 width) = NULL;
+    void (*rev_convert)
+      (const line_buf *src_line, const ui32 src_line_offset, 
+       line_buf *dst_line, const ui32 dst_line_offset, 
+       si64 shift, ui32 width) = NULL;
 
     //////////////////////////////////////////////////////////////////////////
-    void (*cnvrt_si32_to_si32_nlt_type3)
-      (const si32* sp, si32* dp, int shift, ui32 width) = NULL;
+    void (*rev_convert_nlt_type3)
+      (const line_buf *src_line, const ui32 src_line_offset, 
+       line_buf *dst_line, const ui32 dst_line_offset, 
+       si64 shift, ui32 width) = NULL;
 
     //////////////////////////////////////////////////////////////////////////
     void (*cnvrt_si32_to_float_shftd)
@@ -71,13 +80,13 @@ namespace ojph {
 
     //////////////////////////////////////////////////////////////////////////
     void (*rct_forward)
-      (const si32 *r, const si32 *g, const si32 *b,
-       si32 *y, si32 *cb, si32 *cr, ui32 repeat) = NULL;
+      (const line_buf* r, const line_buf* g, const line_buf* b,
+       line_buf* y, line_buf* cb, line_buf* cr, ui32 repeat) = NULL;
 
     //////////////////////////////////////////////////////////////////////////
     void (*rct_backward)
-      (const si32 *y, const si32 *cb, const si32 *cr,
-       si32 *r, si32 *g, si32 *b, ui32 repeat) = NULL;
+      (const line_buf* r, const line_buf* g, const line_buf* b,
+       line_buf* y, line_buf* cb, line_buf* cr, ui32 repeat) = NULL;
 
     //////////////////////////////////////////////////////////////////////////
     void (*ict_forward)
@@ -100,8 +109,8 @@ namespace ojph {
 
 #if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN)
 
-      cnvrt_si32_to_si32_shftd = gen_cnvrt_si32_to_si32_shftd;
-      cnvrt_si32_to_si32_nlt_type3 = gen_cnvrt_si32_to_si32_nlt_type3;
+      rev_convert = gen_rev_convert;
+      rev_convert_nlt_type3 = gen_rev_convert_nlt_type3;
       cnvrt_si32_to_float_shftd = gen_cnvrt_si32_to_float_shftd;
       cnvrt_si32_to_float = gen_cnvrt_si32_to_float;
       cnvrt_float_to_si32_shftd = gen_cnvrt_float_to_si32_shftd;
@@ -130,10 +139,10 @@ namespace ojph {
       #ifndef OJPH_DISABLE_SSE2
         if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE2)
         {
+          rev_convert = sse2_rev_convert;
+          rev_convert_nlt_type3 = sse2_rev_convert_nlt_type3;
           cnvrt_float_to_si32_shftd = sse2_cnvrt_float_to_si32_shftd;
           cnvrt_float_to_si32 = sse2_cnvrt_float_to_si32;
-          cnvrt_si32_to_si32_shftd = sse2_cnvrt_si32_to_si32_shftd;
-          cnvrt_si32_to_si32_nlt_type3 = sse2_cnvrt_si32_to_si32_nlt_type3;
           rct_forward = sse2_rct_forward;
           rct_backward = sse2_rct_backward;
         }
@@ -154,8 +163,8 @@ namespace ojph {
       #ifndef OJPH_DISABLE_AVX2
         if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX2)
         {
-          cnvrt_si32_to_si32_shftd = avx2_cnvrt_si32_to_si32_shftd;
-          cnvrt_si32_to_si32_nlt_type3 = avx2_cnvrt_si32_to_si32_nlt_type3;
+          rev_convert = avx2_rev_convert;
+          rev_convert_nlt_type3 = avx2_rev_convert_nlt_type3;
           rct_forward = avx2_rct_forward;
           rct_backward = avx2_rct_backward;
         }
@@ -168,8 +177,9 @@ namespace ojph {
   #endif // !OJPH_DISABLE_SIMD
 
 #else // OJPH_ENABLE_WASM_SIMD
-      cnvrt_si32_to_si32_shftd = wasm_cnvrt_si32_to_si32_shftd;
-      cnvrt_si32_to_si32_nlt_type3 = wasm_cnvrt_si32_to_si32_nlt_type3;
+
+      rev_convert = wasm_rev_convert;
+      rev_convert_nlt_type3 = wasm_rev_convert_nlt_type3;
       cnvrt_si32_to_float_shftd = wasm_cnvrt_si32_to_float_shftd;
       cnvrt_si32_to_float = wasm_cnvrt_si32_to_float;
       cnvrt_float_to_si32_shftd = wasm_cnvrt_float_to_si32_shftd;
@@ -178,6 +188,7 @@ namespace ojph {
       rct_backward = wasm_rct_backward;
       ict_forward = wasm_ict_forward;
       ict_backward = wasm_ict_backward;
+
 #endif // !OJPH_ENABLE_WASM_SIMD
 
       colour_transform_functions_initialized = true;
@@ -201,20 +212,78 @@ namespace ojph {
 #if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN)
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift,
-                                      ui32 width)
+    void gen_rev_convert(
+      const line_buf *src_line, const ui32 src_line_offset, 
+      line_buf *dst_line, const ui32 dst_line_offset, 
+      si64 shift, ui32 width)
     {
-      for (ui32 i = width; i > 0; --i)
-        *dp++ = *sp++ + shift;
+      if (src_line->flags & line_buf::LFT_32BIT)
+      { 
+        if (dst_line->flags & line_buf::LFT_32BIT)
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si32 *dp = dst_line->i32 + dst_line_offset;
+          si32 s = (si32)shift;
+          for (ui32 i = width; i > 0; --i)
+            *dp++ = *sp++ + s;
+        }
+        else 
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si64 *dp = dst_line->i64 + dst_line_offset;
+          for (ui32 i = width; i > 0; --i)
+            *dp++ = *sp++ + shift;
+        }
+      }
+      else 
+      {
+        assert(src_line->flags | line_buf::LFT_64BIT);
+        assert(dst_line->flags | line_buf::LFT_32BIT);
+        const si64 *sp = src_line->i64 + src_line_offset;
+        si32 *dp = dst_line->i32 + dst_line_offset;
+        for (ui32 i = width; i > 0; --i)
+          *dp++ = (si32)(*sp++ + shift);
+      }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_cnvrt_si32_to_si32_nlt_type3(const si32 *sp, si32 *dp, 
-                                          int shift, ui32 width)
+    void gen_rev_convert_nlt_type3(
+      const line_buf *src_line, const ui32 src_line_offset, 
+      line_buf *dst_line, const ui32 dst_line_offset, 
+      si64 shift, ui32 width)
     {
-      for (ui32 i = width; i > 0; --i) {
-        const si32 v = *sp++;
-        *dp++ = v >= 0 ? v : (- v - shift);
+      if (src_line->flags & line_buf::LFT_32BIT)
+      { 
+        if (dst_line->flags & line_buf::LFT_32BIT)
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si32 *dp = dst_line->i32 + dst_line_offset;
+          si32 s = (si32)shift;
+          for (ui32 i = width; i > 0; --i) {
+            const si32 v = *sp++;
+            *dp++ = v >= 0 ? v : (- v - s);
+          }
+        }
+        else 
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si64 *dp = dst_line->i64 + dst_line_offset;
+          for (ui32 i = width; i > 0; --i) {
+            const si64 v = *sp++;
+            *dp++ = v >= 0 ? v : (- v - shift);
+          }
+        }
+      }
+      else 
+      {
+        assert(src_line->flags | line_buf::LFT_64BIT);
+        assert(dst_line->flags | line_buf::LFT_32BIT);
+        const si64 *sp = src_line->i64 + src_line_offset;
+        si32 *dp = dst_line->i32 + dst_line_offset;
+        for (ui32 i = width; i > 0; --i) {
+          const si64 v = *sp++;
+          *dp++ = (si32)(v >= 0 ? v : (- v - shift));
+        }
       }
     }
 
@@ -251,26 +320,104 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_rct_forward(const si32 *r, const si32 *g, const si32 *b,
-                         si32 *y, si32 *cb, si32 *cr, ui32 repeat)
+    void gen_rct_forward(
+      const line_buf *r, const line_buf *g, const line_buf *b,
+      line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat)
     {
-      for (ui32 i = repeat; i > 0; --i)
+      assert((y->flags  & line_buf::LFT_REVERSIBLE) &&
+             (cb->flags & line_buf::LFT_REVERSIBLE) && 
+             (cr->flags & line_buf::LFT_REVERSIBLE) &&
+             (r->flags  & line_buf::LFT_REVERSIBLE) &&
+             (g->flags  & line_buf::LFT_REVERSIBLE) && 
+             (b->flags  & line_buf::LFT_REVERSIBLE));
+      
+      if  (y->flags & line_buf::LFT_32BIT)
       {
-        *y++ = (*r + (*g << 1) + *b) >> 2;
-        *cb++ = (*b++ - *g);
-        *cr++ = (*r++ - *g++);
+        assert((y->flags  & line_buf::LFT_32BIT) &&
+               (cb->flags & line_buf::LFT_32BIT) && 
+               (cr->flags & line_buf::LFT_32BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) && 
+               (b->flags  & line_buf::LFT_32BIT));        
+        const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32;
+        si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32;
+        for (ui32 i = repeat; i > 0; --i)
+        {
+          si32 rr = *rp++, gg = *gp++, bb = *bp++;
+          *yp++ = (rr + (gg << 1) + bb) >> 2;
+          *cbp++ = (bb - gg);
+          *crp++ = (rr - gg);
+        }
+      }
+      else 
+      {
+        assert((y->flags  & line_buf::LFT_64BIT) &&
+               (cb->flags & line_buf::LFT_64BIT) && 
+               (cr->flags & line_buf::LFT_64BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) && 
+               (b->flags  & line_buf::LFT_32BIT));
+        const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
+        for (ui32 i = repeat; i > 0; --i)
+        {
+          si64 rr = *rp++, gg = *gp++, bb = *bp++;
+          *yp++ = (rr + (gg << 1) + bb) >> 2;
+          *cbp++ = (bb - gg);
+          *crp++ = (rr - gg);
+        }
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_rct_backward(const si32 *y, const si32 *cb, const si32 *cr,
-                          si32 *r, si32 *g, si32 *b, ui32 repeat)
+    void gen_rct_backward(
+      const line_buf *y, const line_buf *cb, const line_buf *cr,
+      line_buf *r, line_buf *g, line_buf *b, ui32 repeat)
     {
-      for (ui32 i = repeat; i > 0; --i)
+      assert((y->flags  & line_buf::LFT_REVERSIBLE) &&
+             (cb->flags & line_buf::LFT_REVERSIBLE) && 
+             (cr->flags & line_buf::LFT_REVERSIBLE) &&
+             (r->flags  & line_buf::LFT_REVERSIBLE) &&
+             (g->flags  & line_buf::LFT_REVERSIBLE) && 
+             (b->flags  & line_buf::LFT_REVERSIBLE));
+
+      if (y->flags & line_buf::LFT_32BIT)
+      {
+        assert((y->flags  & line_buf::LFT_32BIT) &&
+               (cb->flags & line_buf::LFT_32BIT) && 
+               (cr->flags & line_buf::LFT_32BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) && 
+               (b->flags  & line_buf::LFT_32BIT));
+        const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32;
+        si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        for (ui32 i = repeat; i > 0; --i)
+        {
+          si32 yy = *yp++, cbb = *cbp++, crr = *crp++;
+          si32 gg = yy - ((cbb + crr) >> 2);
+          *rp++ = crr + gg;
+          *gp++ = gg;
+          *bp++ = cbb + gg;
+        }
+      }
+      else
       {
-        *g = *y++ - ((*cb + *cr)>>2);
-        *b++ = *cb++ + *g;
-        *r++ = *cr++ + *g++;
+        assert((y->flags  & line_buf::LFT_64BIT) &&
+               (cb->flags & line_buf::LFT_64BIT) && 
+               (cr->flags & line_buf::LFT_64BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) && 
+               (b->flags  & line_buf::LFT_32BIT));   
+        const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
+        si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        for (ui32 i = repeat; i > 0; --i)
+        {
+          si64 yy = *yp++, cbb = *cbp++, crr = *crp++;
+          si64 gg = yy - ((cbb + crr) >> 2);
+          *rp++ = (si32)(crr + gg);
+          *gp++ = (si32)gg;
+          *bp++ = (si32)(cbb + gg);
+        }
       }
     }
 
diff --git a/src/core/transform/ojph_colour.h b/src/core/transform/ojph_colour.h
index 52df312..cc42aaa 100644
--- a/src/core/transform/ojph_colour.h
+++ b/src/core/transform/ojph_colour.h
@@ -40,18 +40,26 @@
 #define OJPH_COLOR_H
 
 namespace ojph {
+
+  // defined elsewhere
+  class line_buf;
+
   namespace local {
 
   ////////////////////////////////////////////////////////////////////////////
   void init_colour_transform_functions();
 
   ////////////////////////////////////////////////////////////////////////////
-  extern void (*cnvrt_si32_to_si32_shftd)
-    (const si32 *sp, si32 *dp, int shift, ui32 width);
+  extern void (*rev_convert)
+    (const line_buf *src_line, const ui32 src_line_offset, 
+     line_buf *dst_line, const ui32 dst_line_offset, 
+     si64 shift, ui32 width);
 
   ////////////////////////////////////////////////////////////////////////////
-  extern void (*cnvrt_si32_to_si32_nlt_type3)
-    (const si32 *sp, si32 *dp, int shift, ui32 width);
+  extern void (*rev_convert_nlt_type3)
+    (const line_buf *src_line, const ui32 src_line_offset, 
+     line_buf *dst_line, const ui32 dst_line_offset, 
+     si64 shift, ui32 width);
 
   ////////////////////////////////////////////////////////////////////////////
   extern void (*cnvrt_si32_to_float_shftd)
@@ -71,13 +79,13 @@ namespace ojph {
 
   ////////////////////////////////////////////////////////////////////////////
   extern void (*rct_forward)
-    (const si32 *r, const si32 *g, const si32 *b,
-     si32 *y, si32 *cb, si32 *cr, ui32 repeat);
+    (const line_buf *r, const line_buf *g, const line_buf *b,
+     line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat);
 
   ////////////////////////////////////////////////////////////////////////////
   extern void (*rct_backward)
-    (const si32 *y, const si32 *cb, const si32 *cr,
-     si32 *r, si32 *g, si32 *b, ui32 repeat);
+    (const line_buf *y, const line_buf *cb, const line_buf *cr,
+     line_buf *r, line_buf *g, line_buf *b, ui32 repeat);
 
   ////////////////////////////////////////////////////////////////////////////
   extern void (*ict_forward)
diff --git a/src/core/transform/ojph_colour_avx2.cpp b/src/core/transform/ojph_colour_avx2.cpp
index 14e5a35..05bff31 100644
--- a/src/core/transform/ojph_colour_avx2.cpp
+++ b/src/core/transform/ojph_colour_avx2.cpp
@@ -35,10 +35,12 @@
 // Date: 11 October 2019
 //***************************************************************************/
 
+#include <climits>
 #include <cmath>
 
 #include "ojph_defs.h"
 #include "ojph_arch.h"
+#include "ojph_mem.h"
 #include "ojph_colour.h"
 
 #include <immintrin.h>
@@ -46,82 +48,392 @@
 namespace ojph {
   namespace local {
 
+    /////////////////////////////////////////////////////////////////////////
+    // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h
+    static inline 
+    __m256i avx2_mm256_srai_epi64(__m256i a, int amt, __m256i m) 
+    {
+      // note than m must be obtained using
+      // __m256i m = _mm256_set1_epi64x(1ULL << (63 - amt));
+      __m256i x = _mm256_srli_epi64(a, amt);
+      x = _mm256_xor_si256(x, m);
+      __m256i result = _mm256_sub_epi64(x, m);
+      return result;
+    }
+
     //////////////////////////////////////////////////////////////////////////
-    void avx2_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift,
-                                       ui32 width)
+    void avx2_rev_convert(const line_buf *src_line, 
+                          const ui32 src_line_offset,
+                          line_buf *dst_line, 
+                          const ui32 dst_line_offset, 
+                          si64 shift, ui32 width)
     {
-      __m256i sh = _mm256_set1_epi32(shift);
-      for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
+      if (src_line->flags & line_buf::LFT_32BIT)
+      { 
+        if (dst_line->flags & line_buf::LFT_32BIT)
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si32 *dp = dst_line->i32 + dst_line_offset;
+          __m256i sh = _mm256_set1_epi32((si32)shift);
+          for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
+          {
+            __m256i s = _mm256_loadu_si256((__m256i*)sp);
+            s = _mm256_add_epi32(s, sh);
+            _mm256_storeu_si256((__m256i*)dp, s);
+          }            
+        }
+        else 
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si64 *dp = dst_line->i64 + dst_line_offset;
+          __m256i sh = _mm256_set1_epi64x(shift);
+          for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
+          {
+            __m256i s, t;
+            s = _mm256_loadu_si256((__m256i*)sp);
+            
+            t = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(s, 0));
+            t = _mm256_add_epi64(t, sh);
+            _mm256_storeu_si256((__m256i*)dp, t);
+            
+            t = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(s, 1));
+            t = _mm256_add_epi64(t, sh);
+            _mm256_storeu_si256((__m256i*)dp + 1, t);
+          }            
+        }
+      }
+      else 
       {
-        __m256i s = _mm256_loadu_si256((__m256i*)sp);
-        s = _mm256_add_epi32(s, sh);
-        _mm256_storeu_si256((__m256i*)dp, s);
+        assert(src_line->flags | line_buf::LFT_64BIT);
+        assert(dst_line->flags | line_buf::LFT_32BIT);
+        const si64 *sp = src_line->i64 + src_line_offset;
+        si32 *dp = dst_line->i32 + dst_line_offset;
+        __m256i low_bits = _mm256_set_epi64x(0, (si64)ULLONG_MAX,
+                                             0, (si64)ULLONG_MAX);
+        __m256i sh = _mm256_set1_epi64x(shift);
+        for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
+        {
+          __m256i s, t;
+          s = _mm256_loadu_si256((__m256i*)sp);
+          s = _mm256_add_epi64(s, sh);
+
+          t = _mm256_shuffle_epi32(s, _MM_SHUFFLE(0, 0, 2, 0));
+          t = _mm256_and_si256(low_bits, t);
+
+          s = _mm256_loadu_si256((__m256i*)sp + 1);
+          s = _mm256_add_epi64(s, sh);
+
+          s = _mm256_shuffle_epi32(s, _MM_SHUFFLE(2, 0, 0, 0));
+          s = _mm256_andnot_si256(low_bits, s);
+          
+          t = _mm256_or_si256(s, t);
+          t = _mm256_permute4x64_epi64(t, _MM_SHUFFLE(3, 1, 2, 0));
+          _mm256_storeu_si256((__m256i*)dp, t);
+        }            
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_cnvrt_si32_to_si32_nlt_type3(const si32* sp, si32* dp,
-                                           int shift, ui32 width)
+    void avx2_rev_convert_nlt_type3(const line_buf *src_line, 
+                                    const ui32 src_line_offset, 
+                                    line_buf *dst_line, 
+                                    const ui32 dst_line_offset, 
+                                    si64 shift, ui32 width)
     {
-      __m256i sh = _mm256_set1_epi32(-shift);
-      __m256i zero = _mm256_setzero_si256();
-      for (int i = (width + 7) >> 3; i > 0; --i, sp += 8, dp += 8)
+      if (src_line->flags & line_buf::LFT_32BIT)
+      { 
+        if (dst_line->flags & line_buf::LFT_32BIT)
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si32 *dp = dst_line->i32 + dst_line_offset;
+          __m256i sh = _mm256_set1_epi32((si32)(-shift));
+          __m256i zero = _mm256_setzero_si256();
+          for (int i = (width + 7) >> 3; i > 0; --i, sp += 8, dp += 8)
+          {
+            __m256i s = _mm256_loadu_si256((__m256i*)sp);
+            __m256i c = _mm256_cmpgt_epi32(zero, s);  // 0xFFFFFFFF for -ve val
+            __m256i v_m_sh = _mm256_sub_epi32(sh, s); // - shift - value 
+            v_m_sh = _mm256_and_si256(c, v_m_sh);     // keep only -shift-val
+            s = _mm256_andnot_si256(c, s);            // keep only +ve or 0
+            s = _mm256_or_si256(s, v_m_sh);           // combine
+            _mm256_storeu_si256((__m256i*)dp, s);
+          }
+        }
+        else 
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si64 *dp = dst_line->i64 + dst_line_offset;
+          __m256i sh = _mm256_set1_epi64x(-shift);
+          __m256i zero = _mm256_setzero_si256();
+          for (int i = (width + 7) >> 3; i > 0; --i, sp += 8, dp += 8)
+          {
+            __m256i s, t, u0, u1, c, v_m_sh;
+            s = _mm256_loadu_si256((__m256i*)sp);
+
+            t = _mm256_cmpgt_epi32(zero, s);      // find -ve 32bit -1
+            u0 = _mm256_unpacklo_epi32(s, t);     // correct 64bit data
+            c = _mm256_unpacklo_epi32(t, t);      // 64bit -1 for -ve value
+
+            v_m_sh = _mm256_sub_epi64(sh, u0);    // - shift - value 
+            v_m_sh = _mm256_and_si256(c, v_m_sh); // keep only - shift - value
+            u0 = _mm256_andnot_si256(c, u0);      // keep only +ve or 0
+            u0 = _mm256_or_si256(u0, v_m_sh);     // combine
+
+            u1 = _mm256_unpackhi_epi32(s, t);     // correct 64bit data
+            c = _mm256_unpackhi_epi32(t, t);      // 64bit -1 for -ve value
+
+            v_m_sh = _mm256_sub_epi64(sh, u1);    // - shift - value 
+            v_m_sh = _mm256_and_si256(c, v_m_sh); // keep only - shift - value
+            u1 = _mm256_andnot_si256(c, u1);      // keep only +ve or 0
+            u1 = _mm256_or_si256(u1, v_m_sh);     // combine
+
+            t = _mm256_permute2x128_si256(u0, u1, (2 << 4) | 0);
+            _mm256_storeu_si256((__m256i*)dp, t);
+
+            t = _mm256_permute2x128_si256(u0, u1, (3 << 4) | 1);
+            _mm256_storeu_si256((__m256i*)dp + 1, t);
+          }
+        }
+      }
+      else 
       {
-        __m256i s = _mm256_loadu_si256((__m256i*)sp);
-        __m256i c = _mm256_cmpgt_epi32(s, zero);  // 0xFFFFFFFF for +ve value
-        __m256i z = _mm256_cmpeq_epi32(s, zero);  // 0xFFFFFFFF for 0
-        c = _mm256_or_si256(c, z);                // 0xFFFFFFFF for +ve and 0
-
-        __m256i v_m_sh = _mm256_sub_epi32(sh, s); // - shift - value 
-        v_m_sh = _mm256_andnot_si256(c, v_m_sh);  // keep only - shift - value
-        s = _mm256_and_si256(c, s);               // keep only +ve or 0
-        s = _mm256_or_si256(s, v_m_sh);           // combine
-        _mm256_storeu_si256((__m256i*)dp, s);
+        assert(src_line->flags | line_buf::LFT_64BIT);
+        assert(dst_line->flags | line_buf::LFT_32BIT);
+        const si64 *sp = src_line->i64 + src_line_offset;
+        si32 *dp = dst_line->i32 + dst_line_offset;
+        __m256i sh = _mm256_set1_epi64x(-shift);
+        __m256i zero = _mm256_setzero_si256();
+        __m256i half_mask = _mm256_set_epi64x(0, (si64)ULLONG_MAX,
+                                              0, (si64)ULLONG_MAX);
+        for (int i = (width + 7) >> 3; i > 0; --i, sp += 8, dp += 8)
+        {
+          // s for source, t for target, p for positive, n for negative,
+          // m for mask, and tm for temp
+          __m256i s, t, p, n, m, tm;
+          s = _mm256_loadu_si256((__m256i*)sp);
+          
+          m = _mm256_cmpgt_epi64(zero, s);    // 64b -1 for -ve value
+          tm = _mm256_sub_epi64(sh, s);       // - shift - value
+          n = _mm256_and_si256(m, tm);        // -ve
+          p = _mm256_andnot_si256(m, s);      // +ve
+          tm = _mm256_or_si256(n, p);
+          tm = _mm256_shuffle_epi32(tm, _MM_SHUFFLE(0, 0, 2, 0));
+          t = _mm256_and_si256(half_mask, tm);
+
+          s = _mm256_loadu_si256((__m256i*)sp + 1);
+          m = _mm256_cmpgt_epi64(zero, s);    // 64b -1 for -ve value
+          tm = _mm256_sub_epi64(sh, s);       // - shift - value
+          n = _mm256_and_si256(m, tm);        // -ve
+          p = _mm256_andnot_si256(m, s);      // +ve
+          tm = _mm256_or_si256(n, p);
+          tm = _mm256_shuffle_epi32(tm, _MM_SHUFFLE(2, 0, 0, 0));
+          tm = _mm256_andnot_si256(half_mask, tm);
+
+          t = _mm256_or_si256(t, tm);
+          t = _mm256_permute4x64_epi64(t, _MM_SHUFFLE(3, 1, 2, 0));
+           _mm256_storeu_si256((__m256i*)dp, t);
+        }
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_rct_forward(const si32 *r, const si32 *g, const si32 *b,
-                          si32 *y, si32 *cb, si32 *cr, ui32 repeat)
+    void avx2_rct_forward(const line_buf *r, 
+                          const line_buf *g, 
+                          const line_buf *b,
+                          line_buf *y, line_buf *cb, line_buf *cr, 
+                          ui32 repeat)
     {
-      for (int i = (repeat + 7) >> 3; i > 0; --i)
+      assert((y->flags  & line_buf::LFT_REVERSIBLE) &&
+             (cb->flags & line_buf::LFT_REVERSIBLE) && 
+             (cr->flags & line_buf::LFT_REVERSIBLE) &&
+             (r->flags  & line_buf::LFT_REVERSIBLE) &&
+             (g->flags  & line_buf::LFT_REVERSIBLE) && 
+             (b->flags  & line_buf::LFT_REVERSIBLE));
+      
+      if  (y->flags & line_buf::LFT_32BIT)
       {
-        __m256i mr = _mm256_load_si256((__m256i*)r);
-        __m256i mg = _mm256_load_si256((__m256i*)g);
-        __m256i mb = _mm256_load_si256((__m256i*)b);
-        __m256i t = _mm256_add_epi32(mr, mb);
-        t = _mm256_add_epi32(t, _mm256_slli_epi32(mg, 1));
-        _mm256_store_si256((__m256i*)y, _mm256_srai_epi32(t, 2));
-        t = _mm256_sub_epi32(mb, mg);
-        _mm256_store_si256((__m256i*)cb, t);
-        t = _mm256_sub_epi32(mr, mg);
-        _mm256_store_si256((__m256i*)cr, t);
-
-        r += 8; g += 8; b += 8;
-        y += 8; cb += 8; cr += 8;
+        assert((y->flags  & line_buf::LFT_32BIT) &&
+               (cb->flags & line_buf::LFT_32BIT) && 
+               (cr->flags & line_buf::LFT_32BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) && 
+               (b->flags  & line_buf::LFT_32BIT));        
+        const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32;
+        si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32;
+        for (int i = (repeat + 7) >> 3; i > 0; --i)
+        {
+          __m256i mr = _mm256_load_si256((__m256i*)rp);
+          __m256i mg = _mm256_load_si256((__m256i*)gp);
+          __m256i mb = _mm256_load_si256((__m256i*)bp);
+          __m256i t = _mm256_add_epi32(mr, mb);
+          t = _mm256_add_epi32(t, _mm256_slli_epi32(mg, 1));
+          _mm256_store_si256((__m256i*)yp, _mm256_srai_epi32(t, 2));
+          t = _mm256_sub_epi32(mb, mg);
+          _mm256_store_si256((__m256i*)cbp, t);
+          t = _mm256_sub_epi32(mr, mg);
+          _mm256_store_si256((__m256i*)crp, t);
+
+          rp += 8; gp += 8; bp += 8;
+          yp += 8; cbp += 8; crp += 8;
+        }
       }
-    }
+      else 
+      {
+        assert((y->flags  & line_buf::LFT_64BIT) &&
+               (cb->flags & line_buf::LFT_64BIT) && 
+               (cr->flags & line_buf::LFT_64BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) && 
+               (b->flags  & line_buf::LFT_32BIT));
+        __m256i v2 = _mm256_set1_epi64x(1ULL << (63 - 2));
+        const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
+        for (int i = (repeat + 7) >> 3; i > 0; --i)
+        {
+          __m256i mr32 = _mm256_load_si256((__m256i*)rp);
+          __m256i mg32 = _mm256_load_si256((__m256i*)gp);
+          __m256i mb32 = _mm256_load_si256((__m256i*)bp);
+          __m256i mr, mg, mb, t;
+          mr = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mr32, 0));
+          mg = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mg32, 0));
+          mb = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mb32, 0));
+          
+          t = _mm256_add_epi64(mr, mb);
+          t = _mm256_add_epi64(t, _mm256_slli_epi64(mg, 1));
+          _mm256_store_si256((__m256i*)yp, avx2_mm256_srai_epi64(t, 2, v2));
+          t = _mm256_sub_epi64(mb, mg);
+          _mm256_store_si256((__m256i*)cbp, t);
+          t = _mm256_sub_epi64(mr, mg);
+          _mm256_store_si256((__m256i*)crp, t);
+
+          yp += 4; cbp += 4; crp += 4;
+
+          mr = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mr32, 1));
+          mg = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mg32, 1));
+          mb = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mb32, 1));
+          
+          t = _mm256_add_epi64(mr, mb);
+          t = _mm256_add_epi64(t, _mm256_slli_epi64(mg, 1));
+          _mm256_store_si256((__m256i*)yp, avx2_mm256_srai_epi64(t, 2, v2));
+          t = _mm256_sub_epi64(mb, mg);
+          _mm256_store_si256((__m256i*)cbp, t);
+          t = _mm256_sub_epi64(mr, mg);
+          _mm256_store_si256((__m256i*)crp, t);
+
+          rp += 8; gp += 8; bp += 8;
+          yp += 4; cbp += 4; crp += 4;
+        }
+      }
+    }    
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_rct_backward(const si32 *y, const si32 *cb, const si32 *cr,
-                           si32 *r, si32 *g, si32 *b, ui32 repeat)
+    void avx2_rct_backward(const line_buf *y, 
+                           const line_buf *cb, 
+                           const line_buf *cr,
+                           line_buf *r, line_buf *g, line_buf *b, 
+                           ui32 repeat)
     {
-      for (int i = (repeat + 7) >> 3; i > 0; --i)
+      assert((y->flags  & line_buf::LFT_REVERSIBLE) &&
+             (cb->flags & line_buf::LFT_REVERSIBLE) && 
+             (cr->flags & line_buf::LFT_REVERSIBLE) &&
+             (r->flags  & line_buf::LFT_REVERSIBLE) &&
+             (g->flags  & line_buf::LFT_REVERSIBLE) && 
+             (b->flags  & line_buf::LFT_REVERSIBLE));
+
+      if (y->flags & line_buf::LFT_32BIT)
       {
-        __m256i my  = _mm256_load_si256((__m256i*)y);
-        __m256i mcb = _mm256_load_si256((__m256i*)cb);
-        __m256i mcr = _mm256_load_si256((__m256i*)cr);
-
-        __m256i t = _mm256_add_epi32(mcb, mcr);
-        t = _mm256_sub_epi32(my, _mm256_srai_epi32(t, 2));
-        _mm256_store_si256((__m256i*)g, t);
-        __m256i u = _mm256_add_epi32(mcb, t);
-        _mm256_store_si256((__m256i*)b, u);
-        u = _mm256_add_epi32(mcr, t);
-        _mm256_store_si256((__m256i*)r, u);
-
-        y += 8; cb += 8; cr += 8;
-        r += 8; g += 8; b += 8;
+        assert((y->flags  & line_buf::LFT_32BIT) &&
+               (cb->flags & line_buf::LFT_32BIT) && 
+               (cr->flags & line_buf::LFT_32BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) && 
+               (b->flags  & line_buf::LFT_32BIT));
+        const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32;
+        si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        for (int i = (repeat + 7) >> 3; i > 0; --i)
+        {
+          __m256i my  = _mm256_load_si256((__m256i*)yp);
+          __m256i mcb = _mm256_load_si256((__m256i*)cbp);
+          __m256i mcr = _mm256_load_si256((__m256i*)crp);
+
+          __m256i t = _mm256_add_epi32(mcb, mcr);
+          t = _mm256_sub_epi32(my, _mm256_srai_epi32(t, 2));
+          _mm256_store_si256((__m256i*)gp, t);
+          __m256i u = _mm256_add_epi32(mcb, t);
+          _mm256_store_si256((__m256i*)bp, u);
+          u = _mm256_add_epi32(mcr, t);
+          _mm256_store_si256((__m256i*)rp, u);
+
+          yp += 8; cbp += 8; crp += 8;
+          rp += 8; gp += 8; bp += 8;
+        }        
+      }
+      else
+      {
+        assert((y->flags  & line_buf::LFT_64BIT) &&
+               (cb->flags & line_buf::LFT_64BIT) && 
+               (cr->flags & line_buf::LFT_64BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) && 
+               (b->flags  & line_buf::LFT_32BIT));
+        __m256i v2 = _mm256_set1_epi64x(1ULL << (63 - 2));
+        __m256i low_bits = _mm256_set_epi64x(0, (si64)ULLONG_MAX, 
+                                             0, (si64)ULLONG_MAX);
+        const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
+        si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        for (int i = (repeat + 7) >> 3; i > 0; --i)
+        {
+          __m256i my, mcb, mcr, tr, tg, tb;          
+          my  = _mm256_load_si256((__m256i*)yp);
+          mcb = _mm256_load_si256((__m256i*)cbp);
+          mcr = _mm256_load_si256((__m256i*)crp);
+
+          tg = _mm256_add_epi64(mcb, mcr);
+          tg = _mm256_sub_epi64(my, avx2_mm256_srai_epi64(tg, 2, v2));
+          tb = _mm256_add_epi64(mcb, tg);
+          tr = _mm256_add_epi64(mcr, tg);
+
+          __m256i mr, mg, mb;
+          mr = _mm256_shuffle_epi32(tr, _MM_SHUFFLE(0, 0, 2, 0));
+          mr = _mm256_and_si256(low_bits, mr);
+          mg = _mm256_shuffle_epi32(tg, _MM_SHUFFLE(0, 0, 2, 0));
+          mg = _mm256_and_si256(low_bits, mg);
+          mb = _mm256_shuffle_epi32(tb, _MM_SHUFFLE(0, 0, 2, 0));
+          mb = _mm256_and_si256(low_bits, mb);
+
+          yp += 4; cbp += 4; crp += 4;
+
+          my  = _mm256_load_si256((__m256i*)yp);
+          mcb = _mm256_load_si256((__m256i*)cbp);
+          mcr = _mm256_load_si256((__m256i*)crp);
+
+          tg = _mm256_add_epi64(mcb, mcr);
+          tg = _mm256_sub_epi64(my, avx2_mm256_srai_epi64(tg, 2, v2));
+          tb = _mm256_add_epi64(mcb, tg);
+          tr = _mm256_add_epi64(mcr, tg);
+
+          tr = _mm256_shuffle_epi32(tr, _MM_SHUFFLE(2, 0, 0, 0));
+          tr = _mm256_andnot_si256(low_bits, tr);
+          mr = _mm256_or_si256(mr, tr);
+          mr = _mm256_permute4x64_epi64(mr, _MM_SHUFFLE(3, 1, 2, 0));
+
+          tg = _mm256_shuffle_epi32(tg, _MM_SHUFFLE(2, 0, 0, 0));
+          tg = _mm256_andnot_si256(low_bits, tg);
+          mg = _mm256_or_si256(mg, tg);
+          mg = _mm256_permute4x64_epi64(mg, _MM_SHUFFLE(3, 1, 2, 0));
+
+          tb = _mm256_shuffle_epi32(tb, _MM_SHUFFLE(2, 0, 0, 0));
+          tb = _mm256_andnot_si256(low_bits, tb);
+          mb = _mm256_or_si256(mb, tb);
+          mb = _mm256_permute4x64_epi64(mb, _MM_SHUFFLE(3, 1, 2, 0));
+
+          _mm256_store_si256((__m256i*)rp, mr);
+          _mm256_store_si256((__m256i*)gp, mg);
+          _mm256_store_si256((__m256i*)bp, mb);
+
+          yp += 4; cbp += 4; crp += 4;
+          rp += 8; gp += 8; bp += 8;
+        }        
       }
     }
 
diff --git a/src/core/transform/ojph_colour_local.h b/src/core/transform/ojph_colour_local.h
index ae5eba1..5eb8b74 100644
--- a/src/core/transform/ojph_colour_local.h
+++ b/src/core/transform/ojph_colour_local.h
@@ -65,12 +65,16 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift,
-                                      ui32 width);
+    void gen_rev_convert(
+      const line_buf *src_line, const ui32 src_line_offset, 
+      line_buf *dst_line, const ui32 dst_line_offset, 
+      si64 shift, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_cnvrt_si32_to_si32_nlt_type3(const si32 *sp, si32 *dp, 
-                                          int shift, ui32 width);
+    void gen_rev_convert_nlt_type3(
+      const line_buf *src_line, const ui32 src_line_offset, 
+      line_buf *dst_line, const ui32 dst_line_offset, 
+      si64 shift, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
     void gen_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul,
@@ -89,12 +93,14 @@ namespace ojph {
                                  ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_rct_forward(const si32 *r, const si32 *g, const si32 *b,
-                         si32 *y, si32 *cb, si32 *cr, ui32 repeat);
+    void gen_rct_forward(
+      const line_buf *r, const line_buf *g, const line_buf *b,
+      line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat);
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_rct_backward(const si32 *y, const si32 *cb, const si32 *cr,
-                          si32 *r, si32 *g, si32 *b, ui32 repeat);
+    void gen_rct_backward(
+      const line_buf *y, const line_buf *cb, const line_buf *cr,
+      line_buf *r, line_buf *g, line_buf *b, ui32 repeat);
 
     //////////////////////////////////////////////////////////////////////////
     void gen_ict_forward(const float *r, const float *g, const float *b,
@@ -161,21 +167,26 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift,
-                                       ui32 width);
+    void sse2_rev_convert(
+      const line_buf *src_line, const ui32 src_line_offset, 
+      line_buf *dst_line, const ui32 dst_line_offset, 
+      si64 shift, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_cnvrt_si32_to_si32_nlt_type3(const si32 *sp, si32 *dp, 
-                                           int shift, ui32 width);
-
+    void sse2_rev_convert_nlt_type3(
+      const line_buf *src_line, const ui32 src_line_offset, 
+      line_buf *dst_line, const ui32 dst_line_offset, 
+      si64 shift, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_rct_forward(const si32 *r, const si32 *g, const si32 *b,
-                          si32 *y, si32 *cb, si32 *cr, ui32 repeat);
+    void sse2_rct_forward(
+      const line_buf *r, const line_buf *g, const line_buf *b,
+      line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat);
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_rct_backward(const si32 *y, const si32 *cb, const si32 *cr,
-                           si32 *r, si32 *g, si32 *b, ui32 repeat);
+    void sse2_rct_backward(
+      const line_buf *y, const line_buf *cb, const line_buf *cr,
+      line_buf *r, line_buf *g, line_buf *b, ui32 repeat);
 
     //////////////////////////////////////////////////////////////////////////
     //
@@ -218,20 +229,26 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift,
-                                       ui32 width);
+    void avx2_rev_convert(
+      const line_buf *src_line, const ui32 src_line_offset, 
+      line_buf *dst_line, const ui32 dst_line_offset, 
+      si64 shift, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_cnvrt_si32_to_si32_nlt_type3(const si32 *sp, si32 *dp, 
-                                           int shift, ui32 width);
+    void avx2_rev_convert_nlt_type3(
+      const line_buf *src_line, const ui32 src_line_offset, 
+      line_buf *dst_line, const ui32 dst_line_offset, 
+      si64 shift, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_rct_forward(const si32 *r, const si32 *g, const si32 *b,
-                          si32 *y, si32 *cb, si32 *cr, ui32 repeat);
+    void avx2_rct_forward(
+      const line_buf *r, const line_buf *g, const line_buf *b,
+      line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat);
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_rct_backward(const si32 *y, const si32 *cb, const si32 *cr,
-                           si32 *r, si32 *g, si32 *b, ui32 repeat);
+    void avx2_rct_backward(
+      const line_buf *y, const line_buf *cb, const line_buf *cr,
+      line_buf *r, line_buf *g, line_buf *b, ui32 repeat);
 
     //////////////////////////////////////////////////////////////////////////
     //
@@ -258,20 +275,26 @@ namespace ojph {
                                   ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift,
-                                       ui32 width);
+    void wasm_rev_convert(
+      const line_buf *src_line, const ui32 src_line_offset, 
+      line_buf *dst_line, const ui32 dst_line_offset, 
+      si64 shift, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_cnvrt_si32_to_si32_nlt_type3(const si32 *sp, si32 *dp, 
-                                           int shift, ui32 width);
+    void wasm_rev_convert_nlt_type3(
+      const line_buf *src_line, const ui32 src_line_offset, 
+      line_buf *dst_line, const ui32 dst_line_offset, 
+      si64 shift, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rct_forward(const si32 *r, const si32 *g, const si32 *b,
-                          si32 *y, si32 *cb, si32 *cr, ui32 repeat);
+    void wasm_rct_forward(
+      const line_buf *r, const line_buf *g, const line_buf *b,
+      line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat);
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rct_backward(const si32 *y, const si32 *cb, const si32 *cr,
-                           si32 *r, si32 *g, si32 *b, ui32 repeat);
+    void wasm_rct_backward(
+      const line_buf *y, const line_buf *cb, const line_buf *cr,
+      line_buf *r, line_buf *g, line_buf *b, ui32 repeat);
 
     //////////////////////////////////////////////////////////////////////////
     void wasm_ict_forward(const float *r, const float *g, const float *b,
diff --git a/src/core/transform/ojph_colour_sse2.cpp b/src/core/transform/ojph_colour_sse2.cpp
index c50c091..a529c66 100644
--- a/src/core/transform/ojph_colour_sse2.cpp
+++ b/src/core/transform/ojph_colour_sse2.cpp
@@ -35,10 +35,12 @@
 // Date: 11 October 2019
 //***************************************************************************/
 
+#include <climits>
 #include <cmath>
 
 #include "ojph_defs.h"
 #include "ojph_arch.h"
+#include "ojph_mem.h"
 #include "ojph_colour.h"
 
 #include <emmintrin.h>
@@ -46,6 +48,207 @@
 namespace ojph {
   namespace local {
 
+    /////////////////////////////////////////////////////////////////////////
+    // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h
+    static inline __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m) 
+    {
+      // note than m must be obtained using
+      // __m128i m = _mm_set1_epi64x(1ULL << (63 - amt));
+      __m128i x = _mm_srli_epi64(a, amt);
+      x = _mm_xor_si128(x, m);
+      __m128i result = _mm_sub_epi64(x, m);
+      return result;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline __m128i sse2_cvtlo_epi32_epi64(__m128i a, __m128i zero)
+    {
+      __m128i t;
+      t = _mm_cmplt_epi32(a, zero);      // get -ve
+      t = _mm_unpacklo_epi32(a, t);
+      return t;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline __m128i sse2_cvthi_epi32_epi64(__m128i a, __m128i zero)
+    {
+      __m128i t;
+      t = _mm_cmplt_epi32(a, zero);      // get -ve
+      t = _mm_unpackhi_epi32(a, t);
+      return t;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_rev_convert(const line_buf *src_line, 
+                          const ui32 src_line_offset,
+                          line_buf *dst_line, 
+                          const ui32 dst_line_offset, 
+                          si64 shift, ui32 width)
+    {
+      if (src_line->flags & line_buf::LFT_32BIT)
+      { 
+        if (dst_line->flags & line_buf::LFT_32BIT)
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si32 *dp = dst_line->i32 + dst_line_offset;
+          __m128i sh = _mm_set1_epi32((si32)shift);
+          for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+          {
+            __m128i s = _mm_loadu_si128((__m128i*)sp);
+            s = _mm_add_epi32(s, sh);
+            _mm_storeu_si128((__m128i*)dp, s);
+          }            
+        }
+        else 
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si64 *dp = dst_line->i64 + dst_line_offset;
+          __m128i zero = _mm_setzero_si128();
+          __m128i sh = _mm_set1_epi64x(shift);
+          for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+          {
+            __m128i s, t;
+            s = _mm_loadu_si128((__m128i*)sp);
+            
+            t = sse2_cvtlo_epi32_epi64(s, zero);
+            t = _mm_add_epi64(t, sh);
+            _mm_storeu_si128((__m128i*)dp, t);
+            
+            t = sse2_cvthi_epi32_epi64(s, zero);
+            t = _mm_add_epi64(t, sh);
+            _mm_storeu_si128((__m128i*)dp + 1, t);
+          }            
+        }
+      }
+      else 
+      {
+        assert(src_line->flags | line_buf::LFT_64BIT);
+        assert(dst_line->flags | line_buf::LFT_32BIT);
+        const si64 *sp = src_line->i64 + src_line_offset;
+        si32 *dp = dst_line->i32 + dst_line_offset;
+        __m128i low_bits = _mm_set_epi64x(0, (si64)ULLONG_MAX);
+        __m128i sh = _mm_set1_epi64x(shift);
+        for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+        {
+          __m128i s, t;
+          s = _mm_loadu_si128((__m128i*)sp);
+          s = _mm_add_epi64(s, sh);
+
+          t = _mm_shuffle_epi32(s, _MM_SHUFFLE(0, 0, 2, 0));
+          t = _mm_and_si128(low_bits, t);
+
+          s = _mm_loadu_si128((__m128i*)sp + 1);
+          s = _mm_add_epi64(s, sh);
+
+          s = _mm_shuffle_epi32(s, _MM_SHUFFLE(2, 0, 0, 0));
+          s = _mm_andnot_si128(low_bits, s);
+          
+          t = _mm_or_si128(s, t);
+          _mm_storeu_si128((__m128i*)dp, t);
+        }            
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_rev_convert_nlt_type3(const line_buf *src_line, 
+                                    const ui32 src_line_offset, 
+                                    line_buf *dst_line, 
+                                    const ui32 dst_line_offset, 
+                                    si64 shift, ui32 width)
+    {
+      if (src_line->flags & line_buf::LFT_32BIT)
+      { 
+        if (dst_line->flags & line_buf::LFT_32BIT)
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si32 *dp = dst_line->i32 + dst_line_offset;
+          __m128i sh = _mm_set1_epi32((si32)(-shift));
+          __m128i zero = _mm_setzero_si128();
+          for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
+          {
+            __m128i s = _mm_loadu_si128((__m128i*)sp);
+            __m128i c = _mm_cmplt_epi32(s, zero);  // 0xFFFFFFFF for -ve value
+            __m128i v_m_sh = _mm_sub_epi32(sh, s); // - shift - value 
+            v_m_sh = _mm_and_si128(c, v_m_sh);     // keep only - shift - value
+            s = _mm_andnot_si128(c, s);            // keep only +ve or 0
+            s = _mm_or_si128(s, v_m_sh);           // combine
+            _mm_storeu_si128((__m128i*)dp, s);
+          }
+        }
+        else 
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si64 *dp = dst_line->i64 + dst_line_offset;
+          __m128i sh = _mm_set1_epi64x(-shift);
+          __m128i zero = _mm_setzero_si128();
+          for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
+          {
+            __m128i s, t, u, c, v_m_sh;
+            s = _mm_loadu_si128((__m128i*)sp);
+
+            t = _mm_cmplt_epi32(s, zero);      // find -ve 32bit -1
+            u = _mm_unpacklo_epi32(s, t);      // correct 64bit data
+            c = _mm_unpacklo_epi32(t, t);      // 64bit -1 for -ve value
+
+            v_m_sh = _mm_sub_epi64(sh, u);     // - shift - value 
+            v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value
+            u = _mm_andnot_si128(c, u);        // keep only +ve or 0
+            u = _mm_or_si128(u, v_m_sh);       // combine
+
+            _mm_storeu_si128((__m128i*)dp, u);
+            u = _mm_unpackhi_epi32(s, t);      // correct 64bit data
+            c = _mm_unpackhi_epi32(t, t);      // 64bit -1 for -ve value
+
+            v_m_sh = _mm_sub_epi64(sh, u);     // - shift - value 
+            v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value
+            u = _mm_andnot_si128(c, u);        // keep only +ve or 0
+            u = _mm_or_si128(u, v_m_sh);       // combine
+
+            _mm_storeu_si128((__m128i*)dp + 1, u);
+          }
+        }
+      }
+      else 
+      {
+        assert(src_line->flags | line_buf::LFT_64BIT);
+        assert(dst_line->flags | line_buf::LFT_32BIT);
+        const si64 *sp = src_line->i64 + src_line_offset;
+        si32 *dp = dst_line->i32 + dst_line_offset;
+        __m128i sh = _mm_set1_epi64x(-shift);
+        __m128i zero = _mm_setzero_si128();
+        __m128i half_mask = _mm_set_epi64x(0, (si64)ULLONG_MAX);
+        for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
+        {
+          // s for source, t for target, p for positive, n for negative,
+          // m for mask, and tm for temp
+          __m128i s, t, p, n, m, tm;
+          s = _mm_loadu_si128((__m128i*)sp);
+          
+          tm = _mm_cmplt_epi32(s, zero);   // 32b -1 for -ve value
+          m = _mm_shuffle_epi32(tm, _MM_SHUFFLE(3, 3, 1, 1)); // expand to 64b
+          tm = _mm_sub_epi64(sh, s);       // - shift - value
+          n = _mm_and_si128(m, tm);        // -ve
+          p = _mm_andnot_si128(m, s);      // +ve
+          tm = _mm_or_si128(n, p);
+          tm = _mm_shuffle_epi32(tm, _MM_SHUFFLE(0, 0, 2, 0));
+          t = _mm_and_si128(half_mask, tm);
+
+          s = _mm_loadu_si128((__m128i*)sp + 1);
+          tm = _mm_cmplt_epi32(s, zero);   // 32b -1 for -ve value
+          m = _mm_shuffle_epi32(tm, _MM_SHUFFLE(3, 3, 1, 1)); // expand to 64b
+          tm = _mm_sub_epi64(sh, s);       // - shift - value
+          n = _mm_and_si128(m, tm);        // -ve
+          p = _mm_andnot_si128(m, s);      // +ve
+          tm = _mm_or_si128(n, p);
+          tm = _mm_shuffle_epi32(tm, _MM_SHUFFLE(2, 0, 0, 0));
+          tm = _mm_andnot_si128(half_mask, tm);
+
+          t = _mm_or_si128(t, tm);
+           _mm_storeu_si128((__m128i*)dp, t);
+        }
+      }
+    }
+
     //////////////////////////////////////////////////////////////////////////
     void sse2_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul,
                                        ui32 width)
@@ -80,82 +283,200 @@ namespace ojph {
       _MM_SET_ROUNDING_MODE(rounding_mode);
     }
 
-
     //////////////////////////////////////////////////////////////////////////
-    void sse2_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift,
-                                       ui32 width)
+    void sse2_rct_forward(const line_buf *r, 
+                          const line_buf *g, 
+                          const line_buf *b,
+                          line_buf *y, line_buf *cb, line_buf *cr, 
+                          ui32 repeat)
     {
-      __m128i sh = _mm_set1_epi32(shift);
-      for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+      assert((y->flags  & line_buf::LFT_REVERSIBLE) &&
+             (cb->flags & line_buf::LFT_REVERSIBLE) && 
+             (cr->flags & line_buf::LFT_REVERSIBLE) &&
+             (r->flags  & line_buf::LFT_REVERSIBLE) &&
+             (g->flags  & line_buf::LFT_REVERSIBLE) && 
+             (b->flags  & line_buf::LFT_REVERSIBLE));
+      
+      if  (y->flags & line_buf::LFT_32BIT)
       {
-        __m128i s = _mm_loadu_si128((__m128i*)sp);
-        s = _mm_add_epi32(s, sh);
-        _mm_storeu_si128((__m128i*)dp, s);
-      }
-    }
+        assert((y->flags  & line_buf::LFT_32BIT) &&
+               (cb->flags & line_buf::LFT_32BIT) && 
+               (cr->flags & line_buf::LFT_32BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) && 
+               (b->flags  & line_buf::LFT_32BIT));        
+        const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32;
+        si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32;
+        for (int i = (repeat + 3) >> 2; i > 0; --i)
+        {
+          __m128i mr = _mm_load_si128((__m128i*)rp);
+          __m128i mg = _mm_load_si128((__m128i*)gp);
+          __m128i mb = _mm_load_si128((__m128i*)bp);
+          __m128i t = _mm_add_epi32(mr, mb);
+          t = _mm_add_epi32(t, _mm_slli_epi32(mg, 1));
+          _mm_store_si128((__m128i*)yp, _mm_srai_epi32(t, 2));
+          t = _mm_sub_epi32(mb, mg);
+          _mm_store_si128((__m128i*)cbp, t);
+          t = _mm_sub_epi32(mr, mg);
+          _mm_store_si128((__m128i*)crp, t);
 
-    //////////////////////////////////////////////////////////////////////////
-    void sse2_cnvrt_si32_to_si32_nlt_type3(const si32* sp, si32* dp,
-                                           int shift, ui32 width)
-    {
-      __m128i sh = _mm_set1_epi32(-shift);
-      __m128i zero = _mm_setzero_si128();
-      for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
+          rp += 4; gp += 4; bp += 4;
+          yp += 4; cbp += 4; crp += 4;
+        }
+      }
+      else 
       {
-        __m128i s = _mm_loadu_si128((__m128i*)sp);
-        __m128i c = _mm_cmplt_epi32(s, zero);  // 0xFFFFFFFF for -ve value
-        __m128i v_m_sh = _mm_sub_epi32(sh, s); // - shift - value 
-        v_m_sh = _mm_and_si128(c, v_m_sh);     // keep only - shift - value
-        s = _mm_andnot_si128(c, s);            // keep only +ve or 0
-        s = _mm_or_si128(s, v_m_sh);           // combine
-        _mm_storeu_si128((__m128i*)dp, s);
+        assert((y->flags  & line_buf::LFT_64BIT) &&
+               (cb->flags & line_buf::LFT_64BIT) && 
+               (cr->flags & line_buf::LFT_64BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) && 
+               (b->flags  & line_buf::LFT_32BIT));
+        __m128i zero = _mm_setzero_si128();
+        __m128i v2 = _mm_set1_epi64x(1ULL << (63 - 2));
+        const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
+        for (int i = (repeat + 3) >> 2; i > 0; --i)
+        {
+          __m128i mr32 = _mm_load_si128((__m128i*)rp);
+          __m128i mg32 = _mm_load_si128((__m128i*)gp);
+          __m128i mb32 = _mm_load_si128((__m128i*)bp);
+          __m128i mr, mg, mb, t;
+          mr = sse2_cvtlo_epi32_epi64(mr32, zero);
+          mg = sse2_cvtlo_epi32_epi64(mg32, zero);
+          mb = sse2_cvtlo_epi32_epi64(mb32, zero);
+          
+          t = _mm_add_epi64(mr, mb);
+          t = _mm_add_epi64(t, _mm_slli_epi64(mg, 1));
+          _mm_store_si128((__m128i*)yp, sse2_mm_srai_epi64(t, 2, v2));
+          t = _mm_sub_epi64(mb, mg);
+          _mm_store_si128((__m128i*)cbp, t);
+          t = _mm_sub_epi64(mr, mg);
+          _mm_store_si128((__m128i*)crp, t);
+
+          yp += 2; cbp += 2; crp += 2;
+
+          mr = sse2_cvthi_epi32_epi64(mr32, zero);
+          mg = sse2_cvthi_epi32_epi64(mg32, zero);
+          mb = sse2_cvthi_epi32_epi64(mb32, zero);
+          
+          t = _mm_add_epi64(mr, mb);
+          t = _mm_add_epi64(t, _mm_slli_epi64(mg, 1));
+          _mm_store_si128((__m128i*)yp, sse2_mm_srai_epi64(t, 2, v2));
+          t = _mm_sub_epi64(mb, mg);
+          _mm_store_si128((__m128i*)cbp, t);
+          t = _mm_sub_epi64(mr, mg);
+          _mm_store_si128((__m128i*)crp, t);
+
+          rp += 4; gp += 4; bp += 4;
+          yp += 2; cbp += 2; crp += 2;
+        }
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_rct_forward(const si32 *r, const si32 *g, const si32 *b,
-                          si32 *y, si32 *cb, si32 *cr, ui32 repeat)
+    void sse2_rct_backward(const line_buf *y, 
+                           const line_buf *cb, 
+                           const line_buf *cr,
+                           line_buf *r, line_buf *g, line_buf *b, 
+                           ui32 repeat)
     {
-      for (int i = (repeat + 3) >> 2; i > 0; --i)
+      assert((y->flags  & line_buf::LFT_REVERSIBLE) &&
+             (cb->flags & line_buf::LFT_REVERSIBLE) && 
+             (cr->flags & line_buf::LFT_REVERSIBLE) &&
+             (r->flags  & line_buf::LFT_REVERSIBLE) &&
+             (g->flags  & line_buf::LFT_REVERSIBLE) && 
+             (b->flags  & line_buf::LFT_REVERSIBLE));
+
+      if (y->flags & line_buf::LFT_32BIT)
       {
-        __m128i mr = _mm_load_si128((__m128i*)r);
-        __m128i mg = _mm_load_si128((__m128i*)g);
-        __m128i mb = _mm_load_si128((__m128i*)b);
-        __m128i t = _mm_add_epi32(mr, mb);
-        t = _mm_add_epi32(t, _mm_slli_epi32(mg, 1));
-        _mm_store_si128((__m128i*)y, _mm_srai_epi32(t, 2));
-        t = _mm_sub_epi32(mb, mg);
-        _mm_store_si128((__m128i*)cb, t);
-        t = _mm_sub_epi32(mr, mg);
-        _mm_store_si128((__m128i*)cr, t);
-
-        r += 4; g += 4; b += 4;
-        y += 4; cb += 4; cr += 4;
-      }
-    }
+        assert((y->flags  & line_buf::LFT_32BIT) &&
+               (cb->flags & line_buf::LFT_32BIT) && 
+               (cr->flags & line_buf::LFT_32BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) && 
+               (b->flags  & line_buf::LFT_32BIT));
+        const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32;
+        si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        for (int i = (repeat + 3) >> 2; i > 0; --i)
+        {
+          __m128i my  = _mm_load_si128((__m128i*)yp);
+          __m128i mcb = _mm_load_si128((__m128i*)cbp);
+          __m128i mcr = _mm_load_si128((__m128i*)crp);
 
-    //////////////////////////////////////////////////////////////////////////
-    void sse2_rct_backward(const si32 *y, const si32 *cb, const si32 *cr,
-                           si32 *r, si32 *g, si32 *b, ui32 repeat)
-    {
-      for (int i = (repeat + 3) >> 2; i > 0; --i)
+          __m128i t = _mm_add_epi32(mcb, mcr);
+          t = _mm_sub_epi32(my, _mm_srai_epi32(t, 2));
+          _mm_store_si128((__m128i*)gp, t);
+          __m128i u = _mm_add_epi32(mcb, t);
+          _mm_store_si128((__m128i*)bp, u);
+          u = _mm_add_epi32(mcr, t);
+          _mm_store_si128((__m128i*)rp, u);
+
+          yp += 4; cbp += 4; crp += 4;
+          rp += 4; gp += 4; bp += 4;
+        }        
+      }
+      else
       {
-        __m128i my  = _mm_load_si128((__m128i*)y);
-        __m128i mcb = _mm_load_si128((__m128i*)cb);
-        __m128i mcr = _mm_load_si128((__m128i*)cr);
-
-        __m128i t = _mm_add_epi32(mcb, mcr);
-        t = _mm_sub_epi32(my, _mm_srai_epi32(t, 2));
-        _mm_store_si128((__m128i*)g, t);
-        __m128i u = _mm_add_epi32(mcb, t);
-        _mm_store_si128((__m128i*)b, u);
-        u = _mm_add_epi32(mcr, t);
-        _mm_store_si128((__m128i*)r, u);
-
-        y += 4; cb += 4; cr += 4;
-        r += 4; g += 4; b += 4;
+        assert((y->flags  & line_buf::LFT_64BIT) &&
+               (cb->flags & line_buf::LFT_64BIT) && 
+               (cr->flags & line_buf::LFT_64BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) && 
+               (b->flags  & line_buf::LFT_32BIT));
+        __m128i v2 = _mm_set1_epi64x(1ULL << (63 - 2));
+        __m128i low_bits = _mm_set_epi64x(0, (si64)ULLONG_MAX);
+        const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
+        si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        for (int i = (repeat + 3) >> 2; i > 0; --i)
+        {
+          __m128i my, mcb, mcr, tr, tg, tb;          
+          my  = _mm_load_si128((__m128i*)yp);
+          mcb = _mm_load_si128((__m128i*)cbp);
+          mcr = _mm_load_si128((__m128i*)crp);
+
+          tg = _mm_add_epi64(mcb, mcr);
+          tg = _mm_sub_epi64(my, sse2_mm_srai_epi64(tg, 2, v2));
+          tb = _mm_add_epi64(mcb, tg);
+          tr = _mm_add_epi64(mcr, tg);
+
+          __m128i mr, mg, mb;
+          mr = _mm_shuffle_epi32(tr, _MM_SHUFFLE(0, 0, 2, 0));
+          mr = _mm_and_si128(low_bits, mr);
+          mg = _mm_shuffle_epi32(tg, _MM_SHUFFLE(0, 0, 2, 0));
+          mg = _mm_and_si128(low_bits, mg);
+          mb = _mm_shuffle_epi32(tb, _MM_SHUFFLE(0, 0, 2, 0));
+          mb = _mm_and_si128(low_bits, mb);
+
+          yp += 2; cbp += 2; crp += 2;
+
+          my  = _mm_load_si128((__m128i*)yp);
+          mcb = _mm_load_si128((__m128i*)cbp);
+          mcr = _mm_load_si128((__m128i*)crp);
+
+          tg = _mm_add_epi64(mcb, mcr);
+          tg = _mm_sub_epi64(my, sse2_mm_srai_epi64(tg, 2, v2));
+          tb = _mm_add_epi64(mcb, tg);
+          tr = _mm_add_epi64(mcr, tg);
+
+          tr = _mm_shuffle_epi32(tr, _MM_SHUFFLE(2, 0, 0, 0));
+          tr = _mm_andnot_si128(low_bits, tr);
+          mr = _mm_or_si128(mr, tr);
+          tg = _mm_shuffle_epi32(tg, _MM_SHUFFLE(2, 0, 0, 0));
+          tg = _mm_andnot_si128(low_bits, tg);
+          mg = _mm_or_si128(mg, tg);
+          tb = _mm_shuffle_epi32(tb, _MM_SHUFFLE(2, 0, 0, 0));
+          tb = _mm_andnot_si128(low_bits, tb);
+          mb = _mm_or_si128(mb, tb);
+
+          _mm_store_si128((__m128i*)rp, mr);
+          _mm_store_si128((__m128i*)gp, mg);
+          _mm_store_si128((__m128i*)bp, mb);
+
+          yp += 2; cbp += 2; crp += 2;
+          rp += 4; gp += 4; bp += 4;
+        }        
       }
     }
-
   }
 }
diff --git a/src/core/transform/ojph_colour_wasm.cpp b/src/core/transform/ojph_colour_wasm.cpp
index 57b84c7..5bf6ccd 100644
--- a/src/core/transform/ojph_colour_wasm.cpp
+++ b/src/core/transform/ojph_colour_wasm.cpp
@@ -39,12 +39,164 @@
 #include <wasm_simd128.h>
 
 #include "ojph_defs.h"
+#include "ojph_mem.h"
 #include "ojph_colour.h"
 #include "ojph_colour_local.h"
 
 namespace ojph {
   namespace local {
     
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_rev_convert(const line_buf *src_line, 
+                          const ui32 src_line_offset,
+                          line_buf *dst_line, 
+                          const ui32 dst_line_offset, 
+                          si64 shift, ui32 width)
+    {
+      if (src_line->flags & line_buf::LFT_32BIT)
+      { 
+        if (dst_line->flags & line_buf::LFT_32BIT)
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si32 *dp = dst_line->i32 + dst_line_offset;
+          v128_t sh = wasm_i32x4_splat((si32)shift);
+          for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+          {
+            v128_t s = wasm_v128_load(sp);
+            s = wasm_i32x4_add(s, sh);
+            wasm_v128_store(dp, s);
+          }            
+        }
+        else 
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si64 *dp = dst_line->i64 + dst_line_offset;
+          v128_t sh = wasm_i64x2_splat(shift);
+          for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+          {
+            v128_t s, t;
+            s = wasm_v128_load(sp);
+            
+            t = wasm_i64x2_extend_low_i32x4(s);
+            t = wasm_i64x2_add(t, sh);
+            wasm_v128_store(dp, t);
+            
+            t = wasm_i64x2_extend_high_i32x4(s);
+            t = wasm_i64x2_add(t, sh);
+            wasm_v128_store(dp + 2, t);
+          }            
+        }
+      }
+      else 
+      {
+        assert(src_line->flags | line_buf::LFT_64BIT);
+        assert(dst_line->flags | line_buf::LFT_32BIT);
+        const si64 *sp = src_line->i64 + src_line_offset;
+        si32 *dp = dst_line->i32 + dst_line_offset;
+        v128_t sh = wasm_i64x2_splat(shift);
+        for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+        {
+          v128_t s0, s1;
+          s0 = wasm_v128_load(sp);
+          s0 = wasm_i64x2_add(s0, sh);
+          s1 = wasm_v128_load(sp + 2);
+          s1 = wasm_i64x2_add(s1, sh);
+          s0 = wasm_i32x4_shuffle(s0, s1, 0, 2, 4 + 0, 4 + 2);
+          wasm_v128_store(dp, s0);
+        }            
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_rev_convert_nlt_type3(const line_buf *src_line, 
+                                    const ui32 src_line_offset, 
+                                    line_buf *dst_line, 
+                                    const ui32 dst_line_offset, 
+                                    si64 shift, ui32 width)
+    {
+      if (src_line->flags & line_buf::LFT_32BIT)
+      { 
+        if (dst_line->flags & line_buf::LFT_32BIT)
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si32 *dp = dst_line->i32 + dst_line_offset;
+          v128_t sh = wasm_i32x4_splat((si32)(-shift));
+          v128_t zero = wasm_i32x4_splat(0);
+          for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
+          {
+            v128_t s = wasm_v128_load(sp);
+            v128_t c = wasm_i32x4_lt(s, zero);     // 0xFFFFFFFF for -ve value
+            v128_t v_m_sh = wasm_i32x4_sub(sh, s); // - shift - value 
+            v_m_sh = wasm_v128_and(c, v_m_sh);     // keep only - shift - value
+            s = wasm_v128_andnot(c, s);            // keep only +ve or 0
+            s = wasm_v128_or(s, v_m_sh);           // combine
+            wasm_v128_store(dp, s);
+          }
+        }
+        else 
+        {
+          const si32 *sp = src_line->i32 + src_line_offset;
+          si64 *dp = dst_line->i64 + dst_line_offset;
+          v128_t sh = wasm_i64x2_splat(-shift);
+          v128_t zero = wasm_i32x4_splat(0);
+          for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
+          {
+            v128_t s, u, c, v_m_sh;
+            s = wasm_v128_load(sp);
+
+            u = wasm_i64x2_extend_low_i32x4(s);
+            c = wasm_i64x2_lt(u, zero);        // 64b -1 for -ve value
+            v_m_sh = wasm_i64x2_sub(sh, u);    // - shift - value 
+            v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value
+            u = wasm_v128_andnot(c, u);        // keep only +ve or 0
+            u = wasm_v128_or(u, v_m_sh);       // combine
+
+            wasm_v128_store(dp, u);
+
+            u = wasm_i64x2_extend_high_i32x4(s);
+            c = wasm_i64x2_lt(u, zero);        // 64b -1 for -ve value
+            v_m_sh = wasm_i64x2_sub(sh, u);    // - shift - value 
+            v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value
+            u = wasm_v128_andnot(c, u);        // keep only +ve or 0
+            u = wasm_v128_or(u, v_m_sh);       // combine
+
+            wasm_v128_store(dp + 2, u);
+          }
+        }
+      }
+      else 
+      {
+        assert(src_line->flags | line_buf::LFT_64BIT);
+        assert(dst_line->flags | line_buf::LFT_32BIT);
+        const si64 *sp = src_line->i64 + src_line_offset;
+        si32 *dp = dst_line->i32 + dst_line_offset;
+        v128_t sh = wasm_i64x2_splat(-shift);
+        v128_t zero = wasm_i32x4_splat(0);
+        for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
+        {
+          // s for source, t for target, p for positive, n for negative,
+          // m for mask, and tm for temp
+          v128_t s, t0, t1, p, n, m, tm;
+          s = wasm_v128_load(sp);
+          m = wasm_i64x2_lt(s, zero);   // 64b -1 for -ve value
+          tm = wasm_i64x2_sub(sh, s);   // - shift - value
+          n = wasm_v128_and(m, tm);     // -ve
+          p = wasm_v128_andnot(m, s);   // +ve
+          t0 = wasm_v128_or(n, p);
+
+          s = wasm_v128_load(sp + 2);
+          m = wasm_i64x2_lt(s, zero);   // 64b -1 for -ve value
+          tm = wasm_i64x2_sub(sh, s);   // - shift - value
+          n = wasm_v128_and(m, tm);     // -ve
+          p = wasm_v128_andnot(m, s);   // +ve
+          t1 = wasm_v128_or(n, p);
+
+          t0 = wasm_i32x4_shuffle(t0, t1, 0, 2, 4 + 0, 4 + 2);
+          wasm_v128_store(dp, t0);
+        }
+      }
+    }
+
     //////////////////////////////////////////////////////////////////////////
     void wasm_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul,
                                         ui32 width)
@@ -108,80 +260,182 @@ namespace ojph {
       }
     }
 
-
     //////////////////////////////////////////////////////////////////////////
-    void wasm_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift,
-                                       ui32 width)
+    void wasm_rct_forward(const line_buf *r, 
+                          const line_buf *g, 
+                          const line_buf *b,
+                          line_buf *y, line_buf *cb, line_buf *cr, 
+                          ui32 repeat)
     {
-      v128_t sh = wasm_i32x4_splat(shift);
-      for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+      assert((y->flags  & line_buf::LFT_REVERSIBLE) &&
+             (cb->flags & line_buf::LFT_REVERSIBLE) && 
+             (cr->flags & line_buf::LFT_REVERSIBLE) &&
+             (r->flags  & line_buf::LFT_REVERSIBLE) &&
+             (g->flags  & line_buf::LFT_REVERSIBLE) && 
+             (b->flags  & line_buf::LFT_REVERSIBLE));
+      
+      if  (y->flags & line_buf::LFT_32BIT)
       {
-        v128_t s = wasm_v128_load(sp);
-        s = wasm_i32x4_add(s, sh);
-        wasm_v128_store(dp, s);
-      }
-    }
+        assert((y->flags  & line_buf::LFT_32BIT) &&
+               (cb->flags & line_buf::LFT_32BIT) && 
+               (cr->flags & line_buf::LFT_32BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) && 
+               (b->flags  & line_buf::LFT_32BIT));        
+        const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32;
+        si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32;
 
-    //////////////////////////////////////////////////////////////////////////
-    void wasm_cnvrt_si32_to_si32_nlt_type3(const si32* sp, si32* dp,
-                                           int shift, ui32 width)
-    {
-      v128_t sh = wasm_i32x4_splat(-shift);
-      v128_t zero = wasm_i32x4_splat(0);
-      for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4)
+        for (int i = (repeat + 3) >> 2; i > 0; --i)
+        {
+          v128_t mr = wasm_v128_load(rp);
+          v128_t mg = wasm_v128_load(gp);
+          v128_t mb = wasm_v128_load(bp);
+          v128_t t = wasm_i32x4_add(mr, mb);
+          t = wasm_i32x4_add(t, wasm_i32x4_shl(mg, 1));
+          wasm_v128_store(yp, wasm_i32x4_shr(t, 2));
+          t = wasm_i32x4_sub(mb, mg);
+          wasm_v128_store(cbp, t);
+          t = wasm_i32x4_sub(mr, mg);
+          wasm_v128_store(crp, t);
+
+            rp += 4; gp += 4; bp += 4;
+            yp += 4; cbp += 4; crp += 4;
+        }
+      }
+      else 
       {
-        v128_t s = wasm_v128_load(sp);
-        v128_t c = wasm_i32x4_lt(s, zero);     // 0xFFFFFFFF for -ve value
-        v128_t v_m_sh = wasm_i32x4_sub(sh, s); // - shift - value 
-        v_m_sh = wasm_v128_and(c, v_m_sh);     // keep only - shift - value
-        s = wasm_v128_andnot(c, s);            // keep only +ve or 0
-        s = wasm_v128_or(s, v_m_sh);           // combine
-        wasm_v128_store(dp, s);
+        assert((y->flags  & line_buf::LFT_64BIT) &&
+               (cb->flags & line_buf::LFT_64BIT) && 
+               (cr->flags & line_buf::LFT_64BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) && 
+               (b->flags  & line_buf::LFT_32BIT));
+        const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
+        for (int i = (repeat + 3) >> 2; i > 0; --i)
+        {
+          v128_t mr32 = wasm_v128_load(rp);
+          v128_t mg32 = wasm_v128_load(gp);
+          v128_t mb32 = wasm_v128_load(bp);
+          v128_t mr, mg, mb, t;
+          mr = wasm_i64x2_extend_low_i32x4(mr32);
+          mg = wasm_i64x2_extend_low_i32x4(mg32);
+          mb = wasm_i64x2_extend_low_i32x4(mb32);
+          
+          t = wasm_i64x2_add(mr, mb);
+          t = wasm_i64x2_add(t, wasm_i64x2_shl(mg, 1));
+          wasm_v128_store(yp, wasm_i64x2_shr(t, 2));
+          t = wasm_i64x2_sub(mb, mg);
+          wasm_v128_store(cbp, t);
+          t = wasm_i64x2_sub(mr, mg);
+          wasm_v128_store(crp, t);
+
+          yp += 2; cbp += 2; crp += 2;
+
+          mr = wasm_i64x2_extend_high_i32x4(mr32);
+          mg = wasm_i64x2_extend_high_i32x4(mg32);
+          mb = wasm_i64x2_extend_high_i32x4(mb32);
+          
+          t = wasm_i64x2_add(mr, mb);
+          t = wasm_i64x2_add(t, wasm_i64x2_shl(mg, 1));
+          wasm_v128_store(yp, wasm_i64x2_shr(t, 2));
+          t = wasm_i64x2_sub(mb, mg);
+          wasm_v128_store(cbp, t);
+          t = wasm_i64x2_sub(mr, mg);
+          wasm_v128_store(crp, t);
+
+          rp += 4; gp += 4; bp += 4;
+          yp += 2; cbp += 2; crp += 2;
+        }
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rct_forward(const si32 *r, const si32 *g, const si32 *b,
-                          si32 *y, si32 *cb, si32 *cr, ui32 repeat)
+    void wasm_rct_backward(const line_buf *y, 
+                           const line_buf *cb, 
+                           const line_buf *cr,
+                           line_buf *r, line_buf *g, line_buf *b, 
+                           ui32 repeat)
     {
-      for (int i = (repeat + 3) >> 2; i > 0; --i)
+      assert((y->flags  & line_buf::LFT_REVERSIBLE) &&
+             (cb->flags & line_buf::LFT_REVERSIBLE) && 
+             (cr->flags & line_buf::LFT_REVERSIBLE) &&
+             (r->flags  & line_buf::LFT_REVERSIBLE) &&
+             (g->flags  & line_buf::LFT_REVERSIBLE) && 
+             (b->flags  & line_buf::LFT_REVERSIBLE));
+
+      if (y->flags & line_buf::LFT_32BIT)
       {
-        v128_t mr = wasm_v128_load(r);
-        v128_t mg = wasm_v128_load(g);
-        v128_t mb = wasm_v128_load(b);
-        v128_t t = wasm_i32x4_add(mr, mb);
-        t = wasm_i32x4_add(t, wasm_i32x4_shl(mg, 1));
-        wasm_v128_store(y, wasm_i32x4_shr(t, 2));
-        t = wasm_i32x4_sub(mb, mg);
-        wasm_v128_store(cb, t);
-        t = wasm_i32x4_sub(mr, mg);
-        wasm_v128_store(cr, t);
+        assert((y->flags  & line_buf::LFT_32BIT) &&
+               (cb->flags & line_buf::LFT_32BIT) && 
+               (cr->flags & line_buf::LFT_32BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) && 
+               (b->flags  & line_buf::LFT_32BIT));
+        const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32;
+        si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        for (int i = (repeat + 3) >> 2; i > 0; --i)
+        {
+          v128_t my  = wasm_v128_load(yp);
+          v128_t mcb = wasm_v128_load(cbp);
+          v128_t mcr = wasm_v128_load(crp);
 
-        r += 4; g += 4; b += 4;
-        y += 4; cb += 4; cr += 4;
-      }
-    }
+          v128_t t = wasm_i32x4_add(mcb, mcr);
+          t = wasm_i32x4_sub(my, wasm_i32x4_shr(t, 2));
+          wasm_v128_store(gp, t);
+          v128_t u = wasm_i32x4_add(mcb, t);
+          wasm_v128_store(bp, u);
+          u = wasm_i32x4_add(mcr, t);
+          wasm_v128_store(rp, u);
 
-    //////////////////////////////////////////////////////////////////////////
-    void wasm_rct_backward(const si32 *y, const si32 *cb, const si32 *cr,
-                           si32 *r, si32 *g, si32 *b, ui32 repeat)
-    {
-      for (int i = (repeat + 3) >> 2; i > 0; --i)
+          yp += 4; cbp += 4; crp += 4;
+          rp += 4; gp += 4; bp += 4;
+        }
+      }
+      else
       {
-        v128_t my  = wasm_v128_load(y);
-        v128_t mcb = wasm_v128_load(cb);
-        v128_t mcr = wasm_v128_load(cr);
+        assert((y->flags  & line_buf::LFT_64BIT) &&
+               (cb->flags & line_buf::LFT_64BIT) && 
+               (cr->flags & line_buf::LFT_64BIT) &&
+               (r->flags  & line_buf::LFT_32BIT) &&
+               (g->flags  & line_buf::LFT_32BIT) && 
+               (b->flags  & line_buf::LFT_32BIT));
+        const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
+        si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
+        for (int i = (repeat + 3) >> 2; i > 0; --i)
+        {
+          v128_t my, mcb, mcr, tr0, tg0, tb0, tr1, tg1, tb1;
+          my  = wasm_v128_load(yp);
+          mcb = wasm_v128_load(cbp);
+          mcr = wasm_v128_load(crp);
 
-        v128_t t = wasm_i32x4_add(mcb, mcr);
-        t = wasm_i32x4_sub(my, wasm_i32x4_shr(t, 2));
-        wasm_v128_store(g, t);
-        v128_t u = wasm_i32x4_add(mcb, t);
-        wasm_v128_store(b, u);
-        u = wasm_i32x4_add(mcr, t);
-        wasm_v128_store(r, u);
+          tg0 = wasm_i64x2_add(mcb, mcr);
+          tg0 = wasm_i64x2_sub(my, wasm_i64x2_shr(tg0, 2));
+          tb0 = wasm_i64x2_add(mcb, tg0);
+          tr0 = wasm_i64x2_add(mcr, tg0);
 
-        y += 4; cb += 4; cr += 4;
-        r += 4; g += 4; b += 4;
+          yp += 2; cbp += 2; crp += 2;
+
+          my  = wasm_v128_load(yp);
+          mcb = wasm_v128_load(cbp);
+          mcr = wasm_v128_load(crp);
+
+          tg1 = wasm_i64x2_add(mcb, mcr);
+          tg1 = wasm_i64x2_sub(my, wasm_i64x2_shr(tg1, 2));
+          tb1 = wasm_i64x2_add(mcb, tg1);
+          tr1 = wasm_i64x2_add(mcr, tg1);
+
+          tr0 = wasm_i32x4_shuffle(tr0, tr1, 0, 2, 4 + 0, 4 + 2);
+          tg0 = wasm_i32x4_shuffle(tg0, tg1, 0, 2, 4 + 0, 4 + 2);
+          tb0 = wasm_i32x4_shuffle(tb0, tb1, 0, 2, 4 + 0, 4 + 2);
+
+          wasm_v128_store(rp, tr0);
+          wasm_v128_store(gp, tg0);
+          wasm_v128_store(bp, tb0);
+
+          yp += 2; cbp += 2; crp += 2;
+          rp += 4; gp += 4; bp += 4;
+        }        
       }
     }
 
diff --git a/src/core/transform/ojph_transform.cpp b/src/core/transform/ojph_transform.cpp
index ee4bb08..c4313ab 100644
--- a/src/core/transform/ojph_transform.cpp
+++ b/src/core/transform/ojph_transform.cpp
@@ -45,7 +45,9 @@
 #include "../codestream/ojph_params_local.h"
 
 namespace ojph {
-  struct line_buf;
+
+  // defined elsewhere
+  class line_buf;
 
   namespace local {
 
@@ -156,9 +158,9 @@ namespace ojph {
       #if (defined(OJPH_ARCH_X86_64) && !defined(OJPH_DISABLE_AVX512))
         if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX512)
         {
-          rev_vert_step             = avx512_rev_vert_step;
-          rev_horz_ana              = avx512_rev_horz_ana;
-          rev_horz_syn              = avx512_rev_horz_syn;
+          // rev_vert_step             = avx512_rev_vert_step;
+          // rev_horz_ana              = avx512_rev_horz_ana;
+          // rev_horz_syn              = avx512_rev_horz_syn;
 
           irv_vert_step             = avx512_irv_vert_step;
           irv_vert_times_K          = avx512_irv_vert_times_K;
@@ -192,13 +194,14 @@ namespace ojph {
 #if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN)
 
     /////////////////////////////////////////////////////////////////////////
-    void gen_rev_vert_step(const lifting_step* s, const line_buf* sig, 
-                           const line_buf* other, const line_buf* aug, 
-                           ui32 repeat, bool synthesis)
+    static
+    void gen_rev_vert_step32(const lifting_step* s, const line_buf* sig, 
+                             const line_buf* other, const line_buf* aug, 
+                             ui32 repeat, bool synthesis)
     {
       const si32 a = s->rev.Aatk;
       const si32 b = s->rev.Batk;
-      const ui32 e = s->rev.Eatk;
+      const ui8 e = s->rev.Eatk;
 
       si32* dst = aug->i32;
       const si32* src1 = sig->i32, * src2 = other->i32;
@@ -243,9 +246,85 @@ namespace ojph {
     }
 
     /////////////////////////////////////////////////////////////////////////
-    void gen_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
-                          const line_buf* hdst, const line_buf* src, 
-                          ui32 width, bool even)
+    static
+    void gen_rev_vert_step64(const lifting_step* s, const line_buf* sig, 
+                             const line_buf* other, const line_buf* aug, 
+                             ui32 repeat, bool synthesis)
+    {
+      const si64 a = s->rev.Aatk;
+      const si64 b = s->rev.Batk;
+      const ui8 e = s->rev.Eatk;
+
+      si64* dst = aug->i64;
+      const si64* src1 = sig->i64, * src2 = other->i64;
+      // The general definition of the wavelet in Part 2 is slightly 
+      // different to part 2, although they are mathematically equivalent
+      // here, we identify the simpler form from Part 1 and employ them
+      if (a == 1)
+      { // 5/3 update and any case with a == 1
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (b + *src1++ + *src2++) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (b + *src1++ + *src2++) >> e;
+      }
+      else if (a == -1 && b == 1 && e == 1)
+      { // 5/3 predict
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (*src1++ + *src2++) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (*src1++ + *src2++) >> e;
+      }
+      else if (a == -1)
+      { // any case with a == -1, which is not 5/3 predict
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (b - (*src1++ + *src2++)) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (b - (*src1++ + *src2++)) >> e;
+      }
+      else { // general case
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (b + a * (*src1++ + *src2++)) >> e;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void gen_rev_vert_step(const lifting_step* s, const line_buf* sig, 
+                           const line_buf* other, const line_buf* aug, 
+                           ui32 repeat, bool synthesis)
+    {
+      if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) || 
+          ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) ||
+          ((other != NULL) && (other->flags & line_buf::LFT_32BIT))) 
+      {
+        assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) &&
+               (other == NULL || other->flags & line_buf::LFT_32BIT) && 
+               (aug == NULL || aug->flags & line_buf::LFT_32BIT));
+        gen_rev_vert_step32(s, sig, other, aug, repeat, synthesis);
+      }
+      else 
+      {
+        assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) &&
+               (other == NULL || other->flags & line_buf::LFT_64BIT) && 
+               (aug == NULL || aug->flags & line_buf::LFT_64BIT));
+        gen_rev_vert_step64(s, sig, other, aug, repeat, synthesis);
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void gen_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, 
+                            const line_buf* hdst, const line_buf* src, 
+                            ui32 width, bool even)
     {
       if (width > 1)
       {
@@ -277,7 +356,7 @@ namespace ojph {
           const lifting_step* s = atk->get_step(j - 1);
           const si32 a = s->rev.Aatk;
           const si32 b = s->rev.Batk;
-          const ui32 e = s->rev.Eatk;
+          const ui8 e = s->rev.Eatk;
 
           // extension
           lp[-1] = lp[0];
@@ -319,11 +398,111 @@ namespace ojph {
           hdst->i32[0] = src->i32[0] << 1;
       }
     }
-    
-    //////////////////////////////////////////////////////////////////////////
-    void gen_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
-                          const line_buf* lsrc, const line_buf* hsrc, 
+
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void gen_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, 
+                            const line_buf* hdst, const line_buf* src, 
+                            ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        // combine both lsrc and hsrc into dst
+        si64* dph = hdst->i64;
+        si64* dpl = ldst->i64;
+        si64* sp = src->i64;
+        ui32 w = width;
+        if (!even)
+        {
+          *dph++ = *sp++; --w;
+        }
+        for (; w > 1; w -= 2)
+        {
+          *dpl++ = *sp++; *dph++ = *sp++;
+        }
+        if (w)
+        {
+          *dpl++ = *sp++; --w;
+        }
+
+        si64* hp = hdst->i64, * lp = ldst->i64;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
+        {
+          // first lifting step
+          const lifting_step* s = atk->get_step(j - 1);
+          const si64 a = s->rev.Aatk;
+          const si64 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const si64* sp = lp + (even ? 1 : 0);
+          si64* dp = hp;
+          if (a == 1) 
+          { // 5/3 update and any case with a == 1
+            for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+              *dp += (b + (sp[-1] + sp[0])) >> e;
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+              *dp -= (sp[-1] + sp[0]) >> e;
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+              *dp += (b - (sp[-1] + sp[0])) >> e;
+          }
+          else {
+            // general case
+            for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+              *dp += (b + a * (sp[-1] + sp[0])) >> e;
+          }
+
+          // swap buffers
+          si64* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
+        }
+      }
+      else {
+        if (even)
+          ldst->i64[0] = src->i64[0];
+        else
+          hdst->i64[0] = src->i64[0] << 1;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void gen_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                          const line_buf* hdst, const line_buf* src, 
                           ui32 width, bool even)
+    {
+      if (src->flags & line_buf::LFT_32BIT) 
+      {
+        assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) &&
+               (hdst == NULL || hdst->flags & line_buf::LFT_32BIT));
+        gen_rev_horz_ana32(atk, ldst, hdst, src, width, even);
+      }
+      else 
+      {
+        assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) &&
+               (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) && 
+               (src == NULL || src->flags & line_buf::LFT_64BIT));
+        gen_rev_horz_ana64(atk, ldst, hdst, src, width, even);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static
+    void gen_rev_horz_syn32(const param_atk* atk, const line_buf* dst, 
+                            const line_buf* lsrc, const line_buf* hsrc, 
+                            ui32 width, bool even)
     {
       if (width > 1)
       {
@@ -337,7 +516,7 @@ namespace ojph {
           const lifting_step* s = atk->get_step(j);
           const si32 a = s->rev.Aatk;
           const si32 b = s->rev.Batk;
-          const ui32 e = s->rev.Eatk;
+          const ui8 e = s->rev.Eatk;
 
           // extension
           oth[-1] = oth[0];
@@ -398,6 +577,105 @@ namespace ojph {
       }
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    static
+    void gen_rev_horz_syn64(const param_atk* atk, const line_buf* dst, 
+                            const line_buf* lsrc, const line_buf* hsrc, 
+                            ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        bool ev = even;
+        si64* oth = hsrc->i64, * aug = lsrc->i64;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
+        {
+          const lifting_step* s = atk->get_step(j);
+          const si64 a = s->rev.Aatk;
+          const si64 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const si64* sp = oth + (ev ? 0 : 1);
+          si64* dp = aug;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+              *dp -= (b + (sp[-1] + sp[0])) >> e;
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+              *dp += (sp[-1] + sp[0]) >> e;
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+              *dp -= (b - (sp[-1] + sp[0])) >> e;
+          }
+          else {
+            // general case
+            for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+              *dp -= (b + a * (sp[-1] + sp[0])) >> e;
+          }
+
+          // swap buffers
+          si64* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
+        }
+
+        // combine both lsrc and hsrc into dst
+        si64* sph = hsrc->i64;
+        si64* spl = lsrc->i64;
+        si64* dp = dst->i64;
+        ui32 w = width;
+        if (!even)
+        {
+          *dp++ = *sph++; --w;
+        }
+        for (; w > 1; w -= 2)
+        {
+          *dp++ = *spl++; *dp++ = *sph++;
+        }
+        if (w)
+        {
+          *dp++ = *spl++; --w;
+        }
+      }
+      else {
+        if (even)
+          dst->i64[0] = lsrc->i64[0];
+        else
+          dst->i64[0] = hsrc->i64[0] >> 1;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void gen_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
+                          const line_buf* lsrc, const line_buf* hsrc, 
+                          ui32 width, bool even)
+    {
+      if (dst->flags & line_buf::LFT_32BIT) 
+      {
+        assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) && 
+               (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT));
+        gen_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even);
+      }
+      else 
+      {
+        assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) &&
+               (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) && 
+               (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT));
+        gen_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even);
+      }
+    }    
+
     //////////////////////////////////////////////////////////////////////////
     void gen_irv_vert_step(const lifting_step* s, const line_buf* sig, 
                            const line_buf* other, const line_buf* aug, 
diff --git a/src/core/transform/ojph_transform.h b/src/core/transform/ojph_transform.h
index 0e59632..f7576a1 100644
--- a/src/core/transform/ojph_transform.h
+++ b/src/core/transform/ojph_transform.h
@@ -42,7 +42,10 @@
 #include "ojph_defs.h"
 
 namespace ojph {
-  struct line_buf;
+
+  // defined elsewhere
+  class line_buf;
+
   namespace local {
     union lifting_step;
     struct param_atk;
diff --git a/src/core/transform/ojph_transform_avx.cpp b/src/core/transform/ojph_transform_avx.cpp
index 0856662..8838d18 100644
--- a/src/core/transform/ojph_transform_avx.cpp
+++ b/src/core/transform/ojph_transform_avx.cpp
@@ -61,6 +61,40 @@ namespace ojph {
       }
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    void avx_deinterleave32(float* dpl, float* dph, float* sp, int width)
+    {
+      for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8)
+      {
+        __m256 a = _mm256_load_ps(sp);
+        __m256 b = _mm256_load_ps(sp + 8);
+        __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));
+        __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));
+        __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));
+        __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));
+        _mm256_store_ps(dpl, e);
+        _mm256_store_ps(dph, f);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline 
+    void avx_interleave32(float* dp, float* spl, float* sph, int width)
+    {
+      for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8)
+      {
+        __m256 a = _mm256_load_ps(spl);
+        __m256 b = _mm256_load_ps(sph);
+        __m256 c = _mm256_unpacklo_ps(a, b);
+        __m256 d = _mm256_unpackhi_ps(a, b);
+        __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));
+        __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));
+        _mm256_store_ps(dp, e);
+        _mm256_store_ps(dp + 8, f);
+      }
+    }
+
     //////////////////////////////////////////////////////////////////////////
     void avx_irv_vert_step(const lifting_step* s, const line_buf* sig, 
                            const line_buf* other, const line_buf* aug, 
@@ -100,11 +134,11 @@ namespace ojph {
       {
         // split src into ldst and hdst
         {
-          float* dpl = ldst->f32;
-          float* dph = hdst->f32;
+          float* dpl = even ? ldst->f32 : hdst->f32;
+          float* dph = even ? hdst->f32 : ldst->f32;
           float* sp = src->f32;
           int w = (int)width;
-          AVX_DEINTERLEAVE(dpl, dph, sp, w, even);
+          avx_deinterleave32(dpl, dph, sp, w);
         }
 
         // the actual horizontal transform
@@ -235,10 +269,10 @@ namespace ojph {
         // combine both lsrc and hsrc into dst
         {
           float* dp = dst->f32;
-          float* spl = lsrc->f32;
-          float* sph = hsrc->f32;
+          float* spl = even ? lsrc->f32 : hsrc->f32;
+          float* sph = even ? hsrc->f32 : lsrc->f32;
           int w = (int)width;
-          AVX_INTERLEAVE(dp, spl, sph, w, even);
+          avx_interleave32(dp, spl, sph, w);
         }
       }
       else {
diff --git a/src/core/transform/ojph_transform_avx2.cpp b/src/core/transform/ojph_transform_avx2.cpp
index 847cd4c..1bc92e6 100644
--- a/src/core/transform/ojph_transform_avx2.cpp
+++ b/src/core/transform/ojph_transform_avx2.cpp
@@ -35,6 +35,7 @@
 // Date: 28 August 2019
 //***************************************************************************/
 
+#include <climits>
 #include <cstdio>
 
 #include "ojph_defs.h"
@@ -52,13 +53,95 @@ namespace ojph {
   namespace local {
 
     /////////////////////////////////////////////////////////////////////////
-    void avx2_rev_vert_step(const lifting_step* s, const line_buf* sig, 
-                            const line_buf* other, const line_buf* aug, 
-                            ui32 repeat, bool synthesis)
+    // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h
+    static inline 
+    __m256i avx2_mm256_srai_epi64(__m256i a, int amt, __m256i m) 
+    {
+      // note than m must be obtained using
+      // __m256i m = _mm256_set1_epi64x(1ULL << (63 - amt));
+      __m256i x = _mm256_srli_epi64(a, amt);
+      x = _mm256_xor_si256(x, m);
+      __m256i result = _mm256_sub_epi64(x, m);
+      return result;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    void avx2_deinterleave32(float* dpl, float* dph, float* sp, int width)
+    {
+      for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8)
+      {
+        __m256 a = _mm256_load_ps(sp);
+        __m256 b = _mm256_load_ps(sp + 8);
+        __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));
+        __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));
+        __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));
+        __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));
+        _mm256_store_ps(dpl, e);
+        _mm256_store_ps(dph, f);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline 
+    void avx2_interleave32(float* dp, float* spl, float* sph, int width)
+    {
+      for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8)
+      {
+        __m256 a = _mm256_load_ps(spl);
+        __m256 b = _mm256_load_ps(sph);
+        __m256 c = _mm256_unpacklo_ps(a, b);
+        __m256 d = _mm256_unpackhi_ps(a, b);
+        __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));
+        __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));
+        _mm256_store_ps(dp, e);
+        _mm256_store_ps(dp + 8, f);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline 
+    void avx2_deinterleave64(double* dpl, double* dph, double* sp, int width)
+    {
+      for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
+      {
+        __m256d a = _mm256_load_pd(sp);
+        __m256d b = _mm256_load_pd(sp + 4);
+        __m256d c = _mm256_permute2f128_pd(a, b, (2 << 4) | (0));
+        __m256d d = _mm256_permute2f128_pd(a, b, (3 << 4) | (1));
+        __m256d e = _mm256_shuffle_pd(c, d, 0x0);
+        __m256d f = _mm256_shuffle_pd(c, d, 0xF);
+        _mm256_store_pd(dpl, e);
+        _mm256_store_pd(dph, f);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline 
+    void avx2_interleave64(double* dp, double* spl, double* sph, int width)
+    {
+      for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
+      {
+        __m256d a = _mm256_load_pd(spl);
+        __m256d b = _mm256_load_pd(sph);
+        __m256d c = _mm256_unpacklo_pd(a, b);
+        __m256d d = _mm256_unpackhi_pd(a, b);
+        __m256d e = _mm256_permute2f128_pd(c, d, (2 << 4) | (0));
+        __m256d f = _mm256_permute2f128_pd(c, d, (3 << 4) | (1));
+        _mm256_store_pd(dp, e);
+        _mm256_store_pd(dp + 4, f);
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void avx2_rev_vert_step32(const lifting_step* s, const line_buf* sig, 
+                              const line_buf* other, const line_buf* aug, 
+                              ui32 repeat, bool synthesis)
     {
       const si32 a = s->rev.Aatk;
       const si32 b = s->rev.Batk;
-      const si32 e = s->rev.Eatk;
+      const ui8 e = s->rev.Eatk;
       __m256i va = _mm256_set1_epi32(a);
       __m256i vb = _mm256_set1_epi32(b);
 
@@ -181,19 +264,154 @@ namespace ojph {
     }
 
     /////////////////////////////////////////////////////////////////////////
-    void avx2_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
-                           const line_buf* hdst, const line_buf* src, 
-                           ui32 width, bool even)
+    static
+    void avx2_rev_vert_step64(const lifting_step* s, const line_buf* sig, 
+                              const line_buf* other, const line_buf* aug, 
+                              ui32 repeat, bool synthesis)
+    {
+      const si32 a = s->rev.Aatk;
+      const si32 b = s->rev.Batk;
+      const ui8 e = s->rev.Eatk;
+      __m256i vb = _mm256_set1_epi64x(b);
+      __m256i ve = _mm256_set1_epi64x(1LL << (63 - e));      
+
+      si64* dst = aug->i64;
+      const si64* src1 = sig->i64, * src2 = other->i64;
+      // The general definition of the wavelet in Part 2 is slightly 
+      // different to part 2, although they are mathematically equivalent
+      // here, we identify the simpler form from Part 1 and employ them
+      if (a == 1)
+      { // 5/3 update and any case with a == 1
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi64(s1, s2);
+            __m256i v = _mm256_add_epi64(vb, t);
+            __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+            d = _mm256_sub_epi64(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi64(s1, s2);
+            __m256i v = _mm256_add_epi64(vb, t);
+            __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+            d = _mm256_add_epi64(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
+      }
+      else if (a == -1 && b == 1 && e == 1)
+      { // 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi64(s1, s2);
+            __m256i w = avx2_mm256_srai_epi64(t, e, ve);
+            d = _mm256_add_epi64(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi64(s1, s2);
+            __m256i w = avx2_mm256_srai_epi64(t, e, ve);
+            d = _mm256_sub_epi64(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
+      }
+      else if (a == -1)
+      { // any case with a == -1, which is not 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi64(s1, s2);
+            __m256i v = _mm256_sub_epi64(vb, t);
+            __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+            d = _mm256_sub_epi64(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi64(s1, s2);
+            __m256i v = _mm256_sub_epi64(vb, t);
+            __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+            d = _mm256_add_epi64(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
+      }
+      else { // general case
+        // 64bit multiplication is not supported in avx2;
+        // in particular, _mm256_mullo_epi64.
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (b + a * (*src1++ + *src2++)) >> e;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx2_rev_vert_step(const lifting_step* s, const line_buf* sig, 
+                            const line_buf* other, const line_buf* aug, 
+                            ui32 repeat, bool synthesis)
+    {
+      if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) || 
+          ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) ||
+          ((other != NULL) && (other->flags & line_buf::LFT_32BIT))) 
+      {
+        assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) &&
+               (other == NULL || other->flags & line_buf::LFT_32BIT) && 
+               (aug == NULL || aug->flags & line_buf::LFT_32BIT));
+        avx2_rev_vert_step32(s, sig, other, aug, repeat, synthesis);
+      }
+      else 
+      {
+        assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) &&
+               (other == NULL || other->flags & line_buf::LFT_64BIT) && 
+               (aug == NULL || aug->flags & line_buf::LFT_64BIT));
+        avx2_rev_vert_step64(s, sig, other, aug, repeat, synthesis);
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void avx2_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, 
+                             const line_buf* hdst, const line_buf* src, 
+                             ui32 width, bool even)
     {
       if (width > 1)
       {
-        // combine both lsrc and hsrc into dst
+        // split src into ldst and hdst
         {
-          float* dpl = ldst->f32;
-          float* dph = hdst->f32;
-          float* sp = src->f32;
+          float* dpl = even ? ldst->f32 : hdst->f32;
+          float* dph = even ? hdst->f32 : ldst->f32;
+          float* sp  = src->f32;
           int w = (int)width;
-          AVX_DEINTERLEAVE(dpl, dph, sp, w, even);
+          avx2_deinterleave32(dpl, dph, sp, w);
         }
 
         si32* hp = hdst->i32, * lp = ldst->i32;
@@ -206,7 +424,7 @@ namespace ojph {
           const lifting_step* s = atk->get_step(j - 1);
           const si32 a = s->rev.Aatk;
           const si32 b = s->rev.Batk;
-          const si32 e = s->rev.Eatk;
+          const ui8 e  = s->rev.Eatk;
           __m256i va = _mm256_set1_epi32(a);
           __m256i vb = _mm256_set1_epi32(b);
 
@@ -346,11 +564,181 @@ namespace ojph {
           hdst->i32[0] = src->i32[0] << 1;
       }
     }
+
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void avx2_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, 
+                             const line_buf* hdst, const line_buf* src, 
+                             ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        // split src into ldst and hdst
+        {
+          double* dpl = (double*)(even ? ldst->p : hdst->p);
+          double* dph = (double*)(even ? hdst->p : ldst->p);
+          double* sp  = (double*)src->p;
+          int w = (int)width;
+          avx2_deinterleave64(dpl, dph, sp, w);
+        }
+
+        si64* hp = hdst->i64, * lp = ldst->i64;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
+        {
+          // first lifting step
+          const lifting_step* s = atk->get_step(j - 1);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e  = s->rev.Eatk;
+          __m256i vb = _mm256_set1_epi64x(b);
+          __m256i ve = _mm256_set1_epi64x(1LL << (63 - e));
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const si64* sp = lp;
+          si64* dp = hp;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)h_width;
+            if (even)
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i v = _mm256_add_epi64(vb, t);
+                __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+                d = _mm256_add_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i v = _mm256_add_epi64(vb, t);
+                __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+                d = _mm256_add_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i w = avx2_mm256_srai_epi64(t, e, ve);
+                d = _mm256_sub_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i w = avx2_mm256_srai_epi64(t, e, ve);
+                d = _mm256_sub_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i v = _mm256_sub_epi64(vb, t);
+                __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+                d = _mm256_add_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i v = _mm256_sub_epi64(vb, t);
+                __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+                d = _mm256_add_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+          }
+          else {
+            // general case
+            // 64bit multiplication is not supported in avx2;
+            // in particular, _mm256_mullo_epi64.
+            if (even)
+              for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+                *dp += (b + a * (sp[0] + sp[1])) >> e;
+            else
+              for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+                *dp += (b + a * (sp[-1] + sp[0])) >> e;
+          }
+
+          // swap buffers
+          si64* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
+        }
+      }
+      else {
+        if (even)
+          ldst->i64[0] = src->i64[0];
+        else
+          hdst->i64[0] = src->i64[0] << 1;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx2_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                           const line_buf* hdst, const line_buf* src, 
+                           ui32 width, bool even)
+    {
+      if (src->flags & line_buf::LFT_32BIT) 
+      {
+        assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) &&
+               (hdst == NULL || hdst->flags & line_buf::LFT_32BIT));
+        avx2_rev_horz_ana32(atk, ldst, hdst, src, width, even);
+      }
+      else 
+      {
+        assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) &&
+               (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) && 
+               (src == NULL || src->flags & line_buf::LFT_64BIT));
+        avx2_rev_horz_ana64(atk, ldst, hdst, src, width, even);
+      }
+    } 
     
     //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
-                           const line_buf* lsrc, const line_buf* hsrc, 
-                           ui32 width, bool even)
+    static
+    void avx2_rev_horz_syn32(const param_atk* atk, const line_buf* dst, 
+                             const line_buf* lsrc, const line_buf* hsrc, 
+                             ui32 width, bool even)
     {
       if (width > 1)
       {
@@ -364,7 +752,7 @@ namespace ojph {
           const lifting_step* s = atk->get_step(j);
           const si32 a = s->rev.Aatk;
           const si32 b = s->rev.Batk;
-          const si32 e = s->rev.Eatk;
+          const ui8 e  = s->rev.Eatk;
           __m256i va = _mm256_set1_epi32(a);
           __m256i vb = _mm256_set1_epi32(b);
 
@@ -499,11 +887,11 @@ namespace ojph {
 
         // combine both lsrc and hsrc into dst
         {
-          float* dp = dst->f32;
-          float* spl = lsrc->f32;
-          float* sph = hsrc->f32;
+          float* dp  = dst->f32;
+          float* spl = even ? lsrc->f32 : hsrc->f32;
+          float* sph = even ? hsrc->f32 : lsrc->f32;
           int w = (int)width;
-          AVX_INTERLEAVE(dp, spl, sph, w, even);
+          avx2_interleave32(dp, spl, sph, w);
         }
       }
       else {
@@ -514,5 +902,174 @@ namespace ojph {
       }
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    static
+    void avx2_rev_horz_syn64(const param_atk* atk, const line_buf* dst, 
+                             const line_buf* lsrc, const line_buf* hsrc, 
+                             ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        bool ev = even;
+        si64* oth = hsrc->i64, * aug = lsrc->i64;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
+        {
+          const lifting_step* s = atk->get_step(j);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e  = s->rev.Eatk;
+          __m256i vb = _mm256_set1_epi64x(b);
+          __m256i ve = _mm256_set1_epi64x(1LL << (63 - e));      
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const si64* sp = oth;
+          si64* dp = aug;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)aug_width;
+            if (ev)
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i v = _mm256_add_epi64(vb, t);
+                __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+                d = _mm256_sub_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i v = _mm256_add_epi64(vb, t);
+                __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+                d = _mm256_sub_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i w = avx2_mm256_srai_epi64(t, e, ve);
+                d = _mm256_add_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i w = avx2_mm256_srai_epi64(t, e, ve);
+                d = _mm256_add_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i v = _mm256_sub_epi64(vb, t);
+                __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+                d = _mm256_sub_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi64(s1, s2);
+                __m256i v = _mm256_sub_epi64(vb, t);
+                __m256i w = avx2_mm256_srai_epi64(v, e, ve);
+                d = _mm256_sub_epi64(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+          }
+          else {
+            // general case
+            // 64bit multiplication is not supported in avx2;
+            // in particular, _mm_mullo_epi64.
+            if (ev)
+              for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+                *dp -= (b + a * (sp[-1] + sp[0])) >> e;
+            else
+              for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+                *dp -= (b + a * (sp[0] + sp[1])) >> e;
+          }
+
+          // swap buffers
+          si64* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
+        }
+
+        // combine both lsrc and hsrc into dst
+        {
+          double* dp  = (double*)dst->p;
+          double* spl = (double*)(even ? lsrc->p : hsrc->p);
+          double* sph = (double*)(even ? hsrc->p : lsrc->p);
+          int w = (int)width;
+          avx2_interleave64(dp, spl, sph, w);
+        }
+      }
+      else {
+        if (even)
+          dst->i64[0] = lsrc->i64[0];
+        else
+          dst->i64[0] = hsrc->i64[0] >> 1;
+      }
+    }    
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx2_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
+                           const line_buf* lsrc, const line_buf* hsrc, 
+                           ui32 width, bool even)
+    {
+      if (dst->flags & line_buf::LFT_32BIT) 
+      {
+        assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) && 
+               (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT));
+        avx2_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even);
+      }
+      else 
+      {
+        assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) &&
+               (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) && 
+               (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT));
+        avx2_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even);
+      }
+    }
+
   } // !local
 } // !ojph
diff --git a/src/core/transform/ojph_transform_avx512.cpp b/src/core/transform/ojph_transform_avx512.cpp
index 504aa87..0e92230 100644
--- a/src/core/transform/ojph_transform_avx512.cpp
+++ b/src/core/transform/ojph_transform_avx512.cpp
@@ -54,8 +54,8 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
     // We split multiples of 32 followed by multiples of 16, because
     // we assume byte_alignment == 64
-    static void avx512_deinterleave(float* dpl, float* dph, float* sp, 
-                                    int width, bool even)
+    static 
+    void avx512_deinterleave32(float* dpl, float* dph, float* sp, int width)
     {
       __m512i idx1 = _mm512_set_epi32(
         0x1E, 0x1C, 0x1A, 0x18, 0x16, 0x14, 0x12, 0x10,
@@ -65,59 +65,33 @@ namespace ojph {
         0x1F, 0x1D, 0x1B, 0x19, 0x17, 0x15, 0x13, 0x11,
         0x0F, 0x0D, 0x0B, 0x09, 0x07, 0x05, 0x03, 0x01
       );
-      if (even)
+      for (; width > 16; width -= 32, sp += 32, dpl += 16, dph += 16)
       {
-        for (; width > 16; width -= 32, sp += 32, dpl += 16, dph += 16)
-        {
-          __m512 a = _mm512_load_ps(sp);
-          __m512 b = _mm512_load_ps(sp + 16);
-          __m512 c = _mm512_permutex2var_ps(a, idx1, b);
-          __m512 d = _mm512_permutex2var_ps(a, idx2, b);
-          _mm512_store_ps(dpl, c);
-          _mm512_store_ps(dph, d);
-        }
-        for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8)
-        {
-          __m256 a = _mm256_load_ps(sp);
-          __m256 b = _mm256_load_ps(sp + 8);
-          __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));
-          __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));
-          __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));
-          __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));
-          _mm256_store_ps(dpl, e);
-          _mm256_store_ps(dph, f);
-        }
+        __m512 a = _mm512_load_ps(sp);
+        __m512 b = _mm512_load_ps(sp + 16);
+        __m512 c = _mm512_permutex2var_ps(a, idx1, b);
+        __m512 d = _mm512_permutex2var_ps(a, idx2, b);
+        _mm512_store_ps(dpl, c);
+        _mm512_store_ps(dph, d);
       }
-      else
+      for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8)
       {
-        for (; width > 16; width -= 32, sp += 32, dpl += 16, dph += 16)
-        {
-          __m512 a = _mm512_load_ps(sp);
-          __m512 b = _mm512_load_ps(sp + 16);
-          __m512 c = _mm512_permutex2var_ps(a, idx2, b);
-          __m512 d = _mm512_permutex2var_ps(a, idx1, b);
-          _mm512_store_ps(dpl, c);
-          _mm512_store_ps(dph, d);
-        }
-        for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8)
-        {
-          __m256 a = _mm256_load_ps(sp);
-          __m256 b = _mm256_load_ps(sp + 8);
-          __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));
-          __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));
-          __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));
-          __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));
-          _mm256_store_ps(dpl, f);
-          _mm256_store_ps(dph, e);
-        }
+        __m256 a = _mm256_load_ps(sp);
+        __m256 b = _mm256_load_ps(sp + 8);
+        __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));
+        __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));
+        __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));
+        __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));
+        _mm256_store_ps(dpl, e);
+        _mm256_store_ps(dph, f);
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
     // We split multiples of 32 followed by multiples of 16, because
     // we assume byte_alignment == 64
-    static void avx512_interleave(float* dp, float* spl, float* sph,
-                                  int width, bool even)
+    static 
+    void avx512_interleave32(float* dp, float* spl, float* sph, int width)
     {
       __m512i idx1 = _mm512_set_epi32(
         0x17, 0x7, 0x16, 0x6, 0x15, 0x5, 0x14, 0x4,
@@ -127,51 +101,93 @@ namespace ojph {
         0x1F, 0xF, 0x1E, 0xE, 0x1D, 0xD, 0x1C, 0xC,
         0x1B, 0xB, 0x1A, 0xA, 0x19, 0x9, 0x18, 0x8
       );
-      if (even)
+      for (; width > 16; width -= 32, dp += 32, spl += 16, sph += 16)
       {
-        for (; width > 16; width -= 32, dp += 32, spl += 16, sph += 16)
-        {
-          __m512 a = _mm512_load_ps(spl);
-          __m512 b = _mm512_load_ps(sph);
-          __m512 c = _mm512_permutex2var_ps(a, idx1, b);
-          __m512 d = _mm512_permutex2var_ps(a, idx2, b);
-          _mm512_store_ps(dp, c);
-          _mm512_store_ps(dp + 16, d);
-        }
-        for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8)
-        {
-          __m256 a = _mm256_load_ps(spl);
-          __m256 b = _mm256_load_ps(sph);
-          __m256 c = _mm256_unpacklo_ps(a, b);
-          __m256 d = _mm256_unpackhi_ps(a, b);
-          __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));
-          __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));
-          _mm256_store_ps(dp, e);
-          _mm256_store_ps(dp + 8, f);
-        }
+        __m512 a = _mm512_load_ps(spl);
+        __m512 b = _mm512_load_ps(sph);
+        __m512 c = _mm512_permutex2var_ps(a, idx1, b);
+        __m512 d = _mm512_permutex2var_ps(a, idx2, b);
+        _mm512_store_ps(dp, c);
+        _mm512_store_ps(dp + 16, d);
       }
-      else
+      for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8)
       {
-        for (; width > 16; width -= 32, dp += 32, spl += 16, sph += 16)
-        {
-          __m512 a = _mm512_load_ps(spl);
-          __m512 b = _mm512_load_ps(sph);
-          __m512 c = _mm512_permutex2var_ps(b, idx1, a);
-          __m512 d = _mm512_permutex2var_ps(b, idx2, a);
-          _mm512_store_ps(dp, c);
-          _mm512_store_ps(dp + 16, d);
-        }
-        for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8)
-        {
-          __m256 a = _mm256_load_ps(spl);
-          __m256 b = _mm256_load_ps(sph);
-          __m256 c = _mm256_unpacklo_ps(b, a);
-          __m256 d = _mm256_unpackhi_ps(b, a);
-          __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));
-          __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));
-          _mm256_store_ps(dp, e);
-          _mm256_store_ps(dp + 8, f);
-        }
+        __m256 a = _mm256_load_ps(spl);
+        __m256 b = _mm256_load_ps(sph);
+        __m256 c = _mm256_unpacklo_ps(a, b);
+        __m256 d = _mm256_unpackhi_ps(a, b);
+        __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));
+        __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));
+        _mm256_store_ps(dp, e);
+        _mm256_store_ps(dp + 8, f);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    // We split multiples of 32 followed by multiples of 16, because
+    // we assume byte_alignment == 64
+    static void avx512_deinterleave64(double* dpl, double* dph, double* sp, 
+                                      int width)
+    {
+      __m512i idx1 = _mm512_set_epi64(
+        0x0E, 0x0C, 0x0A, 0x08, 0x06, 0x04, 0x02, 0x00
+      );
+      __m512i idx2 = _mm512_set_epi64(
+        0x0F, 0x0D, 0x0B, 0x09, 0x07, 0x05, 0x03, 0x01
+      );
+      for (; width > 8; width -= 16, sp += 16, dpl += 8, dph += 8)
+      {
+        __m512d a = _mm512_load_pd(sp);
+        __m512d b = _mm512_load_pd(sp + 16);
+        __m512d c = _mm512_permutex2var_pd(a, idx1, b);
+        __m512d d = _mm512_permutex2var_pd(a, idx2, b);
+        _mm512_store_pd(dpl, c);
+        _mm512_store_pd(dph, d);
+      }
+      for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
+      {
+        __m256d a = _mm256_load_pd(sp);
+        __m256d b = _mm256_load_pd(sp + 4);
+        __m256d c = _mm256_permute2f128_pd(a, b, (2 << 4) | (0));
+        __m256d d = _mm256_permute2f128_pd(a, b, (3 << 4) | (1));
+        __m256d e = _mm256_shuffle_pd(c, d, 0x0);
+        __m256d f = _mm256_shuffle_pd(c, d, 0xF);
+        _mm256_store_pd(dpl, e);
+        _mm256_store_pd(dph, f);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    // We split multiples of 32 followed by multiples of 16, because
+    // we assume byte_alignment == 64
+    static void avx512_interleave64(double* dp, double* spl, double* sph, 
+                                    int width)
+    {
+      __m512i idx1 = _mm512_set_epi64(
+        0xB, 0x3, 0xA, 0x2, 0x9, 0x1, 0x8, 0x0
+      );
+      __m512i idx2 = _mm512_set_epi64(
+        0xF, 0x7, 0xE, 0x6, 0xD, 0x5, 0xC, 0x4
+      );
+      for (; width > 8; width -= 16, dp += 16, spl += 8, sph += 8)
+      {
+        __m512d a = _mm512_load_pd(spl);
+        __m512d b = _mm512_load_pd(sph);
+        __m512d c = _mm512_permutex2var_pd(a, idx1, b);
+        __m512d d = _mm512_permutex2var_pd(a, idx2, b);
+        _mm512_store_pd(dp, c);
+        _mm512_store_pd(dp + 16, d);
+      }
+      for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
+      {
+        __m256d a = _mm256_load_pd(spl);
+        __m256d b = _mm256_load_pd(sph);
+        __m256d c = _mm256_unpacklo_pd(a, b);
+        __m256d d = _mm256_unpackhi_pd(a, b);
+        __m256d e = _mm256_permute2f128_pd(c, d, (2 << 4) | (0));
+        __m256d f = _mm256_permute2f128_pd(c, d, (3 << 4) | (1));
+        _mm256_store_pd(dp, e);
+        _mm256_store_pd(dp + 4, f);
       }
     }
 
@@ -224,7 +240,13 @@ namespace ojph {
       if (width > 1)
       {
         // split src into ldst and hdst
-        avx512_deinterleave(ldst->f32, hdst->f32, src->f32, (int)width, even);
+        {
+          float* dpl = even ? ldst->f32 : hdst->f32;
+          float* dph = even ? hdst->f32 : ldst->f32;
+          float* sp  = src->f32;
+          int w = (int)width;
+          avx512_deinterleave32(dpl, dph, sp, w);
+        }
 
         // the actual horizontal transform
         float* hp = hdst->f32, * lp = ldst->f32;
@@ -352,7 +374,13 @@ namespace ojph {
         }
 
         // combine both lsrc and hsrc into dst
-        avx512_interleave(dst->f32, lsrc->f32, hsrc->f32, (int)width, even);
+        {
+          float* dp  = dst->f32;
+          float* spl = even ? lsrc->f32 : hsrc->f32;
+          float* sph = even ? hsrc->f32 : lsrc->f32;
+          int w = (int)width;
+          avx512_interleave32(dp, spl, sph, w);
+        }        
       }
       else {
         if (even)
@@ -364,13 +392,13 @@ namespace ojph {
 
 
     /////////////////////////////////////////////////////////////////////////
-    void avx512_rev_vert_step(const lifting_step* s, const line_buf* sig, 
-                              const line_buf* other, const line_buf* aug, 
-                              ui32 repeat, bool synthesis)
+    void avx512_rev_vert_step32(const lifting_step* s, const line_buf* sig, 
+                                const line_buf* other, const line_buf* aug, 
+                                ui32 repeat, bool synthesis)
     {
       const si32 a = s->rev.Aatk;
       const si32 b = s->rev.Batk;
-      const ui32 e = s->rev.Eatk;
+      const ui8 e = s->rev.Eatk;
       __m512i va = _mm512_set1_epi32(a);
       __m512i vb = _mm512_set1_epi32(b);
 
@@ -493,14 +521,185 @@ namespace ojph {
     }
 
     /////////////////////////////////////////////////////////////////////////
-    void avx512_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
-                             const line_buf* hdst, const line_buf* src, 
-                             ui32 width, bool even)
+    void avx512_rev_vert_step64(const lifting_step* s, const line_buf* sig, 
+                                const line_buf* other, const line_buf* aug, 
+                                ui32 repeat, bool synthesis)
+    {
+      const si32 a = s->rev.Aatk;
+      const si32 b = s->rev.Batk;
+      const ui8 e = s->rev.Eatk;
+      __m512i vb = _mm512_set1_epi64(b);
+
+      si64* dst = aug->i64;
+      const si64* src1 = sig->i64, * src2 = other->i64;
+      // The general definition of the wavelet in Part 2 is slightly 
+      // different to part 2, although they are mathematically equivalent
+      // here, we identify the simpler form from Part 1 and employ them
+      if (a == 1)
+      { // 5/3 update and any case with a == 1
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi64(s1, s2);
+            __m512i v = _mm512_add_epi64(vb, t);
+            __m512i w = _mm512_srai_epi64(v, e);
+            d = _mm512_sub_epi64(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi64(s1, s2);
+            __m512i v = _mm512_add_epi64(vb, t);
+            __m512i w = _mm512_srai_epi64(v, e);
+            d = _mm512_add_epi64(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+      }
+      else if (a == -1 && b == 1 && e == 1)
+      { // 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi64(s1, s2);
+            __m512i w = _mm512_srai_epi64(t, e);
+            d = _mm512_add_epi64(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi64(s1, s2);
+            __m512i w = _mm512_srai_epi64(t, e);
+            d = _mm512_sub_epi64(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+      }
+      else if (a == -1)
+      { // any case with a == -1, which is not 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi64(s1, s2);
+            __m512i v = _mm512_sub_epi64(vb, t);
+            __m512i w = _mm512_srai_epi64(v, e);
+            d = _mm512_sub_epi64(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi64(s1, s2);
+            __m512i v = _mm512_sub_epi64(vb, t);
+            __m512i w = _mm512_srai_epi64(v, e);
+            d = _mm512_add_epi64(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+      }
+      else { 
+        // general case
+        // 64bit multiplication is not supported in AVX512F + AVX512CD;
+        // in particular, _mm256_mullo_epi64.
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (b + a * (*src1++ + *src2++)) >> e;
+      }
+
+      // This can only be used if you have AVX512DQ
+      // { // general case
+      //   __m512i va = _mm512_set1_epi64(a);
+      //   int i = (int)repeat;
+      //   if (synthesis)
+      //     for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+      //     {
+      //       __m512i s1 = _mm512_load_si512((__m512i*)src1);
+      //       __m512i s2 = _mm512_load_si512((__m512i*)src2);
+      //       __m512i d = _mm512_load_si512((__m512i*)dst);
+      //       __m512i t = _mm512_add_epi64(s1, s2);
+      //       __m512i u = _mm512_mullo_epi64(va, t);
+      //       __m512i v = _mm512_add_epi64(vb, u);
+      //       __m512i w = _mm512_srai_epi64(v, e);
+      //       d = _mm512_sub_epi64(d, w);
+      //       _mm512_store_si512((__m512i*)dst, d);
+      //     }
+      //   else
+      //     for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+      //     {
+      //       __m512i s1 = _mm512_load_si512((__m512i*)src1);
+      //       __m512i s2 = _mm512_load_si512((__m512i*)src2);
+      //       __m512i d = _mm512_load_si512((__m512i*)dst);
+      //       __m512i t = _mm512_add_epi64(s1, s2);
+      //       __m512i u = _mm512_mullo_epi64(va, t);
+      //       __m512i v = _mm512_add_epi64(vb, u);
+      //       __m512i w = _mm512_srai_epi64(v, e);
+      //       d = _mm512_add_epi64(d, w);
+      //       _mm512_store_si512((__m512i*)dst, d);
+      //     }
+      // }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_rev_vert_step(const lifting_step* s, const line_buf* sig, 
+                              const line_buf* other, const line_buf* aug, 
+                              ui32 repeat, bool synthesis)
+    {
+      if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) || 
+          ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) ||
+          ((other != NULL) && (other->flags & line_buf::LFT_32BIT))) 
+      {
+        assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) &&
+               (other == NULL || other->flags & line_buf::LFT_32BIT) && 
+               (aug == NULL || aug->flags & line_buf::LFT_32BIT));
+        avx512_rev_vert_step32(s, sig, other, aug, repeat, synthesis);
+      }
+      else 
+      {
+        assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) &&
+               (other == NULL || other->flags & line_buf::LFT_64BIT) && 
+               (aug == NULL || aug->flags & line_buf::LFT_64BIT));
+        avx512_rev_vert_step64(s, sig, other, aug, repeat, synthesis);
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, 
+                               const line_buf* hdst, const line_buf* src, 
+                               ui32 width, bool even)
     {
       if (width > 1)
       {
-        // combine both lsrc and hsrc into dst
-        avx512_deinterleave(ldst->f32, hdst->f32, src->f32, (int)width, even);
+        // split src into ldst and hdst
+        {
+          float* dpl = even ? ldst->f32 : hdst->f32;
+          float* dph = even ? hdst->f32 : ldst->f32;
+          float* sp  = src->f32;
+          int w = (int)width;
+          avx512_deinterleave32(dpl, dph, sp, w);
+        }        
 
         si32* hp = hdst->i32, * lp = ldst->i32;
         ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
@@ -512,7 +711,7 @@ namespace ojph {
           const lifting_step* s = atk->get_step(j - 1);
           const si32 a = s->rev.Aatk;
           const si32 b = s->rev.Batk;
-          const ui32 e = s->rev.Eatk;
+          const ui8 e = s->rev.Eatk;
           __m512i va = _mm512_set1_epi32(a);
           __m512i vb = _mm512_set1_epi32(b);
 
@@ -653,10 +852,211 @@ namespace ojph {
       }
     }
     
-    //////////////////////////////////////////////////////////////////////////
-    void avx512_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
-                             const line_buf* lsrc, const line_buf* hsrc, 
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, 
+                               const line_buf* hdst, const line_buf* src, 
+                               ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        // split src into ldst and hdst
+        {
+          double* dpl = (double*)(even ? ldst->p : hdst->p);
+          double* dph = (double*)(even ? hdst->p : ldst->p);
+          double* sp  = (double*)(src->p);
+          int w = (int)width;
+          avx512_deinterleave64(dpl, dph, sp, w);
+        }        
+
+        si64* hp = hdst->i64, * lp = ldst->i64;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
+        {
+          // first lifting step
+          const lifting_step* s = atk->get_step(j - 1);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+          __m512i vb = _mm512_set1_epi64(b);
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const si64* sp = lp;
+          si64* dp = hp;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)h_width;
+            if (even)
+            {
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i v = _mm512_add_epi64(vb, t);
+                __m512i w = _mm512_srai_epi64(v, e);
+                d = _mm512_add_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i v = _mm512_add_epi64(vb, t);
+                __m512i w = _mm512_srai_epi64(v, e);
+                d = _mm512_add_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i w = _mm512_srai_epi64(t, e);
+                d = _mm512_sub_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i w = _mm512_srai_epi64(t, e);
+                d = _mm512_sub_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i v = _mm512_sub_epi64(vb, t);
+                __m512i w = _mm512_srai_epi64(v, e);
+                d = _mm512_add_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i v = _mm512_sub_epi64(vb, t);
+                __m512i w = _mm512_srai_epi64(v, e);
+                d = _mm512_add_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+          }
+          else 
+          {
+            // general case
+            // 64bit multiplication is not supported in AVX512F + AVX512CD;
+            // in particular, _mm256_mullo_epi64.
+            if (even)
+              for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+                *dp += (b + a * (sp[0] + sp[1])) >> e;
+            else
+              for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+                *dp += (b + a * (sp[-1] + sp[0])) >> e;
+          }
+
+          // This can only be used if you have AVX512DQ
+          // {
+          //   // general case
+          //   __m512i va = _mm512_set1_epi64(a);
+          //   int i = (int)h_width;
+          //   if (even)
+          //     for (; i > 0; i -= 8, sp += 8, dp += 8)
+          //     {
+          //       __m512i s1 = _mm512_load_si512((__m512i*)sp);
+          //       __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+          //       __m512i d = _mm512_load_si512((__m512i*)dp);
+          //       __m512i t = _mm512_add_epi64(s1, s2);
+          //       __m512i u = _mm512_mullo_epi64(va, t);
+          //       __m512i v = _mm512_add_epi64(vb, u);
+          //       __m512i w = _mm512_srai_epi64(v, e);
+          //       d = _mm512_add_epi64(d, w);
+          //       _mm512_store_si512((__m512i*)dp, d);
+          //     }
+          //   else
+          //     for (; i > 0; i -= 8, sp += 8, dp += 8)
+          //     {
+          //       __m512i s1 = _mm512_load_si512((__m512i*)sp);
+          //       __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+          //       __m512i d = _mm512_load_si512((__m512i*)dp);
+          //       __m512i t = _mm512_add_epi64(s1, s2);
+          //       __m512i u = _mm512_mullo_epi64(va, t);
+          //       __m512i v = _mm512_add_epi64(vb, u);
+          //       __m512i w = _mm512_srai_epi64(v, e);
+          //       d = _mm512_add_epi64(d, w);
+          //       _mm512_store_si512((__m512i*)dp, d);
+          //     }
+          // }
+
+          // swap buffers
+          si64* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
+        }
+      }
+      else {
+        if (even)
+          ldst->i64[0] = src->i64[0];
+        else
+          hdst->i64[0] = src->i64[0] << 1;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                             const line_buf* hdst, const line_buf* src, 
                              ui32 width, bool even)
+    {
+      if (src->flags & line_buf::LFT_32BIT) 
+      {
+        assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) &&
+               (hdst == NULL || hdst->flags & line_buf::LFT_32BIT));
+        avx512_rev_horz_ana32(atk, ldst, hdst, src, width, even);
+      }
+      else 
+      {
+        assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) &&
+               (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) && 
+               (src == NULL || src->flags & line_buf::LFT_64BIT));
+        avx512_rev_horz_ana64(atk, ldst, hdst, src, width, even);
+      }
+    } 
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx512_rev_horz_syn32(const param_atk* atk, const line_buf* dst, 
+                               const line_buf* lsrc, const line_buf* hsrc, 
+                               ui32 width, bool even)
     {
       if (width > 1)
       {
@@ -670,7 +1070,7 @@ namespace ojph {
           const lifting_step* s = atk->get_step(j);
           const si32 a = s->rev.Aatk;
           const si32 b = s->rev.Batk;
-          const ui32 e = s->rev.Eatk;
+          const ui8 e = s->rev.Eatk;
           __m512i va = _mm512_set1_epi32(a);
           __m512i vb = _mm512_set1_epi32(b);
 
@@ -804,7 +1204,13 @@ namespace ojph {
         }
 
         // combine both lsrc and hsrc into dst
-        avx512_interleave(dst->f32, lsrc->f32, hsrc->f32, (int)width, even);
+        {
+          float* dp  = dst->f32;
+          float* spl = even ? lsrc->f32 : hsrc->f32;
+          float* sph = even ? hsrc->f32 : lsrc->f32;
+          int w = (int)width;
+          avx512_interleave32(dp, spl, sph, w);
+        }          
       }
       else {
         if (even)
@@ -814,5 +1220,206 @@ namespace ojph {
       }
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    void avx512_rev_horz_syn64(const param_atk* atk, const line_buf* dst, 
+                               const line_buf* lsrc, const line_buf* hsrc, 
+                               ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        bool ev = even;
+        si64* oth = hsrc->i64, * aug = lsrc->i64;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
+        {
+          const lifting_step* s = atk->get_step(j);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+          __m512i vb = _mm512_set1_epi64(b);
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const si64* sp = oth;
+          si64* dp = aug;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)aug_width;
+            if (ev)
+            {
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i v = _mm512_add_epi64(vb, t);
+                __m512i w = _mm512_srai_epi64(v, e);
+                d = _mm512_sub_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i v = _mm512_add_epi64(vb, t);
+                __m512i w = _mm512_srai_epi64(v, e);
+                d = _mm512_sub_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i w = _mm512_srai_epi64(t, e);
+                d = _mm512_add_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i w = _mm512_srai_epi64(t, e);
+                d = _mm512_add_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i v = _mm512_sub_epi64(vb, t);
+                __m512i w = _mm512_srai_epi64(v, e);
+                d = _mm512_sub_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi64(s1, s2);
+                __m512i v = _mm512_sub_epi64(vb, t);
+                __m512i w = _mm512_srai_epi64(v, e);
+                d = _mm512_sub_epi64(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+          }
+          else 
+           {
+            // general case
+            // 64bit multiplication is not supported in AVX512F + AVX512CD;
+            // in particular, _mm256_mullo_epi64.            
+            if (ev)
+              for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+                *dp -= (b + a * (sp[-1] + sp[0])) >> e;
+            else
+              for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+                *dp -= (b + a * (sp[0] + sp[1])) >> e;
+          }
+
+          // This can only be used if you have AVX512DQ
+          // {
+          //   // general case
+          //   __m512i va = _mm512_set1_epi64(a);
+          //   int i = (int)aug_width;
+          //   if (ev)
+          //     for (; i > 0; i -= 8, sp += 8, dp += 8)
+          //     {
+          //       __m512i s1 = _mm512_load_si512((__m512i*)sp);
+          //       __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+          //       __m512i d = _mm512_load_si512((__m512i*)dp);
+          //       __m512i t = _mm512_add_epi64(s1, s2);
+          //       __m512i u = _mm512_mullo_epi64(va, t);
+          //       __m512i v = _mm512_add_epi64(vb, u);
+          //       __m512i w = _mm512_srai_epi64(v, e);
+          //       d = _mm512_sub_epi64(d, w);
+          //       _mm512_store_si512((__m512i*)dp, d);
+          //     }
+          //   else
+          //     for (; i > 0; i -= 8, sp += 8, dp += 8)
+          //     {
+          //       __m512i s1 = _mm512_load_si512((__m512i*)sp);
+          //       __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+          //       __m512i d = _mm512_load_si512((__m512i*)dp);
+          //       __m512i t = _mm512_add_epi64(s1, s2);
+          //       __m512i u = _mm512_mullo_epi64(va, t);
+          //       __m512i v = _mm512_add_epi64(vb, u);
+          //       __m512i w = _mm512_srai_epi64(v, e);
+          //       d = _mm512_sub_epi64(d, w);
+          //       _mm512_store_si512((__m512i*)dp, d);
+          //     }
+          // }
+
+          // swap buffers
+          si64* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
+        }
+
+        // combine both lsrc and hsrc into dst
+        {
+          double* dp  = (double*)(dst->p);
+          double* spl = (double*)(even ? lsrc->p : hsrc->p);
+          double* sph = (double*)(even ? hsrc->p : lsrc->p);
+          int w = (int)width;
+          avx512_interleave64(dp, spl, sph, w);
+        }          
+      }
+      else {
+        if (even)
+          dst->i64[0] = lsrc->i64[0];
+        else
+          dst->i64[0] = hsrc->i64[0] >> 1;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
+                             const line_buf* lsrc, const line_buf* hsrc, 
+                             ui32 width, bool even)
+    {
+      if (dst->flags & line_buf::LFT_32BIT) 
+      {
+        assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) && 
+               (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT));
+        avx512_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even);
+      }
+      else 
+      {
+        assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) &&
+               (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) && 
+               (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT));
+        avx512_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even);
+      }
+    }
+
   } // !local
 } // !ojph
diff --git a/src/core/transform/ojph_transform_local.h b/src/core/transform/ojph_transform_local.h
index ec2a2e1..acf9ee6 100644
--- a/src/core/transform/ojph_transform_local.h
+++ b/src/core/transform/ojph_transform_local.h
@@ -42,7 +42,10 @@
 #include "ojph_defs.h"
 
 namespace ojph {
-  struct line_buf;
+
+  // defined elsewhere
+  class line_buf;
+
   namespace local {
     struct param_atk;
     union lifting_step;
@@ -104,60 +107,6 @@ namespace ojph {
     //
     //////////////////////////////////////////////////////////////////////////
 
-    //////////////////////////////////////////////////////////////////////////
-    // Supporting macros
-    //////////////////////////////////////////////////////////////////////////
-
-    //////////////////////////////////////////////////////////////////////////
-    #define SSE_DEINTERLEAVE(dpl, dph, sp, width, even)                      \
-    {                                                                        \
-      if (even)                                                              \
-        for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)           \
-        {                                                                    \
-          __m128 a = _mm_load_ps(sp);                                        \
-          __m128 b = _mm_load_ps(sp + 4);                                    \
-          __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));          \
-          __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));          \
-          _mm_store_ps(dpl, c);                                              \
-          _mm_store_ps(dph, d);                                              \
-        }                                                                    \
-      else                                                                   \
-        for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)           \
-        {                                                                    \
-          __m128 a = _mm_load_ps(sp);                                        \
-          __m128 b = _mm_load_ps(sp + 4);                                    \
-          __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));          \
-          __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));          \
-          _mm_store_ps(dpl, d);                                              \
-          _mm_store_ps(dph, c);                                              \
-        }                                                                    \
-    }                                                                        
-
-    //////////////////////////////////////////////////////////////////////////
-    #define SSE_INTERLEAVE(dp, spl, sph, width, even)                        \
-    {                                                                        \
-      if (even)                                                              \
-        for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)           \
-        {                                                                    \
-          __m128 a = _mm_load_ps(spl);                                       \
-          __m128 b = _mm_load_ps(sph);                                       \
-          __m128 c = _mm_unpacklo_ps(a, b);                                  \
-          __m128 d = _mm_unpackhi_ps(a, b);                                  \
-          _mm_store_ps(dp, c);                                               \
-          _mm_store_ps(dp + 4, d);                                           \
-        }                                                                    \
-      else                                                                   \
-        for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)           \
-        {                                                                    \
-          __m128 a = _mm_load_ps(spl);                                       \
-          __m128 b = _mm_load_ps(sph);                                       \
-          __m128 c = _mm_unpacklo_ps(b, a);                                  \
-          __m128 d = _mm_unpackhi_ps(b, a);                                  \
-          _mm_store_ps(dp, c);                                               \
-          _mm_store_ps(dp + 4, d);                                           \
-        }                                                                    \
-    }
-
     //////////////////////////////////////////////////////////////////////////
     // Irreversible functions
     //////////////////////////////////////////////////////////////////////////
@@ -216,76 +165,6 @@ namespace ojph {
     //
     //////////////////////////////////////////////////////////////////////////
 
-    //////////////////////////////////////////////////////////////////////////
-    // Supporting macros
-    //////////////////////////////////////////////////////////////////////////
-
-    //////////////////////////////////////////////////////////////////////////
-    #define AVX_DEINTERLEAVE(dpl, dph, sp, width, even)                      \
-    {                                                                        \
-      if (even)                                                              \
-      {                                                                      \
-        for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8)         \
-        {                                                                    \
-          __m256 a = _mm256_load_ps(sp);                                     \
-          __m256 b = _mm256_load_ps(sp + 8);                                 \
-          __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));           \
-          __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));           \
-          __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));       \
-          __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));       \
-          _mm256_store_ps(dpl, e);                                           \
-          _mm256_store_ps(dph, f);                                           \
-        }                                                                    \
-      }                                                                      \
-      else                                                                   \
-      {                                                                      \
-        for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8)         \
-        {                                                                    \
-          __m256 a = _mm256_load_ps(sp);                                     \
-          __m256 b = _mm256_load_ps(sp + 8);                                 \
-          __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));           \
-          __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));           \
-          __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));       \
-          __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));       \
-          _mm256_store_ps(dpl, f);                                           \
-          _mm256_store_ps(dph, e);                                           \
-        }                                                                    \
-      }                                                                      \
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    #define AVX_INTERLEAVE(dp, spl, sph, width, even)                        \
-    {                                                                        \
-      if (even)                                                              \
-      {                                                                      \
-        for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8)         \
-        {                                                                    \
-          __m256 a = _mm256_load_ps(spl);                                    \
-          __m256 b = _mm256_load_ps(sph);                                    \
-          __m256 c = _mm256_unpacklo_ps(a, b);                               \
-          __m256 d = _mm256_unpackhi_ps(a, b);                               \
-          __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));           \
-          __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));           \
-          _mm256_store_ps(dp, e);                                            \
-          _mm256_store_ps(dp + 8, f);                                        \
-        }                                                                    \
-      }                                                                      \
-      else                                                                   \
-      {                                                                      \
-        for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8)         \
-        {                                                                    \
-          __m256 a = _mm256_load_ps(spl);                                    \
-          __m256 b = _mm256_load_ps(sph);                                    \
-          __m256 c = _mm256_unpacklo_ps(b, a);                               \
-          __m256 d = _mm256_unpackhi_ps(b, a);                               \
-          __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));           \
-          __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));           \
-          _mm256_store_ps(dp, e);                                            \
-          _mm256_store_ps(dp + 8, f);                                        \
-        }                                                                    \
-      }                                                                      \
-    }
-
     //////////////////////////////////////////////////////////////////////////
     // Irreversible functions
     //////////////////////////////////////////////////////////////////////////
diff --git a/src/core/transform/ojph_transform_sse.cpp b/src/core/transform/ojph_transform_sse.cpp
index 897a193..dcb5e53 100644
--- a/src/core/transform/ojph_transform_sse.cpp
+++ b/src/core/transform/ojph_transform_sse.cpp
@@ -50,6 +50,36 @@
 namespace ojph {
   namespace local {
 
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    void sse_deinterleave32(float* dpl, float* dph, float* sp, int width)
+    {
+      for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
+      {
+        __m128 a = _mm_load_ps(sp);
+        __m128 b = _mm_load_ps(sp + 4);
+        __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
+        __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
+        _mm_store_ps(dpl, c);
+        _mm_store_ps(dph, d);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    void sse_interleave32(float* dp, float* spl, float* sph, int width)                      \
+    {
+      for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
+      {
+        __m128 a = _mm_load_ps(spl);
+        __m128 b = _mm_load_ps(sph);
+        __m128 c = _mm_unpacklo_ps(a, b);
+        __m128 d = _mm_unpackhi_ps(a, b);
+        _mm_store_ps(dp, c);
+        _mm_store_ps(dp + 4, d);
+      }
+    }
+
     //////////////////////////////////////////////////////////////////////////
     static inline void sse_multiply_const(float* p, float f, int width)
     {
@@ -100,11 +130,11 @@ namespace ojph {
       {
         // split src into ldst and hdst
         {
-          float* dpl = ldst->f32;
-          float* dph = hdst->f32;
+          float* dpl = even ? ldst->f32 : hdst->f32;
+          float* dph = even ? hdst->f32 : ldst->f32;
           float* sp = src->f32;
           int w = (int)width;
-          SSE_DEINTERLEAVE(dpl, dph, sp, w, even);
+          sse_deinterleave32(dpl, dph, sp, w);
         }
 
         // the actual horizontal transform
@@ -235,10 +265,10 @@ namespace ojph {
         // combine both lsrc and hsrc into dst
         {
           float* dp = dst->f32;
-          float* spl = lsrc->f32;
-          float* sph = hsrc->f32;
+          float* spl = even ? lsrc->f32 : hsrc->f32;
+          float* sph = even ? hsrc->f32 : lsrc->f32;
           int w = (int)width;
-          SSE_INTERLEAVE(dp, spl, sph, w, even);
+          sse_interleave32(dp, spl, sph, w);
         }
       }
       else {
diff --git a/src/core/transform/ojph_transform_sse2.cpp b/src/core/transform/ojph_transform_sse2.cpp
index 8328842..a69b1fb 100644
--- a/src/core/transform/ojph_transform_sse2.cpp
+++ b/src/core/transform/ojph_transform_sse2.cpp
@@ -35,6 +35,7 @@
 // Date: 28 August 2019
 //***************************************************************************/
 
+#include <climits>
 #include <cstdio>
 
 #include "ojph_defs.h"
@@ -52,13 +53,86 @@ namespace ojph {
   namespace local {
 
     /////////////////////////////////////////////////////////////////////////
-    void sse2_rev_vert_step(const lifting_step* s, const line_buf* sig, 
-                            const line_buf* other, const line_buf* aug, 
-                            ui32 repeat, bool synthesis)
+    // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h
+    static inline __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m) 
+    {
+      // note than m must be obtained using
+      // __m128i m = _mm_set1_epi64x(1ULL << (63 - amt));
+      __m128i x = _mm_srli_epi64(a, amt);
+      x = _mm_xor_si128(x, m);
+      __m128i result = _mm_sub_epi64(x, m);
+      return result;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    void sse2_deinterleave32(float* dpl, float* dph, float* sp, int width)
+    {
+      for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
+      {
+        __m128 a = _mm_load_ps(sp);
+        __m128 b = _mm_load_ps(sp + 4);
+        __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
+        __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
+        _mm_store_ps(dpl, c);
+        _mm_store_ps(dph, d);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    void sse2_interleave32(float* dp, float* spl, float* sph, int width)                      \
+    {
+      for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
+      {
+        __m128 a = _mm_load_ps(spl);
+        __m128 b = _mm_load_ps(sph);
+        __m128 c = _mm_unpacklo_ps(a, b);
+        __m128 d = _mm_unpackhi_ps(a, b);
+        _mm_store_ps(dp, c);
+        _mm_store_ps(dp + 4, d);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline 
+    void sse2_deinterleave64(double* dpl, double* dph, double* sp, int width)
+    {
+      for (; width > 0; width -= 4, sp += 4, dpl += 2, dph += 2)
+      {
+        __m128d a = _mm_load_pd(sp);
+        __m128d b = _mm_load_pd(sp + 2);
+        __m128d c = _mm_shuffle_pd(a, b, 0);
+        __m128d d = _mm_shuffle_pd(a, b, 3);
+        _mm_store_pd(dpl, c);
+        _mm_store_pd(dph, d);
+      }
+    }    
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline 
+    void sse2_interleave64(double* dp, double* spl, double* sph, int width)
+    {
+      for (; width > 0; width -= 4, dp += 4, spl += 2, sph += 2)
+      {
+        __m128d a = _mm_load_pd(spl);
+        __m128d b = _mm_load_pd(sph);
+        __m128d c = _mm_unpacklo_pd(a, b);
+        __m128d d = _mm_unpackhi_pd(a, b);
+        _mm_store_pd(dp, c);
+        _mm_store_pd(dp + 2, d);
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void sse2_rev_vert_step32(const lifting_step* s, const line_buf* sig, 
+                              const line_buf* other, const line_buf* aug, 
+                              ui32 repeat, bool synthesis)
     {
       const si32 a = s->rev.Aatk;
       const si32 b = s->rev.Batk;
-      const si32 e = s->rev.Eatk;
+      const ui8 e = s->rev.Eatk;
       __m128i vb = _mm_set1_epi32(b);
 
       si32* dst = aug->i32;
@@ -162,19 +236,153 @@ namespace ojph {
     }
 
     /////////////////////////////////////////////////////////////////////////
-    void sse2_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
-                           const line_buf* hdst, const line_buf* src, 
-                           ui32 width, bool even)
+    static
+    void sse2_rev_vert_step64(const lifting_step* s, const line_buf* sig, 
+                              const line_buf* other, const line_buf* aug, 
+                              ui32 repeat, bool synthesis)
+    {
+      const si64 a = s->rev.Aatk;
+      const si64 b = s->rev.Batk;
+      const ui8 e = s->rev.Eatk;
+      __m128i vb = _mm_set1_epi64x(b);
+      __m128i ve = _mm_set1_epi64x(1LL << (63 - e));
+
+      si64* dst = aug->i64;
+      const si64* src1 = sig->i64, * src2 = other->i64;
+      // The general definition of the wavelet in Part 2 is slightly 
+      // different to part 2, although they are mathematically equivalent
+      // here, we identify the simpler form from Part 1 and employ them
+      if (a == 1)
+      { // 5/3 update and any case with a == 1
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            __m128i s1 = _mm_load_si128((__m128i*)src1);
+            __m128i s2 = _mm_load_si128((__m128i*)src2);
+            __m128i d = _mm_load_si128((__m128i*)dst);
+            __m128i t = _mm_add_epi64(s1, s2);
+            __m128i v = _mm_add_epi64(vb, t);
+            __m128i w = sse2_mm_srai_epi64(v, e, ve);
+            d = _mm_sub_epi64(d, w);
+            _mm_store_si128((__m128i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            __m128i s1 = _mm_load_si128((__m128i*)src1);
+            __m128i s2 = _mm_load_si128((__m128i*)src2);
+            __m128i d = _mm_load_si128((__m128i*)dst);
+            __m128i t = _mm_add_epi64(s1, s2);
+            __m128i v = _mm_add_epi64(vb, t);
+            __m128i w = sse2_mm_srai_epi64(v, e, ve);
+            d = _mm_add_epi64(d, w);
+            _mm_store_si128((__m128i*)dst, d);
+          }
+      }
+      else if (a == -1 && b == 1 && e == 1)
+      { // 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            __m128i s1 = _mm_load_si128((__m128i*)src1);
+            __m128i s2 = _mm_load_si128((__m128i*)src2);
+            __m128i d = _mm_load_si128((__m128i*)dst);
+            __m128i t = _mm_add_epi64(s1, s2);
+            __m128i w = sse2_mm_srai_epi64(t, e, ve);
+            d = _mm_add_epi64(d, w);
+            _mm_store_si128((__m128i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            __m128i s1 = _mm_load_si128((__m128i*)src1);
+            __m128i s2 = _mm_load_si128((__m128i*)src2);
+            __m128i d = _mm_load_si128((__m128i*)dst);
+            __m128i t = _mm_add_epi64(s1, s2);
+            __m128i w = sse2_mm_srai_epi64(t, e, ve);
+            d = _mm_sub_epi64(d, w);
+            _mm_store_si128((__m128i*)dst, d);
+          }
+      }
+      else if (a == -1)
+      { // any case with a == -1, which is not 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            __m128i s1 = _mm_load_si128((__m128i*)src1);
+            __m128i s2 = _mm_load_si128((__m128i*)src2);
+            __m128i d = _mm_load_si128((__m128i*)dst);
+            __m128i t = _mm_add_epi64(s1, s2);
+            __m128i v = _mm_sub_epi64(vb, t);
+            __m128i w = sse2_mm_srai_epi64(v, e, ve);
+            d = _mm_sub_epi64(d, w);
+            _mm_store_si128((__m128i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            __m128i s1 = _mm_load_si128((__m128i*)src1);
+            __m128i s2 = _mm_load_si128((__m128i*)src2);
+            __m128i d = _mm_load_si128((__m128i*)dst);
+            __m128i t = _mm_add_epi64(s1, s2);
+            __m128i v = _mm_sub_epi64(vb, t);
+            __m128i w = sse2_mm_srai_epi64(v, e, ve);
+            d = _mm_add_epi64(d, w);
+            _mm_store_si128((__m128i*)dst, d);
+          }
+      }
+      else { // general case
+        // 64bit multiplication is not supported in sse2
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (b + a * (*src1++ + *src2++)) >> e;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void sse2_rev_vert_step(const lifting_step* s, const line_buf* sig, 
+                            const line_buf* other, const line_buf* aug, 
+                            ui32 repeat, bool synthesis)
+    {
+      if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) || 
+          ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) ||
+          ((other != NULL) && (other->flags & line_buf::LFT_32BIT))) 
+      {
+        assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) &&
+               (other == NULL || other->flags & line_buf::LFT_32BIT) && 
+               (aug == NULL || aug->flags & line_buf::LFT_32BIT));
+        sse2_rev_vert_step32(s, sig, other, aug, repeat, synthesis);
+      }
+      else 
+      {
+        assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) &&
+               (other == NULL || other->flags & line_buf::LFT_64BIT) && 
+               (aug == NULL || aug->flags & line_buf::LFT_64BIT));
+        sse2_rev_vert_step64(s, sig, other, aug, repeat, synthesis);
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void sse2_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, 
+                             const line_buf* hdst, const line_buf* src, 
+                             ui32 width, bool even)
     {
       if (width > 1)
       {
-        // combine both lsrc and hsrc into dst
+        // split src into ldst and hdst
         {
-          float* dpl = ldst->f32;
-          float* dph = hdst->f32;
+          float* dpl = even ? ldst->f32 : hdst->f32;
+          float* dph = even ? hdst->f32 : ldst->f32;
           float* sp = src->f32;
           int w = (int)width;
-          SSE_DEINTERLEAVE(dpl, dph, sp, w, even);
+          sse2_deinterleave32(dpl, dph, sp, w);
         }
 
         si32* hp = hdst->i32, * lp = ldst->i32;
@@ -187,7 +395,7 @@ namespace ojph {
           const lifting_step* s = atk->get_step(j - 1);
           const si32 a = s->rev.Aatk;
           const si32 b = s->rev.Batk;
-          const si32 e = s->rev.Eatk;
+          const ui8 e = s->rev.Eatk;
           __m128i vb = _mm_set1_epi32(b);
 
           // extension
@@ -284,9 +492,7 @@ namespace ojph {
           }
           else {
             // general case
-            // 32bit multiplication is not supported in sse2; we need sse4.1,
-            // where we can use _mm_mullo_epi32, which multiplies
-            // 32bit x 32bit, keeping the LSBs
+            // 64bit multiplication is not supported in sse2
             if (even)
               for (ui32 i = h_width; i > 0; --i, sp++, dp++)
                 *dp += (b + a * (sp[0] + sp[1])) >> e;
@@ -308,11 +514,179 @@ namespace ojph {
           hdst->i32[0] = src->i32[0] << 1;
       }
     }
+
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void sse2_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, 
+                             const line_buf* hdst, const line_buf* src, 
+                             ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        // split src into ldst and hdst
+        {
+          double* dpl = (double*)(even ? ldst->p : hdst->p);
+          double* dph = (double*)(even ? hdst->p : ldst->p);
+          double* sp  = (double*)src->p;
+          int w = (int)width;
+          sse2_deinterleave64(dpl, dph, sp, w);
+        }
+
+        si64* hp = hdst->i64, * lp = ldst->i64;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
+        {
+          // first lifting step
+          const lifting_step* s = atk->get_step(j - 1);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+          __m128i vb = _mm_set1_epi64x(b);
+          __m128i ve = _mm_set1_epi64x(1LL << (63 - e));
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const si64* sp = lp;
+          si64* dp = hp;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)h_width;
+            if (even)
+            {
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i v = _mm_add_epi64(vb, t);
+                __m128i w = sse2_mm_srai_epi64(v, e, ve);
+                d = _mm_add_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i v = _mm_add_epi64(vb, t);
+                __m128i w = sse2_mm_srai_epi64(v, e, ve);
+                d = _mm_add_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i w = sse2_mm_srai_epi64(t, e, ve);
+                d = _mm_sub_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i w = sse2_mm_srai_epi64(t, e, ve);
+                d = _mm_sub_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i v = _mm_sub_epi64(vb, t);
+                __m128i w = sse2_mm_srai_epi64(v, e, ve);
+                d = _mm_add_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i v = _mm_sub_epi64(vb, t);
+                __m128i w = sse2_mm_srai_epi64(v, e, ve);
+                d = _mm_add_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+          }
+          else {
+            // general case
+            // 64bit multiplication is not supported in sse2
+            if (even)
+              for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+                *dp += (b + a * (sp[0] + sp[1])) >> e;
+            else
+              for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+                *dp += (b + a * (sp[-1] + sp[0])) >> e;
+          }
+
+          // swap buffers
+          si64* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
+        }
+      }
+      else {
+        if (even)
+          ldst->i64[0] = src->i64[0];
+        else
+          hdst->i64[0] = src->i64[0] << 1;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void sse2_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                           const line_buf* hdst, const line_buf* src, 
+                           ui32 width, bool even)
+    {
+      if (src->flags & line_buf::LFT_32BIT) 
+      {
+        assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) &&
+               (hdst == NULL || hdst->flags & line_buf::LFT_32BIT));
+        sse2_rev_horz_ana32(atk, ldst, hdst, src, width, even);
+      }
+      else 
+      {
+        assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) &&
+               (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) && 
+               (src == NULL || src->flags & line_buf::LFT_64BIT));
+        sse2_rev_horz_ana64(atk, ldst, hdst, src, width, even);
+      }
+    }    
     
     //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
-                           const line_buf* lsrc, const line_buf* hsrc, 
-                           ui32 width, bool even)
+    void sse2_rev_horz_syn32(const param_atk* atk, const line_buf* dst, 
+                             const line_buf* lsrc, const line_buf* hsrc, 
+                             ui32 width, bool even)
     {
       if (width > 1)
       {
@@ -326,7 +700,7 @@ namespace ojph {
           const lifting_step* s = atk->get_step(j);
           const si32 a = s->rev.Aatk;
           const si32 b = s->rev.Batk;
-          const si32 e = s->rev.Eatk;
+          const ui8 e = s->rev.Eatk;
           __m128i vb = _mm_set1_epi32(b);
 
           // extension
@@ -443,10 +817,10 @@ namespace ojph {
         // combine both lsrc and hsrc into dst
         {
           float* dp = dst->f32;
-          float* spl = lsrc->f32;
-          float* sph = hsrc->f32;
+          float* spl = even ? lsrc->f32 : hsrc->f32;
+          float* sph = even ? hsrc->f32 : lsrc->f32;
           int w = (int)width;
-          SSE_INTERLEAVE(dp, spl, sph, w, even);
+          sse2_interleave32(dp, spl, sph, w);
         }
       }
       else {
@@ -457,5 +831,172 @@ namespace ojph {
       }
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_rev_horz_syn64(const param_atk* atk, const line_buf* dst, 
+                             const line_buf* lsrc, const line_buf* hsrc, 
+                             ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        bool ev = even;
+        si64* oth = hsrc->i64, * aug = lsrc->i64;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
+        {
+          const lifting_step* s = atk->get_step(j);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+          __m128i vb = _mm_set1_epi64x(b);
+          __m128i ve = _mm_set1_epi64x(1LL << (63 - e));
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const si64* sp = oth;
+          si64* dp = aug;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)aug_width;
+            if (ev)
+            {
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i v = _mm_add_epi64(vb, t);
+                __m128i w = sse2_mm_srai_epi64(v, e, ve);
+                d = _mm_sub_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i v = _mm_add_epi64(vb, t);
+                __m128i w = sse2_mm_srai_epi64(v, e, ve);
+                d = _mm_sub_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i w = sse2_mm_srai_epi64(t, e, ve);
+                d = _mm_add_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i w = sse2_mm_srai_epi64(t, e, ve);
+                d = _mm_add_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i v = _mm_sub_epi64(vb, t);
+                __m128i w = sse2_mm_srai_epi64(v, e, ve);
+                d = _mm_sub_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi64(s1, s2);
+                __m128i v = _mm_sub_epi64(vb, t);
+                __m128i w = sse2_mm_srai_epi64(v, e, ve);
+                d = _mm_sub_epi64(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+          }
+          else {
+            // general case
+            // 64bit multiplication is not supported in sse2
+            if (ev)
+              for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+                *dp -= (b + a * (sp[-1] + sp[0])) >> e;
+            else
+              for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+                *dp -= (b + a * (sp[0] + sp[1])) >> e;
+          }
+
+          // swap buffers
+          si64* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
+        }
+
+        // combine both lsrc and hsrc into dst
+        {
+          double* dp  = (double*)dst->p;
+          double* spl = (double*)(even ? lsrc->p : hsrc->p);
+          double* sph = (double*)(even ? hsrc->p : lsrc->p);
+          int w = (int)width;
+          sse2_interleave64(dp, spl, sph, w);
+        }
+      }
+      else {
+        if (even)
+          dst->i64[0] = lsrc->i64[0];
+        else
+          dst->i64[0] = hsrc->i64[0] >> 1;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void sse2_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
+                           const line_buf* lsrc, const line_buf* hsrc, 
+                           ui32 width, bool even)
+    {
+      if (dst->flags & line_buf::LFT_32BIT) 
+      {
+        assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) && 
+               (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT));
+        sse2_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even);
+      }
+      else 
+      {
+        assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) &&
+               (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) && 
+               (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT));
+        sse2_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even);
+      }
+    }    
+
   } // !local
 } // !ojph
diff --git a/src/core/transform/ojph_transform_wasm.cpp b/src/core/transform/ojph_transform_wasm.cpp
index bd652df..341cfc3 100644
--- a/src/core/transform/ojph_transform_wasm.cpp
+++ b/src/core/transform/ojph_transform_wasm.cpp
@@ -51,65 +51,69 @@ namespace ojph {
   namespace local {
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_deinterleave(float* dpl, float* dph, float* sp, 
-                           int width, bool even)
+    static inline
+    void wasm_deinterleave32(float* dpl, float* dph, float* sp, int width)
     {
-      if (even)
-        for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
-        {
-          v128_t a = wasm_v128_load(sp);
-          v128_t b = wasm_v128_load(sp + 4);
-          v128_t c = wasm_i32x4_shuffle(a, b, 0, 2, 4 + 0, 4 + 2);
-          v128_t d = wasm_i32x4_shuffle(a, b, 1, 3, 4 + 1, 4 + 3);
-          // v128_t c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
-          // v128_t d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
-          wasm_v128_store(dpl, c);
-          wasm_v128_store(dph, d);
-        }
-      else
-        for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
-        {
-          v128_t a = wasm_v128_load(sp);
-          v128_t b = wasm_v128_load(sp + 4);
-          v128_t c = wasm_i32x4_shuffle(a, b, 0, 2, 4 + 0, 4 + 2);
-          v128_t d = wasm_i32x4_shuffle(a, b, 1, 3, 4 + 1, 4 + 3);
-          // v128_t c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
-          // v128_t d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
-          wasm_v128_store(dpl, d);
-          wasm_v128_store(dph, c);
-        }
+      for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
+      {
+        v128_t a = wasm_v128_load(sp);
+        v128_t b = wasm_v128_load(sp + 4);
+        v128_t c = wasm_i32x4_shuffle(a, b, 0, 2, 4 + 0, 4 + 2);
+        v128_t d = wasm_i32x4_shuffle(a, b, 1, 3, 4 + 1, 4 + 3);
+        // v128_t c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
+        // v128_t d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
+        wasm_v128_store(dpl, c);
+        wasm_v128_store(dph, d);
+      }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_interleave(float* dp, float* spl, float* sph, 
-                         int width, bool even)
+    static inline
+    void wasm_interleave32(float* dp, float* spl, float* sph, int width)
     {
-      if (even)
-        for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
-        {
-          v128_t a = wasm_v128_load(spl);
-          v128_t b = wasm_v128_load(sph);
-          v128_t c = wasm_i32x4_shuffle(a, b, 0, 4 + 0, 1, 4 + 1);
-          v128_t d = wasm_i32x4_shuffle(a, b, 2, 4 + 2, 3, 4 + 3);
-          // v128_t c = _mm_unpacklo_ps(a, b);
-          // v128_t d = _mm_unpackhi_ps(a, b);
-          wasm_v128_store(dp, c);
-          wasm_v128_store(dp + 4, d);
-        }
-      else
-        for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
-        {
-          v128_t a = wasm_v128_load(spl);
-          v128_t b = wasm_v128_load(sph);
-          v128_t c = wasm_i32x4_shuffle(b, a, 0, 4 + 0, 1, 4 + 1);
-          v128_t d = wasm_i32x4_shuffle(b, a, 2, 4 + 2, 3, 4 + 3);
-          // v128_t c = _mm_unpacklo_ps(b, a);
-          // v128_t d = _mm_unpackhi_ps(b, a);
-          wasm_v128_store(dp, c);
-          wasm_v128_store(dp + 4, d);
-        }
+      for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
+      {
+        v128_t a = wasm_v128_load(spl);
+        v128_t b = wasm_v128_load(sph);
+        v128_t c = wasm_i32x4_shuffle(a, b, 0, 4 + 0, 1, 4 + 1);
+        v128_t d = wasm_i32x4_shuffle(a, b, 2, 4 + 2, 3, 4 + 3);
+        // v128_t c = _mm_unpacklo_ps(a, b);
+        // v128_t d = _mm_unpackhi_ps(a, b);
+        wasm_v128_store(dp, c);
+        wasm_v128_store(dp + 4, d);
+      }
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    static inline 
+    void wasm_deinterleave64(double* dpl, double* dph, double* sp, int width)
+    {
+      for (; width > 0; width -= 4, sp += 4, dpl += 2, dph += 2)
+      {
+        v128_t a = wasm_v128_load(sp);
+        v128_t b = wasm_v128_load(sp + 2);
+        v128_t c = wasm_i64x2_shuffle(a, b, 0, 2 + 0);
+        v128_t d = wasm_i64x2_shuffle(a, b, 1, 2 + 1);
+        wasm_v128_store(dpl, c);
+        wasm_v128_store(dph, d);
+      }
+    }    
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline 
+    void wasm_interleave64(double* dp, double* spl, double* sph, int width)
+    {
+      for (; width > 0; width -= 4, dp += 4, spl += 2, sph += 2)
+      {
+        v128_t a = wasm_v128_load(spl);
+        v128_t b = wasm_v128_load(sph);
+        v128_t c = wasm_i64x2_shuffle(a, b, 0, 2 + 0);
+        v128_t d = wasm_i64x2_shuffle(a, b, 1, 2 + 1);
+        wasm_v128_store(dp, c);
+        wasm_v128_store(dp + 2, d);
+      }
+    }    
+
     //////////////////////////////////////////////////////////////////////////
     static inline void wasm_multiply_const(float* p, float f, int width)
     {
@@ -159,7 +163,13 @@ namespace ojph {
       if (width > 1)
       {
         // split src into ldst and hdst
-        wasm_deinterleave(ldst->f32, hdst->f32, src->f32, (int)width, even);
+        {
+          float* dpl = even ? ldst->f32 : hdst->f32;
+          float* dph = even ? hdst->f32 : ldst->f32;
+          float* sp = src->f32;
+          int w = (int)width;
+          wasm_deinterleave32(dpl, dph, sp, w);
+        }        
 
         // the actual horizontal transform
         float* hp = hdst->f32, * lp = ldst->f32;
@@ -287,7 +297,13 @@ namespace ojph {
         }
 
         // combine both lsrc and hsrc into dst
-        wasm_interleave(dst->f32, lsrc->f32, hsrc->f32, (int)width, even);
+        {
+          float* dp = dst->f32;
+          float* spl = even ? lsrc->f32 : hsrc->f32;
+          float* sph = even ? hsrc->f32 : lsrc->f32;
+          int w = (int)width;
+          wasm_interleave32(dp, spl, sph, w);
+        }        
       }
       else {
         if (even)
@@ -298,13 +314,13 @@ namespace ojph {
     }
 
     /////////////////////////////////////////////////////////////////////////
-    void wasm_rev_vert_step(const lifting_step* s, const line_buf* sig, 
-                            const line_buf* other, const line_buf* aug, 
-                            ui32 repeat, bool synthesis)
+    void wasm_rev_vert_step32(const lifting_step* s, const line_buf* sig, 
+                              const line_buf* other, const line_buf* aug, 
+                              ui32 repeat, bool synthesis)
     {
       const si32 a = s->rev.Aatk;
       const si32 b = s->rev.Batk;
-      const ui32 e = s->rev.Eatk;
+      const ui8 e = s->rev.Eatk;
       v128_t va = wasm_i32x4_splat(a);
       v128_t vb = wasm_i32x4_splat(b);
 
@@ -428,14 +444,174 @@ namespace ojph {
     }
 
     /////////////////////////////////////////////////////////////////////////
-    void wasm_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
-                           const line_buf* hdst, const line_buf* src, 
-                           ui32 width, bool even)
+    void wasm_rev_vert_step64(const lifting_step* s, const line_buf* sig, 
+                              const line_buf* other, const line_buf* aug, 
+                              ui32 repeat, bool synthesis)
+    {
+      const si32 a = s->rev.Aatk;
+      const si32 b = s->rev.Batk;
+      const ui8 e = s->rev.Eatk;
+      v128_t va = wasm_i64x2_splat(a);
+      v128_t vb = wasm_i64x2_splat(b);
+
+      si64* dst = aug->i64;
+      const si64* src1 = sig->i64, * src2 = other->i64;
+      // The general definition of the wavelet in Part 2 is slightly 
+      // different to part 2, although they are mathematically equivalent
+      // here, we identify the simpler form from Part 1 and employ them
+      if (a == 1)
+      { // 5/3 update and any case with a == 1
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i64x2_add(s1, s2);
+            v128_t v = wasm_i64x2_add(vb, t);
+            v128_t w = wasm_i64x2_shr(v, e);
+            d = wasm_i64x2_sub(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i64x2_add(s1, s2);
+            v128_t v = wasm_i64x2_add(vb, t);
+            v128_t w = wasm_i64x2_shr(v, e);
+            d = wasm_i64x2_add(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+      }
+      else if (a == -1 && b == 1 && e == 1)
+      { // 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i64x2_add(s1, s2);
+            v128_t w = wasm_i64x2_shr(t, e);
+            d = wasm_i64x2_add(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i64x2_add(s1, s2);
+            v128_t w = wasm_i64x2_shr(t, e);
+            d = wasm_i64x2_sub(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+      }
+      else if (a == -1)
+      { // any case with a == -1, which is not 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i64x2_add(s1, s2);
+            v128_t v = wasm_i64x2_sub(vb, t);
+            v128_t w = wasm_i64x2_shr(v, e);
+            d = wasm_i64x2_sub(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i64x2_add(s1, s2);
+            v128_t v = wasm_i64x2_sub(vb, t);
+            v128_t w = wasm_i64x2_shr(v, e);
+            d = wasm_i64x2_add(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+      }
+      else 
+      { // general case
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i64x2_add(s1, s2);
+            v128_t u = wasm_i64x2_mul(va, t);
+            v128_t v = wasm_i64x2_add(vb, u);
+            v128_t w = wasm_i64x2_shr(v, e);
+            d = wasm_i64x2_sub(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i64x2_add(s1, s2);
+            v128_t u = wasm_i64x2_mul(va, t);
+            v128_t v = wasm_i64x2_add(vb, u);
+            v128_t w = wasm_i64x2_shr(v, e);
+            d = wasm_i64x2_add(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void wasm_rev_vert_step(const lifting_step* s, const line_buf* sig, 
+                            const line_buf* other, const line_buf* aug, 
+                            ui32 repeat, bool synthesis)
+    {
+      if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) || 
+          ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) ||
+          ((other != NULL) && (other->flags & line_buf::LFT_32BIT))) 
+      {
+        assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) &&
+               (other == NULL || other->flags & line_buf::LFT_32BIT) && 
+               (aug == NULL || aug->flags & line_buf::LFT_32BIT));
+        wasm_rev_vert_step32(s, sig, other, aug, repeat, synthesis);
+      }
+      else 
+      {
+        assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) &&
+               (other == NULL || other->flags & line_buf::LFT_64BIT) && 
+               (aug == NULL || aug->flags & line_buf::LFT_64BIT));
+        wasm_rev_vert_step64(s, sig, other, aug, repeat, synthesis);
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void wasm_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, 
+                             const line_buf* hdst, const line_buf* src, 
+                             ui32 width, bool even)
     {
       if (width > 1)
       {
         // combine both lsrc and hsrc into dst
-        wasm_deinterleave(ldst->f32, hdst->f32, src->f32, (int)width, even);          
+        {
+          float* dpl = even ? ldst->f32 : hdst->f32;
+          float* dph = even ? hdst->f32 : ldst->f32;
+          float* sp = src->f32;
+          int w = (int)width;
+          wasm_deinterleave32(dpl, dph, sp, w);
+        }        
 
         si32* hp = hdst->i32, * lp = ldst->i32;
         ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
@@ -447,7 +623,7 @@ namespace ojph {
           const lifting_step* s = atk->get_step(j - 1);
           const si32 a = s->rev.Aatk;
           const si32 b = s->rev.Batk;
-          const ui32 e = s->rev.Eatk;
+          const ui8 e = s->rev.Eatk;
           v128_t va = wasm_i32x4_splat(a);
           v128_t vb = wasm_i32x4_splat(b);
 
@@ -587,11 +763,199 @@ namespace ojph {
           hdst->i32[0] = src->i32[0] << 1;
       }
     }
-    
-    //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
-                           const line_buf* lsrc, const line_buf* hsrc, 
+
+    /////////////////////////////////////////////////////////////////////////
+    static
+    void wasm_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, 
+                             const line_buf* hdst, const line_buf* src, 
+                             ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        // combine both lsrc and hsrc into dst
+        {
+          double* dpl = (double*)(even ? ldst->p : hdst->p);
+          double* dph = (double*)(even ? hdst->p : ldst->p);
+          double* sp  = (double*)src->p;
+          int w = (int)width;
+          wasm_deinterleave64(dpl, dph, sp, w);
+        }        
+
+        si64* hp = hdst->i64, * lp = ldst->i64;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
+        {
+          // first lifting step
+          const lifting_step* s = atk->get_step(j - 1);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+          v128_t va = wasm_i64x2_splat(a);
+          v128_t vb = wasm_i64x2_splat(b);
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const si64* sp = lp;
+          si64* dp = hp;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)h_width;
+            if (even)
+            {
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t v = wasm_i64x2_add(vb, t);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t v = wasm_i64x2_add(vb, t);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t w = wasm_i64x2_shr(t, e);
+                d = wasm_i64x2_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t w = wasm_i64x2_shr(t, e);
+                d = wasm_i64x2_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t v = wasm_i64x2_sub(vb, t);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t v = wasm_i64x2_sub(vb, t);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+          }
+          else 
+          { // general case
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t u = wasm_i64x2_mul(va, t);
+                v128_t v = wasm_i64x2_add(vb, u);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t u = wasm_i64x2_mul(va, t);                
+                v128_t v = wasm_i64x2_add(vb, u);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+          }
+
+          // swap buffers
+          si64* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
+        }
+      }
+      else {
+        if (even)
+          ldst->i64[0] = src->i64[0];
+        else
+          hdst->i64[0] = src->i64[0] << 1;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void wasm_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                           const line_buf* hdst, const line_buf* src, 
                            ui32 width, bool even)
+    {
+      if (src->flags & line_buf::LFT_32BIT) 
+      {
+        assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) &&
+               (hdst == NULL || hdst->flags & line_buf::LFT_32BIT));
+        wasm_rev_horz_ana32(atk, ldst, hdst, src, width, even);
+      }
+      else 
+      {
+        assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) &&
+               (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) && 
+               (src == NULL || src->flags & line_buf::LFT_64BIT));
+        wasm_rev_horz_ana64(atk, ldst, hdst, src, width, even);
+      }
+    } 
+
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_rev_horz_syn32(const param_atk* atk, const line_buf* dst, 
+                             const line_buf* lsrc, const line_buf* hsrc, 
+                             ui32 width, bool even)
     {
       if (width > 1)
       {
@@ -605,7 +969,7 @@ namespace ojph {
           const lifting_step* s = atk->get_step(j);
           const si32 a = s->rev.Aatk;
           const si32 b = s->rev.Batk;
-          const ui32 e = s->rev.Eatk;
+          const ui8 e = s->rev.Eatk;
           v128_t va = wasm_i32x4_splat(a);
           v128_t vb = wasm_i32x4_splat(b);
 
@@ -739,7 +1103,13 @@ namespace ojph {
         }
 
         // combine both lsrc and hsrc into dst
-        wasm_interleave(dst->f32, lsrc->f32, hsrc->f32, (int)width, even);
+        {
+          float* dp = dst->f32;
+          float* spl = even ? lsrc->f32 : hsrc->f32;
+          float* sph = even ? hsrc->f32 : lsrc->f32;
+          int w = (int)width;
+          wasm_interleave32(dp, spl, sph, w);
+        }
       }
       else {
         if (even)
@@ -749,5 +1119,192 @@ namespace ojph {
       }
     }
     
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_rev_horz_syn64(const param_atk* atk, const line_buf* dst, 
+                             const line_buf* lsrc, const line_buf* hsrc, 
+                             ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        bool ev = even;
+        si64* oth = hsrc->i64, * aug = lsrc->i64;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
+        {
+          const lifting_step* s = atk->get_step(j);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui8 e = s->rev.Eatk;
+          v128_t va = wasm_i64x2_splat(a);
+          v128_t vb = wasm_i64x2_splat(b);
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const si64* sp = oth;
+          si64* dp = aug;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)aug_width;
+            if (ev)
+            {
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t v = wasm_i64x2_add(vb, t);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t v = wasm_i64x2_add(vb, t);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t w = wasm_i64x2_shr(t, e);
+                d = wasm_i64x2_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t w = wasm_i64x2_shr(t, e);
+                d = wasm_i64x2_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t v = wasm_i64x2_sub(vb, t);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t v = wasm_i64x2_sub(vb, t);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+          }
+          else 
+          { // general case
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t u = wasm_i64x2_mul(va, t);
+                v128_t v = wasm_i64x2_add(vb, u);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 2, sp += 2, dp += 2)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i64x2_add(s1, s2);
+                v128_t u = wasm_i64x2_mul(va, t);                
+                v128_t v = wasm_i64x2_add(vb, u);
+                v128_t w = wasm_i64x2_shr(v, e);
+                d = wasm_i64x2_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+          }
+
+          // swap buffers
+          si64* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
+        }
+
+        // combine both lsrc and hsrc into dst
+        {
+          double* dp  = (double*)dst->p;
+          double* spl = (double*)(even ? lsrc->p : hsrc->p);
+          double* sph = (double*)(even ? hsrc->p : lsrc->p);
+          int w = (int)width;
+          wasm_interleave64(dp, spl, sph, w);
+        }
+      }
+      else {
+        if (even)
+          dst->i64[0] = lsrc->i64[0];
+        else
+          dst->i64[0] = hsrc->i64[0] >> 1;
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void wasm_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
+                           const line_buf* lsrc, const line_buf* hsrc, 
+                           ui32 width, bool even)
+    {
+      if (dst->flags & line_buf::LFT_32BIT) 
+      {
+        assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) && 
+               (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT));
+        wasm_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even);
+      }
+      else 
+      {
+        assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) &&
+               (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) && 
+               (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT));
+        wasm_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even);
+      }
+    } 
+
   } // !local
 } // !ojph
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 000409f..8cc1d72 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -3,7 +3,7 @@
 include(FetchContent)
 FetchContent_Declare(
   googletest
-  URL https://github.com/google/googletest/archive/refs/tags/v1.13.0.tar.gz
+  URL https://github.com/google/googletest/archive/refs/tags/v1.14.0.tar.gz
   EXCLUDE_FROM_ALL
 )
 # For Windows: Prevent overriding the parent project's compiler/linker settings
diff --git a/tests/test_executables.cpp b/tests/test_executables.cpp
index 9f77f75..22f148e 100644
--- a/tests/test_executables.cpp
+++ b/tests/test_executables.cpp
@@ -107,8 +107,27 @@ int execute(const std::string& cmd, std::string& result)
 #define REF_FILE_DIR "./jp2k_test_codestreams/openjph/references/"
 #define MSE_PAE_PATH  "./mse_pae"
 #define COMPARE_FILES_PATH  "./compare_files"
+
+// This is a comment to me, to help with emscripten testing.
+// This is written after the completion of the tests.
+// 1. Compile for the target platform (Linux), selecting from the following
+//    code the version that suits you; in particular it should be the one
+//    the uses node.  Ideally create two versions of test_executables, one
+//    for WASM SIMD, and for WASM without SIMD -- use linux cp command to
+//    create test_executables_simd and test_executables_no_simd
+// 2. Compile again, without deleting what compiled; this time compile using
+//    emscripten, targeting WASM.  The compilation is very finicky, do
+//    'make clean && make' after every change in code.
+// 3. cd to tests, and run test_executables_simd or test_executables_no_simd.
+
 #define EXPAND_EXECUTABLE "./ojph_expand"
 #define COMPRESS_EXECUTABLE "./ojph_compress"
+//#define EXPAND_EXECUTABLE "20.18.0_64bit/bin/node ./ojph_expand.js"
+//#define COMPRESS_EXECUTABLE "20.18.0_64bit/bin/node ./ojph_compress.js"
+//#define EXPAND_EXECUTABLE "node-v18.7.0-linux-x64/bin/node ./ojph_expand_simd.js"
+//#define COMPRESS_EXECUTABLE "node-v18.7.0-linux-x64/bin/node ./ojph_compress_simd.js"
+//#define EXPAND_EXECUTABLE "./../../../sde/sde64 -skx -- ./ojph_expand"
+//#define COMPRESS_EXECUTABLE "./../../../sde/sde64 -skx -- ./ojph_compress"
 #endif
 #define TOL_DOUBLE 0.01
 #define TOL_INTEGER 1