From f28a90fce49edf94d44865add3299650058ebe6d Mon Sep 17 00:00:00 2001
From: Aous Naman <aous72@yahoo.com>
Date: Sat, 13 Apr 2024 17:57:45 +1000
Subject: [PATCH] Wasm completed -- not tested yet.

---
 src/apps/ojph_compress/CMakeLists.txt        |   2 +-
 src/apps/ojph_expand/CMakeLists.txt          |   2 +-
 src/core/CMakeLists.txt                      |   2 +-
 src/core/transform/ojph_transform.cpp        |  18 +-
 src/core/transform/ojph_transform_avx512.cpp |  10 +-
 src/core/transform/ojph_transform_wasm.cpp   | 957 +++++++++++--------
 tests/CMakeLists.txt                         |  16 +-
 7 files changed, 596 insertions(+), 411 deletions(-)

diff --git a/src/apps/ojph_compress/CMakeLists.txt b/src/apps/ojph_compress/CMakeLists.txt
index bbb77abc..dadcca9b 100644
--- a/src/apps/ojph_compress/CMakeLists.txt
+++ b/src/apps/ojph_compress/CMakeLists.txt
@@ -17,7 +17,7 @@ source_group("others"      FILES ${OJPH_IMG_IO})
 source_group("common"      FILES ${OJPH_IMG_IO_H})
 
 if(EMSCRIPTEN)
-  add_compile_options(-std=c++11 -O3 -fexceptions -DOJPH_DISABLE_INTEL_SIMD)
+  add_compile_options(-std=c++11 -O3 -fexceptions)
   add_executable(ojph_compress ${SOURCES})
   add_executable(ojph_compress_simd ${SOURCES} ${OJPH_IMG_IO_SSE4})
   target_compile_options(ojph_compress_simd PRIVATE -DOJPH_ENABLE_WASM_SIMD -msimd128 -msse4.1)
diff --git a/src/apps/ojph_expand/CMakeLists.txt b/src/apps/ojph_expand/CMakeLists.txt
index c0ac185e..d4b65523 100644
--- a/src/apps/ojph_expand/CMakeLists.txt
+++ b/src/apps/ojph_expand/CMakeLists.txt
@@ -17,7 +17,7 @@ source_group("others"      FILES ${OJPH_IMG_IO})
 source_group("common"      FILES ${OJPH_IMG_IO_H})
 
 if(EMSCRIPTEN)
-  add_compile_options(-std=c++11 -O3 -fexceptions -DOJPH_DISABLE_INTEL_SIMD)
+  add_compile_options(-std=c++11 -O3 -fexceptions)
   add_executable(ojph_expand ${SOURCES})
   add_executable(ojph_expand_simd ${SOURCES} ${OJPH_IMG_IO_SSE4})
   target_compile_options(ojph_expand_simd PRIVATE -DOJPH_ENABLE_WASM_SIMD -msimd128 -msse4.1)
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 19123a2e..40fffa48 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -33,7 +33,7 @@ source_group("others"            FILES ${OTHERS})
 source_group("transform"         FILES ${TRANSFORM})
 
 if(EMSCRIPTEN)
-  add_compile_options(-std=c++11 -O3 -fexceptions -DOJPH_DISABLE_INTEL_SIMD)
+  add_compile_options(-std=c++11 -O3 -fexceptions)
   add_library(openjph ${SOURCES})
   add_library(openjphsimd ${SOURCES} ${CODESTREAM_WASM} ${CODING_WASM} ${TRANSFORM_WASM})
   target_include_directories(openjph PUBLIC common)
diff --git a/src/core/transform/ojph_transform.cpp b/src/core/transform/ojph_transform.cpp
index 83eed644..0dc5f95c 100644
--- a/src/core/transform/ojph_transform.cpp
+++ b/src/core/transform/ojph_transform.cpp
@@ -162,16 +162,14 @@ namespace ojph {
 #endif // !OJPH_DISABLE_INTEL_SIMD
 
 #else // OJPH_ENABLE_WASM_SIMD
-      rev_vert_ana_step         = wasm_rev_vert_ana_step;
-      rev_horz_ana              = wasm_rev_horz_ana;
-      rev_vert_syn_step         = wasm_rev_vert_syn_step;
-      rev_horz_syn              = wasm_rev_horz_syn;
-
-      irv_vert_ana_step         = wasm_irv_vert_ana_step;
-      irv_horz_ana              = wasm_irv_horz_ana;      
-      irv_vert_syn_step         = wasm_irv_vert_syn_step;
-      irv_horz_syn              = wasm_irv_horz_syn;
-      irv_vert_times_K          = wasm_irv_vert_times_K;
+        rev_vert_step             = wasm_rev_vert_step;
+        rev_horz_ana              = wasm_rev_horz_ana;
+        rev_horz_syn              = wasm_rev_horz_syn;
+        
+        irv_vert_step             = wasm_irv_vert_step;
+        irv_vert_times_K          = wasm_irv_vert_times_K;
+        irv_horz_ana              = wasm_irv_horz_ana;
+        irv_horz_syn              = wasm_irv_horz_syn;
 #endif // !OJPH_ENABLE_WASM_SIMD
 
       wavelet_transform_functions_initialized = true;
diff --git a/src/core/transform/ojph_transform_avx512.cpp b/src/core/transform/ojph_transform_avx512.cpp
index 02edca60..504aa870 100644
--- a/src/core/transform/ojph_transform_avx512.cpp
+++ b/src/core/transform/ojph_transform_avx512.cpp
@@ -2,9 +2,9 @@
 // This software is released under the 2-Clause BSD license, included
 // below.
 //
-// Copyright (c) 2019, Aous Naman 
-// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
-// Copyright (c) 2019, The University of New South Wales, Australia
+// Copyright (c) 2019-2024, Aous Naman 
+// Copyright (c) 2019-2024, Kakadu Software Pty Ltd, Australia
+// Copyright (c) 2019-2024, The University of New South Wales, Australia
 // 
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
@@ -30,9 +30,9 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //***************************************************************************/
 // This file is part of the OpenJPH software implementation.
-// File: ojph_transform_avx2.cpp
+// File: ojph_transform_avx512.cpp
 // Author: Aous Naman
-// Date: 28 August 2019
+// Date: 13 April 2024
 //***************************************************************************/
 
 #include <cstdio>
diff --git a/src/core/transform/ojph_transform_wasm.cpp b/src/core/transform/ojph_transform_wasm.cpp
index 8f48e352..7b9ffb10 100644
--- a/src/core/transform/ojph_transform_wasm.cpp
+++ b/src/core/transform/ojph_transform_wasm.cpp
@@ -41,6 +41,9 @@
 #include "ojph_defs.h"
 #include "ojph_arch.h"
 #include "ojph_mem.h"
+#include "ojph_params.h"
+#include "../codestream/ojph_params_local.h"
+
 #include "ojph_transform.h"
 #include "ojph_transform_local.h"
 
@@ -48,473 +51,645 @@ namespace ojph {
   namespace local {
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_vert_wvlt_fwd_predict(const line_buf* line_src1, 
-                                        const line_buf* line_src2,
-                                        line_buf *line_dst, ui32 repeat)
-    {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
-
-      for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
-      {
-        v128_t s1 = wasm_v128_load(src1);
-        v128_t s2 = wasm_v128_load(src2);
-        v128_t d = wasm_v128_load(dst);
-        s1 = wasm_i32x4_shr(wasm_i32x4_add(s1, s2), 1);
-        d = wasm_i32x4_sub(d, s1);
-        wasm_v128_store(dst, d);
-      }
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_vert_wvlt_fwd_update(const line_buf* line_src1, 
-                                       const line_buf* line_src2,
-                                       line_buf *line_dst, ui32 repeat)
+    void wasm_deinterleave(float* dpl, float* dph, float* sp, 
+                           int width, bool even)
     {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
-
-      v128_t offset = wasm_i32x4_splat(2);
-      for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
-      {
-        v128_t s1 = wasm_v128_load(src1);
-        s1 = wasm_i32x4_add(s1, offset);
-        v128_t s2 = wasm_v128_load(src2);
-        s2 = wasm_i32x4_add(s2, s1);
-        v128_t d = wasm_v128_load(dst);
-        d = wasm_i32x4_add(d, wasm_i32x4_shr(s2, 2));
-        wasm_v128_store(dst, d);
-      }
+      if (even)
+        for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
+        {
+          v128_t a = wasm_v128_load(sp);
+          v128_t b = wasm_v128_load(sp + 4);
+          v128_t c = wasm_i32x4_shuffle(a, b, 0, 2, 4 + 0, 4 + 2);
+          v128_t d = wasm_i32x4_shuffle(a, b, 1, 3, 4 + 1, 4 + 3);
+          // v128_t c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
+          // v128_t d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
+          wasm_v128_store(dpl, c);
+          wasm_v128_store(dph, d);
+        }
+      else
+        for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
+        {
+          v128_t a = wasm_v128_load(sp);
+          v128_t b = wasm_v128_load(sp + 4);
+          v128_t c = wasm_i32x4_shuffle(a, b, 0, 2, 4 + 0, 4 + 2);
+          v128_t d = wasm_i32x4_shuffle(a, b, 1, 3, 4 + 1, 4 + 3);
+          // v128_t c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
+          // v128_t d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
+          wasm_v128_store(dpl, d);
+          wasm_v128_store(dph, c);
+        }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_horz_wvlt_fwd_tx(line_buf *line_src, line_buf *line_ldst, 
-                                   line_buf *line_hdst, ui32 width, bool even)
+    void wasm_interleave(float* dp, float* spl, float* sph, 
+                         int width, bool even)
     {
-      if (width > 1)
-      {
-        si32 *src = line_src->i32;
-        si32 *ldst = line_ldst->i32, *hdst = line_hdst->i32;
-      
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
-
-        // extension
-        src[-1] = src[1];
-        src[width] = src[width-2];
-        // predict
-        const si32* sp = src + (even ? 1 : 0);
-        si32 *dph = hdst;
-        for (ui32 i = (H_width + 3) >> 2; i > 0; --i, dph+=4)
-        { //this is doing twice the work it needs to do
-          //it can be definitely written better
-          v128_t s1 = wasm_v128_load(sp - 1);
-          v128_t s2 = wasm_v128_load(sp + 1);
-          v128_t d = wasm_v128_load(sp);
-          s1 = wasm_i32x4_shr(wasm_i32x4_add(s1, s2), 1);
-          v128_t d1 = wasm_i32x4_sub(d, s1);
-          sp += 4;
-          s1 = wasm_v128_load(sp - 1);
-          s2 = wasm_v128_load(sp + 1);
-          d = wasm_v128_load(sp);
-          s1 = wasm_i32x4_shr(wasm_i32x4_add(s1, s2), 1);
-          v128_t d2 = wasm_i32x4_sub(d, s1);
-          sp += 4;
-          d = wasm_i32x4_shuffle(d1, d2, 0, 2, 4, 6);
-          wasm_v128_store(dph, d);
-        }
-
-        // extension
-        hdst[-1] = hdst[0];
-        hdst[H_width] = hdst[H_width-1];
-        // update
-        sp = src + (even ? 0 : 1);
-        const si32* sph = hdst + (even ? 0 : 1);
-        si32 *dpl = ldst;
-        v128_t offset = wasm_i32x4_splat(2);
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sp+=8, sph+=4, dpl+=4)
+      if (even)
+        for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
         {
-          v128_t s1 = wasm_v128_load(sph - 1);
-          s1 = wasm_i32x4_add(s1, offset);
-          v128_t s2 = wasm_v128_load(sph);
-          s2 = wasm_i32x4_add(s2, s1);
-          v128_t d1 = wasm_v128_load(sp);
-          v128_t d2 = wasm_v128_load(sp + 4);
-          v128_t d = wasm_i32x4_shuffle(d1, d2, 0, 2, 4, 6);
-          d = wasm_i32x4_add(d, wasm_i32x4_shr(s2, 2));
-          wasm_v128_store(dpl, d);
+          v128_t a = wasm_v128_load(spl);
+          v128_t b = wasm_v128_load(sph);
+          v128_t c = wasm_i32x4_shuffle(a, b, 0, 4 + 0, 1, 4 + 1);
+          v128_t d = wasm_i32x4_shuffle(a, b, 2, 4 + 2, 3, 4 + 3);
+          // v128_t c = _mm_unpacklo_ps(a, b);
+          // v128_t d = _mm_unpackhi_ps(a, b);
+          wasm_v128_store(dp, c);
+          wasm_v128_store(dp + 4, d);
         }
-      }
       else
-      {
-        if (even)
-          line_ldst->i32[0] = line_src->i32[0];
-        else
-          line_hdst->i32[0] = line_src->i32[0] << 1;
-      }
+        for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
+        {
+          v128_t a = wasm_v128_load(spl);
+          v128_t b = wasm_v128_load(sph);
+          v128_t c = wasm_i32x4_shuffle(b, a, 0, 4 + 0, 1, 4 + 1);
+          v128_t d = wasm_i32x4_shuffle(b, a, 2, 4 + 2, 3, 4 + 3);
+          // v128_t c = _mm_unpacklo_ps(b, a);
+          // v128_t d = _mm_unpackhi_ps(b, a);
+          wasm_v128_store(dp, c);
+          wasm_v128_store(dp + 4, d);
+        }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_vert_wvlt_bwd_predict(const line_buf *line_src1, 
-                                        const line_buf *line_src2,
-                                        line_buf *line_dst, ui32 repeat)
+    static inline void wasm_multiply_const(float* p, float f, int width)
     {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
-    
-      for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
+      v128_t factor = wasm_f32x4_splat(f);
+      for (; width > 0; width -= 4, p += 4)
       {
-        v128_t s1 = wasm_v128_load(src1);
-        v128_t s2 = wasm_v128_load(src2);
-        v128_t d = wasm_v128_load(dst);
-        s1 = wasm_i32x4_shr(wasm_i32x4_add(s1, s2), 1);
-        d = wasm_i32x4_add(d, s1);
-        wasm_v128_store(dst, d);
+        v128_t s = wasm_v128_load(p);
+        wasm_v128_store(p, wasm_f32x4_mul(factor, s));
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_vert_wvlt_bwd_update(const line_buf *line_src1, 
-                                       const line_buf *line_src2,
-                                       line_buf *line_dst, ui32 repeat)
+    void wasm_irv_vert_step(const lifting_step* s, const line_buf* sig, 
+                            const line_buf* other, const line_buf* aug, 
+                            ui32 repeat, bool synthesis)
     {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
-    
-      v128_t offset = wasm_i32x4_splat(2);
-      for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
+      float a = s->irv.Aatk;
+      if (synthesis)
+        a = -a;
+
+      v128_t factor = wasm_f32x4_splat(a);
+
+      float* dst = aug->f32;
+      const float* src1 = sig->f32, * src2 = other->f32;
+      int i = (int)repeat;
+      for ( ; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
       {
         v128_t s1 = wasm_v128_load(src1);
-        s1 = wasm_i32x4_add(s1, offset);
         v128_t s2 = wasm_v128_load(src2);
-        s2 = wasm_i32x4_add(s2, s1);
-        v128_t d = wasm_v128_load(dst);
-        d = wasm_i32x4_sub(d, wasm_i32x4_shr(s2, 2));
+        v128_t d  = wasm_v128_load(dst);
+        d = wasm_f32x4_add(d, wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2)));
         wasm_v128_store(dst, d);
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_horz_wvlt_bwd_tx(line_buf *line_dst, line_buf *line_lsrc, 
-                                   line_buf *line_hsrc, ui32 width, bool even)
+    void wasm_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat)
+    {
+      wasm_multiply_const(aug->f32, K, (int)repeat);
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void wasm_irv_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                           const line_buf* hdst, const line_buf* src, 
+                           ui32 width, bool even)
     {
       if (width > 1)
       {
-        si32 *lsrc = line_lsrc->i32, *hsrc = line_hsrc->i32;
-        si32 *dst = line_dst->i32;
-      
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
+        // split src into ldst and hdst
+        wasm_deinterleave(ldst->f32, hdst->f32, src->f32, (int)width, even);
 
-        // extension
-        hsrc[-1] = hsrc[0];
-        hsrc[H_width] = hsrc[H_width-1];
-        //inverse update
-        const si32 *sph = hsrc + (even ? 0 : 1);
-        si32 *spl = lsrc;
-        v128_t offset = wasm_i32x4_splat(2);
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sph+=4, spl+=4)
+        // the actual horizontal transform
+        float* hp = hdst->f32, * lp = ldst->f32;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
         {
-          v128_t s1 = wasm_v128_load(sph - 1);
-          s1 = wasm_i32x4_add(s1, offset);
-          v128_t s2 = wasm_v128_load(sph);
-          s2 = wasm_i32x4_add(s2, s1);
-          v128_t d = wasm_v128_load(spl);
-          d = wasm_i32x4_sub(d, wasm_i32x4_shr(s2, 2));
-          wasm_v128_store(spl, d);
+          const lifting_step* s = atk->get_step(j - 1);
+          const float a = s->irv.Aatk;
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const float* sp = lp;
+          float* dp = hp;
+          int i = (int)h_width;
+          v128_t f = wasm_f32x4_splat(a);
+          if (even)
+          {
+            for (; i > 0; i -= 4, sp += 4, dp += 4)
+            {
+              v128_t m = wasm_v128_load(sp);
+              v128_t n = wasm_v128_load(sp + 1);
+              v128_t p = wasm_v128_load(dp);
+              p = wasm_f32x4_add(p, wasm_f32x4_mul(f, wasm_f32x4_add(m, n)));
+              wasm_v128_store(dp, p);
+            }
+          }
+          else
+          {
+            for (; i > 0; i -= 4, sp += 4, dp += 4)
+            {
+              v128_t m = wasm_v128_load(sp);
+              v128_t n = wasm_v128_load(sp - 1);
+              v128_t p = wasm_v128_load(dp);
+              p = wasm_f32x4_add(p, wasm_f32x4_mul(f, wasm_f32x4_add(m, n)));
+              wasm_v128_store(dp, p);
+            }
+          }
+
+          // swap buffers
+          float* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
         }
 
-        // extension
-        lsrc[-1] = lsrc[0];
-        lsrc[L_width] = lsrc[L_width - 1];
-        // inverse predict and combine
-        si32 *dp = dst + (even ? 0 : -1);
-        spl = lsrc + (even ? 0 : -1);
-        sph = hsrc;
-        ui32 width = L_width + (even ? 0 : 1);
-        for (ui32 i = (width + 3) >> 2; i > 0; --i, sph+=4, spl+=4, dp+=8)
-        {
-          v128_t s1 = wasm_v128_load(spl);
-          v128_t s2 = wasm_v128_load(spl + 1);
-          v128_t d = wasm_v128_load(sph);
-          s2 = wasm_i32x4_shr(wasm_i32x4_add(s1, s2), 1);
-          d = wasm_i32x4_add(d, s2);
-          wasm_v128_store(dp, wasm_i32x4_shuffle(s1, d, 0, 4, 1, 5));
-          wasm_v128_store(dp + 4, wasm_i32x4_shuffle(s1, d, 2, 6, 3, 7));
+        { // multiply by K or 1/K
+          float K = atk->get_K();
+          float K_inv = 1.0f / K;
+          wasm_multiply_const(lp, K_inv, (int)l_width);
+          wasm_multiply_const(hp, K, (int)h_width);
         }
       }
-      else
-      {
+      else {
         if (even)
-          line_dst->i32[0] = line_lsrc->i32[0];
+          ldst->f32[0] = src->f32[0];
         else
-          line_dst->i32[0] = line_hsrc->i32[0] >> 1;
+          hdst->f32[0] = src->f32[0] * 2.0f;
       }
     }
     
     //////////////////////////////////////////////////////////////////////////
-    void wasm_irrev_vert_wvlt_step(const line_buf *line_src1, 
-                                   const line_buf *line_src2,
-                                   line_buf *line_dst, int step_num, 
-                                   ui32 repeat)
+    void wasm_irv_horz_syn(const param_atk* atk, const line_buf* dst, 
+                           const line_buf* lsrc, const line_buf* hsrc, 
+                           ui32 width, bool even)
     {
-      float *dst = line_dst->f32;
-      const float *src1 = line_src1->f32, *src2 = line_src2->f32;
-    
-      v128_t factor = wasm_f32x4_splat(LIFTING_FACTORS::steps[step_num]);
-      for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
+      if (width > 1)
       {
-        v128_t s1 = wasm_v128_load(src1);
-        v128_t s2 = wasm_v128_load(src2);
-        v128_t d = wasm_v128_load(dst);
-        d = wasm_f32x4_add(d, wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2)));
-        wasm_v128_store(dst, d);
+        bool ev = even;
+        float* oth = hsrc->f32, * aug = lsrc->f32;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+
+        { // multiply by K or 1/K
+          float K = atk->get_K();
+          float K_inv = 1.0f / K;
+          wasm_multiply_const(aug, K, (int)aug_width);
+          wasm_multiply_const(oth, K_inv, (int)oth_width);
+        }
+
+        // the actual horizontal transform
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
+        {
+          const lifting_step* s = atk->get_step(j);
+          const float a = s->irv.Aatk;
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const float* sp = oth;
+          float* dp = aug;
+          int i = (int)aug_width;
+          v128_t f = wasm_f32x4_splat(a);
+          if (ev)
+          {
+            for ( ; i > 0; i -= 4, sp += 4, dp += 4)
+            {
+              v128_t m = wasm_v128_load(sp);
+              v128_t n = wasm_v128_load(sp - 1);
+              v128_t p = wasm_v128_load(dp);
+              p = wasm_f32x4_sub(p, wasm_f32x4_mul(f, wasm_f32x4_add(m, n)));
+              wasm_v128_store(dp, p);
+            }
+          }
+          else
+          {
+            for ( ; i > 0; i -= 4, sp += 4, dp += 4)
+            {
+              v128_t m = wasm_v128_load(sp);
+              v128_t n = wasm_v128_load(sp + 1);
+              v128_t p = wasm_v128_load(dp);
+              p = wasm_f32x4_sub(p, wasm_f32x4_mul(f, wasm_f32x4_add(m, n)));
+              wasm_v128_store(dp, p);
+            }
+          }
+
+          // swap buffers
+          float* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
+        }
+
+        // combine both lsrc and hsrc into dst
+        wasm_interleave(dst->f32, lsrc->f32, hsrc->f32, (int)width, even);
+      }
+      else {
+        if (even)
+          dst->f32[0] = lsrc->f32[0];
+        else
+          dst->f32[0] = hsrc->f32[0] * 0.5f;
       }
     }
 
     /////////////////////////////////////////////////////////////////////////
-    void wasm_irrev_vert_wvlt_K(const line_buf *line_src, line_buf *line_dst,
-                                bool L_analysis_or_H_synthesis, ui32 repeat)
+    void wasm_rev_vert_step(const lifting_step* s, const line_buf* sig, 
+                            const line_buf* other, const line_buf* aug, 
+                            ui32 repeat, bool synthesis)
     {
-      float *dst = line_dst->f32;
-      const float *src = line_src->f32;
+      const si32 a = s->rev.Aatk;
+      const si32 b = s->rev.Batk;
+      const ui32 e = s->rev.Eatk;
+      v128_t vb = wasm_i32x4_splat(b);
 
-      float f = LIFTING_FACTORS::K_inv;
-      f = L_analysis_or_H_synthesis ? f : LIFTING_FACTORS::K;
-      v128_t factor = wasm_f32x4_splat(f);
-      for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src+=4)
-      {
-        v128_t s = wasm_v128_load(src);
-        wasm_v128_store(dst, wasm_f32x4_mul(factor, s));
+      si32* dst = aug->i32;
+      const si32* src1 = sig->i32, * src2 = other->i32;
+      // The general definition of the wavelet in Part 2 is slightly 
+      // different to part 2, although they are mathematically equivalent
+      // here, we identify the simpler form from Part 1 and employ them
+      if (a == 1)
+      { // 5/3 update and any case with a == 1
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i32x4_add(s1, s2);
+            v128_t v = wasm_i32x4_add(vb, t);
+            v128_t w = wasm_i32x4_shr(v, e);
+            d = wasm_i32x4_sub(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i32x4_add(s1, s2);
+            v128_t v = wasm_i32x4_add(vb, t);
+            v128_t w = wasm_i32x4_shr(v, e);
+            d = wasm_i32x4_add(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+      }
+      else if (a == -1 && b == 1 && e == 1)
+      { // 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i32x4_add(s1, s2);
+            v128_t w = wasm_i32x4_shr(t, e);
+            d = wasm_i32x4_add(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i32x4_add(s1, s2);
+            v128_t w = wasm_i32x4_shr(t, e);
+            d = wasm_i32x4_sub(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+      }
+      else if (a == -1)
+      { // any case with a == -1, which is not 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i32x4_add(s1, s2);
+            v128_t v = wasm_i32x4_sub(vb, t);
+            v128_t w = wasm_i32x4_shr(v, e);
+            d = wasm_i32x4_sub(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i32x4_add(s1, s2);
+            v128_t v = wasm_i32x4_sub(vb, t);
+            v128_t w = wasm_i32x4_shr(v, e);
+            d = wasm_i32x4_add(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+      }
+      else { // general case
+        // 32bit multiplication is not supported in sse2; we need sse4.1,
+        // where we can use _mm_mullo_epi32, which multiplies 32bit x 32bit,
+        // keeping the LSBs
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (b + a * (*src1++ + *src2++)) >> e;
       }
     }
 
     /////////////////////////////////////////////////////////////////////////
-    void wasm_irrev_horz_wvlt_fwd_tx(line_buf *line_src, line_buf *line_ldst, 
-                                     line_buf *line_hdst, ui32 width, 
-                                     bool even)
+    void wasm_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                           const line_buf* hdst, const line_buf* src, 
+                           ui32 width, bool even)
     {
       if (width > 1)
       {
-        float *src = line_src->f32;
-        float *ldst = line_ldst->f32, *hdst = line_hdst->f32;
-      
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
-
-        //extension
-        src[-1] = src[1];
-        src[width] = src[width-2];
-        // predict
-        const float* sp = src + (even ? 1 : 0);
-        float *dph = hdst;
-        v128_t factor = wasm_f32x4_splat(LIFTING_FACTORS::steps[0]);
-        for (ui32 i = (H_width + 3) >> 2; i > 0; --i, dph+=4)
-        { //this is doing twice the work it needs to do
-          //it can be definitely written better
-          v128_t s1 = wasm_v128_load(sp - 1);
-          v128_t s2 = wasm_v128_load(sp + 1);
-          v128_t d = wasm_v128_load(sp);
-          s1 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
-          v128_t d1 = wasm_f32x4_add(d, s1);
-          sp += 4;
-          s1 = wasm_v128_load(sp - 1);
-          s2 = wasm_v128_load(sp + 1);
-          d = wasm_v128_load(sp);
-          s1 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
-          v128_t d2 = wasm_f32x4_add(d, s1);
-          sp += 4;
-          d = wasm_i32x4_shuffle(d1, d2, 0, 2, 4, 6);
-          wasm_v128_store(dph, d);
-        }
+        // combine both lsrc and hsrc into dst
+        wasm_deinterleave(ldst->f32, hdst->f32, src->f32, (int)width, even);          
 
-        // extension
-        hdst[-1] = hdst[0];
-        hdst[H_width] = hdst[H_width-1];
-        // update
-        factor = wasm_f32x4_splat(LIFTING_FACTORS::steps[1]);
-        sp = src + (even ? 0 : 1);
-        const float* sph = hdst + (even ? 0 : 1);
-        float *dpl = ldst;
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sp+=8, sph+=4, dpl+=4)
+        si32* hp = hdst->i32, * lp = ldst->i32;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
         {
-          v128_t s1 = wasm_v128_load(sph - 1);
-          v128_t s2 = wasm_v128_load(sph);
-          s1 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
-          v128_t d1 = wasm_v128_load(sp);
-          v128_t d2 = wasm_v128_load(sp + 4);
-          v128_t d = wasm_i32x4_shuffle(d1, d2, 0, 2, 4, 6);
-          d = wasm_f32x4_add(d, s1);
-          wasm_v128_store(dpl, d);
-        }
+          // first lifting step
+          const lifting_step* s = atk->get_step(j - 1);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui32 e = s->rev.Eatk;
+          v128_t vb = wasm_i32x4_splat(b);
 
-        //extension
-        ldst[-1] = ldst[0];
-        ldst[L_width] = ldst[L_width-1];
-        //predict
-        factor = wasm_f32x4_splat(LIFTING_FACTORS::steps[2]);
-        const float* spl = ldst + (even ? 1 : 0);
-        dph = hdst;
-        for (ui32 i = (H_width + 3) >> 2; i > 0; --i, spl+=4, dph+=4)
-        {
-          v128_t s1 = wasm_v128_load(spl - 1);
-          v128_t s2 = wasm_v128_load(spl);
-          v128_t d = wasm_v128_load(dph);
-          s1 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
-          d = wasm_f32x4_add(d, s1);
-          wasm_v128_store(dph, d);
-        }
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const si32* sp = lp;
+          si32* dp = hp;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)h_width;
+            if (even)
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t v = wasm_i32x4_add(vb, t);
+                v128_t w = wasm_i32x4_shr(v, e);
+                d = wasm_i32x4_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t v = wasm_i32x4_add(vb, t);
+                v128_t w = wasm_i32x4_shr(v, e);
+                d = wasm_i32x4_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t w = wasm_i32x4_shr(t, e);
+                d = wasm_i32x4_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t w = wasm_i32x4_shr(t, e);
+                d = wasm_i32x4_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t v = wasm_i32x4_sub(vb, t);
+                v128_t w = wasm_i32x4_shr(v, e);
+                d = wasm_i32x4_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t v = wasm_i32x4_sub(vb, t);
+                v128_t w = wasm_i32x4_shr(v, e);
+                d = wasm_i32x4_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+          }
+          else {
+            // general case
+            // 32bit multiplication is not supported in sse2; we need sse4.1,
+            // where we can use _mm_mullo_epi32, which multiplies
+            // 32bit x 32bit, keeping the LSBs
+            if (even)
+              for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+                *dp += (b + a * (sp[0] + sp[1])) >> e;
+            else
+              for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+                *dp += (b + a * (sp[-1] + sp[0])) >> e;
+          }
 
-        // extension
-        hdst[-1] = hdst[0];
-        hdst[H_width] = hdst[H_width-1];
-        // update
-        factor = wasm_f32x4_splat(LIFTING_FACTORS::steps[3]);
-        sph = hdst + (even ? 0 : 1);
-        dpl = ldst;
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sph+=4, dpl+=4)
-        {
-          v128_t s1 = wasm_v128_load(sph - 1);
-          v128_t s2 = wasm_v128_load(sph);
-          v128_t d = wasm_v128_load(dpl);
-          s1 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
-          d = wasm_f32x4_add(d, s1);
-          wasm_v128_store(dpl, d);
-        }
-
-        //multipliers
-        float *dp = ldst;
-        factor = wasm_f32x4_splat(LIFTING_FACTORS::K_inv);
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, dp+=4)
-        {
-          v128_t d = wasm_v128_load(dp);
-          wasm_v128_store(dp, wasm_f32x4_mul(factor, d));
-        }
-        dp = hdst;
-        factor = wasm_f32x4_splat(LIFTING_FACTORS::K);
-        for (int i = (H_width + 3) >> 2; i > 0; --i, dp+=4)
-        {
-          v128_t d = wasm_v128_load(dp);
-          wasm_v128_store(dp, wasm_f32x4_mul(factor, d));
+          // swap buffers
+          si32* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
         }
       }
-      else
-      {
+      else {
         if (even)
-          line_ldst->f32[0] = line_src->f32[0];
+          ldst->i32[0] = src->i32[0];
         else
-          line_hdst->f32[0] = line_src->f32[0] + line_src->f32[0];
+          hdst->i32[0] = src->i32[0] << 1;
       }
     }
-
-    /////////////////////////////////////////////////////////////////////////
-    void wasm_irrev_horz_wvlt_bwd_tx(line_buf *line_dst, line_buf *line_lsrc, 
-                                     line_buf *line_hsrc, ui32 width, 
-                                     bool even)
+    
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
+                           const line_buf* lsrc, const line_buf* hsrc, 
+                           ui32 width, bool even)
     {
       if (width > 1)
       {
-        float *lsrc = line_lsrc->f32, *hsrc = line_hsrc->f32;
-        float *dst = line_dst->f32;
-      
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
-
-        //multipliers
-        float *dp = lsrc;
-        v128_t factor = wasm_f32x4_splat(LIFTING_FACTORS::K);
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, dp+=4)
-        {
-          v128_t d = wasm_v128_load(dp);
-          wasm_v128_store(dp, wasm_f32x4_mul(factor, d));
-        }
-        dp = hsrc;
-        factor = wasm_f32x4_splat(LIFTING_FACTORS::K_inv);
-        for (ui32 i = (H_width + 3) >> 2; i > 0; --i, dp+=4)
-        {
-          v128_t d = wasm_v128_load(dp);
-          wasm_v128_store(dp, wasm_f32x4_mul(factor, d));
-        }
-
-        //extension
-        hsrc[-1] = hsrc[0];
-        hsrc[H_width] = hsrc[H_width-1];
-        //inverse update
-        factor = wasm_f32x4_splat(LIFTING_FACTORS::steps[7]);
-        const float *sph = hsrc + (even ? 0 : 1);
-        float *dpl = lsrc;
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, dpl+=4, sph+=4)
+        bool ev = even;
+        si32* oth = hsrc->i32, * aug = lsrc->i32;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
         {
-          v128_t s1 = wasm_v128_load(sph - 1);
-          v128_t s2 = wasm_v128_load(sph);
-          v128_t d = wasm_v128_load(dpl);
-          s1 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
-          d = wasm_f32x4_add(d, s1);
-          wasm_v128_store(dpl, d);
-        }
+          const lifting_step* s = atk->get_step(j);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui32 e = s->rev.Eatk;
+          v128_t vb = wasm_i32x4_splat(b);
 
-        //extension
-        lsrc[-1] = lsrc[0];
-        lsrc[L_width] = lsrc[L_width-1];
-        //inverse perdict
-        factor = wasm_f32x4_splat(LIFTING_FACTORS::steps[6]);
-        const float *spl = lsrc + (even ? 0 : -1);
-        float *dph = hsrc;
-        for (ui32 i = (H_width + 3) >> 2; i > 0; --i, dph+=4, spl+=4)
-        {
-          v128_t s1 = wasm_v128_load(spl);
-          v128_t s2 = wasm_v128_load(spl + 1);
-          v128_t d = wasm_v128_load(dph);
-          s1 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
-          d = wasm_f32x4_add(d, s1);
-          wasm_v128_store(dph, d);
-        }
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const si32* sp = oth;
+          si32* dp = aug;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)aug_width;
+            if (ev)
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t v = wasm_i32x4_add(vb, t);
+                v128_t w = wasm_i32x4_shr(v, e);
+                d = wasm_i32x4_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t v = wasm_i32x4_add(vb, t);
+                v128_t w = wasm_i32x4_shr(v, e);
+                d = wasm_i32x4_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t w = wasm_i32x4_shr(t, e);
+                d = wasm_i32x4_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t w = wasm_i32x4_shr(t, e);
+                d = wasm_i32x4_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t v = wasm_i32x4_sub(vb, t);
+                v128_t w = wasm_i32x4_shr(v, e);
+                d = wasm_i32x4_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t v = wasm_i32x4_sub(vb, t);
+                v128_t w = wasm_i32x4_shr(v, e);
+                d = wasm_i32x4_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+          }
+          else {
+            // general case
+            // 32bit multiplication is not supported in sse2; we need sse4.1,
+            // where we can use _mm_mullo_epi32, which multiplies
+            // 32bit x 32bit, keeping the LSBs
+            if (ev)
+              for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+                *dp -= (b + a * (sp[-1] + sp[0])) >> e;
+            else
+              for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+                *dp -= (b + a * (sp[0] + sp[1])) >> e;
+          }
 
-        //extension
-        hsrc[-1] = hsrc[0];
-        hsrc[H_width] = hsrc[H_width-1];
-        //inverse update
-        factor = wasm_f32x4_splat(LIFTING_FACTORS::steps[5]);
-        sph = hsrc + (even ? 0 : 1);
-        dpl = lsrc;
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, dpl+=4, sph+=4)
-        {
-          v128_t s1 = wasm_v128_load(sph - 1);
-          v128_t s2 = wasm_v128_load(sph);
-          v128_t d = wasm_v128_load(dpl);
-          s1 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
-          d = wasm_f32x4_add(d, s1);
-          wasm_v128_store(dpl, d);
+          // swap buffers
+          si32* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
         }
 
-        //extension
-        lsrc[-1] = lsrc[0];
-        lsrc[L_width] = lsrc[L_width-1];
-        //inverse perdict and combine
-        factor = wasm_f32x4_splat(LIFTING_FACTORS::steps[4]);
-        dp = dst + (even ? 0 : -1);
-        spl = lsrc + (even ? 0 : -1);
-        sph = hsrc;
-        ui32 width = L_width + (even ? 0 : 1);
-        for (ui32 i = (width + 3) >> 2; i > 0; --i, spl+=4, sph+=4, dp+=8)
-        {
-          v128_t s1 = wasm_v128_load(spl);
-          v128_t s2 = wasm_v128_load(spl + 1);
-          v128_t d = wasm_v128_load(sph);
-          s2 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
-          d = wasm_f32x4_add(d, s2);
-          wasm_v128_store(dp, wasm_i32x4_shuffle(s1, d, 0, 4, 1, 5));
-          wasm_v128_store(dp + 4, wasm_i32x4_shuffle(s1, d, 2, 6, 3, 7));
-        }
+        // combine both lsrc and hsrc into dst
+        wasm_interleave(dst->f32, lsrc->f32, hsrc->f32, (int)width, even);
       }
-      else
-      {
+      else {
         if (even)
-          line_dst->f32[0] = line_lsrc->f32[0];
+          dst->i32[0] = lsrc->i32[0];
         else
-          line_dst->f32[0] = line_hsrc->f32[0] * 0.5f;
+          dst->i32[0] = hsrc->i32[0] >> 1;
       }
     }
-
-  }
-}
+    
+  } // !local
+} // !ojph
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 48c8f67d..000409ff 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -69,10 +69,22 @@ else()
     COMMAND ${CMAKE_COMMAND} -E copy "$<TARGET_FILE:ojph_expand>" "./"
     COMMAND ${CMAKE_COMMAND} -E copy "$<TARGET_FILE:ojph_compress>" "./"
   )
+  if(EMSCRIPTEN)
+    add_custom_command(TARGET test_executables POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy "$<TARGET_FILE:ojph_expand_simd>" "./"
+      COMMAND ${CMAKE_COMMAND} -E copy "$<TARGET_FILE:ojph_compress_simd>" "./"
+    )
+    add_custom_command(TARGET test_executables POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy "$<TARGET_FILE_DIR:ojph_expand>/ojph_expand.wasm" "./"
+      COMMAND ${CMAKE_COMMAND} -E copy "$<TARGET_FILE_DIR:ojph_compress>/ojph_compress.wasm" "./"
+      COMMAND ${CMAKE_COMMAND} -E copy "$<TARGET_FILE_DIR:ojph_expand_simd>/ojph_expand_simd.wasm" "./"
+      COMMAND ${CMAKE_COMMAND} -E copy "$<TARGET_FILE_DIR:ojph_compress_simd>/ojph_compress_simd.wasm" "./"
+    )
+  endif(EMSCRIPTEN)
   if(MSYS)
     add_custom_command(TARGET test_executables POST_BUILD
       COMMAND ${CMAKE_COMMAND} -E copy "../bin/msys-gtest.dll" "./"
       COMMAND ${CMAKE_COMMAND} -E copy "../bin/msys-gtest_main.dll" "./"
     )
-  endif()
-endif()
+  endif(MSYS)
+endif(MSVC)