diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1c4fbe494..9ebe1ec42 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,9 +2,52 @@
 
 This is a list of notable changes to Hyperscan, in reverse chronological order.
 
+## [4.5.0] 2017-06-09
+- New API feature: approximate matching using the "edit distance" extended
+  parameter. This allows the user to request all matches that are a given edit
+  distance from an exact match for a pattern.
+- Initial support for Intel(R) Advanced Vector Extensions 512 (Intel(R)
+  AVX-512), disabled by default. To enable it, pass `-DBUILD_AVX512=1` to
+  `cmake`.
+- Major compile time improvements in many subsystems, reducing compile time
+  significantly for many large pattern sets.
+- Internal reworking of literal matchers to operate on literals of at
+  most eight characters, with subsequent confirmation done in the Rose
+  interpreter. This reduces complexity and bytecode size and improves
+  performance for many pattern sets.
+- Improve performance of the FDR literal matcher front end.
+- Improve bucket assignment and other heuristics governing the FDR literal
+  matcher.
+- Improve optimisation passes that take advantage of extended parameter
+  constraints (`min_offset`, etc).
+- Introduce further lookaround specialisations to improve scanning performance.
+- Optimise Rose interpreter construction to reduce the length of programs
+  generated in some situations.
+- Remove the old "Rose" pattern decomposition analysis pass in favour of the
+  new "Violet" pass introduced in Hyperscan 4.3.0.
+- In streaming mode, allow exhaustion (where the stream can no longer produce
+  matchers) to be detected in more situations, improving scanning performance.
+- Improve parsing of control verbs (such as `(*UTF8)`) that can only occur at
+  the beginning of the pattern. Combinations of supported verbs in any order
+  are now permitted.
+- Update version of PCRE used by testing tools as a syntax and semantic
+  reference to PCRE 8.40.
+- Tuning support for Intel(R) microarchitecture code names Skylake, Skylake
+  Server, Goldmont.
+- CMake: when building a native build with a version of GCC that doesn't
+  recognise the host compiler, tune for the microarch selected by
+  `-march=native`.
+- CMake: don't fail if SQLite (which is only required to build the `hsbench`
+  tool) is not present.
+- CMake: detect libc++ directly and use that to inform the Boost version
+  requirement.
+- Bugfix for issue #51: make the fat runtime build wrapper less fragile.
+- Bugfix for issues #46, #52: use `sqlite3_errmsg()` to allow SQLite 3.6.x to
+  be used. Thanks to @EaseTheWorld for the PR.
+
 ## [4.4.1] 2017-02-28
 - Bugfixes to fix issues where stale data was being referenced in scratch
-  memory. In particular this may have resulted in hs_close_stream()
+  memory. In particular this may have resulted in `hs_close_stream()`
   referencing data from other previously scanned streams. This may result in
   incorrect matches being been reported.
 
@@ -142,9 +185,7 @@ This is a list of notable changes to Hyperscan, in reverse chronological order.
   supplied with a NULL scratch pointer if no matches are required. This is in
   line with the behaviour of `hs_close_stream()`.
 - Disallow bounded repeats with a very large minimum repeat but no maximum,
-  i.e. {
-    N,
-} for very large N.
+  i.e. {N,} for very large N.
 - Reduce compile memory usage in literal set explansion for some large cases.
 
 ## [4.0.0] 2015-10-20
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7ede52b45..7f452696a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,20 +1,22 @@
 cmake_minimum_required (VERSION 2.8.11)
-project (Hyperscan C CXX)
+project (hyperscan C CXX)
 
 set (HS_MAJOR_VERSION 4)
-set (HS_MINOR_VERSION 4)
-set (HS_PATCH_VERSION 1)
+set (HS_MINOR_VERSION 5)
+set (HS_PATCH_VERSION 0)
 set (HS_VERSION ${HS_MAJOR_VERSION}.${HS_MINOR_VERSION}.${HS_PATCH_VERSION})
 
 set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
 include(CheckCCompilerFlag)
 include(CheckCXXCompilerFlag)
+include(CheckCXXSymbolExists)
 INCLUDE (CheckFunctionExists)
 INCLUDE (CheckIncludeFiles)
 INCLUDE (CheckIncludeFileCXX)
 INCLUDE (CheckLibraryExists)
 INCLUDE (CheckSymbolExists)
 include (CMakeDependentOption)
+include (GNUInstallDirs)
 include (${CMAKE_MODULE_PATH}/platform.cmake)
 include (${CMAKE_MODULE_PATH}/ragel.cmake)
 
@@ -36,6 +38,7 @@ endif()
 
 set(BINDIR "${PROJECT_BINARY_DIR}/bin")
 set(LIBDIR "${PROJECT_BINARY_DIR}/lib")
+set(INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR})
 
 # First for the generic no-config case
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${BINDIR}")
@@ -59,31 +62,6 @@ include_directories(${PROJECT_SOURCE_DIR}/src)
 include_directories(${PROJECT_BINARY_DIR})
 include_directories(SYSTEM include)
 
-set(BOOST_USE_STATIC_LIBS OFF)
-set(BOOST_USE_MULTITHREADED OFF)
-set(BOOST_USE_STATIC_RUNTIME OFF)
-if (CMAKE_SYSTEM_NAME MATCHES "Darwin"
-    OR (CMAKE_SYSTEM_NAME MATCHES "FreeBSD"
-        AND CMAKE_C_COMPILER_ID MATCHES "Clang"))
-    # we need a more recent boost for libc++ used by clang on OSX and FreeBSD
-    set(BOOST_MINVERSION 1.61.0)
-else ()
-    set(BOOST_MINVERSION 1.57.0)
-endif ()
-set(BOOST_NO_BOOST_CMAKE ON)
-
-# first check for Boost installed on the system
-find_package(Boost ${BOOST_MINVERSION})
-if(NOT Boost_FOUND)
-    # we might have boost in tree, so provide a hint and try again
-    message(STATUS "trying include dir for boost")
-    set(BOOST_INCLUDEDIR "${PROJECT_SOURCE_DIR}/include")
-    find_package(Boost ${BOOST_MINVERSION})
-    if(NOT Boost_FOUND)
-        message(FATAL_ERROR "Boost ${BOOST_MINVERSION} or later not found. Either install system packages if available, extract Boost headers to ${CMAKE_SOURCE_DIR}/include, or set the CMake BOOST_ROOT variable.")
-    endif()
-endif()
-
 include (${CMAKE_MODULE_PATH}/boost.cmake)
 
 # -- make this work? set(python_ADDITIONAL_VERSIONS 2.7 2.6)
@@ -132,6 +110,12 @@ if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
     endif()
 endif()
 
+if (NOT BUILD_SHARED_LIBS)
+    # build static libs
+    set(BUILD_STATIC_LIBS ON)
+    mark_as_advanced(BUILD_STATIC_LIBS)
+endif ()
+
 #for config
 if (OPTIMISE)
     set(HS_OPTIMIZE ON)
@@ -141,6 +125,9 @@ CMAKE_DEPENDENT_OPTION(DUMP_SUPPORT "Dump code support; normally on, except in r
 
 CMAKE_DEPENDENT_OPTION(DISABLE_ASSERTS "Disable assert(); Asserts are enabled in debug builds, disabled in release builds" OFF "NOT RELEASE_BUILD" ON)
 
+option(BUILD_AVX512 "Experimental: support avx512 in the fat runtime"
+    OFF)
+
 option(WINDOWS_ICC "Use Intel C++ Compiler on Windows, default off, requires ICC to be set in project" OFF)
 
 # TODO: per platform config files?
@@ -148,16 +135,21 @@ option(WINDOWS_ICC "Use Intel C++ Compiler on Windows, default off, requires ICC
 # TODO: windows generator on cmake always uses msvc, even if we plan to build with icc
 if(MSVC OR MSVC_IDE)
     message(STATUS "Building for Windows")
+
     if (MSVC_VERSION LESS 1700)
         message(FATAL_ERROR "The project requires C++11 features.")
     else()
         if (WINDOWS_ICC)
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O3 /Qstd=c99 /Qrestrict /QxHost /wd4267 /Qdiag-disable:remark")
+            set(ARCH_C_FLAGS "/QxHost")
+            set(ARCH_CXX_FLAGS "/QxHost")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /O3 /Qstd=c99 /Qrestrict /wd4267 /Qdiag-disable:remark")
             set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 /Qstd=c++11 /Qrestrict /QxHost /wd4267 /wd4800 /Qdiag-disable:remark -DBOOST_DETAIL_NO_CONTAINER_FWD -D_SCL_SECURE_NO_WARNINGS")
         else()
-            #TODO: don't hardcode arch
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}  /O2 /arch:AVX /wd4267")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 /arch:AVX /wd4244 /wd4267 /wd4800 -DBOOST_DETAIL_NO_CONTAINER_FWD -D_SCL_SECURE_NO_WARNINGS")
+            # todo: change these as required
+            set(ARCH_C_FLAGS "/arch:AVX2")
+            set(ARCH_CXX_FLAGS "/arch:AVX2")
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}  /O2 /wd4244 /wd4267")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /O2 /wd4244 /wd4267 /wd4800 -DBOOST_DETAIL_NO_CONTAINER_FWD -D_SCL_SECURE_NO_WARNINGS")
         endif()
         string(REPLACE "/RTC1" "" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
         string(REPLACE "/RTC1" "" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}")
@@ -166,32 +158,58 @@ if(MSVC OR MSVC_IDE)
             set(CMAKE_C_FLAGS_DEBUG "/DNDEBUG ${CMAKE_C_FLAGS_DEBUG}")
             set(CMAKE_CXX_FLAGS_DEBUG "/DNDEBUG ${CMAKE_CXX_FLAGS_DEBUG}")
         endif ()
+
+        # flags only used to build hs libs
+        set(HS_C_FLAGS "/Gv")
+        set(HS_CXX_FLAGS "/Gv")
     endif()
 
 else()
 
-    # compiler version checks TODO: test more compilers
-    if (CMAKE_COMPILER_IS_GNUCXX)
-        set (GNUCXX_MINVER "4.8.1")
-        exec_program(${CMAKE_CXX_COMPILER}
-                     ARGS ${CMAKE_CXX_COMPILER_ARG1} --version
-                     OUTPUT_VARIABLE _GXX_OUTPUT)
-        # is the following too fragile?
-        string(REGEX REPLACE ".* ([0-9]\\.[0-9](\\.[0-9])?)( |\n).*" "\\1"
-               GNUCXX_VERSION "${_GXX_OUTPUT}")
-        message(STATUS "g++ version ${GNUCXX_VERSION}")
-        if (GNUCXX_VERSION VERSION_LESS ${GNUCXX_MINVER})
-            message(FATAL_ERROR "A minimum of g++ ${GNUCXX_MINVER} is required for C++11 support")
-        endif()
-        unset(_GXX_OUTPUT)
-    endif()
-
     # remove CMake's idea of optimisation
     foreach (CONFIG ${CMAKE_BUILD_TYPE} ${CMAKE_CONFIGURATION_TYPES})
         string(REGEX REPLACE "-O[^ ]*" "" CMAKE_C_FLAGS_${CONFIG} "${CMAKE_C_FLAGS_${CONFIG}}")
         string(REGEX REPLACE "-O[^ ]*" "" CMAKE_CXX_FLAGS_${CONFIG} "${CMAKE_CXX_FLAGS_${CONFIG}}")
     endforeach ()
 
+    if (CMAKE_COMPILER_IS_GNUCC)
+        message(STATUS "gcc version ${CMAKE_C_COMPILER_VERSION}")
+        # If gcc doesn't recognise the host cpu, then mtune=native becomes
+        # generic, which isn't very good in some cases. march=native looks at
+        # cpuid info and then chooses the best microarch it can (and replaces
+        # the flag), so use that for tune.
+
+        # arg1 might exist if using ccache
+        string (STRIP "${CMAKE_C_COMPILER_ARG1}" CC_ARG1)
+        set (EXEC_ARGS ${CC_ARG1} -c -Q --help=target -march=native -mtune=native)
+        execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
+            OUTPUT_VARIABLE _GCC_OUTPUT)
+        string(REGEX REPLACE ".*march=[ \t]*([^ \n]*)[ \n].*" "\\1"
+            GNUCC_ARCH "${_GCC_OUTPUT}")
+
+        # test the parsed flag
+        set (EXEC_ARGS ${CC_ARG1} -E - -mtune=${GNUCC_ARCH})
+        execute_process(COMMAND ${CMAKE_C_COMPILER} ${EXEC_ARGS}
+            OUTPUT_QUIET ERROR_QUIET
+            INPUT_FILE /dev/null
+            RESULT_VARIABLE GNUCC_TUNE_TEST)
+        if (NOT GNUCC_TUNE_TEST EQUAL 0)
+            message(SEND_ERROR "Something went wrong determining gcc tune: -mtune=${GNUCC_ARCH} not valid")
+        endif()
+        set(TUNE_FLAG ${GNUCC_ARCH})
+    else ()
+        set(TUNE_FLAG native)
+    endif()
+
+    # compiler version checks TODO: test more compilers
+    if (CMAKE_COMPILER_IS_GNUCXX)
+        set(GNUCXX_MINVER "4.8.1")
+        message(STATUS "g++ version ${CMAKE_CXX_COMPILER_VERSION}")
+        if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS GNUCXX_MINVER)
+            message(FATAL_ERROR "A minimum of g++ ${GNUCXX_MINVER} is required for C++11 support")
+        endif()
+    endif()
+
     if(OPTIMISE)
         set(OPT_C_FLAG "-O3")
         set(OPT_CXX_FLAG "-O2")
@@ -216,12 +234,12 @@ else()
         set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -DNDEBUG")
     endif()
 
-    if (NOT CMAKE_C_FLAGS MATCHES .*march.*)
-        set(ARCH_C_FLAGS "${ARCH_C_FLAGS} -march=native -mtune=native")
+    if (NOT CMAKE_C_FLAGS MATCHES .*march.* AND NOT CMAKE_C_FLAGS MATCHES .*mtune.*)
+        set(ARCH_C_FLAGS "-march=native -mtune=${TUNE_FLAG}")
     endif()
 
-    if (NOT CMAKE_CXX_FLAGS MATCHES .*march.*)
-        set(ARCH_CXX_FLAGS "${ARCH_CXX_FLAGS} -march=native -mtune=native")
+    if (NOT CMAKE_CXX_FLAGS MATCHES .*march.* AND NOT CMAKE_CXX_FLAGS MATCHES .*mtune.*)
+        set(ARCH_CXX_FLAGS "-march=native -mtune=${TUNE_FLAG}")
     endif()
 
     if(CMAKE_COMPILER_IS_GNUCC)
@@ -244,6 +262,11 @@ else()
         set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -Wno-abi")
     endif ()
 
+    if (CMAKE_C_COMPILER_ID MATCHES "Intel")
+        set(SKYLAKE_FLAG "-xCORE-AVX512")
+    else ()
+        set(SKYLAKE_FLAG "-march=skylake-avx512")
+    endif ()
 endif()
 
 CHECK_INCLUDE_FILES(unistd.h HAVE_UNISTD_H)
@@ -259,6 +282,9 @@ CHECK_FUNCTION_EXISTS(_aligned_malloc HAVE__ALIGNED_MALLOC)
 CHECK_C_COMPILER_FLAG(-fvisibility=hidden HAS_C_HIDDEN)
 CHECK_CXX_COMPILER_FLAG(-fvisibility=hidden HAS_CXX_HIDDEN)
 
+# are we using libc++
+CHECK_CXX_SYMBOL_EXISTS(_LIBCPP_VERSION ciso646 HAVE_LIBCPP)
+
 if (RELEASE_BUILD)
     if (HAS_C_HIDDEN)
         set(EXTRA_C_FLAGS "${EXTRA_C_FLAGS} -fvisibility=hidden")
@@ -294,13 +320,10 @@ endif ()
 
 include (${CMAKE_MODULE_PATH}/arch.cmake)
 
-if (NOT FAT_RUNTIME AND NOT HAVE_SSSE3)
-        message(FATAL_ERROR "A minimum of SSSE3 compiler support is required")
-endif ()
-
 # testing a builtin takes a little more work
 CHECK_C_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CC_BUILTIN_ASSUME_ALIGNED)
 CHECK_CXX_SOURCE_COMPILES("void *aa_test(void *x) { return __builtin_assume_aligned(x, 16);}\nint main(void) { return 0; }" HAVE_CXX_BUILTIN_ASSUME_ALIGNED)
+CHECK_C_SOURCE_COMPILES("int main(void) { __builtin_constant_p(0); }" HAVE__BUILTIN_CONSTANT_P)
 
 if (NOT WIN32)
 set(C_FLAGS_TO_CHECK
@@ -404,13 +427,13 @@ endif()
 endif()
 
 if (NOT FAT_RUNTIME)
-message(STATUS "Building for current host CPU")
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ARCH_C_FLAGS}")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_CXX_FLAGS}")
+    message(STATUS "Building for current host CPU: ${ARCH_C_FLAGS}")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ARCH_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_CXX_FLAGS}")
 else()
-message(STATUS "Building runtime for multiple microarchitectures")
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+    message(STATUS "Building runtime for multiple microarchitectures")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 endif()
 
 add_subdirectory(util)
@@ -435,19 +458,18 @@ if (NOT WIN32)
 
     configure_file(libhs.pc.in libhs.pc @ONLY) # only replace @ quoted vars
     install(FILES ${CMAKE_BINARY_DIR}/libhs.pc
-            DESTINATION "${CMAKE_INSTALL_PREFIX}/lib/pkgconfig")
+        DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
 endif()
 
 # only set these after all tests are done
 if (NOT FAT_RUNTIME)
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${HS_C_FLAGS}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS} ${HS_CXX_FLAGS}")
 else()
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
 endif()
 
-
 if(NOT WIN32)
 set(RAGEL_C_FLAGS "-Wno-unused")
 endif()
@@ -459,13 +481,20 @@ set_source_files_properties(
 
 ragelmaker(src/parser/Parser.rl)
 
+set_source_files_properties(
+    ${CMAKE_BINARY_DIR}/src/parser/control_verbs.cpp
+    PROPERTIES
+        COMPILE_FLAGS "${RAGEL_C_FLAGS}")
+
+ragelmaker(src/parser/control_verbs.rl)
+
 SET(hs_HEADERS
     src/hs.h
     src/hs_common.h
     src/hs_compile.h
     src/hs_runtime.h
 )
-install(FILES ${hs_HEADERS} DESTINATION include/hs)
+install(FILES ${hs_HEADERS} DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/hs")
 
 set (hs_exec_common_SRCS
     src/alloc.c
@@ -541,25 +570,6 @@ set (hs_exec_SRCS
     src/nfa/mpv.h
     src/nfa/mpv.c
     src/nfa/mpv_internal.h
-    src/nfa/multiaccel_common.h
-    src/nfa/multiaccel_doubleshift.h
-    src/nfa/multiaccel_doubleshiftgrab.h
-    src/nfa/multiaccel_long.h
-    src/nfa/multiaccel_longgrab.h
-    src/nfa/multiaccel_shift.h
-    src/nfa/multiaccel_shiftgrab.h
-    src/nfa/multishufti.c
-    src/nfa/multishufti_avx2.h
-    src/nfa/multishufti_sse.h
-    src/nfa/multishufti.h
-    src/nfa/multitruffle.c
-    src/nfa/multitruffle_avx2.h
-    src/nfa/multitruffle_sse.h
-    src/nfa/multitruffle.h
-    src/nfa/multivermicelli.c
-    src/nfa/multivermicelli.h
-    src/nfa/multivermicelli_sse.h
-    src/nfa/multivermicelli_avx2.h
     src/nfa/nfa_api.h
     src/nfa/nfa_api_dispatch.c
     src/nfa/nfa_internal.h
@@ -573,13 +583,11 @@ set (hs_exec_SRCS
     src/nfa/sheng_impl.h
     src/nfa/sheng_impl4.h
     src/nfa/sheng_internal.h
-    src/nfa/shufti_common.h
     src/nfa/shufti.c
     src/nfa/shufti.h
     src/nfa/tamarama.c
     src/nfa/tamarama.h
     src/nfa/tamarama_internal.h
-    src/nfa/truffle_common.h
     src/nfa/truffle.c
     src/nfa/truffle.h
     src/nfa/vermicelli.h
@@ -662,6 +670,7 @@ SET (hs_SRCS
     src/compiler/compiler.h
     src/compiler/error.cpp
     src/compiler/error.h
+    src/compiler/expression_info.h
     src/fdr/engine_description.cpp
     src/fdr/engine_description.h
     src/fdr/fdr_compile.cpp
@@ -719,8 +728,6 @@ SET (hs_SRCS
     src/nfa/mpv_internal.h
     src/nfa/mpvcompile.cpp
     src/nfa/mpvcompile.h
-    src/nfa/multiaccel_compilehelper.cpp
-    src/nfa/multiaccel_compilehelper.h
     src/nfa/nfa_api.h
     src/nfa/nfa_api_queue.h
     src/nfa/nfa_api_util.h
@@ -775,6 +782,8 @@ SET (hs_SRCS
     src/nfagraph/ng_extparam.h
     src/nfagraph/ng_fixed_width.cpp
     src/nfagraph/ng_fixed_width.h
+    src/nfagraph/ng_fuzzy.cpp
+    src/nfagraph/ng_fuzzy.h
     src/nfagraph/ng_haig.cpp
     src/nfagraph/ng_haig.h
     src/nfagraph/ng_holder.cpp
@@ -820,8 +829,6 @@ SET (hs_SRCS
     src/nfagraph/ng_restructuring.h
     src/nfagraph/ng_revacc.cpp
     src/nfagraph/ng_revacc.h
-    src/nfagraph/ng_rose.cpp
-    src/nfagraph/ng_rose.h
     src/nfagraph/ng_sep.cpp
     src/nfagraph/ng_sep.h
     src/nfagraph/ng_small_literal_set.cpp
@@ -893,6 +900,8 @@ SET (hs_SRCS
     src/parser/buildstate.h
     src/parser/check_refs.cpp
     src/parser/check_refs.h
+    src/parser/control_verbs.cpp
+    src/parser/control_verbs.h
     src/parser/parse_error.cpp
     src/parser/parse_error.h
     src/parser/parser_util.cpp
@@ -928,6 +937,8 @@ SET (hs_SRCS
     src/rose/rose_build_compile.cpp
     src/rose/rose_build_convert.cpp
     src/rose/rose_build_convert.h
+    src/rose/rose_build_dedupe.cpp
+    src/rose/rose_build_engine_blob.cpp
     src/rose/rose_build_engine_blob.h
     src/rose/rose_build_exclusive.cpp
     src/rose/rose_build_exclusive.h
@@ -936,6 +947,10 @@ SET (hs_SRCS
     src/rose/rose_build_impl.h
     src/rose/rose_build_infix.cpp
     src/rose/rose_build_infix.h
+    src/rose/rose_build_instructions.cpp
+    src/rose/rose_build_instructions.h
+    src/rose/rose_build_lit_accel.cpp
+    src/rose/rose_build_lit_accel.h
     src/rose/rose_build_long_lit.cpp
     src/rose/rose_build_long_lit.h
     src/rose/rose_build_lookaround.cpp
@@ -947,6 +962,7 @@ SET (hs_SRCS
     src/rose/rose_build_misc.cpp
     src/rose/rose_build_program.cpp
     src/rose/rose_build_program.h
+    src/rose/rose_build_resources.h
     src/rose/rose_build_role_aliasing.cpp
     src/rose/rose_build_scatter.cpp
     src/rose/rose_build_scatter.h
@@ -982,8 +998,12 @@ SET (hs_SRCS
     src/util/fatbit_build.h
     src/util/graph.h
     src/util/hash.h
+    src/util/hash_dynamic_bitset.h
+    src/util/math.h
     src/util/multibit_build.cpp
     src/util/multibit_build.h
+    src/util/noncopyable.h
+    src/util/operators.h
     src/util/order_check.h
     src/util/partial_store.h
     src/util/partitioned_set.h
@@ -993,6 +1013,7 @@ SET (hs_SRCS
     src/util/report_manager.cpp
     src/util/report_manager.h
     src/util/simd_utils.h
+    src/util/small_vector.h
     src/util/target_info.cpp
     src/util/target_info.h
     src/util/ue2_containers.h
@@ -1048,8 +1069,6 @@ set(hs_dump_SRCS
     src/rose/rose_build_dump.h
     src/rose/rose_in_dump.cpp
     src/rose/rose_in_dump.h
-    src/rose/rose_dump.cpp
-    src/rose/rose_dump.h
     src/util/dump_charclass.cpp
     src/util/dump_charclass.h
     src/util/dump_util.cpp
@@ -1074,10 +1093,14 @@ if (NOT FAT_RUNTIME)
         set(hs_exec_SRCS ${hs_exec_SRCS} ${hs_exec_avx2_SRCS})
     endif()
 
-    add_library(hs_exec OBJECT ${hs_exec_SRCS})
+    if (BUILD_STATIC_LIBS)
+        add_library(hs_exec OBJECT ${hs_exec_SRCS})
+
+        add_library(hs_runtime STATIC src/hs_version.c src/hs_valid_platform.c $<TARGET_OBJECTS:hs_exec>)
+        set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C)
 
-    add_library(hs_runtime STATIC src/hs_version.c src/hs_valid_platform.c $<TARGET_OBJECTS:hs_exec>)
-    set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C)
+        add_library(hs STATIC ${hs_SRCS} src/hs_valid_platform.c $<TARGET_OBJECTS:hs_exec>)
+    endif (BUILD_STATIC_LIBS)
 
     if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
         add_library(hs_exec_shared OBJECT ${hs_exec_SRCS})
@@ -1085,51 +1108,98 @@ if (NOT FAT_RUNTIME)
     endif()
 
 else (FAT_RUNTIME)
-    set(BUILD_WRAPPER "${PROJECT_SOURCE_DIR}/cmake/build_wrapper.sh")
-    add_library(hs_exec_core2 OBJECT ${hs_exec_SRCS})
-    set_target_properties(hs_exec_core2 PROPERTIES
-        COMPILE_FLAGS "-march=core2"
-        RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in"
-        )
-
-    add_library(hs_exec_corei7 OBJECT ${hs_exec_SRCS})
-    set_target_properties(hs_exec_corei7 PROPERTIES
-        COMPILE_FLAGS "-march=corei7"
-        RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} corei7 ${CMAKE_MODULE_PATH}/keep.syms.in"
-        )
 
-    add_library(hs_exec_avx2 OBJECT ${hs_exec_SRCS} ${hs_exec_avx2_SRCS})
-    set_target_properties(hs_exec_avx2 PROPERTIES
-        COMPILE_FLAGS "-march=core-avx2"
-        RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} avx2 ${CMAKE_MODULE_PATH}/keep.syms.in"
-        )
-
-    add_library(hs_exec_common OBJECT
-        ${hs_exec_common_SRCS}
-        src/dispatcher.c
-        )
+    set(BUILD_WRAPPER "${PROJECT_SOURCE_DIR}/cmake/build_wrapper.sh")
+    if (NOT BUILD_AVX512)
+        set (DISPATCHER_DEFINE "-DDISABLE_AVX512_DISPATCH")
+    endif (NOT BUILD_AVX512)
     set_source_files_properties(src/dispatcher.c PROPERTIES
-        COMPILE_FLAGS "-Wno-unused-parameter -Wno-unused-function")
+        COMPILE_FLAGS "-Wno-unused-parameter -Wno-unused-function ${DISPATCHER_DEFINE}")
+
+    if (BUILD_STATIC_LIBS)
+       add_library(hs_exec_core2 OBJECT ${hs_exec_SRCS})
+       list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_core2>)
+       set_target_properties(hs_exec_core2 PROPERTIES
+           COMPILE_FLAGS "-march=core2"
+           RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in"
+           )
+
+       add_library(hs_exec_corei7 OBJECT ${hs_exec_SRCS})
+       list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_corei7>)
+       set_target_properties(hs_exec_corei7 PROPERTIES
+           COMPILE_FLAGS "-march=corei7"
+           RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} corei7 ${CMAKE_MODULE_PATH}/keep.syms.in"
+           )
+
+       add_library(hs_exec_avx2 OBJECT ${hs_exec_SRCS} ${hs_exec_avx2_SRCS})
+       list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_avx2>)
+       set_target_properties(hs_exec_avx2 PROPERTIES
+           COMPILE_FLAGS "-march=core-avx2"
+           RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} avx2 ${CMAKE_MODULE_PATH}/keep.syms.in"
+           )
+       if (BUILD_AVX512)
+           add_library(hs_exec_avx512 OBJECT ${hs_exec_SRCS} ${hs_exec_avx2_SRCS})
+           list(APPEND RUNTIME_LIBS $<TARGET_OBJECTS:hs_exec_avx512>)
+           set_target_properties(hs_exec_avx512 PROPERTIES
+               COMPILE_FLAGS "${SKYLAKE_FLAG}"
+               RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} avx512 ${CMAKE_MODULE_PATH}/keep.syms.in"
+               )
+       endif (BUILD_AVX512)
+
+       add_library(hs_exec_common OBJECT
+           ${hs_exec_common_SRCS}
+           src/dispatcher.c
+           )
+
+       # hs_version.c is added explicitly to avoid some build systems that refuse to
+       # create a lib without any src (I'm looking at you Xcode)
+
+       add_library(hs_runtime STATIC src/hs_version.c
+           $<TARGET_OBJECTS:hs_exec_common>
+           ${RUNTIME_LIBS})
+       set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C)
+
+       # we want the static lib for testing
+       add_library(hs STATIC src/hs_version.c src/hs_valid_platform.c
+           ${hs_SRCS}
+           $<TARGET_OBJECTS:hs_exec_common>
+           ${RUNTIME_LIBS})
+
+    endif (BUILD_STATIC_LIBS)
 
     if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
+        # build shared libs
         add_library(hs_exec_shared_core2 OBJECT ${hs_exec_SRCS})
+        list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_core2>)
         set_target_properties(hs_exec_shared_core2 PROPERTIES
             COMPILE_FLAGS "-march=core2"
             POSITION_INDEPENDENT_CODE TRUE
             RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} core2 ${CMAKE_MODULE_PATH}/keep.syms.in"
             )
         add_library(hs_exec_shared_corei7 OBJECT ${hs_exec_SRCS})
+        list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_corei7>)
         set_target_properties(hs_exec_shared_corei7 PROPERTIES
             COMPILE_FLAGS "-march=corei7"
             POSITION_INDEPENDENT_CODE TRUE
             RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} corei7 ${CMAKE_MODULE_PATH}/keep.syms.in"
             )
         add_library(hs_exec_shared_avx2 OBJECT ${hs_exec_SRCS} ${hs_exec_avx2_SRCS})
+        list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_avx2>)
         set_target_properties(hs_exec_shared_avx2 PROPERTIES
             COMPILE_FLAGS "-march=core-avx2"
             POSITION_INDEPENDENT_CODE TRUE
             RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} avx2 ${CMAKE_MODULE_PATH}/keep.syms.in"
             )
+
+        if (BUILD_AVX512)
+            add_library(hs_exec_shared_avx512 OBJECT ${hs_exec_SRCS} ${hs_exec_avx2_SRCS})
+            list(APPEND RUNTIME_SHLIBS $<TARGET_OBJECTS:hs_exec_shared_avx512>)
+            set_target_properties(hs_exec_shared_avx512 PROPERTIES
+                COMPILE_FLAGS "${SKYLAKE_FLAG}"
+                POSITION_INDEPENDENT_CODE TRUE
+                RULE_LAUNCH_COMPILE "${BUILD_WRAPPER} avx512 ${CMAKE_MODULE_PATH}/keep.syms.in"
+                )
+        endif (BUILD_AVX512)
         add_library(hs_exec_common_shared OBJECT
         ${hs_exec_common_SRCS}
         src/dispatcher.c
@@ -1140,31 +1210,21 @@ else (FAT_RUNTIME)
     endif() # SHARED
 
 
-# hs_version.c is added explicitly to avoid some build systems that refuse to
-# create a lib without any src (I'm looking at you Xcode)
-
-    add_library(hs_runtime STATIC src/hs_version.c
-        $<TARGET_OBJECTS:hs_exec_common> $<TARGET_OBJECTS:hs_exec_core2>
-        $<TARGET_OBJECTS:hs_exec_corei7> $<TARGET_OBJECTS:hs_exec_avx2>)
 endif (NOT FAT_RUNTIME)
 
-
-set_target_properties(hs_runtime PROPERTIES LINKER_LANGUAGE C)
 if (NOT BUILD_SHARED_LIBS)
-    install(TARGETS hs_runtime DESTINATION lib)
+    install(TARGETS hs_runtime DESTINATION ${CMAKE_INSTALL_LIBDIR})
 endif()
 
 if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
     if (NOT FAT_RUNTIME)
-        add_library(hs_runtime_shared SHARED src/hs_version.c src/hs_valid_platform.c
-$<TARGET_OBJECTS:hs_exec_shared>)
-            else()
+        add_library(hs_runtime_shared SHARED src/hs_version.c
+            src/hs_valid_platform.c $<TARGET_OBJECTS:hs_exec_shared>)
+    else()
         add_library(hs_runtime_shared SHARED src/hs_version.c
             src/hs_valid_platform.c
             $<TARGET_OBJECTS:hs_exec_common_shared>
-            $<TARGET_OBJECTS:hs_exec_shared_core2>
-            $<TARGET_OBJECTS:hs_exec_shared_corei7>
-            $<TARGET_OBJECTS:hs_exec_shared_avx2>)
+            ${RUNTIME_SHLIBS})
     endif()
     set_target_properties(hs_runtime_shared PROPERTIES
         VERSION ${LIB_VERSION}
@@ -1173,24 +1233,17 @@ $<TARGET_OBJECTS:hs_exec_shared>)
         MACOSX_RPATH ON
         LINKER_LANGUAGE C)
     install(TARGETS hs_runtime_shared
-        RUNTIME DESTINATION bin
-        ARCHIVE DESTINATION lib
-        LIBRARY DESTINATION lib)
+        RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+        ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
 endif()
 
-if (NOT FAT_RUNTIME)
-    add_library(hs STATIC ${hs_SRCS} src/hs_valid_platform.c $<TARGET_OBJECTS:hs_exec>)
-else()
-    # we want the static lib for testing
-    add_library(hs STATIC src/hs_version.c src/hs_valid_platform.c
-        ${hs_SRCS} $<TARGET_OBJECTS:hs_exec_common> $<TARGET_OBJECTS:hs_exec_core2>
-        $<TARGET_OBJECTS:hs_exec_corei7> $<TARGET_OBJECTS:hs_exec_avx2>)
-endif()
-
-add_dependencies(hs ragel_Parser)
+if (BUILD_STATIC_LIBS)
+    add_dependencies(hs ragel_Parser)
+endif ()
 
 if (NOT BUILD_SHARED_LIBS)
-install(TARGETS hs DESTINATION lib)
+    install(TARGETS hs DESTINATION ${CMAKE_INSTALL_LIBDIR})
 endif()
 
 if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
@@ -1200,9 +1253,7 @@ if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
     else()
         add_library(hs_shared SHARED src/hs_version.c src/hs_valid_platform.c
             ${hs_SRCS} $<TARGET_OBJECTS:hs_exec_common_shared>
-            $<TARGET_OBJECTS:hs_exec_shared_core2>
-            $<TARGET_OBJECTS:hs_exec_shared_corei7>
-            $<TARGET_OBJECTS:hs_exec_shared_avx2>)
+            ${RUNTIME_SHLIBS})
 
     endif()
     add_dependencies(hs_shared ragel_Parser)
@@ -1212,11 +1263,18 @@ if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
         SOVERSION ${LIB_SOVERSION}
         MACOSX_RPATH ON)
 install(TARGETS hs_shared
-    RUNTIME DESTINATION bin
-    ARCHIVE DESTINATION lib
-    LIBRARY DESTINATION lib)
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
 endif()
 
+# used by tools and other targets
+if (NOT BUILD_STATIC_LIBS)
+    # use shared lib without having to change all the targets
+    add_library(hs ALIAS hs_shared)
+endif ()
+
+
 if(NOT WIN32)
     add_subdirectory(examples)
 endif()
diff --git a/cmake/arch.cmake b/cmake/arch.cmake
index e98fbf227..0519b2e5a 100644
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -10,8 +10,24 @@ else ()
     message (FATAL_ERROR "No intrinsics header found")
 endif ()
 
+if (BUILD_AVX512)
+    CHECK_C_COMPILER_FLAG(${SKYLAKE_FLAG} HAS_ARCH_SKYLAKE)
+    if (NOT HAS_ARCH_SKYLAKE)
+        message (FATAL_ERROR "AVX512 not supported by compiler")
+    endif ()
+endif ()
 
-set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${ARCH_C_FLAGS}")
+if (FAT_RUNTIME)
+    # test the highest level microarch to make sure everything works
+    if (BUILD_AVX512)
+        set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${SKYLAKE_FLAG}")
+    else ()
+        set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} -march=core-avx2")
+    endif ()
+else (NOT FAT_RUNTIME)
+    # if not fat runtime, then test given cflags
+    set (CMAKE_REQUIRED_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_C_FLAGS} ${ARCH_C_FLAGS}")
+endif ()
 
 # ensure we have the minimum of SSSE3 - call a SSSE3 intrinsic
 CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
@@ -31,5 +47,39 @@ int main(){
     (void)_mm256_xor_si256(z, z);
 }" HAVE_AVX2)
 
+# and now for AVX512
+CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}>
+#if !defined(__AVX512BW__)
+#error no avx512bw
+#endif
+
+int main(){
+    __m512i z = _mm512_setzero_si512();
+    (void)_mm512_abs_epi8(z);
+}" HAVE_AVX512)
+
+if (FAT_RUNTIME)
+    if (NOT HAVE_SSSE3)
+        message(FATAL_ERROR "SSSE3 support required to build fat runtime")
+    endif ()
+    if (NOT HAVE_AVX2)
+        message(FATAL_ERROR "AVX2 support required to build fat runtime")
+    endif ()
+    if (BUILD_AVX512 AND NOT HAVE_AVX512)
+        message(FATAL_ERROR "AVX512 support requested but not supported")
+    endif ()
+else (NOT FAT_RUNTIME)
+    if (NOT HAVE_AVX2)
+        message(STATUS "Building without AVX2 support")
+    endif ()
+    if (NOT HAVE_AVX512)
+        message(STATUS "Building without AVX512 support")
+    endif ()
+else (NOT FAT_RUNTIME)
+    if (NOT HAVE_SSSE3)
+        message(FATAL_ERROR "A minimum of SSSE3 compiler support is required")
+    endif ()
+endif ()
+
 unset (CMAKE_REQUIRED_FLAGS)
 unset (INTRIN_INC_H)
diff --git a/cmake/backtrace.cmake b/cmake/backtrace.cmake
index b8ad79f63..5a446e894 100644
--- a/cmake/backtrace.cmake
+++ b/cmake/backtrace.cmake
@@ -45,10 +45,12 @@ if(HAVE_BACKTRACE)
     if(HAS_RDYNAMIC)
         list(INSERT BACKTRACE_LDFLAGS 0 -rdynamic)
     endif()
-    # cmake scope fun
-    set(HAVE_BACKTRACE ${HAVE_BACKTRACE} PARENT_SCOPE)
 else()
     set(BACKTRACE_CFLAGS "")
     set(BACKTRACE_LDFLAGS "")
 endif()
 
+# cmake scope fun
+set(HAVE_BACKTRACE ${HAVE_BACKTRACE} CACHE BOOL INTERNAL)
+set(BACKTRACE_CFLAGS ${BACKTRACE_CFLAGS} CACHE STRING INTERNAL)
+set(BACKTRACE_LDFLAGS ${BACKTRACE_LDFLAGS} CACHE STRING INTERNAL)
diff --git a/cmake/boost.cmake b/cmake/boost.cmake
index 3d513deb6..44b4e8ba6 100644
--- a/cmake/boost.cmake
+++ b/cmake/boost.cmake
@@ -1,3 +1,31 @@
+# Various checks related to Boost
+
+set(BOOST_USE_STATIC_LIBS OFF)
+set(BOOST_USE_MULTITHREADED OFF)
+set(BOOST_USE_STATIC_RUNTIME OFF)
+if (HAVE_LIBCPP)
+    # we need a more recent boost for libc++
+    set(BOOST_MINVERSION 1.61.0)
+else ()
+    set(BOOST_MINVERSION 1.57.0)
+endif ()
+set(BOOST_NO_BOOST_CMAKE ON)
+
+unset(Boost_INCLUDE_DIR CACHE)
+# we might have boost in tree, so provide a hint and try again
+set(BOOST_INCLUDEDIR "${PROJECT_SOURCE_DIR}/include")
+find_package(Boost ${BOOST_MINVERSION} QUIET)
+if(NOT Boost_FOUND)
+    # otherwise check for Boost installed on the system
+    unset(BOOST_INCLUDEDIR)
+    find_package(Boost ${BOOST_MINVERSION} QUIET)
+    if(NOT Boost_FOUND)
+        message(FATAL_ERROR "Boost ${BOOST_MINVERSION} or later not found. Either install system packages if available, extract Boost headers to ${CMAKE_SOURCE_DIR}/include, or set the CMake BOOST_ROOT variable.")
+    endif()
+endif()
+
+message(STATUS "Boost version: ${Boost_MAJOR_VERSION}.${Boost_MINOR_VERSION}.${Boost_SUBMINOR_VERSION}")
+
 # Boost 1.62 has a bug that we've patched around, check if it is required
 if (Boost_VERSION EQUAL 106200)
     set (CMAKE_REQUIRED_INCLUDES ${BOOST_INCLUDEDIR} "${PROJECT_SOURCE_DIR}/include")
@@ -38,4 +66,7 @@ ${BOOST_REV_TEST}" BOOST_REVGRAPH_PATCH)
     endif()
 
     unset (CMAKE_REQUIRED_INCLUDES)
+else ()
+    unset(BOOST_REVGRAPH_OK CACHE)
+    unset(BOOST_REVGRAPH_PATCH CACHE)
 endif () # Boost 1.62.0
diff --git a/cmake/build_wrapper.sh b/cmake/build_wrapper.sh
index 5baf209b3..70392229c 100755
--- a/cmake/build_wrapper.sh
+++ b/cmake/build_wrapper.sh
@@ -1,27 +1,28 @@
 #!/bin/sh -e
 # This is used for renaming symbols for the fat runtime, don't call directly
 # TODO: make this a lot less fragile!
+cleanup () {
+    rm -f ${SYMSFILE} ${KEEPSYMS}
+}
+
 PREFIX=$1
 KEEPSYMS_IN=$2
 shift 2
-BUILD=$@
-OUT=$(echo $BUILD | sed 's/.* -o \(.*\.o\).*/\1/')
-SYMSFILE=/tmp/${PREFIX}_rename.syms.$$
-KEEPSYMS=/tmp/keep.syms.$$
-# grab the command without the target obj or src file flags
-# we don't just call gcc directly as there may be flags modifying the arch
-CC_CMD=$(echo $BUILD | sed 's/ -o .*\.o//;s/ -c //;s/ .[^ ]*\.c//;')
-# find me a libc
-LIBC_SO=$(${CC_CMD} --print-file-name=libc.so.6)
+# $@ contains the actual build command
+OUT=$(echo "$@" | sed 's/.* -o \(.*\.o\).*/\1/')
+trap cleanup INT QUIT EXIT
+SYMSFILE=$(mktemp --tmpdir ${PREFIX}_rename.syms.XXXXX)
+KEEPSYMS=$(mktemp --tmpdir keep.syms.XXXXX)
+# find the libc used by gcc
+LIBC_SO=$("$@" --print-file-name=libc.so.6)
 cp ${KEEPSYMS_IN} ${KEEPSYMS}
 # get all symbols from libc and turn them into patterns
 nm -f p -g -D ${LIBC_SO} | sed -s 's/\([^ ]*\).*/^\1$/' >> ${KEEPSYMS}
 # build the object
-${BUILD}
+"$@"
 # rename the symbols in the object
 nm -f p -g ${OUT} | cut -f1 -d' ' | grep -v -f ${KEEPSYMS} | sed -e "s/\(.*\)/\1\ ${PREFIX}_\1/" >> ${SYMSFILE}
 if test -s ${SYMSFILE}
 then
     objcopy --redefine-syms=${SYMSFILE} ${OUT}
 fi
-rm -f ${SYMSFILE} ${KEEPSYMS}
diff --git a/cmake/config.h.in b/cmake/config.h.in
index c7b577c22..9c250b4c7 100644
--- a/cmake/config.h.in
+++ b/cmake/config.h.in
@@ -1,5 +1,8 @@
 /* used by cmake */
 
+#ifndef CONFIG_H_
+#define CONFIG_H_
+
 /* "Define if the build is 32 bit" */
 #cmakedefine ARCH_32_BIT
 
@@ -43,6 +46,8 @@
    0 if you don't. */
 #cmakedefine HAVE_DECL_PTHREAD_SETAFFINITY_NP
 
+#cmakedefine HAVE_PTHREAD_NP_H
+
 /* Define to 1 if you have the `malloc_info' function. */
 #cmakedefine HAVE_MALLOC_INFO
 
@@ -76,6 +81,9 @@
 /* Define to 1 if you have the `_aligned_malloc' function. */
 #cmakedefine HAVE__ALIGNED_MALLOC
 
+/* Define if compiler has __builtin_constant_p */
+#cmakedefine HAVE__BUILTIN_CONSTANT_P
+
 /* Optimize, inline critical functions */
 #cmakedefine HS_OPTIMIZE
 
@@ -91,3 +99,5 @@
 
 /* define if reverse_graph requires patch for boost 1.62.0 */
 #cmakedefine BOOST_REVGRAPH_PATCH
+
+#endif /* CONFIG_H_ */
diff --git a/cmake/sqlite3.cmake b/cmake/sqlite3.cmake
index c07f1161e..cbe17c6d4 100644
--- a/cmake/sqlite3.cmake
+++ b/cmake/sqlite3.cmake
@@ -22,7 +22,7 @@ if (NOT SQLITE3_FOUND)
         set(SQLITE3_INCLUDE_DIRS "${PROJECT_SOURCE_DIR}/sqlite3")
         set(SQLITE3_LDFLAGS sqlite3_static)
     else()
-        message(FATAL_ERROR "  no sqlite3 in source tree")
+        message(STATUS "  no sqlite3 in source tree")
     endif()
 endif()
 
diff --git a/doc/dev-reference/compilation.rst b/doc/dev-reference/compilation.rst
index 8f44c15c1..02b5c3f38 100644
--- a/doc/dev-reference/compilation.rst
+++ b/doc/dev-reference/compilation.rst
@@ -64,7 +64,7 @@ libpcre are supported. The use of unsupported constructs will result in
 compilation errors.
 
 The version of PCRE used to validate Hyperscan's interpretation of this syntax
-is 8.38.
+is 8.40.
 
 ====================
 Supported Constructs
@@ -171,6 +171,8 @@ The following regex constructs are not supported by Hyperscan:
 * Callouts and embedded code.
 * Atomic grouping and possessive quantifiers.
 
+.. _semantics:
+
 *********
 Semantics
 *********
@@ -284,16 +286,24 @@ which provides the following fields:
   expression should match successfully.
 * ``min_length``: The minimum match length (from start to end) required to
   successfully match this expression.
+* ``edit_distance``: Match this expression within a given Levenshtein distance.
 
-These parameters allow the set of matches produced by a pattern to be
-constrained at compile time, rather than relying on the application to process
-unwanted matches at runtime.
+These parameters either allow the set of matches produced by a pattern to be
+constrained at compile time (rather than relying on the application to process
+unwanted matches at runtime), or allow matching a pattern approximately (within
+a given edit distance) to produce more matches.
 
 For example, the pattern :regexp:`/foo.*bar/` when given a ``min_offset`` of 10
 and a ``max_offset`` of 15 will not produce matches when scanned against
 ``foobar`` or ``foo0123456789bar`` but will produce a match against the data
 streams ``foo0123bar`` or ``foo0123456bar``.
 
+Similarly, the pattern :regexp:`/foobar/` when given an ``edit_distance`` of 2
+will produce matches when scanned against ``foobar``, ``fooba``, ``fobr``,
+``fo_baz``, ``foooobar``, and anything else that lies within edit distance of 2
+(as defined by Levenshtein distance). For more details, see the
+:ref:`approximate_matching` section.
+
 =================
 Prefiltering Mode
 =================
@@ -375,3 +385,74 @@ An :c:type:`hs_platform_info_t` structure targeted at the current host can be
 built with the :c:func:`hs_populate_platform` function.
 
 See :ref:`api_constants` for the full list of CPU tuning and feature flags.
+
+.. _approximate_matching:
+
+********************
+Approximate matching
+********************
+
+Hyperscan provides an experimental approximate matching mode, which will match
+patterns within a given edit distance. The exact matching behavior is defined as
+follows:
+
+#. **Edit distance** is defined as Levenshtein distance. That is, there are
+   three possible edit types considered: insertion, removal and substitution.
+   More formal description can be found on
+   `Wikipedia <https://en.wikipedia.org/wiki/Levenshtein_distance>`_.
+
+#. **Approximate matching** will match all *corpora* within a given edit
+   distance. That is, given a pattern, approximate matching will match anything
+   that can be edited to arrive at a corpus that exactly matches the original
+   pattern.
+
+#. **Matching semantics** are exactly the same as described in :ref:`semantics`.
+
+Here are a few examples of approximate matching:
+
+* Pattern :regexp:`/foo/` can match ``foo`` when using regular Hyperscan
+  matching behavior. With approximate matching within edit distance 2, the
+  pattern will produce matches when scanned against ``foo``, ``foooo``, ``f00``,
+  ``f``, and anything else that lies within edit distance 2 of matching corpora
+  for the original pattern (``foo`` in this case).
+
+* Pattern :regexp:`/foo(bar)+/` with edit distance 1 will match ``foobarbar``,
+  ``foobarb0r``, ``fooarbar``, ``foobarba``, ``f0obarbar``, ``fobarbar`` and
+  anything else that lies within edit distance 1 of matching corpora for the
+  original pattern (``foobarbar`` in this case).
+
+* Pattern :regexp:`/foob?ar/` with edit distance 2 will match ``fooar``,
+  ``foo``, ``fabar``, ``oar`` and anything else that lies within edit distance 2
+  of matching corpora for the original pattern (``fooar`` in this case).
+
+Currently, there are trade-offs and limitations that come with approximate
+matching support. Here they are, in a nutshell:
+
+* Reduced pattern support:
+
+  * For many patterns, approximate matching is complex and can result in
+    Hyperscan failing to compile a pattern with a "Pattern too large" error,
+    even if the pattern is supported in normal operation.
+  * Additionally, some patterns cannot be approximately matched because they
+    reduce to so-called "vacuous" patterns (patterns that match everything). For
+    example, pattern :regexp:`/foo/` with edit distance 3, if implemented,
+    would reduce to matching zero-length buffers. Such patterns will result in a
+    "Pattern cannot be approximately matched" compile error.
+  * Finally, due to the inherent complexities of defining matching behavior,
+    approximate matching implements a reduced subset of regular expression
+    syntax. Approximate matching does not support UTF-8 (and other
+    multibyte character encodings), and word boundaries (that is, ``\b``, ``\B``
+    and other equivalent constructs). Patterns containing unsupported constructs
+    will result in "Pattern cannot be approximately matched" compile error.
+  * When using approximate matching in conjunction with SOM, all of the
+    restrictions of SOM also apply. See :ref:`som` for more
+    details.
+* Increased stream state/byte code size requirements: due to approximate
+  matching byte code being inherently larger and more complex than exact
+  matching, the corresponding requirements also increase.
+* Performance overhead: similarly, there is generally a performance cost
+  associated with approximate matching, both due to increased matching
+  complexity, and due to the fact that it will produce more matches.
+
+Approximate matching is always disabled by default, and can be enabled on a
+per-pattern basis by using an extended parameter described in :ref:`extparam`.
diff --git a/doc/dev-reference/conf.py.in b/doc/dev-reference/conf.py.in
index 9f089883e..2daab3696 100644
--- a/doc/dev-reference/conf.py.in
+++ b/doc/dev-reference/conf.py.in
@@ -44,7 +44,7 @@ master_doc = 'index'
 
 # General information about the project.
 project = u'Hyperscan'
-copyright = u'2015-2016, Intel Corporation'
+copyright = u'2015-2017, Intel Corporation'
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
diff --git a/doc/dev-reference/copyright.rst b/doc/dev-reference/copyright.rst
index 737b160f5..9464382e6 100644
--- a/doc/dev-reference/copyright.rst
+++ b/doc/dev-reference/copyright.rst
@@ -30,4 +30,4 @@ and/or other countries.
 
 \*Other names and brands may be claimed as the property of others.
 
-Copyright |copy| 2015-2016, Intel Corporation. All rights reserved.
+Copyright |copy| 2015-2017, Intel Corporation. All rights reserved.
diff --git a/doc/dev-reference/getting_started.rst b/doc/dev-reference/getting_started.rst
index 1794f3e9a..1d44705b4 100644
--- a/doc/dev-reference/getting_started.rst
+++ b/doc/dev-reference/getting_started.rst
@@ -254,18 +254,32 @@ the current platform is supported by Hyperscan.
 At of this release, the variants of the runtime that are built, and the CPU
 capability that is required, are the following:
 
-+----------+-------------------------------+---------------------+
-| Variant  | CPU Feature Flag(s) Required  | gcc arch flag       |
-+==========+===============================+=====================+
-| Core 2   | ``SSSE3``                     | ``-march=core2``    |
-+----------+-------------------------------+---------------------+
-| Core i7  | ``SSE4_2`` and ``POPCNT``     | ``-march=corei7``   |
-+----------+-------------------------------+---------------------+
-| AVX 2    | ``AVX2``                      | ``-march=avx2``     |
-+----------+-------------------------------+---------------------+
-
-As this requires compiler, libc, and binutils support, at this time the fat
-runtime will only be enabled for Linux builds where the compiler supports the
++----------+-------------------------------+---------------------------+
+| Variant  | CPU Feature Flag(s) Required  | gcc arch flag             |
++==========+===============================+===========================+
+| Core 2   | ``SSSE3``                     | ``-march=core2``          |
++----------+-------------------------------+---------------------------+
+| Core i7  | ``SSE4_2`` and ``POPCNT``     | ``-march=corei7``         |
++----------+-------------------------------+---------------------------+
+| AVX 2    | ``AVX2``                      | ``-march=core-avx2``      |
++----------+-------------------------------+---------------------------+
+| AVX 512  | ``AVX512BW`` (see note below) | ``-march=skylake-avx512`` |
++----------+-------------------------------+---------------------------+
+
+.. note::
+
+    Hyperscan v4.5 adds support for AVX-512 instructions - in particular the
+    ``AVX-512BW`` instruction set that was introduced on Intel "Skylake" Xeon
+    processors - however the AVX-512 runtime variant is **not** enabled by
+    default in fat runtime builds as not all toolchains support AVX-512
+    instruction sets. To build an AVX-512 runtime, the CMake variable
+    ``BUILD_AVX512`` must be enabled manually during configuration. For
+    example: ::
+
+        cmake -DBUILD_AVX512=on <...>
+
+As the fat runtime requires compiler, libc, and binutils support, at this time
+it will only be enabled for Linux builds where the compiler supports the
 `indirect function "ifunc" function attribute
 <https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-indirect-functions-3321>`_.
 
diff --git a/doc/dev-reference/index.rst b/doc/dev-reference/index.rst
index df4f89161..32f188dd4 100644
--- a/doc/dev-reference/index.rst
+++ b/doc/dev-reference/index.rst
@@ -17,5 +17,6 @@ Hyperscan |version| Developer's Reference Guide
    runtime
    serialization
    performance
+   tools
    api_constants
    api_files
diff --git a/doc/dev-reference/intro.rst b/doc/dev-reference/intro.rst
index 5f0cc113d..58879aef1 100644
--- a/doc/dev-reference/intro.rst
+++ b/doc/dev-reference/intro.rst
@@ -70,6 +70,13 @@ For a given database, Hyperscan provides several guarantees:
 
 See :ref:`runtime` for more detail.
 
+*****
+Tools
+*****
+
+Some utilities for testing and benchmarking Hyperscan are included with the
+library. See :ref:`tools` for more information.
+
 ************
 Example Code
 ************
diff --git a/doc/dev-reference/performance.rst b/doc/dev-reference/performance.rst
index 8cc0b6758..23781bd61 100644
--- a/doc/dev-reference/performance.rst
+++ b/doc/dev-reference/performance.rst
@@ -333,3 +333,13 @@ Similarly, the :c:member:`hs_expr_ext::min_length` extended parameter can be
 used to specify a lower bound on the length of the matches for a pattern. Using
 this facility may be more lightweight in some circumstances than using the SOM
 flag and post-confirming match length in the calling application.
+
+********************
+Approximate matching
+********************
+
+.. tip:: Approximate matching is an experimental feature.
+
+There is generally a performance impact associated with approximate matching due
+to the reduced specificity of the matches. This impact may vary significantly
+depending on the pattern and edit distance.
diff --git a/doc/dev-reference/tools.rst b/doc/dev-reference/tools.rst
new file mode 100644
index 000000000..d2e7a06e0
--- /dev/null
+++ b/doc/dev-reference/tools.rst
@@ -0,0 +1,116 @@
+.. _tools:
+
+#####
+Tools
+#####
+
+This section describes the set of utilities included with the Hyperscan library.
+
+********************
+Benchmarker: hsbench
+********************
+
+The ``hsbench`` tool provides an easy way to measure Hyperscan's performance
+for a particular set of patterns and corpus of data to be scanned.
+
+Patterns are supplied in the format described below in
+:ref:`tools_pattern_format`, while the corpus must be provided in the form of a
+`corpus database`: this is a simple SQLite database format intended to allow for
+easy control of how a corpus is broken into blocks and streams.
+
+.. note:: A group of Python scripts for constructing corpora databases from
+   various input types, such as PCAP network traffic captures or text files, can
+   be found in the Hyperscan source tree in ``tools/hsbench/scripts``.
+
+Running hsbench
+===============
+
+Given a file full of patterns specified with ``-e`` and a corpus database
+specified with ``-c``, ``hsbench`` will perform a single-threaded benchmark and
+produce output like this::
+
+    $ hsbench -e /tmp/patterns -c /tmp/corpus.db
+
+    Signatures:        /tmp/patterns
+    Hyperscan info:    Version: 4.3.1 Features:  AVX2 Mode: STREAM
+    Expression count:  200
+    Bytecode size:     342,540 bytes
+    Database CRC:      0x6cd6b67c
+    Stream state size: 252 bytes
+    Scratch size:      18,406 bytes
+    Compile time:      0.153 seconds
+    Peak heap usage:   78,073,856 bytes
+
+    Time spent scanning:     0.600 seconds
+    Corpus size:             72,138,183 bytes (63,946 blocks in 8,891 streams)
+    Scan matches:            81 (0.001 matches/kilobyte)
+    Overall block rate:      2,132,004.45 blocks/sec
+    Overall throughput:      19,241.10 Mbit/sec
+
+By default, the corpus is scanned twenty times, and the overall performance
+reported is computed based the total number of bytes scanned in the time it
+takes to perform all twenty scans. The number of repeats can be changed with the
+``-n`` argument, and the results of each scan will be displayed if the
+``--per-scan`` argument is specified.
+
+To benchmark Hyperscan on more than one core, you can supply a list of cores
+with the ``-T`` argument, which will instruct ``hsbench`` to start one
+benchmark thread per core given and compute the throughput from the time taken
+to complete all of them.
+
+.. tip:: For single-threaded benchmarks on multi-processor systems, we recommend
+   using a utility like ``taskset`` to lock the hsbench process to one core and
+   minimize jitter due to the operating system's scheduler.
+
+.. _tools_pattern_format:
+
+**************
+Pattern Format
+**************
+
+All of the Hyperscan tools accept patterns in the same format, read from plain
+text files with one pattern per line. Each line looks like this:
+
+* ``<integer id>:/<regex>/<flags>``
+
+For example::
+
+    1:/hatstand.*teakettle/s
+    2:/(hatstand|teakettle)/iH
+    3:/^.{10,20}hatstand/m
+
+The integer ID is the value that will be reported when a match is found by
+Hyperscan and must be unique.
+
+The pattern itself is a regular expression in PCRE syntax; see
+:ref:`compilation` for more information on supported features.
+
+The flags are single characters that map to Hyperscan flags as follows:
+
+=========   =================================    ===========
+Character   API Flag                             Description
+=========   =================================    ===========
+``i``       :c:member:`HS_FLAG_CASELESS`         Case-insensitive matching
+``s``       :c:member:`HS_FLAG_DOTALL`           Dot (``.``) will match newlines
+``m``       :c:member:`HS_FLAG_MULTILINE`        Multi-line anchoring
+``H``       :c:member:`HS_FLAG_SINGLEMATCH`      Report match ID at most once
+``V``       :c:member:`HS_FLAG_ALLOWEMPTY`       Allow patterns that can match against empty buffers
+``8``       :c:member:`HS_FLAG_UTF8`             UTF-8 mode
+``W``       :c:member:`HS_FLAG_UCP`              Unicode property support
+``P``       :c:member:`HS_FLAG_PREFILTER`        Prefiltering mode
+``L``       :c:member:`HS_FLAG_SOM_LEFTMOST`     Leftmost start of match reporting
+=========   =================================    ===========
+
+In addition to the set of flags above, :ref:`extparam` can be supplied
+for each pattern. These are supplied after the flags as ``key=value`` pairs
+between braces, separated by commas. For example::
+
+    1:/hatstand.*teakettle/s{min_offset=50,max_offset=100}
+
+All Hyperscan tools will accept a pattern file (or a directory containing
+pattern files) with the ``-e`` argument. If no further arguments constraining
+the pattern set are given, all patterns in those files are used.
+
+To select a subset of the patterns, a single ID can be supplied with the ``-z``
+argument, or a file containing a set of IDs can be supplied with the ``-s``
+argument.
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index b52bbdfaf..c252c9ace 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -22,3 +22,6 @@ set_source_files_properties(patbench.cc PROPERTIES COMPILE_FLAGS
     "-Wall -Wno-unused-parameter")
 target_link_libraries(patbench hs pcap)
 endif()
+
+install(FILES simplegrep.c pcapscan.cc patbench.cc README.md
+        DESTINATION ${CMAKE_INSTALL_DOCDIR}/examples)
diff --git a/src/alloc.c b/src/alloc.c
index aa7638e77..e27649bce 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -67,7 +67,7 @@ hs_free_t normalise_free(hs_free_t f) {
 }
 
 HS_PUBLIC_API
-hs_error_t hs_set_allocator(hs_alloc_t allocfunc, hs_free_t freefunc) {
+hs_error_t HS_CDECL hs_set_allocator(hs_alloc_t allocfunc, hs_free_t freefunc) {
     hs_set_database_allocator(allocfunc, freefunc);
     hs_set_misc_allocator(allocfunc, freefunc);
     hs_set_stream_allocator(allocfunc, freefunc);
@@ -77,7 +77,8 @@ hs_error_t hs_set_allocator(hs_alloc_t allocfunc, hs_free_t freefunc) {
 }
 
 HS_PUBLIC_API
-hs_error_t hs_set_database_allocator(hs_alloc_t allocfunc, hs_free_t freefunc) {
+hs_error_t HS_CDECL hs_set_database_allocator(hs_alloc_t allocfunc,
+                                              hs_free_t freefunc) {
     hs_database_alloc = normalise_alloc(allocfunc);
     hs_database_free = normalise_free(freefunc);
 
@@ -85,7 +86,8 @@ hs_error_t hs_set_database_allocator(hs_alloc_t allocfunc, hs_free_t freefunc) {
 }
 
 HS_PUBLIC_API
-hs_error_t hs_set_misc_allocator(hs_alloc_t allocfunc, hs_free_t freefunc) {
+hs_error_t HS_CDECL hs_set_misc_allocator(hs_alloc_t allocfunc,
+                                          hs_free_t freefunc) {
     hs_misc_alloc = normalise_alloc(allocfunc);
     hs_misc_free = normalise_free(freefunc);
 
@@ -93,7 +95,8 @@ hs_error_t hs_set_misc_allocator(hs_alloc_t allocfunc, hs_free_t freefunc) {
 }
 
 HS_PUBLIC_API
-hs_error_t hs_set_scratch_allocator(hs_alloc_t allocfunc, hs_free_t freefunc) {
+hs_error_t HS_CDECL hs_set_scratch_allocator(hs_alloc_t allocfunc,
+                                             hs_free_t freefunc) {
     hs_scratch_alloc = normalise_alloc(allocfunc);
     hs_scratch_free = normalise_free(freefunc);
 
@@ -101,7 +104,8 @@ hs_error_t hs_set_scratch_allocator(hs_alloc_t allocfunc, hs_free_t freefunc) {
 }
 
 HS_PUBLIC_API
-hs_error_t hs_set_stream_allocator(hs_alloc_t allocfunc, hs_free_t freefunc) {
+hs_error_t HS_CDECL hs_set_stream_allocator(hs_alloc_t allocfunc,
+                                            hs_free_t freefunc) {
     hs_stream_alloc = normalise_alloc(allocfunc);
     hs_stream_free = normalise_free(freefunc);
 
diff --git a/src/compiler/asserts.cpp b/src/compiler/asserts.cpp
index be836b06d..444422260 100644
--- a/src/compiler/asserts.cpp
+++ b/src/compiler/asserts.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -42,6 +42,8 @@
  * word-to-word and word-to-nonword) are dropped.
  */
 #include "asserts.h"
+
+#include "compiler/compiler.h"
 #include "nfagraph/ng.h"
 #include "nfagraph/ng_prune.h"
 #include "nfagraph/ng_redundancy.h"
@@ -115,8 +117,8 @@ u32 conjunct(u32 flags1, u32 flags2) {
 typedef map<pair<NFAVertex, NFAVertex>, NFAEdge> edge_cache_t;
 
 static
-void replaceAssertVertex(NGWrapper &g, NFAVertex t, edge_cache_t &edge_cache,
-                         u32 &assert_edge_count) {
+void replaceAssertVertex(NGHolder &g, NFAVertex t, const ExpressionInfo &expr,
+                         edge_cache_t &edge_cache, u32 &assert_edge_count) {
     DEBUG_PRINTF("replacing assert vertex %zu\n", g[t].index);
 
     const u32 flags = g[t].assert_flags;
@@ -178,8 +180,7 @@ void replaceAssertVertex(NGWrapper &g, NFAVertex t, edge_cache_t &edge_cache,
                 edge_cache.emplace(cache_key, e);
                 g[e].assert_flags = flags;
                 if (++assert_edge_count > MAX_ASSERT_EDGES) {
-                    throw CompileError(g.expressionIndex,
-                                       "Pattern is too large.");
+                    throw CompileError(expr.index, "Pattern is too large.");
                 }
             } else {
                 NFAEdge e = ecit->second;
@@ -200,21 +201,23 @@ void replaceAssertVertex(NGWrapper &g, NFAVertex t, edge_cache_t &edge_cache,
 }
 
 static
-void setReportId(ReportManager &rm, NGWrapper &g, NFAVertex v, s32 adj) {
+void setReportId(ReportManager &rm, NGHolder &g, const ExpressionInfo &expr,
+                 NFAVertex v, s32 adj) {
     // Don't try and set the report ID of a special vertex.
     assert(!is_special(v, g));
 
     // There should be no reports set already.
     assert(g[v].reports.empty());
 
-    Report r = rm.getBasicInternalReport(g, adj);
+    Report r = rm.getBasicInternalReport(expr, adj);
 
     g[v].reports.insert(rm.getInternalId(r));
     DEBUG_PRINTF("set report id for vertex %zu, adj %d\n", g[v].index, adj);
 }
 
 static
-void checkForMultilineStart(ReportManager &rm, NGWrapper &g) {
+void checkForMultilineStart(ReportManager &rm, NGHolder &g,
+                            const ExpressionInfo &expr) {
     vector<NFAEdge> dead;
     for (auto v : adjacent_vertices_range(g.start, g)) {
         if (!(g[v].assert_flags & POS_FLAG_MULTILINE_START)) {
@@ -238,7 +241,7 @@ void checkForMultilineStart(ReportManager &rm, NGWrapper &g) {
     for (const auto &e : dead) {
         NFAVertex dummy = add_vertex(g);
         g[dummy].char_reach.setall();
-        setReportId(rm, g, dummy, -1);
+        setReportId(rm, g, expr, dummy, -1);
         add_edge(source(e, g), dummy, g[e], g);
         add_edge(dummy, g.accept, g);
     }
@@ -263,7 +266,8 @@ bool hasAssertVertices(const NGHolder &g) {
  * Remove the horrors that are the temporary assert vertices which arise from
  * our construction method. Allows the rest of our code base to live in
  * blissful ignorance of their existence. */
-void removeAssertVertices(ReportManager &rm, NGWrapper &g) {
+void removeAssertVertices(ReportManager &rm, NGHolder &g,
+                          const ExpressionInfo &expr) {
     size_t num = 0;
 
     DEBUG_PRINTF("before: graph has %zu vertices\n", num_vertices(g));
@@ -285,12 +289,12 @@ void removeAssertVertices(ReportManager &rm, NGWrapper &g) {
 
     for (auto v : vertices_range(g)) {
         if (g[v].assert_flags & WORDBOUNDARY_FLAGS) {
-            replaceAssertVertex(g, v, edge_cache, assert_edge_count);
+            replaceAssertVertex(g, v, expr, edge_cache, assert_edge_count);
             num++;
         }
     }
 
-    checkForMultilineStart(rm, g);
+    checkForMultilineStart(rm, g, expr);
 
     if (num) {
         DEBUG_PRINTF("resolved %zu assert vertices\n", num);
diff --git a/src/compiler/asserts.h b/src/compiler/asserts.h
index b9ec80c7c..b4d64c6c9 100644
--- a/src/compiler/asserts.h
+++ b/src/compiler/asserts.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,8 +35,9 @@
 
 namespace ue2 {
 
+class ExpressionInfo;
 class ReportManager;
-class NGWrapper;
+class NGHolder;
 
 /** \brief Convert temporary assert vertices (from construction method) to
  * edge-based flags.
@@ -44,7 +45,8 @@ class NGWrapper;
  * Remove the horrors that are the temporary assert vertices which arise from
  * our construction method. Allows the rest of our code base to live in
  * blissful ignorance of their existence. */
-void removeAssertVertices(ReportManager &rm, NGWrapper &g);
+void removeAssertVertices(ReportManager &rm, NGHolder &g,
+                          const ExpressionInfo &expr);
 
 } // namespace ue2
 
diff --git a/src/compiler/compiler.cpp b/src/compiler/compiler.cpp
index 4a4afc64e..cce89e408 100644
--- a/src/compiler/compiler.cpp
+++ b/src/compiler/compiler.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -55,9 +55,8 @@
 #include "parser/unsupported.h"
 #include "parser/utf8_validate.h"
 #include "rose/rose_build.h"
-#include "rose/rose_build_dump.h"
 #include "som/slot_manager_dump.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"
 #include "util/compile_error.h"
 #include "util/target_info.h"
 #include "util/verify_types.h"
@@ -74,12 +73,12 @@ using namespace std;
 
 namespace ue2 {
 
-
 static
 void validateExt(const hs_expr_ext &ext) {
     static const unsigned long long ALL_EXT_FLAGS = HS_EXT_FLAG_MIN_OFFSET |
                                                     HS_EXT_FLAG_MAX_OFFSET |
-                                                    HS_EXT_FLAG_MIN_LENGTH;
+                                                    HS_EXT_FLAG_MIN_LENGTH |
+                                                    HS_EXT_FLAG_EDIT_DISTANCE;
     if (ext.flags & ~ALL_EXT_FLAGS) {
         throw CompileError("Invalid hs_expr_ext flag set.");
     }
@@ -100,25 +99,18 @@ void validateExt(const hs_expr_ext &ext) {
 }
 
 ParsedExpression::ParsedExpression(unsigned index_in, const char *expression,
-                                   unsigned flags, ReportID actionId,
+                                   unsigned flags, ReportID report,
                                    const hs_expr_ext *ext)
-    : utf8(false),
-      allow_vacuous(flags & HS_FLAG_ALLOWEMPTY),
-      highlander(flags & HS_FLAG_SINGLEMATCH),
-      prefilter(flags & HS_FLAG_PREFILTER),
-      som(SOM_NONE),
-      index(index_in),
-      id(actionId),
-      min_offset(0),
-      max_offset(MAX_OFFSET),
-      min_length(0) {
+    : expr(index_in, flags & HS_FLAG_ALLOWEMPTY, flags & HS_FLAG_SINGLEMATCH,
+           false, flags & HS_FLAG_PREFILTER, SOM_NONE, report, 0, MAX_OFFSET,
+           0, 0) {
     ParseMode mode(flags);
 
     component = parse(expression, mode);
 
-    utf8 = mode.utf8; /* utf8 may be set by parse() */
+    expr.utf8 = mode.utf8; /* utf8 may be set by parse() */
 
-    if (utf8 && !isValidUtf8(expression)) {
+    if (expr.utf8 && !isValidUtf8(expression)) {
         throw ParseError("Expression is not valid UTF-8.");
     }
 
@@ -146,7 +138,7 @@ ParsedExpression::ParsedExpression(unsigned index_in, const char *expression,
 
     // Set SOM type.
     if (flags & HS_FLAG_SOM_LEFTMOST) {
-        som = SOM_LEFT;
+        expr.som = SOM_LEFT;
     }
 
     // Set extended parameters, if we have them.
@@ -155,26 +147,29 @@ ParsedExpression::ParsedExpression(unsigned index_in, const char *expression,
         validateExt(*ext);
 
         if (ext->flags & HS_EXT_FLAG_MIN_OFFSET) {
-            min_offset = ext->min_offset;
+            expr.min_offset = ext->min_offset;
         }
         if (ext->flags & HS_EXT_FLAG_MAX_OFFSET) {
-            max_offset = ext->max_offset;
+            expr.max_offset = ext->max_offset;
         }
         if (ext->flags & HS_EXT_FLAG_MIN_LENGTH) {
-            min_length = ext->min_length;
+            expr.min_length = ext->min_length;
+        }
+        if (ext->flags & HS_EXT_FLAG_EDIT_DISTANCE) {
+            expr.edit_distance = ext->edit_distance;
         }
     }
 
     // These are validated in validateExt, so an error will already have been
     // thrown if these conditions don't hold.
-    assert(max_offset >= min_offset);
-    assert(max_offset >= min_length);
+    assert(expr.max_offset >= expr.min_offset);
+    assert(expr.max_offset >= expr.min_length);
 
     // Since prefiltering and SOM aren't supported together, we must squash any
     // min_length constraint as well.
-    if (flags & HS_FLAG_PREFILTER && min_length) {
+    if (flags & HS_FLAG_PREFILTER && expr.min_length) {
         DEBUG_PRINTF("prefiltering mode: squashing min_length constraint\n");
-        min_length = 0;
+        expr.min_length = 0;
     }
 }
 
@@ -183,25 +178,25 @@ ParsedExpression::ParsedExpression(unsigned index_in, const char *expression,
  * \brief Dumps the parse tree to screen in debug mode and to disk in dump
  * mode.
  */
-void dumpExpression(UNUSED const ParsedExpression &expr,
+void dumpExpression(UNUSED const ParsedExpression &pe,
                     UNUSED const char *stage, UNUSED const Grey &grey) {
 #if defined(DEBUG)
-    DEBUG_PRINTF("===== Rule ID: %u (internalID:  %u) =====\n", expr.id,
-                 expr.index);
+    DEBUG_PRINTF("===== Rule ID: %u (expression index: %u) =====\n",
+                 pe.expr.report, pe.expr.index);
     ostringstream debug_tree;
-    dumpTree(debug_tree, expr.component.get());
+    dumpTree(debug_tree, pe.component.get());
     printf("%s\n", debug_tree.str().c_str());
 #endif // DEBUG
 
 #if defined(DUMP_SUPPORT)
     if (grey.dumpFlags & Grey::DUMP_PARSE) {
         stringstream ss;
-        ss << grey.dumpPath << "Expr_" << expr.index << "_componenttree_"
+        ss << grey.dumpPath << "Expr_" << pe.expr.index << "_componenttree_"
            << stage << ".txt";
         ofstream out(ss.str().c_str());
-        out << "Component Tree for " << expr.id << endl;
-        dumpTree(out, expr.component.get());
-        if (expr.utf8) {
+        out << "Component Tree for " << pe.expr.report << endl;
+        dumpTree(out, pe.component.get());
+        if (pe.expr.utf8) {
             out << "UTF8 mode" << endl;
         }
     }
@@ -211,13 +206,13 @@ void dumpExpression(UNUSED const ParsedExpression &expr,
 
 /** \brief Run Component tree optimisations on \a expr. */
 static
-void optimise(ParsedExpression &expr) {
-    if (expr.min_length || expr.som) {
+void optimise(ParsedExpression &pe) {
+    if (pe.expr.min_length || pe.expr.som) {
         return;
     }
 
     DEBUG_PRINTF("optimising\n");
-    expr.component->optimise(true /* root is connected to sds */);
+    pe.component->optimise(true /* root is connected to sds */);
 }
 
 void addExpression(NG &ng, unsigned index, const char *expression,
@@ -234,34 +229,34 @@ void addExpression(NG &ng, unsigned index, const char *expression,
 
     // Do per-expression processing: errors here will result in an exception
     // being thrown up to our caller
-    ParsedExpression expr(index, expression, flags, id, ext);
-    dumpExpression(expr, "orig", cc.grey);
+    ParsedExpression pe(index, expression, flags, id, ext);
+    dumpExpression(pe, "orig", cc.grey);
 
     // Apply prefiltering transformations if desired.
-    if (expr.prefilter) {
-        prefilterTree(expr.component, ParseMode(flags));
-        dumpExpression(expr, "prefiltered", cc.grey);
+    if (pe.expr.prefilter) {
+        prefilterTree(pe.component, ParseMode(flags));
+        dumpExpression(pe, "prefiltered", cc.grey);
     }
 
     // Expressions containing zero-width assertions and other extended pcre
     // types aren't supported yet. This call will throw a ParseError exception
     // if the component tree contains such a construct.
-    checkUnsupported(*expr.component);
+    checkUnsupported(*pe.component);
 
-    expr.component->checkEmbeddedStartAnchor(true);
-    expr.component->checkEmbeddedEndAnchor(true);
+    pe.component->checkEmbeddedStartAnchor(true);
+    pe.component->checkEmbeddedEndAnchor(true);
 
     if (cc.grey.optimiseComponentTree) {
-        optimise(expr);
-        dumpExpression(expr, "opt", cc.grey);
+        optimise(pe);
+        dumpExpression(pe, "opt", cc.grey);
     }
 
     DEBUG_PRINTF("component=%p, nfaId=%u, reportId=%u\n",
-                 expr.component.get(), expr.index, expr.id);
+                 pe.component.get(), pe.expr.index, pe.expr.report);
 
     // You can only use the SOM flags if you've also specified an SOM
     // precision mode.
-    if (expr.som != SOM_NONE && cc.streaming && !ng.ssm.somPrecision()) {
+    if (pe.expr.som != SOM_NONE && cc.streaming && !ng.ssm.somPrecision()) {
         throw CompileError("To use a SOM expression flag in streaming mode, "
                            "an SOM precision mode (e.g. "
                            "HS_MODE_SOM_HORIZON_LARGE) must be specified.");
@@ -269,32 +264,31 @@ void addExpression(NG &ng, unsigned index, const char *expression,
 
     // If this expression is a literal, we can feed it directly to Rose rather
     // than building the NFA graph.
-    if (shortcutLiteral(ng, expr)) {
+    if (shortcutLiteral(ng, pe)) {
         DEBUG_PRINTF("took literal short cut\n");
         return;
     }
 
-    unique_ptr<NGWrapper> g = buildWrapper(ng.rm, cc, expr);
-
-    if (!g) {
+    auto built_expr = buildGraph(ng.rm, cc, pe);
+    if (!built_expr.g) {
         DEBUG_PRINTF("NFA build failed on ID %u, but no exception was "
-                     "thrown.\n", expr.id);
+                     "thrown.\n", pe.expr.report);
         throw CompileError("Internal error.");
     }
 
-    if (!expr.allow_vacuous && matches_everywhere(*g)) {
+    if (!pe.expr.allow_vacuous && matches_everywhere(*built_expr.g)) {
         throw CompileError("Pattern matches empty buffer; use "
                            "HS_FLAG_ALLOWEMPTY to enable support.");
     }
 
-    if (!ng.addGraph(*g)) {
-        DEBUG_PRINTF("NFA addGraph failed on ID %u.\n", expr.id);
+    if (!ng.addGraph(built_expr.expr, std::move(built_expr.g))) {
+        DEBUG_PRINTF("NFA addGraph failed on ID %u.\n", pe.expr.report);
         throw CompileError("Error compiling expression.");
     }
 }
 
 static
-aligned_unique_ptr<RoseEngine> generateRoseEngine(NG &ng) {
+bytecode_ptr<RoseEngine> generateRoseEngine(NG &ng) {
     const u32 minWidth =
         ng.minWidth.is_finite() ? verify_u32(ng.minWidth) : ROSE_BOUND_INF;
     auto rose = ng.rose->buildRose(minWidth);
@@ -305,7 +299,6 @@ aligned_unique_ptr<RoseEngine> generateRoseEngine(NG &ng) {
         return nullptr;
     }
 
-    dumpRose(*ng.rose, rose.get(), ng.cc.grey);
     dumpReportManager(ng.rm, ng.cc.grey);
     dumpSomSlotManager(ng.ssm, ng.cc.grey);
     dumpSmallWrite(rose.get(), ng.cc.grey);
@@ -320,6 +313,9 @@ platform_t target_to_platform(const target_t &target_info) {
     if (!target_info.has_avx2()) {
         p |= HS_PLATFORM_NOAVX2;
     }
+    if (!target_info.has_avx512()) {
+        p |= HS_PLATFORM_NOAVX512;
+    }
     return p;
 }
 
@@ -369,7 +365,7 @@ struct hs_database *build(NG &ng, unsigned int *length) {
     if (!rose) {
         throw CompileError("Unable to generate bytecode.");
     }
-    *length = roseSize(rose.get());
+    *length = rose.size();
     if (!*length) {
         DEBUG_PRINTF("RoseEngine has zero length\n");
         assert(0);
@@ -450,41 +446,42 @@ bool isSupported(const Component &c) {
 }
 #endif
 
-unique_ptr<NGWrapper> buildWrapper(ReportManager &rm, const CompileContext &cc,
-                                   const ParsedExpression &expr) {
-    assert(isSupported(*expr.component));
+BuiltExpression buildGraph(ReportManager &rm, const CompileContext &cc,
+                           const ParsedExpression &pe) {
+    assert(isSupported(*pe.component));
 
-    const unique_ptr<NFABuilder> builder = makeNFABuilder(rm, cc, expr);
+    const auto builder = makeNFABuilder(rm, cc, pe);
     assert(builder);
 
     // Set up START and ACCEPT states; retrieve the special states
-    const auto bs = makeGlushkovBuildState(*builder, expr.prefilter);
+    const auto bs = makeGlushkovBuildState(*builder, pe.expr.prefilter);
 
     // Map position IDs to characters/components
-    expr.component->notePositions(*bs);
+    pe.component->notePositions(*bs);
 
     // Wire the start dotstar state to the firsts
-    connectInitialStates(*bs, expr);
+    connectInitialStates(*bs, pe);
 
     DEBUG_PRINTF("wire up body of expr\n");
     // Build the rest of the FOLLOW set
     vector<PositionInfo> initials = {builder->getStartDotStar(),
                                      builder->getStart()};
-    expr.component->buildFollowSet(*bs, initials);
+    pe.component->buildFollowSet(*bs, initials);
 
     // Wire the lasts to the accept state
-    connectFinalStates(*bs, expr);
+    connectFinalStates(*bs, pe);
 
     // Create our edges
     bs->buildEdges();
 
-    auto g = builder->getGraph();
-    assert(g);
+    BuiltExpression built_expr = builder->getGraph();
+    assert(built_expr.g);
 
-    dumpDotWrapper(*g, "00_before_asserts", cc.grey);
-    removeAssertVertices(rm, *g);
+    dumpDotWrapper(*built_expr.g, built_expr.expr, "00_before_asserts",
+                   cc.grey);
+    removeAssertVertices(rm, *built_expr.g, built_expr.expr);
 
-    return g;
+    return built_expr;
 }
 
 } // namespace ue2
diff --git a/src/compiler/compiler.h b/src/compiler/compiler.h
index 1d7d6536d..60d7ca33c 100644
--- a/src/compiler/compiler.h
+++ b/src/compiler/compiler.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,11 +35,11 @@
 
 #include "ue2common.h"
 #include "database.h"
+#include "compiler/expression_info.h"
 #include "parser/Component.h"
-#include "som/som.h"
+#include "util/noncopyable.h"
 
 #include <memory>
-#include <boost/core/noncopyable.hpp>
 
 struct hs_database;
 struct hs_expr_ext;
@@ -50,34 +50,32 @@ struct CompileContext;
 struct Grey;
 struct target_t;
 class NG;
+class NGHolder;
 class ReportManager;
-class NGWrapper;
 
-/** Class gathering together the pieces of a parsed expression.
- * Note: Owns the provided component.
- */
-class ParsedExpression : boost::noncopyable {
+/** \brief Class gathering together the pieces of a parsed expression. */
+class ParsedExpression : noncopyable {
 public:
     ParsedExpression(unsigned index, const char *expression, unsigned flags,
-                     ReportID actionId, const hs_expr_ext *ext = nullptr);
-
-    bool utf8; //!< UTF-8 mode flag specified
+                     ReportID report, const hs_expr_ext *ext = nullptr);
 
-    /** \brief root node of parsed component tree. */
-    std::unique_ptr<ue2::Component> component;
+    /** \brief Expression information (from flags, extparam etc) */
+    ExpressionInfo expr;
 
-    const bool allow_vacuous;   //!< HS_FLAG_ALLOWEMPTY specified
-    const bool highlander;      //!< HS_FLAG_SINGLEMATCH specified
-    const bool prefilter;       //!< HS_FLAG_PREFILTER specified
-    som_type som;               //!< chosen SOM mode, or SOM_NONE
+    /** \brief Root node of parsed component tree. */
+    std::unique_ptr<Component> component;
+};
 
-    /** \brief index in expressions array passed to \ref hs_compile_multi */
-    const unsigned index;
+/**
+ * \brief Class gathering together the pieces of an expression that has been
+ * built into an NFA graph.
+ */
+struct BuiltExpression {
+    /** \brief Expression information (from flags, extparam etc) */
+    ExpressionInfo expr;
 
-    const ReportID id; //!< user-specified pattern ID
-    u64a min_offset;   //!< 0 if not used
-    u64a max_offset;   //!< MAX_OFFSET if not used
-    u64a min_length;   //!< 0 if not used
+    /** \brief Built Glushkov NFA graph. */
+    std::unique_ptr<NGHolder> g;
 };
 
 /**
@@ -94,12 +92,12 @@ class ParsedExpression : boost::noncopyable {
  * @param ext
  *      Struct containing extra parameters for this expression, or NULL if
  *      none.
- * @param actionId
+ * @param report
  *      The identifier to associate with the expression; returned by engine on
  *      match.
  */
 void addExpression(NG &ng, unsigned index, const char *expression,
-                   unsigned flags, const hs_expr_ext *ext, ReportID actionId);
+                   unsigned flags, const hs_expr_ext *ext, ReportID report);
 
 /**
  * Build a Hyperscan database out of the expressions we've been given. A
@@ -127,9 +125,8 @@ struct hs_database *build(NG &ng, unsigned int *length);
  * @return
  *      nullptr on error.
  */
-std::unique_ptr<NGWrapper> buildWrapper(ReportManager &rm,
-                                        const CompileContext &cc,
-                                        const ParsedExpression &expr);
+BuiltExpression buildGraph(ReportManager &rm, const CompileContext &cc,
+                           const ParsedExpression &expr);
 
 /**
  * Build a platform_t out of a target_t.
diff --git a/src/compiler/expression_info.h b/src/compiler/expression_info.h
new file mode 100644
index 000000000..7775f59e7
--- /dev/null
+++ b/src/compiler/expression_info.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file
+ * \brief ExpressionInfo class for storing the properties of an expression.
+ */
+
+#ifndef COMPILER_EXPRESSION_INFO_H
+#define COMPILER_EXPRESSION_INFO_H
+
+#include "ue2common.h"
+#include "som/som.h"
+
+namespace ue2 {
+
+/** \brief Properties of an expression. */
+class ExpressionInfo {
+public:
+    ExpressionInfo(unsigned int index_in, bool allow_vacuous_in,
+                   bool highlander_in, bool utf8_in, bool prefilter_in,
+                   som_type som_in, ReportID report_in, u64a min_offset_in,
+                   u64a max_offset_in, u64a min_length_in, u32 edit_distance_in)
+        : index(index_in), report(report_in), allow_vacuous(allow_vacuous_in),
+          highlander(highlander_in), utf8(utf8_in), prefilter(prefilter_in),
+          som(som_in), min_offset(min_offset_in), max_offset(max_offset_in),
+          min_length(min_length_in), edit_distance(edit_distance_in) {}
+
+    /**
+     * \brief Index of the expression represented by this graph.
+     *
+     * Used:
+     * - down the track in error handling;
+     * - for identifying parts of an expression in highlander mode.
+     */
+    unsigned int index;
+
+    /** \brief Report ID specified by the user. */
+    ReportID report;
+
+    /** \brief Vacuous pattern is allowed. (HS_FLAG_ALLOWEMPTY) */
+    bool allow_vacuous;
+
+    /** \brief "Highlander" (single match) pattern. (HS_FLAG_SINGLEMATCH) */
+    bool highlander;
+
+    /** \brief UTF-8 pattern. (HS_FLAG_UTF8) */
+    bool utf8;
+
+    /** \brief Prefiltering pattern. (HS_FLAG_PREFILTER) */
+    bool prefilter;
+
+    /** \brief Start-of-match type requested, or SOM_NONE. */
+    som_type som;
+
+    /** \brief Minimum match offset extended parameter. 0 if not used. */
+    u64a min_offset;
+
+    /**
+     * \brief Maximum match offset extended parameter.
+     * MAX_OFFSET if not used.
+     */
+    u64a max_offset;
+
+    /** \brief Minimum match length extended parameter. 0 if not used. */
+    u64a min_length;
+
+    /**
+     * \brief Approximate matching edit distance extended parameter.
+     * 0 if not used.
+     */
+    u32 edit_distance;
+};
+
+}
+
+#endif // COMPILER_EXPRESSION_INFO_H
diff --git a/src/crc32.c b/src/crc32.c
index b85acc7f5..1dae47b4e 100644
--- a/src/crc32.c
+++ b/src/crc32.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -29,14 +29,10 @@
 #include "crc32.h"
 #include "config.h"
 #include "ue2common.h"
+#include "util/arch.h"
+#include "util/intrinsics.h"
 
-#if defined(HAVE_C_X86INTRIN_H)
-#include <x86intrin.h>
-#elif defined(HAVE_C_INTRIN_H)
-#include <intrin.h>
-#endif
-
-#ifndef __SSE4_2__
+#if !defined(HAVE_SSE42)
 
 /***
  *** What follows is derived from Intel's Slicing-by-8 CRC32 impl, which is BSD
@@ -582,7 +578,7 @@ u32 crc32c_sb8_64_bit(u32 running_crc, const unsigned char* p_buf,
     return crc;
 }
 
-#else // __SSE4_2__
+#else // HAVE_SSE42
 
 #ifdef ARCH_64_BIT
 #define CRC_WORD 8
@@ -638,7 +634,7 @@ u32 crc32c_sse42(u32 running_crc, const unsigned char* p_buf,
 
 // Externally visible function
 u32 Crc32c_ComputeBuf(u32 inCrc32, const void *buf, size_t bufLen) {
-#ifdef __SSE4_2__
+#if defined(HAVE_SSE42)
     u32 crc = crc32c_sse42(inCrc32, (const unsigned char *)buf, bufLen);
 #else
     u32 crc = crc32c_sb8_64_bit(inCrc32, (const unsigned char *)buf, bufLen);
diff --git a/src/database.c b/src/database.c
index 61eb021fa..dc03bf1fb 100644
--- a/src/database.c
+++ b/src/database.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -49,7 +49,7 @@ int db_correctly_aligned(const void *db) {
 }
 
 HS_PUBLIC_API
-hs_error_t hs_free_database(hs_database_t *db) {
+hs_error_t HS_CDECL hs_free_database(hs_database_t *db) {
     if (db && db->magic != HS_DB_MAGIC) {
         return HS_INVALID;
     }
@@ -59,8 +59,8 @@ hs_error_t hs_free_database(hs_database_t *db) {
 }
 
 HS_PUBLIC_API
-hs_error_t hs_serialize_database(const hs_database_t *db, char **bytes,
-                                 size_t *serialized_length) {
+hs_error_t HS_CDECL hs_serialize_database(const hs_database_t *db, char **bytes,
+                                          size_t *serialized_length) {
     if (!db || !bytes || !serialized_length) {
         return HS_INVALID;
     }
@@ -114,7 +114,8 @@ hs_error_t hs_serialize_database(const hs_database_t *db, char **bytes,
 static
 hs_error_t db_check_platform(const u64a p) {
     if (p != hs_current_platform
-        && p != hs_current_platform_no_avx2) {
+        && p != hs_current_platform_no_avx2
+        && p != hs_current_platform_no_avx512) {
         return HS_DB_PLATFORM_ERROR;
     }
     // passed all checks
@@ -195,8 +196,9 @@ void db_copy_bytecode(const char *serialized, hs_database_t *db) {
 }
 
 HS_PUBLIC_API
-hs_error_t hs_deserialize_database_at(const char *bytes, const size_t length,
-                                      hs_database_t *db) {
+hs_error_t HS_CDECL hs_deserialize_database_at(const char *bytes,
+                                               const size_t length,
+                                               hs_database_t *db) {
     if (!bytes || !db) {
         return HS_INVALID;
     }
@@ -237,8 +239,9 @@ hs_error_t hs_deserialize_database_at(const char *bytes, const size_t length,
 }
 
 HS_PUBLIC_API
-hs_error_t hs_deserialize_database(const char *bytes, const size_t length,
-                                   hs_database_t **db) {
+hs_error_t HS_CDECL hs_deserialize_database(const char *bytes,
+                                            const size_t length,
+                                            hs_database_t **db) {
     if (!bytes || !db) {
         return HS_INVALID;
     }
@@ -286,7 +289,7 @@ hs_error_t hs_deserialize_database(const char *bytes, const size_t length,
 }
 
 HS_PUBLIC_API
-hs_error_t hs_database_size(const hs_database_t *db, size_t *size) {
+hs_error_t HS_CDECL hs_database_size(const hs_database_t *db, size_t *size) {
     if (!size) {
         return HS_INVALID;
     }
@@ -301,8 +304,9 @@ hs_error_t hs_database_size(const hs_database_t *db, size_t *size) {
 }
 
 HS_PUBLIC_API
-hs_error_t hs_serialized_database_size(const char *bytes, const size_t length,
-                                       size_t *size) {
+hs_error_t HS_CDECL hs_serialized_database_size(const char *bytes,
+                                                const size_t length,
+                                                size_t *size) {
     // Decode and check the header
     hs_database_t header;
     hs_error_t ret = db_decode_header(&bytes, length, &header);
@@ -366,7 +370,9 @@ hs_error_t print_database_string(char **s, u32 version, const platform_t plat,
     u8 minor = (version >> 16) & 0xff;
     u8 major = (version >> 24) & 0xff;
 
-    const char *avx2 = (plat & HS_PLATFORM_NOAVX2)  ? "NOAVX2" : " AVX2";
+    const char *features = (plat & HS_PLATFORM_NOAVX512)
+                               ? (plat & HS_PLATFORM_NOAVX2) ? "" : "AVX2"
+                               : "AVX512";
 
     const char *mode = NULL;
 
@@ -395,7 +401,7 @@ hs_error_t print_database_string(char **s, u32 version, const platform_t plat,
         // that don't have snprintf but have a workalike.
         int p_len = SNPRINTF_COMPAT(
             buf, len, "Version: %u.%u.%u Features: %s Mode: %s",
-            major, minor, release, avx2, mode);
+            major, minor, release, features, mode);
         if (p_len < 0) {
             DEBUG_PRINTF("snprintf output error, returned %d\n", p_len);
             hs_misc_free(buf);
@@ -414,8 +420,8 @@ hs_error_t print_database_string(char **s, u32 version, const platform_t plat,
 }
 
 HS_PUBLIC_API
-hs_error_t hs_serialized_database_info(const char *bytes, size_t length,
-                                       char **info) {
+hs_error_t HS_CDECL hs_serialized_database_info(const char *bytes,
+                                                size_t length, char **info) {
     if (!info) {
         return HS_INVALID;
     }
@@ -434,7 +440,7 @@ hs_error_t hs_serialized_database_info(const char *bytes, size_t length,
 }
 
 HS_PUBLIC_API
-hs_error_t hs_database_info(const hs_database_t *db, char **info) {
+hs_error_t HS_CDECL hs_database_info(const hs_database_t *db, char **info) {
     if (!info) {
         return HS_INVALID;
     }
diff --git a/src/database.h b/src/database.h
index 399513fc2..5715ed677 100644
--- a/src/database.h
+++ b/src/database.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -41,6 +41,7 @@ extern "C"
 #include "hs_compile.h" // for HS_MODE_ flags
 #include "hs_version.h"
 #include "ue2common.h"
+#include "util/arch.h"
 
 #define HS_DB_VERSION HS_VERSION_32BIT
 #define HS_DB_MAGIC   (0xdbdbdbdbU)
@@ -53,14 +54,18 @@ extern "C"
 #define HS_PLATFORM_CPU_MASK        0x3F
 
 #define HS_PLATFORM_NOAVX2          (4<<13)
+#define HS_PLATFORM_NOAVX512        (8<<13)
 
 /** \brief Platform features bitmask. */
 typedef u64a platform_t;
 
 static UNUSED
 const platform_t hs_current_platform = {
-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)
     HS_PLATFORM_NOAVX2 |
+#endif
+#if !defined(HAVE_AVX512)
+    HS_PLATFORM_NOAVX512 |
 #endif
     0,
 };
@@ -68,6 +73,13 @@ const platform_t hs_current_platform = {
 static UNUSED
 const platform_t hs_current_platform_no_avx2 = {
     HS_PLATFORM_NOAVX2 |
+    HS_PLATFORM_NOAVX512 |
+    0,
+};
+
+static UNUSED
+const platform_t hs_current_platform_no_avx512 = {
+    HS_PLATFORM_NOAVX512 |
     0,
 };
 
diff --git a/src/dispatcher.c b/src/dispatcher.c
index fb2f4f02a..5ae46b56f 100644
--- a/src/dispatcher.c
+++ b/src/dispatcher.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,8 +33,14 @@
 #include "util/cpuid_flags.h"
 #include "util/join.h"
 
+#if defined(DISABLE_AVX512_DISPATCH)
+#define avx512_ disabled_
+#define check_avx512() (0)
+#endif
+
 #define CREATE_DISPATCH(RTYPE, NAME, ...)                                      \
     /* create defns */                                                         \
+    RTYPE JOIN(avx512_, NAME)(__VA_ARGS__);                                    \
     RTYPE JOIN(avx2_, NAME)(__VA_ARGS__);                                      \
     RTYPE JOIN(corei7_, NAME)(__VA_ARGS__);                                    \
     RTYPE JOIN(core2_, NAME)(__VA_ARGS__);                                     \
@@ -46,6 +52,9 @@
                                                                                \
     /* resolver */                                                             \
     static void(*JOIN(resolve_, NAME)(void)) {                                 \
+        if (check_avx512()) {                                                  \
+            return JOIN(avx512_, NAME);                                        \
+        }                                                                      \
         if (check_avx2()) {                                                    \
             return JOIN(avx2_, NAME);                                          \
         }                                                                      \
diff --git a/src/fdr/engine_description.h b/src/fdr/engine_description.h
index 09b161796..b545e6474 100644
--- a/src/fdr/engine_description.h
+++ b/src/fdr/engine_description.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -38,29 +38,19 @@ class EngineDescription {
     u32 id;
     target_t code_target; // the target that we built this code for
     u32 numBuckets;
-    u32 confirmPullBackDistance;
-    u32 confirmTopLevelSplit;
 
 public:
     EngineDescription(u32 id_in, const target_t &code_target_in,
-                      u32 numBuckets_in, u32 confirmPullBackDistance_in,
-                      u32 confirmTopLevelSplit_in)
-        : id(id_in), code_target(code_target_in), numBuckets(numBuckets_in),
-          confirmPullBackDistance(confirmPullBackDistance_in),
-          confirmTopLevelSplit(confirmTopLevelSplit_in) {}
+                      u32 numBuckets_in)
+        : id(id_in), code_target(code_target_in), numBuckets(numBuckets_in) {}
 
     virtual ~EngineDescription();
 
     u32 getID() const { return id; }
     u32 getNumBuckets() const { return numBuckets; }
-    u32 getConfirmPullBackDistance() const { return confirmPullBackDistance; }
-    u32 getConfirmTopLevelSplit() const { return confirmTopLevelSplit; }
-    void setConfirmTopLevelSplit(u32 split) { confirmTopLevelSplit = split; }
 
     bool isValidOnTarget(const target_t &target_in) const;
     virtual u32 getDefaultFloodSuffixLength() const = 0;
-
-    virtual bool typicallyHoldsOneCharLits() const { return true; }
 };
 
 /** Returns a target given a CPU feature set value. */
diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c
index 23416c707..92e75aaa8 100644
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,7 +34,9 @@
 #include "flood_runtime.h"
 #include "teddy.h"
 #include "teddy_internal.h"
+#include "util/arch.h"
 #include "util/simd_utils.h"
+#include "util/uniform_ops.h"
 
 /** \brief number of bytes processed in each iteration */
 #define ITER_BYTES          16
@@ -51,7 +53,7 @@
  *
  * The incoming buffer is to split in multiple zones to ensure two properties:
  * 1: that we can read 8? bytes behind to generate a hash safely
- * 2: that we can read the byte after the current byte (domain > 8)
+ * 2: that we can read the 3 byte after the current byte (domain > 8)
  */
 struct zone {
     /** \brief copied buffer, used only when it is a boundary zone. */
@@ -116,20 +118,34 @@ const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = {
       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }
 };
 
+/* compilers don't reliably synthesize the 32-bit ANDN instruction here,
+ * so we force its generation.
+ */
+static really_inline
+u64a andn(const u32 a, const u8 *b) {
+    u64a r;
+#if defined(HAVE_BMI) && !defined(NO_ASM)
+    __asm__ ("andn\t%2,%1,%k0" : "=r"(r) : "r"(a), "m"(*(const u32 *)b));
+#else
+    r = unaligned_load_u32(b) & ~a;
+#endif
+    return r;
+}
+
 /* generates an initial state mask based on the last byte-ish of history rather
  * than being all accepting. If there is no history to consider, the state is
  * generated based on the minimum length of each bucket in order to prevent
  * confirms.
  */
 static really_inline
-m128 getInitState(const struct FDR *fdr, u8 len_history, const u8 *ft,
+m128 getInitState(const struct FDR *fdr, u8 len_history, const u64a *ft,
                   const struct zone *z) {
     m128 s;
     if (len_history) {
         /* +1: the zones ensure that we can read the byte at z->end */
         u32 tmp = lv_u16(z->start + z->shift - 1, z->buf, z->end + 1);
         tmp &= fdr->domainMask;
-        s = *((const m128 *)ft + tmp);
+        s = load_m128_from_u64a(ft + tmp);
         s = rshiftbyte_m128(s, 1);
     } else {
         s = fdr->start;
@@ -138,51 +154,30 @@ m128 getInitState(const struct FDR *fdr, u8 len_history, const u8 *ft,
 }
 
 static really_inline
-void get_conf_stride_1(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
-                       u64a domain_mask_adjusted, const u8 *ft, u64a *conf0,
-                       u64a *conf8, m128 *s) {
+void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
+                       UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
+                       const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
     /* +1: the zones ensure that we can read the byte at z->end */
-
-    u64a current_data_0;
-    u64a current_data_8;
-
-    current_data_0 = lv_u64a(itPtr + 0, start_ptr, end_ptr);
-    u64a v7 = (lv_u16(itPtr + 7, start_ptr, end_ptr + 1) << 1) &
-               domain_mask_adjusted;
-    u64a v0 = (current_data_0 << 1) & domain_mask_adjusted;
-    u64a v1 = (current_data_0 >> 7) & domain_mask_adjusted;
-    u64a v2 = (current_data_0 >> 15) & domain_mask_adjusted;
-    u64a v3 = (current_data_0 >> 23) & domain_mask_adjusted;
-    u64a v4 = (current_data_0 >> 31) & domain_mask_adjusted;
-    u64a v5 = (current_data_0 >> 39) & domain_mask_adjusted;
-    u64a v6 = (current_data_0 >> 47) & domain_mask_adjusted;
-    current_data_8 = lv_u64a(itPtr + 8, start_ptr, end_ptr);
-    u64a v15 = (lv_u16(itPtr + 15, start_ptr, end_ptr + 1) << 1) &
-               domain_mask_adjusted;
-    u64a v8 = (current_data_8 << 1) & domain_mask_adjusted;
-    u64a v9 = (current_data_8 >> 7) & domain_mask_adjusted;
-    u64a v10 = (current_data_8 >> 15) & domain_mask_adjusted;
-    u64a v11 = (current_data_8 >> 23) & domain_mask_adjusted;
-    u64a v12 = (current_data_8 >> 31) & domain_mask_adjusted;
-    u64a v13 = (current_data_8 >> 39) & domain_mask_adjusted;
-    u64a v14 = (current_data_8 >> 47) & domain_mask_adjusted;
-
-    m128 st0 = *(const m128 *)(ft + v0*8);
-    m128 st1 = *(const m128 *)(ft + v1*8);
-    m128 st2 = *(const m128 *)(ft + v2*8);
-    m128 st3 = *(const m128 *)(ft + v3*8);
-    m128 st4 = *(const m128 *)(ft + v4*8);
-    m128 st5 = *(const m128 *)(ft + v5*8);
-    m128 st6 = *(const m128 *)(ft + v6*8);
-    m128 st7 = *(const m128 *)(ft + v7*8);
-    m128 st8 = *(const m128 *)(ft + v8*8);
-    m128 st9 = *(const m128 *)(ft + v9*8);
-    m128 st10 = *(const m128 *)(ft + v10*8);
-    m128 st11 = *(const m128 *)(ft + v11*8);
-    m128 st12 = *(const m128 *)(ft + v12*8);
-    m128 st13 = *(const m128 *)(ft + v13*8);
-    m128 st14 = *(const m128 *)(ft + v14*8);
-    m128 st15 = *(const m128 *)(ft + v15*8);
+    assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
+    u64a reach0 = andn(domain_mask_flipped, itPtr);
+    u64a reach1 = andn(domain_mask_flipped, itPtr + 1);
+    u64a reach2 = andn(domain_mask_flipped, itPtr + 2);
+    u64a reach3 = andn(domain_mask_flipped, itPtr + 3);
+
+    m128 st0 = load_m128_from_u64a(ft + reach0);
+    m128 st1 = load_m128_from_u64a(ft + reach1);
+    m128 st2 = load_m128_from_u64a(ft + reach2);
+    m128 st3 = load_m128_from_u64a(ft + reach3);
+
+    u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
+    u64a reach5 = andn(domain_mask_flipped, itPtr + 5);
+    u64a reach6 = andn(domain_mask_flipped, itPtr + 6);
+    u64a reach7 = andn(domain_mask_flipped, itPtr + 7);
+
+    m128 st4 = load_m128_from_u64a(ft + reach4);
+    m128 st5 = load_m128_from_u64a(ft + reach5);
+    m128 st6 = load_m128_from_u64a(ft + reach6);
+    m128 st7 = load_m128_from_u64a(ft + reach7);
 
     st1 = lshiftbyte_m128(st1, 1);
     st2 = lshiftbyte_m128(st2, 2);
@@ -191,6 +186,40 @@ void get_conf_stride_1(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
     st5 = lshiftbyte_m128(st5, 5);
     st6 = lshiftbyte_m128(st6, 6);
     st7 = lshiftbyte_m128(st7, 7);
+
+    st0 = or128(st0, st1);
+    st2 = or128(st2, st3);
+    st4 = or128(st4, st5);
+    st6 = or128(st6, st7);
+    st0 = or128(st0, st2);
+    st4 = or128(st4, st6);
+    st0 = or128(st0, st4);
+    *s = or128(*s, st0);
+
+    *conf0 = movq(*s);
+    *s = rshiftbyte_m128(*s, 8);
+    *conf0 ^= ~0ULL;
+
+    u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
+    u64a reach9 = andn(domain_mask_flipped, itPtr + 9);
+    u64a reach10 = andn(domain_mask_flipped, itPtr + 10);
+    u64a reach11 = andn(domain_mask_flipped, itPtr + 11);
+
+    m128 st8 = load_m128_from_u64a(ft + reach8);
+    m128 st9 = load_m128_from_u64a(ft + reach9);
+    m128 st10 = load_m128_from_u64a(ft + reach10);
+    m128 st11 = load_m128_from_u64a(ft + reach11);
+
+    u64a reach12 = andn(domain_mask_flipped, itPtr + 12);
+    u64a reach13 = andn(domain_mask_flipped, itPtr + 13);
+    u64a reach14 = andn(domain_mask_flipped, itPtr + 14);
+    u64a reach15 = andn(domain_mask_flipped, itPtr + 15);
+
+    m128 st12 = load_m128_from_u64a(ft + reach12);
+    m128 st13 = load_m128_from_u64a(ft + reach13);
+    m128 st14 = load_m128_from_u64a(ft + reach14);
+    m128 st15 = load_m128_from_u64a(ft + reach15);
+
     st9 = lshiftbyte_m128(st9, 1);
     st10 = lshiftbyte_m128(st10, 2);
     st11 = lshiftbyte_m128(st11, 3);
@@ -199,100 +228,86 @@ void get_conf_stride_1(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
     st14 = lshiftbyte_m128(st14, 6);
     st15 = lshiftbyte_m128(st15, 7);
 
-    *s = or128(*s, st0);
-    *s = or128(*s, st1);
-    *s = or128(*s, st2);
-    *s = or128(*s, st3);
-    *s = or128(*s, st4);
-    *s = or128(*s, st5);
-    *s = or128(*s, st6);
-    *s = or128(*s, st7);
-    *conf0 = movq(*s);
-    *s = rshiftbyte_m128(*s, 8);
-    *conf0 ^= ~0ULL;
-
+    st8 = or128(st8, st9);
+    st10 = or128(st10, st11);
+    st12 = or128(st12, st13);
+    st14 = or128(st14, st15);
+    st8 = or128(st8, st10);
+    st12 = or128(st12, st14);
+    st8 = or128(st8, st12);
     *s = or128(*s, st8);
-    *s = or128(*s, st9);
-    *s = or128(*s, st10);
-    *s = or128(*s, st11);
-    *s = or128(*s, st12);
-    *s = or128(*s, st13);
-    *s = or128(*s, st14);
-    *s = or128(*s, st15);
+
     *conf8 = movq(*s);
     *s = rshiftbyte_m128(*s, 8);
     *conf8 ^= ~0ULL;
 }
 
 static really_inline
-void get_conf_stride_2(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
-                       u64a domain_mask_adjusted, const u8 *ft, u64a *conf0,
-                       u64a *conf8, m128 *s) {
-    u64a current_data_0;
-    u64a current_data_8;
-
-    current_data_0 = lv_u64a(itPtr + 0, start_ptr, end_ptr);
-    u64a v0 = (current_data_0 << 1) & domain_mask_adjusted;
-    u64a v2 = (current_data_0 >> 15) & domain_mask_adjusted;
-    u64a v4 = (current_data_0 >> 31) & domain_mask_adjusted;
-    u64a v6 = (current_data_0 >> 47) & domain_mask_adjusted;
-    current_data_8 = lv_u64a(itPtr + 8, start_ptr, end_ptr);
-    u64a v8 = (current_data_8 << 1) & domain_mask_adjusted;
-    u64a v10 = (current_data_8 >> 15) & domain_mask_adjusted;
-    u64a v12 = (current_data_8 >> 31) & domain_mask_adjusted;
-    u64a v14 = (current_data_8 >> 47) & domain_mask_adjusted;
-
-    m128 st0 = *(const m128 *)(ft + v0*8);
-    m128 st2 = *(const m128 *)(ft + v2*8);
-    m128 st4 = *(const m128 *)(ft + v4*8);
-    m128 st6 = *(const m128 *)(ft + v6*8);
-    m128 st8 = *(const m128 *)(ft + v8*8);
-    m128 st10 = *(const m128 *)(ft + v10*8);
-    m128 st12 = *(const m128 *)(ft + v12*8);
-    m128 st14 = *(const m128 *)(ft + v14*8);
+void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr,
+                       UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
+                       const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
+    assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
+    u64a reach0 = andn(domain_mask_flipped, itPtr);
+    u64a reach2 = andn(domain_mask_flipped, itPtr + 2);
+    u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
+    u64a reach6 = andn(domain_mask_flipped, itPtr + 6);
+
+    m128 st0 = load_m128_from_u64a(ft + reach0);
+    m128 st2 = load_m128_from_u64a(ft + reach2);
+    m128 st4 = load_m128_from_u64a(ft + reach4);
+    m128 st6 = load_m128_from_u64a(ft + reach6);
+
+    u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
+    u64a reach10 = andn(domain_mask_flipped, itPtr + 10);
+    u64a reach12 = andn(domain_mask_flipped, itPtr + 12);
+    u64a reach14 = andn(domain_mask_flipped, itPtr + 14);
+
+    m128 st8 = load_m128_from_u64a(ft + reach8);
+    m128 st10 = load_m128_from_u64a(ft + reach10);
+    m128 st12 = load_m128_from_u64a(ft + reach12);
+    m128 st14 = load_m128_from_u64a(ft + reach14);
 
     st2  = lshiftbyte_m128(st2, 2);
     st4  = lshiftbyte_m128(st4, 4);
     st6  = lshiftbyte_m128(st6, 6);
-    st10 = lshiftbyte_m128(st10, 2);
-    st12 = lshiftbyte_m128(st12, 4);
-    st14 = lshiftbyte_m128(st14, 6);
 
     *s = or128(*s, st0);
     *s = or128(*s, st2);
     *s = or128(*s, st4);
     *s = or128(*s, st6);
+
     *conf0 = movq(*s);
     *s = rshiftbyte_m128(*s, 8);
     *conf0 ^= ~0ULL;
 
+    st10 = lshiftbyte_m128(st10, 2);
+    st12 = lshiftbyte_m128(st12, 4);
+    st14 = lshiftbyte_m128(st14, 6);
+
     *s = or128(*s, st8);
     *s = or128(*s, st10);
     *s = or128(*s, st12);
     *s = or128(*s, st14);
+
     *conf8 = movq(*s);
     *s = rshiftbyte_m128(*s, 8);
     *conf8 ^= ~0ULL;
 }
 
 static really_inline
-void get_conf_stride_4(const u8 *itPtr, const u8 *start_ptr, const u8 *end_ptr,
-                       u64a domain_mask_adjusted, const u8 *ft, u64a *conf0,
-                       u64a *conf8, m128 *s) {
-    u64a current_data_0;
-    u64a current_data_8;
-
-    current_data_0 = lv_u64a(itPtr + 0, start_ptr, end_ptr);
-    u64a v0 = (current_data_0 << 1) & domain_mask_adjusted;
-    u64a v4 = (current_data_0 >> 31) & domain_mask_adjusted;
-    current_data_8 = lv_u64a(itPtr + 8, start_ptr, end_ptr);
-    u64a v8 = (current_data_8 << 1) & domain_mask_adjusted;
-    u64a v12 = (current_data_8 >> 31) & domain_mask_adjusted;
-
-    m128 st0 = *(const m128 *)(ft + v0*8);
-    m128 st4 = *(const m128 *)(ft + v4*8);
-    m128 st8 = *(const m128 *)(ft + v8*8);
-    m128 st12 = *(const m128 *)(ft + v12*8);
+void get_conf_stride_4(const u8 *itPtr, UNUSED const u8 *start_ptr,
+                       UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
+                       const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
+    assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
+    u64a reach0 = andn(domain_mask_flipped, itPtr);
+    u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
+    u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
+    u64a reach12 = andn(domain_mask_flipped, itPtr + 12);
+
+    m128 st0 = load_m128_from_u64a(ft + reach0);
+    m128 st4 = load_m128_from_u64a(ft + reach4);
+    m128 st8 = load_m128_from_u64a(ft + reach8);
+    m128 st12 = load_m128_from_u64a(ft + reach12);
 
     st4 = lshiftbyte_m128(st4, 4);
     st12 = lshiftbyte_m128(st12, 4);
@@ -315,7 +330,6 @@ void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
                     const u32 *confBase, const struct FDR_Runtime_Args *a,
                     const u8 *ptr, u32 *last_match_id, struct zone *z) {
     const u8 bucket = 8;
-    const u8 pullback = 1;
 
     if (likely(!*conf)) {
         return;
@@ -332,8 +346,7 @@ void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
         u32 bit = findAndClearLSB_64(conf);
         u32 byte = bit / bucket + offset;
         u32 bitRem = bit % bucket;
-        u32 confSplit = *(ptr + byte);
-        u32 idx = confSplit * bucket + bitRem;
+        u32 idx = bitRem;
         u32 cf = confBase[idx];
         if (!cf) {
             continue;
@@ -343,18 +356,8 @@ void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control,
         if (!(fdrc->groups & *control)) {
             continue;
         }
-        if (!fdrc->mult) {
-            u32 id = fdrc->nBitsOrSoleID;
-            if ((*last_match_id == id) && (fdrc->flags & NoRepeat)) {
-                continue;
-            }
-           *last_match_id = id;
-           *control = a->cb(ptr_main + byte - a->buf, ptr_main + byte - a->buf,
-                            id, a->ctxt);
-           continue;
-        }
-        u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a));
-        confWithBit(fdrc, a, ptr_main - a->buf + byte, pullback, control,
+        u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a) + 1);
+        confWithBit(fdrc, a, ptr_main - a->buf + byte, control,
                     last_match_id, confVal);
     } while (unlikely(!!*conf));
 }
@@ -496,6 +499,7 @@ void createShortZone(const u8 *buf, const u8 *hend, const u8 *begin,
 
     /* copy the post-padding byte; this is required for domain > 8 due to
      * overhang */
+    assert(ZONE_SHORT_DATA_OFFSET + copy_len + 3 < 64);
     *z_end = 0;
 
     z->end = z_end;
@@ -566,15 +570,19 @@ void createStartZone(const u8 *buf, const u8 *hend, const u8 *begin,
     storeu128(z_end - sizeof(m128), loadu128(end - sizeof(m128)));
 
     z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end);
+
+    assert(ZONE_START_BEGIN + copy_len + 3 < 64);
 }
 
 /**
  * \brief Create a zone for the end region.
  *
  * This function requires that there is > ITER_BYTES of data in the buffer to
- * scan. The end zone, however, is only responsible for a scanning the <=
- * ITER_BYTES rump of data. The end zone is required to handle a full ITER_BYTES
- * iteration as the main loop cannot handle the last byte of the buffer.
+ * scan. The end zone is responsible for a scanning the <= ITER_BYTES rump of
+ * data and optional ITER_BYTES. The main zone cannot handle the last 3 bytes
+ * of the buffer. The end zone is required to handle an optional full
+ * ITER_BYTES from main zone when there are less than 3 bytes to scan. The
+ * main zone size is reduced by ITER_BYTES in this case.
  *
  * This zone ensures that the byte at z->end can be read by filling it with a
  * padding character.
@@ -592,31 +600,45 @@ void createEndZone(const u8 *buf, const u8 *begin, const u8 *end,
 
     ptrdiff_t z_len = end - begin;
     assert(z_len > 0);
-    assert(z_len <= ITER_BYTES);
+    size_t iter_bytes_second = 0;
+    size_t z_len_first = z_len;
+    if (z_len > ITER_BYTES) {
+        z_len_first = z_len - ITER_BYTES;
+        iter_bytes_second = ITER_BYTES;
+    }
+    z->shift = ITER_BYTES - z_len_first;
 
-    z->shift = ITER_BYTES - z_len;
+    const u8 *end_first = end - iter_bytes_second;
+    /* The amount of data we have to copy from main buffer for the
+     * first iteration. */
+    size_t copy_len_first = MIN((size_t)(end_first - buf),
+                                ITER_BYTES + sizeof(CONF_TYPE));
+    assert(copy_len_first >= 16);
 
-    /* The amount of data we have to copy from main buffer. */
-    size_t copy_len = MIN((size_t)(end - buf),
-                          ITER_BYTES + sizeof(CONF_TYPE));
-    assert(copy_len >= 16);
+    size_t total_copy_len = copy_len_first + iter_bytes_second;
+    assert(total_copy_len + 3 < 64);
 
     /* copy the post-padding byte; this is required for domain > 8 due to
      * overhang */
-    z->buf[copy_len] = 0;
+    z->buf[total_copy_len] = 0;
 
     /* set the start and end location of the zone buf
      * to be scanned */
-    u8 *z_end = z->buf + copy_len;
+    u8 *z_end = z->buf + total_copy_len;
     z->end = z_end;
-    z->start = z_end - ITER_BYTES;
+    z->start = z_end - ITER_BYTES - iter_bytes_second;
     assert(z->start + z->shift == z_end - z_len);
 
+    u8 *z_end_first = z_end - iter_bytes_second;
     /* copy the first 8 bytes of the valid region */
-    unaligned_store_u64a(z->buf, unaligned_load_u64a(end - copy_len));
+    unaligned_store_u64a(z->buf,
+                         unaligned_load_u64a(end_first - copy_len_first));
 
     /* copy the last 16 bytes, may overlap with the previous 8 byte write */
-    storeu128(z_end - sizeof(m128), loadu128(end - sizeof(m128)));
+    storeu128(z_end_first - sizeof(m128), loadu128(end_first - sizeof(m128)));
+    if (iter_bytes_second) {
+        storeu128(z_end - sizeof(m128), loadu128(end - sizeof(m128)));
+    }
 
     z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end);
 }
@@ -651,13 +673,13 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend,
 
     /* find maximum buffer location that the main zone can scan
      * - must be a multiple of ITER_BYTES, and
-     * - cannot contain the last byte (due to overhang)
+     * - cannot contain the last 3 bytes (due to 3 bytes read behind the
+         end of buffer in FDR main loop)
      */
-    const u8 *main_end = buf + start + ROUNDDOWN_N(len - start - 1, ITER_BYTES);
-    assert(main_end >= ptr);
+    const u8 *main_end = buf + start + ROUNDDOWN_N(len - start - 3, ITER_BYTES);
 
     /* create a zone if multiple of ITER_BYTES are found */
-    if (main_end != ptr) {
+    if (main_end > ptr) {
         createMainZone(flood, ptr, main_end, &zoneArr[numZone++]);
         ptr = main_end;
     }
@@ -684,10 +706,10 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend,
                     return HWLM_TERMINATED;                                 \
                 }                                                           \
             }                                                               \
-            __builtin_prefetch(itPtr + (ITER_BYTES*4));                     \
+            __builtin_prefetch(itPtr + ITER_BYTES);                         \
             u64a conf0;                                                     \
             u64a conf8;                                                     \
-            get_conf_fn(itPtr, start_ptr, end_ptr, domain_mask_adjusted,    \
+            get_conf_fn(itPtr, start_ptr, end_ptr, domain_mask_flipped,     \
                         ft, &conf0, &conf8, &s);                            \
             do_confirm_fdr(&conf0, 0, &control, confBase, a, itPtr,         \
                            &last_match_id, zz);                             \
@@ -705,10 +727,11 @@ hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
                              hwlm_group_t control) {
     u32 floodBackoff = FLOOD_BACKOFF_START;
     u32 last_match_id = INVALID_MATCH_ID;
-    u64a domain_mask_adjusted = fdr->domainMask << 1;
+    u32 domain_mask_flipped = ~fdr->domainMask;
     u8 stride = fdr->stride;
-    const u8 *ft = (const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR));
-    const u32 *confBase = (const u32 *)(ft + fdr->tabSize);
+    const u64a *ft =
+        (const u64a *)((const u8 *)fdr + ROUNDUP_16(sizeof(struct FDR)));
+    const u32 *confBase = (const u32 *)((const u8 *)ft + fdr->tabSize);
     struct zone zones[ZONE_MAX];
     assert(fdr->domain > 8 && fdr->domain < 16);
 
@@ -761,7 +784,7 @@ hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
     return HWLM_SUCCESS;
 }
 
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
 #define ONLY_AVX2(func) func
 #else
 #define ONLY_AVX2(func) NULL
@@ -773,8 +796,8 @@ typedef hwlm_error_t (*FDRFUNCTYPE)(const struct FDR *fdr,
 
 static const FDRFUNCTYPE funcs[] = {
     fdr_engine_exec,
-    ONLY_AVX2(fdr_exec_teddy_avx2_msks1_fast),
-    ONLY_AVX2(fdr_exec_teddy_avx2_msks1_pck_fast),
+    NULL, /* old: fast teddy */
+    NULL, /* old: fast teddy */
     ONLY_AVX2(fdr_exec_teddy_avx2_msks1_fat),
     ONLY_AVX2(fdr_exec_teddy_avx2_msks1_pck_fat),
     ONLY_AVX2(fdr_exec_teddy_avx2_msks2_fat),
diff --git a/src/fdr/fdr_compile.cpp b/src/fdr/fdr_compile.cpp
index 937513a85..c4ea50f27 100644
--- a/src/fdr/fdr_compile.cpp
+++ b/src/fdr/fdr_compile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,8 +30,9 @@
  * \brief FDR literal matcher: build API.
  */
 
-#include "fdr_internal.h"
 #include "fdr_compile.h"
+
+#include "fdr_internal.h"
 #include "fdr_confirm.h"
 #include "fdr_compile_internal.h"
 #include "fdr_engine_description.h"
@@ -40,9 +41,10 @@
 #include "grey.h"
 #include "ue2common.h"
 #include "hwlm/hwlm_build.h"
-#include "util/alloc.h"
 #include "util/compare.h"
 #include "util/dump_mask.h"
+#include "util/math.h"
+#include "util/noncopyable.h"
 #include "util/target_info.h"
 #include "util/ue2string.h"
 #include "util/verify_types.h"
@@ -53,13 +55,15 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <limits>
 #include <map>
 #include <memory>
+#include <numeric>
 #include <set>
 #include <string>
 #include <vector>
 
-#include <boost/core/noncopyable.hpp>
+#include <boost/multi_array.hpp>
 
 using namespace std;
 
@@ -67,31 +71,31 @@ namespace ue2 {
 
 namespace {
 
-class FDRCompiler : boost::noncopyable {
+class FDRCompiler : noncopyable {
 private:
     const FDREngineDescription &eng;
+    const Grey &grey;
     vector<u8> tab;
-    const vector<hwlmLiteral> &lits;
+    vector<hwlmLiteral> lits;
     map<BucketIndex, std::vector<LiteralIndex> > bucketToLits;
     bool make_small;
 
     u8 *tabIndexToMask(u32 indexInTable);
-    void assignStringToBucket(LiteralIndex l, BucketIndex b);
     void assignStringsToBuckets();
 #ifdef DEBUG
     void dumpMasks(const u8 *defaultMask);
 #endif
     void setupTab();
-    aligned_unique_ptr<FDR> setupFDR(pair<aligned_unique_ptr<u8>, size_t> &link);
+    bytecode_ptr<FDR> setupFDR();
     void createInitialState(FDR *fdr);
 
 public:
-    FDRCompiler(const vector<hwlmLiteral> &lits_in,
-                const FDREngineDescription &eng_in, bool make_small_in)
-        : eng(eng_in), tab(eng_in.getTabSizeBytes()), lits(lits_in),
-          make_small(make_small_in) {}
+    FDRCompiler(vector<hwlmLiteral> lits_in, const FDREngineDescription &eng_in,
+                bool make_small_in, const Grey &grey_in)
+        : eng(eng_in), grey(grey_in), tab(eng_in.getTabSizeBytes()),
+          lits(move(lits_in)), make_small(make_small_in) {}
 
-    aligned_unique_ptr<FDR> build(pair<aligned_unique_ptr<u8>, size_t> &link);
+    bytecode_ptr<FDR> build();
 };
 
 u8 *FDRCompiler::tabIndexToMask(u32 indexInTable) {
@@ -140,27 +144,25 @@ void FDRCompiler::createInitialState(FDR *fdr) {
     }
 }
 
-aligned_unique_ptr<FDR>
-FDRCompiler::setupFDR(pair<aligned_unique_ptr<u8>, size_t> &link) {
+bytecode_ptr<FDR> FDRCompiler::setupFDR() {
     size_t tabSize = eng.getTabSizeBytes();
 
-    auto floodControlTmp = setupFDRFloodControl(lits, eng);
-    auto confirmTmp = setupFullMultiConfs(lits, eng, bucketToLits, make_small);
+    auto floodControlTmp = setupFDRFloodControl(lits, eng, grey);
+    auto confirmTmp = setupFullConfs(lits, eng, bucketToLits, make_small);
 
     assert(ISALIGNED_16(tabSize));
-    assert(ISALIGNED_16(confirmTmp.second));
-    assert(ISALIGNED_16(floodControlTmp.second));
-    assert(ISALIGNED_16(link.second));
+    assert(ISALIGNED_16(confirmTmp.size()));
+    assert(ISALIGNED_16(floodControlTmp.size()));
     size_t headerSize = ROUNDUP_16(sizeof(FDR));
-    size_t size = ROUNDUP_16(headerSize + tabSize + confirmTmp.second +
-                             floodControlTmp.second + link.second);
+    size_t size = ROUNDUP_16(headerSize + tabSize + confirmTmp.size() +
+                             floodControlTmp.size());
 
     DEBUG_PRINTF("sizes base=%zu tabSize=%zu confirm=%zu floodControl=%zu "
                  "total=%zu\n",
-                 headerSize, tabSize, confirmTmp.second, floodControlTmp.second,
+                 headerSize, tabSize, confirmTmp.size(), floodControlTmp.size(),
                  size);
 
-    aligned_unique_ptr<FDR> fdr = aligned_zmalloc_unique<FDR>(size);
+    auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64);
     assert(fdr); // otherwise would have thrown std::bad_alloc
 
     fdr->size = size;
@@ -169,16 +171,16 @@ FDRCompiler::setupFDR(pair<aligned_unique_ptr<u8>, size_t> &link) {
     createInitialState(fdr.get());
 
     u8 *fdr_base = (u8 *)fdr.get();
-    u8 * ptr = fdr_base + ROUNDUP_16(sizeof(FDR));
+    u8 *ptr = fdr_base + ROUNDUP_16(sizeof(FDR));
     copy(tab.begin(), tab.end(), ptr);
     ptr += tabSize;
 
-    memcpy(ptr, confirmTmp.first.get(), confirmTmp.second);
-    ptr += confirmTmp.second;
+    memcpy(ptr, confirmTmp.get(), confirmTmp.size());
+    ptr += confirmTmp.size();
 
     fdr->floodOffset = verify_u32(ptr - fdr_base);
-    memcpy(ptr, floodControlTmp.first.get(), floodControlTmp.second);
-    ptr += floodControlTmp.second;
+    memcpy(ptr, floodControlTmp.get(), floodControlTmp.size());
+    ptr += floodControlTmp.size();
 
     /*  we are allowing domains 9 to 15 only */
     assert(eng.bits > 8 && eng.bits < 16);
@@ -187,76 +189,124 @@ FDRCompiler::setupFDR(pair<aligned_unique_ptr<u8>, size_t> &link) {
     fdr->tabSize = (1 << eng.bits) * (eng.schemeWidth / 8);
     fdr->stride = eng.stride;
 
-    if (link.first) {
-        fdr->link = verify_u32(ptr - fdr_base);
-        memcpy(ptr, link.first.get(), link.second);
-    } else {
-        fdr->link = 0;
-    }
-
     return fdr;
 }
 
-void FDRCompiler::assignStringToBucket(LiteralIndex l, BucketIndex b) {
-    bucketToLits[b].push_back(l);
+//#define DEBUG_ASSIGNMENT
+
+static
+double getScoreUtil(u32 len, u32 count) {
+    return len == 0 ? numeric_limits<double>::max()
+                    : our_pow(count, 1.05) * our_pow(len, -3.0);
 }
 
-struct LitOrder {
-    explicit LitOrder(const vector<hwlmLiteral> &vl_) : vl(vl_) {}
-    bool operator()(const u32 &i1, const u32 &i2) const {
-        const string &i1s = vl[i1].s;
-        const string &i2s = vl[i2].s;
+/**
+ * Returns true if the two given literals should be placed in the same chunk as
+ * they are identical except for a difference in caselessness.
+ */
+static
+bool isEquivLit(const hwlmLiteral &a, const hwlmLiteral &b,
+                const hwlmLiteral *last_nocase_lit) {
+    const size_t a_len = a.s.size();
+    const size_t b_len = b.s.size();
 
-        size_t len1 = i1s.size(), len2 = i2s.size();
+    if (a_len != b_len) {
+        return false;
+    }
 
-        if (len1 != len2) {
-            return len1 < len2;
-        } else {
-            auto p = std::mismatch(i1s.rbegin(), i1s.rend(), i2s.rbegin());
-            if (p.first == i1s.rend()) {
-                return false;
+    bool nocase = last_nocase_lit && a_len == last_nocase_lit->s.size() &&
+                  !cmp(a.s.c_str(), last_nocase_lit->s.c_str(), a_len, true);
+    return !cmp(a.s.c_str(), b.s.c_str(), a.s.size(), nocase);
+}
+
+struct Chunk {
+    Chunk(u32 first_id_in, u32 count_in, u32 length_in)
+        : first_id(first_id_in), count(count_in), length(length_in) {}
+    u32 first_id; //!< first id in this chunk
+    u32 count;    //!< how many are in this chunk
+    u32 length;   //!< how long things in the chunk are
+};
+
+static
+vector<Chunk> assignChunks(const vector<hwlmLiteral> &lits,
+                           const map<u32, u32> &lenCounts) {
+    const u32 CHUNK_MAX = 512;
+    const u32 MAX_CONSIDERED_LENGTH = 16;
+
+    // TODO: detailed early stage literal analysis for v. small cases (actually
+    // look at lits) yes - after we factor this out and merge in the Teddy
+    // style of building we can look at this, although the teddy merge
+    // modelling is quite different. It's still probably adaptable to some
+    // extent for this class of problem.
+
+    vector<Chunk> chunks;
+    chunks.reserve(CHUNK_MAX);
+
+    const u32 maxPerChunk = lits.size() /
+            (CHUNK_MAX - MIN(MAX_CONSIDERED_LENGTH, lenCounts.size())) + 1;
+
+    u32 currentSize = 0;
+    u32 chunkStartID = 0;
+    const hwlmLiteral *last_nocase_lit = nullptr;
+
+    for (u32 i = 0; i < lits.size() && chunks.size() < CHUNK_MAX - 1; i++) {
+        const auto &lit = lits[i];
+
+        DEBUG_PRINTF("i=%u, lit=%s%s\n", i, escapeString(lit.s).c_str(),
+                      lit.nocase ? " (nocase)" : "");
+
+        // If this literal is identical to the last one (aside from differences
+        // in caselessness), keep going even if we will "overfill" a chunk; we
+        // don't want to split identical literals into different buckets.
+        if (i != 0 && isEquivLit(lit, lits[i - 1], last_nocase_lit)) {
+            DEBUG_PRINTF("identical lit\n");
+            goto next_literal;
+        }
+
+        if ((currentSize < MAX_CONSIDERED_LENGTH &&
+             (lit.s.size() != currentSize)) ||
+            (currentSize != 1 && ((i - chunkStartID) >= maxPerChunk))) {
+            currentSize = lit.s.size();
+            if (!chunks.empty()) {
+                chunks.back().count = i - chunkStartID;
             }
-            return *p.first < *p.second;
+            chunkStartID = i;
+            chunks.emplace_back(i, 0, currentSize);
+        }
+next_literal:
+        if (lit.nocase) {
+            last_nocase_lit = &lit;
         }
     }
 
-private:
-    const vector<hwlmLiteral> &vl;
-};
+    assert(!chunks.empty());
+    chunks.back().count = lits.size() - chunkStartID;
+    // close off chunks with an empty row
+    chunks.emplace_back(lits.size(), 0, 0);
 
-static u64a getScoreUtil(u32 len, u32 count) {
-    if (len == 0) {
-        return (u64a)-1;
+#ifdef DEBUG_ASSIGNMENT
+    for (size_t j = 0; j < chunks.size(); j++) {
+        const auto &chunk = chunks[j];
+        printf("chunk %zu first_id=%u count=%u length=%u\n", j, chunk.first_id,
+               chunk.count, chunk.length);
     }
-    const u32 LEN_THRESH = 128;
-    const u32 elen = (len > LEN_THRESH) ? LEN_THRESH : len;
-    const u64a lenScore =
-        (LEN_THRESH * LEN_THRESH * LEN_THRESH) / (elen * elen * elen);
-    return count * lenScore; // deemphasize count - possibly more than needed
-                             // this might be overkill in the other direction
+#endif
+
+    DEBUG_PRINTF("built %zu chunks (%zu lits)\n", chunks.size(), lits.size());
+    assert(chunks.size() <= CHUNK_MAX);
+    return chunks;
 }
 
-//#define DEBUG_ASSIGNMENT
 void FDRCompiler::assignStringsToBuckets() {
-    typedef u64a SCORE; // 'Score' type
-    const SCORE MAX_SCORE = (SCORE)-1;
-    const u32 CHUNK_MAX = 512;
-    const u32 BUCKET_MAX = 16;
-    typedef pair<SCORE, u32> SCORE_INDEX_PAIR;
+    const double MAX_SCORE = numeric_limits<double>::max();
 
-    u32 ls = verify_u32(lits.size());
-    assert(ls); // Shouldn't be called with no literals.
+    assert(!lits.empty()); // Shouldn't be called with no literals.
 
-    // make a vector that contains our literals as pointers or u32 LiteralIndex values
-    vector<LiteralIndex> vli;
-    vli.resize(ls);
+    // Count the number of literals for each length.
     map<u32, u32> lenCounts;
-    for (LiteralIndex l = 0; l < ls; l++) {
-        vli[l] = l;
-        lenCounts[lits[l].s.size()]++;
+    for (const auto &lit : lits) {
+        lenCounts[lit.s.size()]++;
     }
-    // sort vector by literal length + if tied on length, 'magic' criteria of some kind (tbd)
-    stable_sort(vli.begin(), vli.end(), LitOrder(lits));
 
 #ifdef DEBUG_ASSIGNMENT
     for (const auto &m : lenCounts) {
@@ -265,103 +315,94 @@ void FDRCompiler::assignStringsToBuckets() {
     printf("\n");
 #endif
 
-    // TODO: detailed early stage literal analysis for v. small cases (actually look at lits)
-    // yes - after we factor this out and merge in the Teddy style of building we can look
-    // at this, although the teddy merge modelling is quite different. It's still probably
-    // adaptable to some extent for this class of problem
-
-    u32 firstIds[CHUNK_MAX]; // how many are in this chunk (CHUNK_MAX - 1 contains 'last' bound)
-    u32 count[CHUNK_MAX]; // how many are in this chunk
-    u32 length[CHUNK_MAX]; // how long things in the chunk are
-
-    const u32 MAX_CONSIDERED_LENGTH = 16;
-    u32 currentChunk = 0;
-    u32 currentSize = 0;
-    u32 chunkStartID = 0;
-    u32 maxPerChunk  = ls/(CHUNK_MAX - MIN(MAX_CONSIDERED_LENGTH, lenCounts.size())) + 1;
-
-    for (u32 i = 0; i < ls && currentChunk < CHUNK_MAX - 1; i++) {
-        LiteralIndex l = vli[i];
-        if ((currentSize < MAX_CONSIDERED_LENGTH && (lits[l].s.size() != currentSize)) ||
-            (currentSize != 1 && ((i - chunkStartID) >= maxPerChunk))) {
-            currentSize = lits[l].s.size();
-            if (currentChunk) {
-                count[currentChunk - 1 ] = i - chunkStartID;
-            }
-            chunkStartID = firstIds[currentChunk] = i;
-            length[currentChunk] = currentSize;
-            currentChunk++;
-        }
-    }
+    // Sort literals by literal length. If tied on length, use lexicographic
+    // ordering (of the reversed literals).
+    stable_sort(lits.begin(), lits.end(),
+                [](const hwlmLiteral &a, const hwlmLiteral &b) {
+                    if (a.s.size() != b.s.size()) {
+                        return a.s.size() < b.s.size();
+                    }
+                    auto p = mismatch(a.s.rbegin(), a.s.rend(), b.s.rbegin());
+                    if (p.first != a.s.rend()) {
+                        return *p.first < *p.second;
+                    }
+                    // Sort caseless variants first.
+                    return a.nocase > b.nocase;
+                });
 
-    assert(currentChunk > 0);
-    count[currentChunk - 1] = ls - chunkStartID;
-    // close off chunks with an empty row
-    firstIds[currentChunk] = ls;
-    length[currentChunk] = 0;
-    count[currentChunk] = 0;
-    u32 nChunks = currentChunk + 1;
+    vector<Chunk> chunks = assignChunks(lits, lenCounts);
 
-#ifdef DEBUG_ASSIGNMENT
-    for (u32 j = 0; j < nChunks; j++) {
-        printf("%d %d %d %d\n", j, firstIds[j], count[j], length[j]);
-    }
-#endif
+    const u32 numChunks = chunks.size();
+    const u32 numBuckets = eng.getNumBuckets();
 
-    SCORE_INDEX_PAIR t[CHUNK_MAX][BUCKET_MAX]; // pair of score, index
-    u32 nb = eng.getNumBuckets();
+    // 2D array of (score, chunk index) pairs, indexed by
+    // [chunk_index][bucket_index].
+    boost::multi_array<pair<double, u32>, 2> t(
+        boost::extents[numChunks][numBuckets]);
 
-    for (u32 j = 0; j < nChunks; j++) {
+    for (u32 j = 0; j < numChunks; j++) {
         u32 cnt = 0;
-        for (u32 k = j; k < nChunks; ++k) {
-            cnt += count[k];
+        for (u32 k = j; k < numChunks; ++k) {
+            cnt += chunks[k].count;
         }
-        t[j][0] = {getScoreUtil(length[j], cnt), 0};
+        t[j][0] = {getScoreUtil(chunks[j].length, cnt), 0};
     }
 
-    for (u32 i = 1; i < nb; i++) {
-        for (u32 j = 0; j < nChunks - 1; j++) { // don't process last, empty row
-            SCORE_INDEX_PAIR best = {MAX_SCORE, 0};
-            u32 cnt = count[j];
-            for (u32 k = j + 1; k < nChunks - 1; k++, cnt += count[k]) {
-                SCORE score = getScoreUtil(length[j], cnt);
+    for (u32 i = 1; i < numBuckets; i++) {
+        for (u32 j = 0; j < numChunks - 1; j++) { // don't do last, empty row
+            pair<double, u32> best = {MAX_SCORE, 0};
+            u32 cnt = chunks[j].count;
+            for (u32 k = j + 1; k < numChunks - 1; k++) {
+                auto score = getScoreUtil(chunks[j].length, cnt);
                 if (score > best.first) {
-                    break; // if we're now worse locally than our best score, give up
+                    break; // now worse locally than our best score, give up
                 }
                 score += t[k][i-1].first;
                 if (score < best.first) {
                     best = {score, k};
                 }
+                cnt += chunks[k].count;
             }
             t[j][i] = best;
         }
-        t[nChunks - 1][i] = {0,0}; // fill in empty final row for next iteration
+        t[numChunks - 1][i] = {0,0}; // fill in empty final row for next iter
     }
 
 #ifdef DEBUG_ASSIGNMENT
-    for (u32 j = 0; j < nChunks; j++) {
-        for (u32 i = 0; i < nb; i++) {
-            SCORE_INDEX_PAIR v = t[j][i];
-            printf("<%7lld,%3d>", v.first, v.second);
+    for (u32 j = 0; j < numChunks; j++) {
+        printf("%03u: ", j);
+        for (u32 i = 0; i < numBuckets; i++) {
+            const auto &v = t[j][i];
+            printf("<%0.3f,%3d> ", v.first, v.second);
         }
         printf("\n");
     }
 #endif
 
-    // our best score is in best[0][N_BUCKETS-1] and we can follow the links
+    // our best score is in t[0][N_BUCKETS-1] and we can follow the links
     // to find where our buckets should start and what goes into them
-    for (u32 i = 0, n = nb; n && (i != nChunks - 1); n--) {
+    for (u32 i = 0, n = numBuckets; n && (i != numChunks - 1); n--) {
         u32 j = t[i][n - 1].second;
         if (j == 0) {
-            j = nChunks - 1;
+            j = numChunks - 1;
         }
-        // put chunks between i - j into bucket (NBUCKETS-1) - n
-#ifdef DEBUG_ASSIGNMENT
-        printf("placing from %d to %d in bucket %d\n", firstIds[i], firstIds[j],
-               nb - n);
-#endif
-        for (u32 k = firstIds[i]; k < firstIds[j]; k++) {
-            assignStringToBucket((LiteralIndex)vli[k], nb - n);
+
+        // put chunks between i - j into bucket (numBuckets - n).
+        u32 first_id = chunks[i].first_id;
+        u32 last_id = chunks[j].first_id;
+        assert(first_id < last_id);
+        u32 bucket = numBuckets - n;
+        UNUSED const auto &first_lit = lits[first_id];
+        UNUSED const auto &last_lit = lits[last_id - 1];
+        DEBUG_PRINTF("placing [%u-%u) in bucket %u (%u lits, len %zu-%zu, "
+                      "score %0.4f)\n",
+                      first_id, last_id, bucket, last_id - first_id,
+                      first_lit.s.length(), last_lit.s.length(),
+                      getScoreUtil(first_lit.s.length(), last_id - first_id));
+
+        auto &bucket_lits = bucketToLits[bucket];
+        for (u32 k = first_id; k < last_id; k++) {
+            bucket_lits.push_back(k);
         }
         i = j;
     }
@@ -487,49 +528,22 @@ void FDRCompiler::setupTab() {
 #endif
 }
 
-aligned_unique_ptr<FDR>
-FDRCompiler::build(pair<aligned_unique_ptr<u8>, size_t> &link) {
+bytecode_ptr<FDR> FDRCompiler::build() {
     assignStringsToBuckets();
     setupTab();
-    return setupFDR(link);
+    return setupFDR();
 }
 
 } // namespace
 
 static
-size_t maxMaskLen(const vector<hwlmLiteral> &lits) {
-    size_t rv = 0;
-    for (const auto &lit : lits) {
-        rv = max(rv, lit.msk.size());
-    }
-    return rv;
-}
-
-static
-void setHistoryRequired(hwlmStreamingControl &stream_ctl,
-                        const vector<hwlmLiteral> &lits) {
-    size_t max_mask_len = maxMaskLen(lits);
-
-    // we want enough history to manage the longest literal and the longest
-    // mask.
-    stream_ctl.literal_history_required = max(maxLen(lits), max_mask_len) - 1;
-}
-
-static
-aligned_unique_ptr<FDR>
-fdrBuildTableInternal(const vector<hwlmLiteral> &lits, bool make_small,
-                      const target_t &target, const Grey &grey, u32 hint,
-                      hwlmStreamingControl *stream_control) {
-    pair<aligned_unique_ptr<u8>, size_t> link(nullptr, 0);
-
-    if (stream_control) {
-        setHistoryRequired(*stream_control, lits);
-    }
-
+bytecode_ptr<FDR> fdrBuildTableInternal(const vector<hwlmLiteral> &lits,
+                                        bool make_small, const target_t &target,
+                                        const Grey &grey, u32 hint) {
     DEBUG_PRINTF("cpu has %s\n", target.has_avx2() ? "avx2" : "no-avx2");
 
     if (grey.fdrAllowTeddy) {
-        auto fdr = teddyBuildTableHinted(lits, make_small, hint, target, link);
+        auto fdr = teddyBuildTableHinted(lits, make_small, hint, target, grey);
         if (fdr) {
             DEBUG_PRINTF("build with teddy succeeded\n");
             return fdr;
@@ -538,10 +552,8 @@ fdrBuildTableInternal(const vector<hwlmLiteral> &lits, bool make_small,
         }
     }
 
-    const unique_ptr<FDREngineDescription> des =
-        (hint == HINT_INVALID) ? chooseEngine(target, lits, make_small)
-                               : getFdrDescription(hint);
-
+    auto des = (hint == HINT_INVALID) ? chooseEngine(target, lits, make_small)
+                                      : getFdrDescription(hint);
     if (!des) {
         return nullptr;
     }
@@ -552,27 +564,23 @@ fdrBuildTableInternal(const vector<hwlmLiteral> &lits, bool make_small,
         des->stride = 1;
     }
 
-    FDRCompiler fc(lits, *des, make_small);
-    return fc.build(link);
+    FDRCompiler fc(lits, *des, make_small, grey);
+    return fc.build();
 }
 
-aligned_unique_ptr<FDR> fdrBuildTable(const vector<hwlmLiteral> &lits,
-                                      bool make_small, const target_t &target,
-                                      const Grey &grey,
-                                      hwlmStreamingControl *stream_control) {
-    return fdrBuildTableInternal(lits, make_small, target, grey, HINT_INVALID,
-                                 stream_control);
+bytecode_ptr<FDR> fdrBuildTable(const vector<hwlmLiteral> &lits,
+                                bool make_small, const target_t &target,
+                                const Grey &grey) {
+    return fdrBuildTableInternal(lits, make_small, target, grey, HINT_INVALID);
 }
 
 #if !defined(RELEASE_BUILD)
 
-aligned_unique_ptr<FDR>
-fdrBuildTableHinted(const vector<hwlmLiteral> &lits, bool make_small, u32 hint,
-                    const target_t &target, const Grey &grey,
-                    hwlmStreamingControl *stream_control) {
-    pair<u8 *, size_t> link(nullptr, 0);
-    return fdrBuildTableInternal(lits, make_small, target, grey, hint,
-                                 stream_control);
+bytecode_ptr<FDR> fdrBuildTableHinted(const vector<hwlmLiteral> &lits,
+                                      bool make_small, u32 hint,
+                                      const target_t &target,
+                                      const Grey &grey) {
+    return fdrBuildTableInternal(lits, make_small, target, grey, hint);
 }
 
 #endif
diff --git a/src/fdr/fdr_compile.h b/src/fdr/fdr_compile.h
index c12e00714..58047600f 100644
--- a/src/fdr/fdr_compile.h
+++ b/src/fdr/fdr_compile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,7 +34,7 @@
 #define FDR_COMPILE_H
 
 #include "ue2common.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"
 
 #include <vector>
 
@@ -43,21 +43,18 @@ struct FDR;
 namespace ue2 {
 
 struct hwlmLiteral;
-struct hwlmStreamingControl;
 struct Grey;
 struct target_t;
 
-ue2::aligned_unique_ptr<FDR>
-fdrBuildTable(const std::vector<hwlmLiteral> &lits, bool make_small,
-              const target_t &target, const Grey &grey,
-              hwlmStreamingControl *stream_control = nullptr);
+bytecode_ptr<FDR> fdrBuildTable(const std::vector<hwlmLiteral> &lits,
+                                bool make_small, const target_t &target,
+                                const Grey &grey);
 
 #if !defined(RELEASE_BUILD)
 
-ue2::aligned_unique_ptr<FDR>
-fdrBuildTableHinted(const std::vector<hwlmLiteral> &lits, bool make_small,
-                    u32 hint, const target_t &target, const Grey &grey,
-                    hwlmStreamingControl *stream_control = nullptr);
+bytecode_ptr<FDR> fdrBuildTableHinted(const std::vector<hwlmLiteral> &lits,
+                                      bool make_small, u32 hint,
+                                      const target_t &target, const Grey &grey);
 
 #endif
 
diff --git a/src/fdr/fdr_compile_internal.h b/src/fdr/fdr_compile_internal.h
index 48e2ed6f3..756fe8e70 100644
--- a/src/fdr/fdr_compile_internal.h
+++ b/src/fdr/fdr_compile_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,7 +31,7 @@
 
 #include "ue2common.h"
 #include "hwlm/hwlm_literal.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"
 
 #include <map>
 #include <utility>
@@ -55,21 +55,22 @@ typedef u32 PositionInBucket;  // zero is 'we are matching right now!",
 class EngineDescription;
 class FDREngineDescription;
 struct hwlmStreamingControl;
+struct Grey;
 
-std::pair<aligned_unique_ptr<u8>, size_t> setupFullMultiConfs(
-    const std::vector<hwlmLiteral> &lits, const EngineDescription &eng,
-    std::map<BucketIndex, std::vector<LiteralIndex>> &bucketToLits,
-    bool make_small);
+bytecode_ptr<u8> setupFullConfs(const std::vector<hwlmLiteral> &lits,
+               const EngineDescription &eng,
+               std::map<BucketIndex, std::vector<LiteralIndex>> &bucketToLits,
+               bool make_small);
 
 // all suffixes include an implicit max_bucket_width suffix to ensure that
 // we always read a full-scale flood "behind" us in terms of what's in our
 // state; if we don't have a flood that's long enough we won't be in the
 // right state yet to allow blindly advancing
-std::pair<aligned_unique_ptr<u8>, size_t>
-setupFDRFloodControl(const std::vector<hwlmLiteral> &lits,
-                     const EngineDescription &eng);
+bytecode_ptr<u8> setupFDRFloodControl(const std::vector<hwlmLiteral> &lits,
+                                      const EngineDescription &eng,
+                                      const Grey &grey);
 
-std::pair<aligned_unique_ptr<u8>, size_t>
+bytecode_ptr<u8>
 fdrBuildTableStreaming(const std::vector<hwlmLiteral> &lits,
                        hwlmStreamingControl &stream_control);
 
diff --git a/src/fdr/fdr_confirm_compile.cpp b/src/fdr/fdr_confirm_compile.cpp
index e77c46d1f..319141c4d 100644
--- a/src/fdr/fdr_confirm_compile.cpp
+++ b/src/fdr/fdr_confirm_compile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -45,10 +45,7 @@ using namespace std;
 
 namespace ue2 {
 
-using ConfSplitType = u8;
-using BucketSplitPair = pair<BucketIndex, ConfSplitType>;
-using BC2CONF = map<BucketSplitPair,
-                    pair<aligned_unique_ptr<FDRConfirm>, size_t>>;
+using BC2CONF = map<BucketIndex, bytecode_ptr<FDRConfirm>>;
 
 // return the number of bytes beyond a length threshold in all strings in lits
 static
@@ -150,9 +147,9 @@ void fillLitInfo(const vector<hwlmLiteral> &lits, vector<LitInfo> &tmpLitInfo,
 
 //#define FDR_CONFIRM_DUMP 1
 
-static pair<aligned_unique_ptr<FDRConfirm>, size_t>
-getFDRConfirm(const vector<hwlmLiteral> &lits, bool applyOneCharOpt,
-              bool make_small, bool make_confirm) {
+static
+bytecode_ptr<FDRConfirm> getFDRConfirm(const vector<hwlmLiteral> &lits,
+                                       bool make_small, bool make_confirm) {
     vector<LitInfo> tmpLitInfo(lits.size());
     CONF_TYPE andmsk;
     fillLitInfo(lits, tmpLitInfo, andmsk);
@@ -166,7 +163,7 @@ getFDRConfirm(const vector<hwlmLiteral> &lits, bool applyOneCharOpt,
     if (make_small) {
         nBits = min(10U, lg2(lits.size()) + 1);
     } else {
-        nBits = min(13U, lg2(lits.size()) + 4);
+        nBits = lg2(lits.size() + 4);
     }
 
     CONF_TYPE mult = (CONF_TYPE)0x0b4e0ef37bc32127ULL;
@@ -177,8 +174,7 @@ getFDRConfirm(const vector<hwlmLiteral> &lits, bool applyOneCharOpt,
     u32 soleLitCmp = 0;
     u32 soleLitMsk = 0;
 
-    if ((applyOneCharOpt && lits.size() == 1 && lits[0].s.size() == 0 &&
-            lits[0].msk.empty()) || make_confirm == false) {
+    if (!make_confirm) {
         flags = FDRC_FLAG_NO_CONFIRM;
         if (lits[0].noruns) {
             flags |= NoRepeat; // messy - need to clean this up later as flags is sorta kinda obsoleted
@@ -288,7 +284,7 @@ getFDRConfirm(const vector<hwlmLiteral> &lits, bool applyOneCharOpt,
                   sizeof(LitInfo) * lits.size() + totalLitSize;
     size = ROUNDUP_N(size, alignof(FDRConfirm));
 
-    auto fdrc = aligned_zmalloc_unique<FDRConfirm>(size);
+    auto fdrc = make_zeroed_bytecode_ptr<FDRConfirm>(size);
     assert(fdrc); // otherwise would have thrown std::bad_alloc
 
     fdrc->andmsk = andmsk;
@@ -322,32 +318,15 @@ getFDRConfirm(const vector<hwlmLiteral> &lits, bool applyOneCharOpt,
             LiteralIndex litIdx = *i;
 
             // Write LitInfo header.
-            u8 *oldPtr = ptr;
             LitInfo &finalLI = *(LitInfo *)ptr;
             finalLI = tmpLitInfo[litIdx];
 
             ptr += sizeof(LitInfo); // String starts directly after LitInfo.
-
-            // Write literal prefix (everything before the last N characters,
-            // as the last N are already confirmed).
-            const string &t = lits[litIdx].s;
-            if (t.size() > sizeof(CONF_TYPE)) {
-                size_t prefix_len = t.size() - sizeof(CONF_TYPE);
-                memcpy(ptr, t.c_str(), prefix_len);
-                ptr += prefix_len;
-            }
-
-            ptr = ROUNDUP_PTR(ptr, alignof(LitInfo));
+            assert(lits[litIdx].s.size() <= sizeof(CONF_TYPE));
             if (next(i) == e) {
                 finalLI.next = 0;
             } else {
-                // our next field represents an adjustment on top of
-                // current address + the actual size of the literal
-                // so we track any rounding up done for alignment and
-                // add this in - that way we don't have to use bigger
-                // than a u8 (for now)
-                assert((size_t)(ptr - oldPtr) > t.size());
-                finalLI.next = verify_u8(ptr - oldPtr - t.size());
+                finalLI.next = 1;
             }
         }
         assert((size_t)(ptr - fdrc_base) <= size);
@@ -358,19 +337,16 @@ getFDRConfirm(const vector<hwlmLiteral> &lits, bool applyOneCharOpt,
     size_t actual_size = ROUNDUP_N((size_t)(ptr - fdrc_base),
                                    alignof(FDRConfirm));
     assert(actual_size <= size);
+    fdrc.shrink(actual_size);
 
-    return {move(fdrc), actual_size};
+    return fdrc;
 }
 
-static
-u32 setupMultiConfirms(const vector<hwlmLiteral> &lits,
-                       const EngineDescription &eng, BC2CONF &bc2Conf,
-                       map<BucketIndex, vector<LiteralIndex> > &bucketToLits,
-                       bool make_small) {
-    u32 pullBack = eng.getConfirmPullBackDistance();
-    u32 splitMask = eng.getConfirmTopLevelSplit() - 1;
-    bool splitHasCase = splitMask & 0x20;
-
+bytecode_ptr<u8>
+setupFullConfs(const vector<hwlmLiteral> &lits,
+               const EngineDescription &eng,
+               map<BucketIndex, vector<LiteralIndex>> &bucketToLits,
+               bool make_small) {
     bool makeConfirm = true;
     unique_ptr<TeddyEngineDescription> teddyDescr =
         getTeddyDescription(eng.getID());
@@ -378,101 +354,43 @@ u32 setupMultiConfirms(const vector<hwlmLiteral> &lits,
         makeConfirm = teddyDescr->needConfirm(lits);
     }
 
+    BC2CONF bc2Conf;
     u32 totalConfirmSize = 0;
     for (BucketIndex b = 0; b < eng.getNumBuckets(); b++) {
         if (!bucketToLits[b].empty()) {
-            vector<vector<hwlmLiteral>> vl(eng.getConfirmTopLevelSplit());
+            vector<hwlmLiteral> vl;
             for (const LiteralIndex &lit_idx : bucketToLits[b]) {
-                hwlmLiteral lit = lits[lit_idx]; // copy
-                // c is last char of this literal
-                u8 c = *(lit.s.rbegin());
-
-                bool suppressSplit = false;
-                if (pullBack) {
-                    // make a shorter string to work over if we're pulling back
-                    // getFDRConfirm doesn't know about that stuff
-                    assert(lit.s.size() >= pullBack);
-                    lit.s.resize(lit.s.size() - pullBack);
-
-                    u8 c_sub, c_sub_msk;
-                    if (lit.msk.empty()) {
-                        c_sub = 0;
-                        c_sub_msk = 0;
-                    } else {
-                        c_sub = *(lit.cmp.rbegin());
-                        c_sub_msk = *(lit.msk.rbegin());
-                        size_t len = lit.msk.size() -
-                                     min(lit.msk.size(), (size_t)pullBack);
-                        lit.msk.resize(len);
-                        lit.cmp.resize(len);
-                    }
-
-                    // if c_sub_msk is 0xff and lit.nocase
-                    // resteer 'c' to an exact value and set suppressSplit
-                    if ((c_sub_msk == 0xff) && (lit.nocase)) {
-                        suppressSplit = true;
-                        c = c_sub;
-                    }
-                }
-
-                if (!suppressSplit && splitHasCase && lit.nocase &&
-                    ourisalpha(c)) {
-                    vl[(u8)(mytoupper(c) & splitMask)].push_back(lit);
-                    vl[(u8)(mytolower(c) & splitMask)].push_back(lit);
-                } else {
-                    vl[c & splitMask].push_back(lit);
-                }
+                vl.push_back(lits[lit_idx]);
             }
 
-            for (u32 c = 0; c < eng.getConfirmTopLevelSplit(); c++) {
-                if (vl[c].empty()) {
-                    continue;
-                }
-                DEBUG_PRINTF("b %d c %02x sz %zu\n", b, c, vl[c].size());
-                auto key = make_pair(b, c);
-                auto fc = getFDRConfirm(vl[c], eng.typicallyHoldsOneCharLits(),
-                                        make_small, makeConfirm);
-                totalConfirmSize += fc.second;
-                assert(bc2Conf.find(key) == end(bc2Conf));
-                bc2Conf.emplace(key, move(fc));
-            }
+            DEBUG_PRINTF("b %d sz %zu\n", b, vl.size());
+            auto fc = getFDRConfirm(vl, make_small, makeConfirm);
+            totalConfirmSize += fc.size();
+            bc2Conf.emplace(b, move(fc));
         }
     }
-    return totalConfirmSize;
-}
-
-pair<aligned_unique_ptr<u8>, size_t>
-setupFullMultiConfs(const vector<hwlmLiteral> &lits,
-                    const EngineDescription &eng,
-                    map<BucketIndex, vector<LiteralIndex>> &bucketToLits,
-                    bool make_small) {
-    BC2CONF bc2Conf;
-    u32 totalConfirmSize = setupMultiConfirms(lits, eng, bc2Conf, bucketToLits,
-                                              make_small);
 
-    u32 primarySwitch = eng.getConfirmTopLevelSplit();
     u32 nBuckets = eng.getNumBuckets();
-    u32 totalConfSwitchSize = primarySwitch * nBuckets * sizeof(u32);
+    u32 totalConfSwitchSize = nBuckets * sizeof(u32);
     u32 totalSize = ROUNDUP_16(totalConfSwitchSize + totalConfirmSize);
 
-    auto buf = aligned_zmalloc_unique<u8>(totalSize);
+    auto buf = make_zeroed_bytecode_ptr<u8>(totalSize, 16);
     assert(buf); // otherwise would have thrown std::bad_alloc
 
     u32 *confBase = (u32 *)buf.get();
     u8 *ptr = buf.get() + totalConfSwitchSize;
 
     for (const auto &m : bc2Conf) {
-        const BucketIndex &b = m.first.first;
-        const u8 &c = m.first.second;
-        const pair<aligned_unique_ptr<FDRConfirm>, size_t> &p = m.second;
+        const BucketIndex &idx = m.first;
+        const bytecode_ptr<FDRConfirm> &p = m.second;
         // confirm offset is relative to the base of this structure, now
         u32 confirm_offset = verify_u32(ptr - buf.get());
-        memcpy(ptr, p.first.get(), p.second);
-        ptr += p.second;
-        u32 idx = c * nBuckets + b;
+        memcpy(ptr, p.get(), p.size());
+        ptr += p.size();
         confBase[idx] = confirm_offset;
     }
-    return {move(buf), totalSize};
+
+    return buf;
 }
 
 } // namespace ue2
diff --git a/src/fdr/fdr_confirm_runtime.h b/src/fdr/fdr_confirm_runtime.h
index 87ade9fea..a0603c929 100644
--- a/src/fdr/fdr_confirm_runtime.h
+++ b/src/fdr/fdr_confirm_runtime.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -40,8 +40,8 @@
 // the whole confirmation procedure
 static really_inline
 void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a,
-                 size_t i, u32 pullBackAmount, hwlmcb_rv_t *control,
-                 u32 *last_match, u64a conf_key) {
+                 size_t i, hwlmcb_rv_t *control, u32 *last_match,
+                 u64a conf_key) {
     assert(i < a->len);
     assert(ISALIGNED(fdrc));
 
@@ -68,13 +68,10 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
             goto out;
         }
 
-        const u8 *loc = buf + i - li->size + 1 - pullBackAmount;
+        const u8 *loc = buf + i - li->size + 1;
 
-        u8 caseless = li->flags & Caseless;
         if (loc < buf) {
             u32 full_overhang = buf - loc;
-
-            const u8 *history = a->buf_history;
             size_t len_history = a->len_history;
 
             // can't do a vectored confirm either if we don't have
@@ -82,44 +79,15 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
             if (full_overhang > len_history) {
                 goto out;
             }
-
-            // as for the regular case, no need to do a full confirm if
-            // we're a short literal
-            if (unlikely(li->size > sizeof(CONF_TYPE))) {
-                const u8 *s1 = (const u8 *)li + sizeof(*li);
-                const u8 *s2 = s1 + full_overhang;
-                const u8 *loc1 = history + len_history - full_overhang;
-                const u8 *loc2 = buf;
-                size_t size1 = MIN(full_overhang, li->size - sizeof(CONF_TYPE));
-                size_t wind_size2_back = sizeof(CONF_TYPE) + full_overhang;
-                size_t size2 = wind_size2_back > li->size ?
-                    0 : li->size - wind_size2_back;
-
-                if (cmpForward(loc1, s1, size1, caseless)) {
-                    goto out;
-                }
-                if (cmpForward(loc2, s2, size2, caseless)) {
-                    goto out;
-                }
-            }
-        } else { // NON-VECTORING PATH
-
-            // if string < conf_type we don't need regular string cmp
-            if (unlikely(li->size > sizeof(CONF_TYPE))) {
-                const u8 *s = (const u8 *)li + sizeof(*li);
-                if (cmpForward(loc, s, li->size - sizeof(CONF_TYPE),
-                               caseless)) {
-                    goto out;
-                }
-            }
         }
+        assert(li->size <= sizeof(CONF_TYPE));
 
         if (unlikely(!(li->groups & *control))) {
             goto out;
         }
 
         if (unlikely(li->flags & ComplexConfirm)) {
-            const u8 *loc2 = buf + i - li->extended_size + 1 - pullBackAmount;
+            const u8 *loc2 = buf + i - li->extended_size + 1;
             if (loc2 < buf) {
                 u32 full_overhang = buf - loc2;
                 size_t len_history = a->len_history;
@@ -133,7 +101,7 @@ void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a
         *control = a->cb(loc - buf, i, li->id, a->ctxt);
     out:
         oldNext = li->next; // oldNext is either 0 or an 'adjust' value
-        li = (const struct LitInfo *)((const u8 *)li + oldNext + li->size);
+        li++;
     } while (oldNext);
 }
 
@@ -148,7 +116,7 @@ void confWithBit1(const struct FDRConfirm *fdrc,
     assert(ISALIGNED(fdrc));
 
     if (unlikely(fdrc->mult)) {
-        confWithBit(fdrc, a, i, 0, control, last_match, conf_key);
+        confWithBit(fdrc, a, i, control, last_match, conf_key);
         return;
     } else {
         u32 id = fdrc->nBitsOrSoleID;
@@ -176,7 +144,7 @@ void confWithBitMany(const struct FDRConfirm *fdrc,
     }
 
     if (unlikely(fdrc->mult)) {
-        confWithBit(fdrc, a, i, 0, control, last_match, conf_key);
+        confWithBit(fdrc, a, i, control, last_match, conf_key);
         return;
     } else {
         const u32 id = fdrc->nBitsOrSoleID;
diff --git a/src/fdr/fdr_engine_description.cpp b/src/fdr/fdr_engine_description.cpp
index 5e923b08f..2f9ba420c 100644
--- a/src/fdr/fdr_engine_description.cpp
+++ b/src/fdr/fdr_engine_description.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -44,8 +44,7 @@ namespace ue2 {
 
 FDREngineDescription::FDREngineDescription(const FDREngineDef &def)
     : EngineDescription(def.id, targetByArchFeatures(def.cpu_features),
-                        def.numBuckets, def.confirmPullBackDistance,
-                        def.confirmTopLevelSplit),
+                        def.numBuckets),
       schemeWidth(def.schemeWidth), stride(0), bits(0) {}
 
 u32 FDREngineDescription::getDefaultFloodSuffixLength() const {
@@ -55,7 +54,7 @@ u32 FDREngineDescription::getDefaultFloodSuffixLength() const {
 }
 
 void getFdrDescriptions(vector<FDREngineDescription> *out) {
-    static const FDREngineDef def = {0, 128, 8, 0, 1, 256};
+    static const FDREngineDef def = {0, 64, 8, 0};
     out->clear();
     out->emplace_back(def);
 }
diff --git a/src/fdr/fdr_engine_description.h b/src/fdr/fdr_engine_description.h
index d4e70d4b1..09c5ce867 100644
--- a/src/fdr/fdr_engine_description.h
+++ b/src/fdr/fdr_engine_description.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -43,8 +43,6 @@ struct FDREngineDef {
     u32 schemeWidth;
     u32 numBuckets;
     u64a cpu_features;
-    u32 confirmPullBackDistance;
-    u32 confirmTopLevelSplit;
 };
 
 class FDREngineDescription : public EngineDescription {
@@ -64,7 +62,6 @@ class FDREngineDescription : public EngineDescription {
     explicit FDREngineDescription(const FDREngineDef &def);
 
     u32 getDefaultFloodSuffixLength() const override;
-    bool typicallyHoldsOneCharLits() const override { return stride == 1; }
 };
 
 std::unique_ptr<FDREngineDescription>
diff --git a/src/fdr/fdr_internal.h b/src/fdr/fdr_internal.h
index 3bf828377..a425d78c8 100644
--- a/src/fdr/fdr_internal.h
+++ b/src/fdr/fdr_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -71,11 +71,6 @@ struct FDR {
     u32 maxStringLen;
     u32 floodOffset;
 
-    /** link is the relative offset of a secondary included FDR table for
-     * stream handling if we're a primary FDR table or the subsidiary tertiary
-     * structures (spillover strings and hash table) if we're a secondary
-     * structure. */
-    u32 link;
     u8 stride; /* stride - how frequeuntly the data is consulted by the first
                 * stage matcher */
     u8 domain; /* number of bits used to index into main FDR table. This value
diff --git a/src/fdr/flood_compile.cpp b/src/fdr/flood_compile.cpp
index 62693c300..7dcc17d18 100644
--- a/src/fdr/flood_compile.cpp
+++ b/src/fdr/flood_compile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,6 +30,7 @@
 #include "fdr_confirm.h"
 #include "fdr_compile_internal.h"
 #include "fdr_engine_description.h"
+#include "grey.h"
 #include "ue2common.h"
 #include "util/alloc.h"
 #include "util/bitutils.h"
@@ -90,9 +91,9 @@ void addFlood(vector<FDRFlood> &tmpFlood, u8 c, const hwlmLiteral &lit,
    }
 }
 
-pair<aligned_unique_ptr<u8>, size_t>
-setupFDRFloodControl(const vector<hwlmLiteral> &lits,
-                     const EngineDescription &eng) {
+bytecode_ptr<u8> setupFDRFloodControl(const vector<hwlmLiteral> &lits,
+                                      const EngineDescription &eng,
+                                      const Grey &grey) {
     vector<FDRFlood> tmpFlood(N_CHARS);
     u32 default_suffix = eng.getDefaultFloodSuffixLength();
 
@@ -187,6 +188,14 @@ setupFDRFloodControl(const vector<hwlmLiteral> &lits,
     }
 #endif
 
+    // If flood detection has been switched off in the grey box, we comply by
+    // setting idCount too high for all floods.
+    if (!grey.fdrAllowFlood) {
+        for (auto &fl : tmpFlood) {
+            fl.idCount = FDR_FLOOD_MAX_IDS;
+        }
+    }
+
     map<FDRFlood, CharReach, FloodComparator> flood2chars;
     for (u32 i = 0; i < N_CHARS; i++) {
         FDRFlood fl = tmpFlood[i];
@@ -198,7 +207,7 @@ setupFDRFloodControl(const vector<hwlmLiteral> &lits,
     size_t floodStructSize = sizeof(FDRFlood) * nDistinctFloods;
     size_t totalSize = ROUNDUP_16(floodHeaderSize + floodStructSize);
 
-    auto buf = aligned_zmalloc_unique<u8>(totalSize);
+    auto buf = make_zeroed_bytecode_ptr<u8>(totalSize, 16);
     assert(buf); // otherwise would have thrown std::bad_alloc
 
     u32 *floodHeader = (u32 *)buf.get();
@@ -218,7 +227,7 @@ setupFDRFloodControl(const vector<hwlmLiteral> &lits,
     DEBUG_PRINTF("made a flood structure with %zu + %zu = %zu\n",
                  floodHeaderSize, floodStructSize, totalSize);
 
-    return {move(buf), totalSize};
+    return buf;
 }
 
 } // namespace ue2
diff --git a/src/fdr/flood_runtime.h b/src/fdr/flood_runtime.h
index 97723be54..d3f6b3b29 100644
--- a/src/fdr/flood_runtime.h
+++ b/src/fdr/flood_runtime.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -100,7 +100,7 @@ const u8 * floodDetect(const struct FDR * fdr,
     // tryFloodDetect is never put in places where unconditional
     // reads a short distance forward or backward here
     // TODO: rationale for this line needs to be rediscovered!!
-    size_t mainLoopLen = len > iterBytes ? len - iterBytes : 0;
+    size_t mainLoopLen = len > 2 * iterBytes ? len - 2 * iterBytes : 0;
     const u32 i = ptr - buf;
     u32 j = i;
 
diff --git a/src/fdr/teddy.c b/src/fdr/teddy.c
index e7a0fccde..a3f7cfaf4 100644
--- a/src/fdr/teddy.c
+++ b/src/fdr/teddy.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -129,7 +129,8 @@ m128 prep_conf_teddy_m1(const m128 *maskBase, m128 val) {
     m128 mask = set16x8(0xf);
     m128 lo = and128(val, mask);
     m128 hi = and128(rshift64_m128(val, 4), mask);
-    return and128(pshufb(maskBase[0*2], lo), pshufb(maskBase[0*2+1], hi));
+    return and128(pshufb_m128(maskBase[0 * 2], lo),
+                  pshufb_m128(maskBase[0 * 2 + 1], hi));
 }
 
 static really_inline
@@ -139,8 +140,8 @@ m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 val) {
     m128 hi = and128(rshift64_m128(val, 4), mask);
     m128 r = prep_conf_teddy_m1(maskBase, val);
 
-    m128 res_1 = and128(pshufb(maskBase[1*2], lo),
-                        pshufb(maskBase[1*2+1], hi));
+    m128 res_1 = and128(pshufb_m128(maskBase[1*2], lo),
+                        pshufb_m128(maskBase[1*2+1], hi));
     m128 res_shifted_1 = palignr(res_1, *old_1, 16-1);
     *old_1 = res_1;
     return and128(r, res_shifted_1);
@@ -154,8 +155,8 @@ m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2,
     m128 hi = and128(rshift64_m128(val, 4), mask);
     m128 r = prep_conf_teddy_m2(maskBase, old_1, val);
 
-    m128 res_2 = and128(pshufb(maskBase[2*2], lo),
-                        pshufb(maskBase[2*2+1], hi));
+    m128 res_2 = and128(pshufb_m128(maskBase[2*2], lo),
+                        pshufb_m128(maskBase[2*2+1], hi));
     m128 res_shifted_2 = palignr(res_2, *old_2, 16-2);
     *old_2 = res_2;
     return and128(r, res_shifted_2);
@@ -169,8 +170,8 @@ m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2,
     m128 hi = and128(rshift64_m128(val, 4), mask);
     m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, val);
 
-    m128 res_3 = and128(pshufb(maskBase[3*2], lo),
-                        pshufb(maskBase[3*2+1], hi));
+    m128 res_3 = and128(pshufb_m128(maskBase[3*2], lo),
+                        pshufb_m128(maskBase[3*2+1], hi));
     m128 res_shifted_3 = palignr(res_3, *old_3, 16-3);
     *old_3 = res_3;
     return and128(r, res_shifted_3);
diff --git a/src/fdr/teddy.h b/src/fdr/teddy.h
index e2936723a..35756c530 100644
--- a/src/fdr/teddy.h
+++ b/src/fdr/teddy.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,6 +34,7 @@
 #define TEDDY_H_
 
 #include "hwlm/hwlm.h" // for hwlm_group_t
+#include "util/arch.h"
 
 struct FDR; // forward declaration from fdr_internal.h
 struct FDR_Runtime_Args;
@@ -70,7 +71,7 @@ hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr,
                                       const struct FDR_Runtime_Args *a,
                                       hwlm_group_t control);
 
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
 
 hwlm_error_t fdr_exec_teddy_avx2_msks1_fat(const struct FDR *fdr,
                                            const struct FDR_Runtime_Args *a,
@@ -104,15 +105,6 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
                                                const struct FDR_Runtime_Args *a,
                                                hwlm_group_t control);
 
-hwlm_error_t fdr_exec_teddy_avx2_msks1_fast(const struct FDR *fdr,
-                                            const struct FDR_Runtime_Args *a,
-                                            hwlm_group_t control);
-
-hwlm_error_t
-fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr,
-                                   const struct FDR_Runtime_Args *a,
-                                   hwlm_group_t control);
-
-#endif /* __AVX2__ */
+#endif /* HAVE_AVX2 */
 
 #endif /* TEDDY_H_ */
diff --git a/src/fdr/teddy_avx2.c b/src/fdr/teddy_avx2.c
index e4a836d47..299825cc4 100644
--- a/src/fdr/teddy_avx2.c
+++ b/src/fdr/teddy_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,78 +35,10 @@
 #include "teddy.h"
 #include "teddy_internal.h"
 #include "teddy_runtime_common.h"
+#include "util/arch.h"
 #include "util/simd_utils.h"
 
-#if defined(__AVX2__)
-
-static const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = {
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00},
-    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}
-};
+#if defined(HAVE_AVX2)
 
 #ifdef ARCH_64_BIT
 #define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn)             \
@@ -199,22 +131,6 @@ do {                                                                        \
 } while (0);
 #endif
 
-#define CONFIRM_FAST_TEDDY(var, offset, reason, conf_fn)                    \
-do {                                                                        \
-    if (unlikely(isnonzero256(var))) {                                      \
-        u32 arrCnt = 0;                                                     \
-        m128 lo = cast256to128(var);                                        \
-        m128 hi = movdq_hi(var);                                            \
-        bit_array_fast_teddy(lo, bitArr, &arrCnt, offset);                  \
-        bit_array_fast_teddy(hi, bitArr, &arrCnt, offset + 2);              \
-        for (u32 i = 0; i < arrCnt; i++) {                                  \
-            conf_fn(bitArr[i], confBase, reason, a, ptr, &control,          \
-                    &last_match);                                           \
-            CHECK_HWLM_TERMINATE_MATCHING;                                  \
-        }                                                                   \
-    }                                                                       \
-} while (0);
-
 static really_inline
 m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
                        const u8 *buf_history, size_t len_history,
@@ -226,193 +142,13 @@ m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
     return ret;
 }
 
-/*
- * \brief Copy a block of [0,31] bytes efficiently.
- *
- * This function is a workaround intended to stop some compilers from
- * synthesizing a memcpy function call out of the copy of a small number of
- * bytes that we do in vectoredLoad128.
- */
-static really_inline
-void copyRuntBlock256(u8 *dst, const u8 *src, size_t len) {
-    switch (len) {
-    case 0:
-        break;
-    case 1:
-        *dst = *src;
-        break;
-    case 2:
-        unaligned_store_u16(dst, unaligned_load_u16(src));
-        break;
-    case 3:
-        unaligned_store_u16(dst, unaligned_load_u16(src));
-        dst[2] = src[2];
-        break;
-    case 4:
-        unaligned_store_u32(dst, unaligned_load_u32(src));
-        break;
-    case 5:
-    case 6:
-    case 7:
-        /* Perform copy with two overlapping 4-byte chunks. */
-        unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4));
-        unaligned_store_u32(dst, unaligned_load_u32(src));
-        break;
-    case 8:
-        unaligned_store_u64a(dst, unaligned_load_u64a(src));
-        break;
-    case 9:
-    case 10:
-    case 11:
-    case 12:
-    case 13:
-    case 14:
-    case 15:
-        /* Perform copy with two overlapping 8-byte chunks. */
-        unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8));
-        unaligned_store_u64a(dst, unaligned_load_u64a(src));
-        break;
-    case 16:
-        storeu128(dst, loadu128(src));
-        break;
-    default:
-        /* Perform copy with two overlapping 16-byte chunks. */
-        assert(len < 32);
-        storeu128(dst + len - 16, loadu128(src + len - 16));
-        storeu128(dst, loadu128(src));
-        break;
-    }
-}
-
-static really_inline
-m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const u8 *lo, const u8 *hi,
-                     const u8 *buf_history, size_t len_history) {
-    union {
-        u8 val8[32];
-        m256 val256;
-    } u;
-
-    uintptr_t copy_start;
-    uintptr_t copy_len;
-
-    if (ptr >= lo) {
-        uintptr_t avail = (uintptr_t)(hi - ptr);
-        if (avail >= 32) {
-            *p_mask = load256(p_mask_arr256[32] + 32);
-            return loadu256(ptr);
-        }
-        *p_mask = load256(p_mask_arr256[avail] + 32);
-        copy_start = 0;
-        copy_len = avail;
-    } else {
-        // need contains "how many chars to pull from history"
-        // calculate based on what we need, what we have in the buffer
-        // and only what we need to make primary confirm work
-        uintptr_t start = (uintptr_t)(lo - ptr);
-        uintptr_t i;
-        for (i = start; ptr + i < lo; i++) {
-            u.val8[i] = buf_history[len_history - (lo - (ptr + i))];
-        }
-        uintptr_t end = MIN(32, (uintptr_t)(hi - ptr));
-        *p_mask = loadu256(p_mask_arr256[end - start] + 32 - start);
-        copy_start = i;
-        copy_len = end - i;
-    }
-
-    // Runt block from the buffer.
-    copyRuntBlock256(&u.val8[copy_start], &ptr[copy_start], copy_len);
-
-    return u.val256;
-}
-
-static really_inline
-void do_confWithBit1_fast_teddy(u16 bits, const u32 *confBase,
-                                CautionReason reason,
-                                const struct FDR_Runtime_Args *a,
-                                const u8 *ptr, hwlmcb_rv_t *control,
-                                u32 *last_match) {
-    u32 byte = bits / 8;
-    u32 cf = confBase[bits % 8];
-    const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
-                                    ((const u8 *)confBase + cf);
-    u64a confVal = getConfVal(a, ptr, byte, reason);
-    confWithBit1(fdrc, a, ptr - a->buf + byte, control, last_match, confVal);
-}
-
-static really_inline
-void do_confWithBit_fast_teddy(u16 bits, const u32 *confBase,
-                               CautionReason reason,
-                               const struct FDR_Runtime_Args *a, const u8 *ptr,
-                               hwlmcb_rv_t *control, u32 *last_match) {
-    u32 byte = bits / 8;
-    u32 bitRem = bits % 8;
-    u32 confSplit = *(ptr+byte) & 0x1f;
-    u32 idx = confSplit * 8 + bitRem;
-    u32 cf = confBase[idx];
-    if (!cf) {
-        return;
-    }
-    const struct FDRConfirm *fdrc = (const struct FDRConfirm *)
-                                    ((const u8 *)confBase + cf);
-    if (!(fdrc->groups & *control)) {
-        return;
-    }
-    u64a confVal = getConfVal(a, ptr, byte, reason);
-    confWithBit(fdrc, a, ptr - a->buf + byte, 0, control, last_match, confVal);
-}
-
-static really_inline
-void bit_array_fast_teddy(m128 var, u16 *bitArr, u32 *arrCnt, u32 offset) {
-    if (unlikely(isnonzero128(var))) {
-#ifdef ARCH_64_BIT
-        u64a part_0 = movq(var);
-        while (unlikely(part_0)) {
-            bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_0) +
-                                    64 * (offset);
-            *arrCnt += 1;
-        }
-        u64a part_1 = movq(rshiftbyte_m128(var, 8));
-        while (unlikely(part_1)) {
-            bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_1) +
-                                    64 * (offset + 1);
-            *arrCnt += 1;
-        }
-#else
-        u32 part_0 = movd(var);
-        while (unlikely(part_0)) {
-            bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_0) +
-                                    32 * (offset * 2);
-            *arrCnt += 1;
-        }
-        u32 part_1 = movd(rshiftbyte_m128(var, 4));
-        while (unlikely(part_1)) {
-            bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_1) +
-                                    32 * (offset * 2 + 1);
-            *arrCnt += 1;
-        }
-        u32 part_2 = movd(rshiftbyte_m128(var, 8));
-        while (unlikely(part_2)) {
-            bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_2) +
-                                    32 * (offset * 2 + 2);
-            *arrCnt += 1;
-        }
-        u32 part_3 = movd(rshiftbyte_m128(var, 12));
-        while (unlikely(part_3)) {
-            bitArr[*arrCnt] = (u16) TEDDY_FIND_AND_CLEAR_LSB(&part_3) +
-                                    32 * (offset * 2 + 3);
-            *arrCnt += 1;
-        }
-#endif
-    }
-}
-
 static really_inline
 m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) {
     m256 mask = set32x8(0xf);
     m256 lo = and256(val, mask);
     m256 hi = and256(rshift64_m256(val, 4), mask);
-    return and256(vpshufb(maskBase[0*2], lo),
-                  vpshufb(maskBase[0*2+1], hi));
+    return and256(pshufb_m256(maskBase[0*2], lo),
+                  pshufb_m256(maskBase[0*2+1], hi));
 }
 
 static really_inline
@@ -422,8 +158,8 @@ m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) {
     m256 hi = and256(rshift64_m256(val, 4), mask);
     m256 r = prep_conf_fat_teddy_m1(maskBase, val);
 
-    m256 res_1 = and256(vpshufb(maskBase[1*2], lo),
-                        vpshufb(maskBase[1*2+1], hi));
+    m256 res_1 = and256(pshufb_m256(maskBase[1*2], lo),
+                        pshufb_m256(maskBase[1*2+1], hi));
     m256 res_shifted_1 = vpalignr(res_1, *old_1, 16-1);
     *old_1 = res_1;
     return and256(r, res_shifted_1);
@@ -437,8 +173,8 @@ m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2,
     m256 hi = and256(rshift64_m256(val, 4), mask);
     m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, val);
 
-    m256 res_2 = and256(vpshufb(maskBase[2*2], lo),
-                        vpshufb(maskBase[2*2+1], hi));
+    m256 res_2 = and256(pshufb_m256(maskBase[2*2], lo),
+                        pshufb_m256(maskBase[2*2+1], hi));
     m256 res_shifted_2 = vpalignr(res_2, *old_2, 16-2);
     *old_2 = res_2;
     return and256(r, res_shifted_2);
@@ -452,20 +188,13 @@ m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2,
     m256 hi = and256(rshift64_m256(val, 4), mask);
     m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, val);
 
-    m256 res_3 = and256(vpshufb(maskBase[3*2], lo),
-                        vpshufb(maskBase[3*2+1], hi));
+    m256 res_3 = and256(pshufb_m256(maskBase[3*2], lo),
+                        pshufb_m256(maskBase[3*2+1], hi));
     m256 res_shifted_3 = vpalignr(res_3, *old_3, 16-3);
     *old_3 = res_3;
     return and256(r, res_shifted_3);
 }
 
-static really_inline
-m256 prep_conf_fast_teddy_m1(m256 val, m256 mask, m256 maskLo, m256 maskHi) {
-    m256 lo = and256(val, mask);
-    m256 hi = and256(rshift64_m256(val, 4), mask);
-    return and256(vpshufb(maskLo, lo), vpshufb(maskHi, hi));
-}
-
 static really_inline
 const m256 * getMaskBase_avx2(const struct Teddy *teddy) {
     return (const m256 *)((const u8 *)teddy + sizeof(struct Teddy));
@@ -959,136 +688,4 @@ hwlm_error_t fdr_exec_teddy_avx2_msks4_pck_fat(const struct FDR *fdr,
     return HWLM_SUCCESS;
 }
 
-hwlm_error_t fdr_exec_teddy_avx2_msks1_fast(const struct FDR *fdr,
-                                            const struct FDR_Runtime_Args *a,
-                                            hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 64;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m128 *maskBase = getMaskBase(teddy);
-    const u32 *confBase = getConfBase(teddy, 1);
-
-    const m256 maskLo = set2x128(maskBase[0]);
-    const m256 maskHi = set2x128(maskBase[1]);
-    const m256 mask = set32x8(0xf);
-    u16 bitArr[512];
-
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 32);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 32;
-        m256 p_mask;
-        m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset,
-                                     buf_end, a->buf_history, a->len_history);
-        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
-        res_0 = and256(res_0, p_mask);
-        CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit1_fast_teddy);
-        ptr += 32;
-    }
-
-    if (ptr + 32 < buf_end) {
-        m256 val_0 = load256(ptr + 0);
-        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
-        CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit1_fast_teddy);
-        ptr += 32;
-    }
-
-    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-
-        m256 val_0 = load256(ptr + 0);
-        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
-        CONFIRM_FAST_TEDDY(res_0, 0, NOT_CAUTIOUS, do_confWithBit1_fast_teddy);
-
-        m256 val_1 = load256(ptr + 32);
-        m256 res_1 = prep_conf_fast_teddy_m1(val_1, mask, maskLo, maskHi);
-        CONFIRM_FAST_TEDDY(res_1, 4, NOT_CAUTIOUS, do_confWithBit1_fast_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 32) {
-        m256 p_mask;
-        m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset,
-                                     buf_end, a->buf_history, a->len_history);
-        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
-        res_0 = and256(res_0, p_mask);
-        CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit1_fast_teddy);
-    }
-
-    return HWLM_SUCCESS;
-}
-
-hwlm_error_t fdr_exec_teddy_avx2_msks1_pck_fast(const struct FDR *fdr,
-                                            const struct FDR_Runtime_Args *a,
-                                            hwlm_group_t control) {
-    const u8 *buf_end = a->buf + a->len;
-    const u8 *ptr = a->buf + a->start_offset;
-    u32 floodBackoff = FLOOD_BACKOFF_START;
-    const u8 *tryFloodDetect = a->firstFloodDetect;
-    u32 last_match = (u32)-1;
-    const struct Teddy *teddy = (const struct Teddy *)fdr;
-    const size_t iterBytes = 64;
-    DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n",
-                 a->buf, a->len, a->start_offset);
-
-    const m128 *maskBase = getMaskBase(teddy);
-    const u32 *confBase = getConfBase(teddy, 1);
-
-    const m256 maskLo = set2x128(maskBase[0]);
-    const m256 maskHi = set2x128(maskBase[1]);
-    const m256 mask = set32x8(0xf);
-    u16 bitArr[512];
-
-    const u8 *mainStart = ROUNDUP_PTR(ptr, 32);
-    DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart);
-    if (ptr < mainStart) {
-        ptr = mainStart - 32;
-        m256 p_mask;
-        m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset,
-                                     buf_end, a->buf_history, a->len_history);
-        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
-        res_0 = and256(res_0, p_mask);
-        CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit_fast_teddy);
-        ptr += 32;
-    }
-
-    if (ptr + 32 < buf_end) {
-        m256 val_0 = load256(ptr + 0);
-        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
-        CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit_fast_teddy);
-        ptr += 32;
-    }
-
-    for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) {
-        __builtin_prefetch(ptr + (iterBytes*4));
-        CHECK_FLOOD;
-
-        m256 val_0 = load256(ptr + 0);
-        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
-        CONFIRM_FAST_TEDDY(res_0, 0, NOT_CAUTIOUS, do_confWithBit_fast_teddy);
-
-        m256 val_1 = load256(ptr + 32);
-        m256 res_1 = prep_conf_fast_teddy_m1(val_1, mask, maskLo, maskHi);
-        CONFIRM_FAST_TEDDY(res_1, 4, NOT_CAUTIOUS, do_confWithBit_fast_teddy);
-    }
-
-    for (; ptr < buf_end; ptr += 32) {
-        m256 p_mask;
-        m256 val_0 = vectoredLoad256(&p_mask, ptr, a->buf + a->start_offset,
-                                     buf_end, a->buf_history, a->len_history);
-        m256 res_0 = prep_conf_fast_teddy_m1(val_0, mask, maskLo, maskHi);
-        res_0 = and256(res_0, p_mask);
-        CONFIRM_FAST_TEDDY(res_0, 0, VECTORING, do_confWithBit_fast_teddy);
-    }
-
-    return HWLM_SUCCESS;
-}
-
-#endif // __AVX2__
+#endif // HAVE_AVX2
diff --git a/src/fdr/teddy_compile.cpp b/src/fdr/teddy_compile.cpp
index 15b9665bb..6f956e8cb 100644
--- a/src/fdr/teddy_compile.cpp
+++ b/src/fdr/teddy_compile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,22 +26,29 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+/**
+ * \file
+ * \brief FDR literal matcher: Teddy build code.
+ */
+
+#include "teddy_compile.h"
+
 #include "fdr.h"
 #include "fdr_internal.h"
 #include "fdr_compile_internal.h"
 #include "fdr_confirm.h"
 #include "fdr_engine_description.h"
+#include "teddy_internal.h"
+#include "teddy_engine_description.h"
+#include "grey.h"
 #include "ue2common.h"
 #include "util/alloc.h"
 #include "util/compare.h"
+#include "util/noncopyable.h"
 #include "util/popcount.h"
 #include "util/target_info.h"
 #include "util/verify_types.h"
 
-#include "teddy_compile.h"
-#include "teddy_internal.h"
-#include "teddy_engine_description.h"
-
 #include <algorithm>
 #include <cassert>
 #include <cctype>
@@ -54,8 +61,6 @@
 #include <string>
 #include <vector>
 
-#include <boost/core/noncopyable.hpp>
-
 using namespace std;
 
 namespace ue2 {
@@ -64,17 +69,20 @@ namespace {
 
 //#define TEDDY_DEBUG
 
-class TeddyCompiler : boost::noncopyable {
+class TeddyCompiler : noncopyable {
     const TeddyEngineDescription &eng;
+    const Grey &grey;
     const vector<hwlmLiteral> &lits;
     bool make_small;
 
 public:
     TeddyCompiler(const vector<hwlmLiteral> &lits_in,
-                  const TeddyEngineDescription &eng_in, bool make_small_in)
-        : eng(eng_in), lits(lits_in), make_small(make_small_in) {}
+                  const TeddyEngineDescription &eng_in, bool make_small_in,
+                  const Grey &grey_in)
+        : eng(eng_in), grey(grey_in), lits(lits_in), make_small(make_small_in) {
+    }
 
-    aligned_unique_ptr<FDR> build(pair<aligned_unique_ptr<u8>, size_t> &link);
+    bytecode_ptr<FDR> build();
     bool pack(map<BucketIndex, std::vector<LiteralIndex> > &bucketToLits);
 };
 
@@ -274,8 +282,7 @@ bool TeddyCompiler::pack(map<BucketIndex,
     return true;
 }
 
-aligned_unique_ptr<FDR>
-TeddyCompiler::build(pair<aligned_unique_ptr<u8>, size_t> &link) {
+bytecode_ptr<FDR> TeddyCompiler::build() {
     if (lits.size() > eng.getNumBuckets() * TEDDY_BUCKET_LOAD) {
         DEBUG_PRINTF("too many literals: %zu\n", lits.size());
         return nullptr;
@@ -308,16 +315,16 @@ TeddyCompiler::build(pair<aligned_unique_ptr<u8>, size_t> &link) {
 
     size_t maskLen = eng.numMasks * 16 * 2 * maskWidth;
 
-    auto floodControlTmp = setupFDRFloodControl(lits, eng);
-    auto confirmTmp = setupFullMultiConfs(lits, eng, bucketToLits, make_small);
+    auto floodControlTmp = setupFDRFloodControl(lits, eng, grey);
+    auto confirmTmp = setupFullConfs(lits, eng, bucketToLits, make_small);
 
     size_t size = ROUNDUP_N(sizeof(Teddy) +
-                             maskLen +
-                             confirmTmp.second +
-                             floodControlTmp.second +
-                             link.second, 16 * maskWidth);
+                            maskLen +
+                            confirmTmp.size() +
+                            floodControlTmp.size(),
+                            16 * maskWidth);
 
-    aligned_unique_ptr<FDR> fdr = aligned_zmalloc_unique<FDR>(size);
+    auto fdr = make_zeroed_bytecode_ptr<FDR>(size, 64);
     assert(fdr); // otherwise would have thrown std::bad_alloc
     Teddy *teddy = (Teddy *)fdr.get(); // ugly
     u8 *teddy_base = (u8 *)teddy;
@@ -327,19 +334,12 @@ TeddyCompiler::build(pair<aligned_unique_ptr<u8>, size_t> &link) {
     teddy->maxStringLen = verify_u32(maxLen(lits));
 
     u8 *ptr = teddy_base + sizeof(Teddy) + maskLen;
-    memcpy(ptr, confirmTmp.first.get(), confirmTmp.second);
-    ptr += confirmTmp.second;
+    memcpy(ptr, confirmTmp.get(), confirmTmp.size());
+    ptr += confirmTmp.size();
 
     teddy->floodOffset = verify_u32(ptr - teddy_base);
-    memcpy(ptr, floodControlTmp.first.get(), floodControlTmp.second);
-    ptr += floodControlTmp.second;
-
-    if (link.first) {
-        teddy->link = verify_u32(ptr - teddy_base);
-        memcpy(ptr, link.first.get(), link.second);
-    } else {
-        teddy->link = 0;
-    }
+    memcpy(ptr, floodControlTmp.get(), floodControlTmp.size());
+    ptr += floodControlTmp.size();
 
     u8 *baseMsk = teddy_base + sizeof(Teddy);
 
@@ -423,10 +423,10 @@ TeddyCompiler::build(pair<aligned_unique_ptr<u8>, size_t> &link) {
 
 } // namespace
 
-aligned_unique_ptr<FDR>
-teddyBuildTableHinted(const vector<hwlmLiteral> &lits, bool make_small,
-                      u32 hint, const target_t &target,
-                      pair<aligned_unique_ptr<u8>, size_t> &link) {
+bytecode_ptr<FDR> teddyBuildTableHinted(const vector<hwlmLiteral> &lits,
+                                        bool make_small, u32 hint,
+                                        const target_t &target,
+                                        const Grey &grey) {
     unique_ptr<TeddyEngineDescription> des;
     if (hint == HINT_INVALID) {
         des = chooseTeddyEngine(target, lits);
@@ -436,8 +436,8 @@ teddyBuildTableHinted(const vector<hwlmLiteral> &lits, bool make_small,
     if (!des) {
         return nullptr;
     }
-    TeddyCompiler tc(lits, *des, make_small);
-    return tc.build(link);
+    TeddyCompiler tc(lits, *des, make_small, grey);
+    return tc.build();
 }
 
 } // namespace ue2
diff --git a/src/fdr/teddy_compile.h b/src/fdr/teddy_compile.h
index 276c1347b..5ff4d8394 100644
--- a/src/fdr/teddy_compile.h
+++ b/src/fdr/teddy_compile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,7 +26,8 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-/** \file
+/**
+ * \file
  * \brief FDR literal matcher: Teddy build API.
  */
 
@@ -34,22 +35,22 @@
 #define TEDDY_COMPILE_H
 
 #include "ue2common.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"
 
 #include <vector>
-#include <utility> // std::pair
 
 struct FDR;
-struct target_t;
 
 namespace ue2 {
 
+struct Grey;
 struct hwlmLiteral;
+struct target_t;
 
-ue2::aligned_unique_ptr<FDR>
-teddyBuildTableHinted(const std::vector<hwlmLiteral> &lits, bool make_small,
-                      u32 hint, const target_t &target,
-                      std::pair<aligned_unique_ptr<u8>, size_t> &link);
+bytecode_ptr<FDR> teddyBuildTableHinted(const std::vector<hwlmLiteral> &lits,
+                                        bool make_small, u32 hint,
+                                        const target_t &target,
+                                        const Grey &grey);
 
 } // namespace ue2
 
diff --git a/src/fdr/teddy_engine_description.cpp b/src/fdr/teddy_engine_description.cpp
index d95f4937a..f7559b13f 100644
--- a/src/fdr/teddy_engine_description.cpp
+++ b/src/fdr/teddy_engine_description.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -44,8 +44,7 @@ namespace ue2 {
 
 TeddyEngineDescription::TeddyEngineDescription(const TeddyEngineDef &def)
     : EngineDescription(def.id, targetByArchFeatures(def.cpu_features),
-                        def.numBuckets, def.confirmPullBackDistance,
-                        def.confirmTopLevelSplit),
+                        def.numBuckets),
       numMasks(def.numMasks), packed(def.packed) {}
 
 u32 TeddyEngineDescription::getDefaultFloodSuffixLength() const {
@@ -66,24 +65,22 @@ bool TeddyEngineDescription::needConfirm(const vector<hwlmLiteral> &lits) const
 
 void getTeddyDescriptions(vector<TeddyEngineDescription> *out) {
     static const TeddyEngineDef defns[] = {
-        { 1, 0 | HS_CPU_FEATURES_AVX2, 1, 8, false, 0, 1 },
-        { 2, 0 | HS_CPU_FEATURES_AVX2, 1, 8, true, 0, 32 },
-        { 3, 0 | HS_CPU_FEATURES_AVX2, 1, 16, false, 0, 1 },
-        { 4, 0 | HS_CPU_FEATURES_AVX2, 1, 16, true, 0, 32 },
-        { 5, 0 | HS_CPU_FEATURES_AVX2, 2, 16, false, 0, 1 },
-        { 6, 0 | HS_CPU_FEATURES_AVX2, 2, 16, true, 0, 32 },
-        { 7, 0 | HS_CPU_FEATURES_AVX2, 3, 16, false, 0, 1 },
-        { 8, 0 | HS_CPU_FEATURES_AVX2, 3, 16, true, 0, 32 },
-        { 9, 0 | HS_CPU_FEATURES_AVX2, 4, 16, false, 0, 1 },
-        { 10, 0 | HS_CPU_FEATURES_AVX2, 4, 16, true, 0, 32 },
-        { 11, 0, 1, 8, false, 0, 1 },
-        { 12, 0, 1, 8, true, 0, 32 },
-        { 13, 0, 2, 8, false, 0, 1 },
-        { 14, 0, 2, 8, true, 0, 32 },
-        { 15, 0, 3, 8, false, 0, 1 },
-        { 16, 0, 3, 8, true, 0, 32 },
-        { 17, 0, 4, 8, false, 0, 1 },
-        { 18, 0, 4, 8, true, 0, 32 },
+        { 3, 0 | HS_CPU_FEATURES_AVX2, 1, 16, false },
+        { 4, 0 | HS_CPU_FEATURES_AVX2, 1, 16, true },
+        { 5, 0 | HS_CPU_FEATURES_AVX2, 2, 16, false },
+        { 6, 0 | HS_CPU_FEATURES_AVX2, 2, 16, true },
+        { 7, 0 | HS_CPU_FEATURES_AVX2, 3, 16, false },
+        { 8, 0 | HS_CPU_FEATURES_AVX2, 3, 16, true },
+        { 9, 0 | HS_CPU_FEATURES_AVX2, 4, 16, false },
+        { 10, 0 | HS_CPU_FEATURES_AVX2, 4, 16, true },
+        { 11, 0, 1, 8, false },
+        { 12, 0, 1, 8, true },
+        { 13, 0, 2, 8, false },
+        { 14, 0, 2, 8, true },
+        { 15, 0, 3, 8, false },
+        { 16, 0, 3, 8, true },
+        { 17, 0, 4, 8, false },
+        { 18, 0, 4, 8, true },
     };
     out->clear();
     for (const auto &def : defns) {
diff --git a/src/fdr/teddy_engine_description.h b/src/fdr/teddy_engine_description.h
index 88d201394..3979a5d32 100644
--- a/src/fdr/teddy_engine_description.h
+++ b/src/fdr/teddy_engine_description.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -45,8 +45,6 @@ struct TeddyEngineDef {
     u32 numMasks;
     u32 numBuckets;
     bool packed;
-    u32 confirmPullBackDistance;
-    u32 confirmTopLevelSplit;
 };
 
 class TeddyEngineDescription : public EngineDescription {
diff --git a/src/fdr/teddy_runtime_common.h b/src/fdr/teddy_runtime_common.h
index dc65c70a6..c5f0885f6 100644
--- a/src/fdr/teddy_runtime_common.h
+++ b/src/fdr/teddy_runtime_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -180,9 +180,7 @@ void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
     do  {
         u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf);
         u32 byte = bit / bucket + offset;
-        u32 bitRem  = bit % bucket;
-        u32 confSplit = *(ptr+byte) & 0x1f;
-        u32 idx = confSplit * bucket + bitRem;
+        u32 idx  = bit % bucket;
         u32 cf = confBase[idx];
         if (!cf) {
             continue;
@@ -193,7 +191,7 @@ void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset,
             continue;
         }
         u64a confVal = getConfVal(a, ptr, byte, reason);
-        confWithBit(fdrc, a, ptr - a->buf + byte, 0, control,
+        confWithBit(fdrc, a, ptr - a->buf + byte, control,
                     last_match, confVal);
     } while (unlikely(*conf));
 }
diff --git a/src/grey.cpp b/src/grey.cpp
index 340a34bf6..24140c05b 100644
--- a/src/grey.cpp
+++ b/src/grey.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -42,6 +42,7 @@ namespace ue2 {
 
 Grey::Grey(void) :
                    optimiseComponentTree(true),
+                   calcComponents(true),
                    performGraphSimplification(true),
                    prefilterReductions(true),
                    removeEdgeRedundancy(true),
@@ -54,7 +55,6 @@ Grey::Grey(void) :
                    allowMcSheng(true),
                    allowPuff(true),
                    allowLiteral(true),
-                   allowRose(true),
                    allowViolet(true),
                    allowExtendedNFA(true), /* bounded repeats of course */
                    allowLimExNFA(true),
@@ -62,8 +62,10 @@ Grey::Grey(void) :
                    allowSmallLiteralSet(true),
                    allowCastle(true),
                    allowDecoratedLiteral(true),
+                   allowApproximateMatching(true),
                    allowNoodle(true),
                    fdrAllowTeddy(true),
+                   fdrAllowFlood(true),
                    violetAvoidSuffixes(true),
                    violetAvoidWeakInfixes(true),
                    violetDoubleCut(true),
@@ -98,6 +100,7 @@ Grey::Grey(void) :
                    minRoseLiteralLength(3),
                    minRoseNetflowLiteralLength(2),
                    maxRoseNetflowEdges(50000), /* otherwise no netflow pass. */
+                   maxEditDistance(16),
                    minExtBoundedRepeatSize(32),
                    goughCopyPropagate(true),
                    goughRegisterAllocate(true),
@@ -105,8 +108,6 @@ Grey::Grey(void) :
                    roseGraphReduction(true),
                    roseRoleAliasing(true),
                    roseMasks(true),
-                   roseMaxBadLeafLength(5),
-                   roseConvertInfBadLeaves(true),
                    roseConvertFloodProneSuffixes(true),
                    roseMergeRosesDuringAliasing(true),
                    roseMultiTopRoses(true),
@@ -116,7 +117,6 @@ Grey::Grey(void) :
                    roseMcClellanSuffix(1),
                    roseMcClellanOutfix(2),
                    roseTransformDelay(true),
-                   roseDesiredSplit(4),
                    earlyMcClellanPrefix(true),
                    earlyMcClellanInfix(true),
                    earlyMcClellanSuffix(true),
@@ -157,7 +157,8 @@ Grey::Grey(void) :
                    limitEngineSize(1073741824), // 1 GB
                    limitDFASize(1073741824), // 1 GB
                    limitNFASize(1048576), // 1 MB
-                   limitLBRSize(1048576) // 1 MB
+                   limitLBRSize(1048576), // 1 MB
+                   limitApproxMatchingVertices(5000)
 {
     assert(maxAnchoredRegion < 64); /* a[lm]_log_sum have limited capacity */
 }
@@ -209,6 +210,7 @@ void applyGreyOverrides(Grey *g, const string &s) {
         } while (0)
 
         G_UPDATE(optimiseComponentTree);
+        G_UPDATE(calcComponents);
         G_UPDATE(performGraphSimplification);
         G_UPDATE(prefilterReductions);
         G_UPDATE(removeEdgeRedundancy);
@@ -221,7 +223,6 @@ void applyGreyOverrides(Grey *g, const string &s) {
         G_UPDATE(allowMcSheng);
         G_UPDATE(allowPuff);
         G_UPDATE(allowLiteral);
-        G_UPDATE(allowRose);
         G_UPDATE(allowViolet);
         G_UPDATE(allowExtendedNFA);
         G_UPDATE(allowLimExNFA);
@@ -230,7 +231,9 @@ void applyGreyOverrides(Grey *g, const string &s) {
         G_UPDATE(allowCastle);
         G_UPDATE(allowDecoratedLiteral);
         G_UPDATE(allowNoodle);
+        G_UPDATE(allowApproximateMatching);
         G_UPDATE(fdrAllowTeddy);
+        G_UPDATE(fdrAllowFlood);
         G_UPDATE(violetAvoidSuffixes);
         G_UPDATE(violetAvoidWeakInfixes);
         G_UPDATE(violetDoubleCut);
@@ -265,6 +268,7 @@ void applyGreyOverrides(Grey *g, const string &s) {
         G_UPDATE(minRoseLiteralLength);
         G_UPDATE(minRoseNetflowLiteralLength);
         G_UPDATE(maxRoseNetflowEdges);
+        G_UPDATE(maxEditDistance);
         G_UPDATE(minExtBoundedRepeatSize);
         G_UPDATE(goughCopyPropagate);
         G_UPDATE(goughRegisterAllocate);
@@ -272,8 +276,6 @@ void applyGreyOverrides(Grey *g, const string &s) {
         G_UPDATE(roseGraphReduction);
         G_UPDATE(roseRoleAliasing);
         G_UPDATE(roseMasks);
-        G_UPDATE(roseMaxBadLeafLength);
-        G_UPDATE(roseConvertInfBadLeaves);
         G_UPDATE(roseConvertFloodProneSuffixes);
         G_UPDATE(roseMergeRosesDuringAliasing);
         G_UPDATE(roseMultiTopRoses);
@@ -283,7 +285,6 @@ void applyGreyOverrides(Grey *g, const string &s) {
         G_UPDATE(roseMcClellanSuffix);
         G_UPDATE(roseMcClellanOutfix);
         G_UPDATE(roseTransformDelay);
-        G_UPDATE(roseDesiredSplit);
         G_UPDATE(earlyMcClellanPrefix);
         G_UPDATE(earlyMcClellanInfix);
         G_UPDATE(earlyMcClellanSuffix);
@@ -319,6 +320,7 @@ void applyGreyOverrides(Grey *g, const string &s) {
         G_UPDATE(limitDFASize);
         G_UPDATE(limitNFASize);
         G_UPDATE(limitLBRSize);
+        G_UPDATE(limitApproxMatchingVertices);
 
 #undef G_UPDATE
         if (key == "simple_som") {
@@ -340,7 +342,6 @@ void applyGreyOverrides(Grey *g, const string &s) {
             g->allowMcClellan = false;
             g->allowPuff = false;
             g->allowLiteral = false;
-            g->allowRose = false;
             g->allowViolet = false;
             g->allowSmallLiteralSet = false;
             g->roseMasks = false;
@@ -358,7 +359,6 @@ void applyGreyOverrides(Grey *g, const string &s) {
             g->allowMcClellan = true;
             g->allowPuff = false;
             g->allowLiteral = false;
-            g->allowRose = false;
             g->allowViolet = false;
             g->allowSmallLiteralSet = false;
             g->roseMasks = false;
@@ -376,7 +376,6 @@ void applyGreyOverrides(Grey *g, const string &s) {
             g->allowMcClellan = true;
             g->allowPuff = false;
             g->allowLiteral = false;
-            g->allowRose = false;
             g->allowViolet = false;
             g->allowSmallLiteralSet = false;
             g->roseMasks = false;
diff --git a/src/grey.h b/src/grey.h
index 4882af7d0..505194181 100644
--- a/src/grey.h
+++ b/src/grey.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -41,6 +41,7 @@ struct Grey {
 
     bool optimiseComponentTree;
 
+    bool calcComponents;
     bool performGraphSimplification;
     bool prefilterReductions;
     bool removeEdgeRedundancy;
@@ -54,7 +55,6 @@ struct Grey {
     bool allowMcSheng;
     bool allowPuff;
     bool allowLiteral;
-    bool allowRose;
     bool allowViolet;
     bool allowExtendedNFA;
     bool allowLimExNFA;
@@ -62,9 +62,11 @@ struct Grey {
     bool allowSmallLiteralSet;
     bool allowCastle;
     bool allowDecoratedLiteral;
+    bool allowApproximateMatching;
 
     bool allowNoodle;
     bool fdrAllowTeddy;
+    bool fdrAllowFlood;
 
     u32  violetAvoidSuffixes; /* 0=never, 1=sometimes, 2=always */
     bool violetAvoidWeakInfixes;
@@ -107,6 +109,7 @@ struct Grey {
     u32 minRoseLiteralLength;
     u32 minRoseNetflowLiteralLength;
     u32 maxRoseNetflowEdges;
+    u32 maxEditDistance;
 
     u32 minExtBoundedRepeatSize; /* to be considered for ng_repeat */
 
@@ -118,8 +121,6 @@ struct Grey {
     bool roseGraphReduction;
     bool roseRoleAliasing;
     bool roseMasks;
-    u32 roseMaxBadLeafLength;
-    bool roseConvertInfBadLeaves;
     bool roseConvertFloodProneSuffixes;
     bool roseMergeRosesDuringAliasing;
     bool roseMultiTopRoses;
@@ -130,7 +131,6 @@ struct Grey {
                               * always */
     u32 roseMcClellanOutfix; /* 0 = off, 1 = sometimes, 2 = almost always */
     bool roseTransformDelay;
-    u32 roseDesiredSplit;
 
     bool earlyMcClellanPrefix;
     bool earlyMcClellanInfix;
@@ -202,6 +202,9 @@ struct Grey {
     u32 limitDFASize;    //!< max size of a DFA (in bytes)
     u32 limitNFASize;    //!< max size of an NFA (in bytes)
     u32 limitLBRSize;    //!< max size of an LBR engine (in bytes)
+
+    // Approximate matching limits.
+    u32 limitApproxMatchingVertices; //!< max number of vertices per graph
 };
 
 #ifndef RELEASE_BUILD
diff --git a/src/hs.cpp b/src/hs.cpp
index f64e867a2..e3c1f811c 100644
--- a/src/hs.cpp
+++ b/src/hs.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -39,10 +39,10 @@
 #include "compiler/error.h"
 #include "nfagraph/ng.h"
 #include "nfagraph/ng_expr_info.h"
-#include "nfagraph/ng_extparam.h"
-#include "parser/parse_error.h"
 #include "parser/Parser.h"
+#include "parser/parse_error.h"
 #include "parser/prefilter.h"
+#include "parser/unsupported.h"
 #include "util/compile_error.h"
 #include "util/cpuid_flags.h"
 #include "util/depth.h"
@@ -119,8 +119,9 @@ bool checkMode(unsigned int mode, hs_compile_error **comp_error) {
 
 static
 bool checkPlatform(const hs_platform_info *p, hs_compile_error **comp_error) {
-#define HS_TUNE_LAST HS_TUNE_FAMILY_BDW
-#define HS_CPU_FEATURES_ALL (HS_CPU_FEATURES_AVX2)
+    static constexpr u32 HS_TUNE_LAST = HS_TUNE_FAMILY_GLM;
+    static constexpr u32 HS_CPU_FEATURES_ALL =
+        HS_CPU_FEATURES_AVX2 | HS_CPU_FEATURES_AVX512;
 
     if (!p) {
         return true;
@@ -277,9 +278,10 @@ hs_compile_multi_int(const char *const *expressions, const unsigned *flags,
 } // namespace ue2
 
 extern "C" HS_PUBLIC_API
-hs_error_t hs_compile(const char *expression, unsigned flags, unsigned mode,
-                      const hs_platform_info_t *platform, hs_database_t **db,
-                      hs_compile_error_t **error) {
+hs_error_t HS_CDECL hs_compile(const char *expression, unsigned flags,
+                               unsigned mode,
+                               const hs_platform_info_t *platform,
+                               hs_database_t **db, hs_compile_error_t **error) {
     if (expression == nullptr) {
         *db = nullptr;
         *error = generateCompileError("Invalid parameter: expression is NULL",
@@ -295,24 +297,25 @@ hs_error_t hs_compile(const char *expression, unsigned flags, unsigned mode,
 }
 
 extern "C" HS_PUBLIC_API
-hs_error_t hs_compile_multi(const char * const *expressions,
-                            const unsigned *flags, const unsigned *ids,
-                            unsigned elements, unsigned mode,
-                            const hs_platform_info_t *platform,
-                            hs_database_t **db, hs_compile_error_t **error) {
+hs_error_t HS_CDECL hs_compile_multi(const char *const *expressions,
+                                     const unsigned *flags, const unsigned *ids,
+                                     unsigned elements, unsigned mode,
+                                     const hs_platform_info_t *platform,
+                                     hs_database_t **db,
+                                     hs_compile_error_t **error) {
     const hs_expr_ext * const *ext = nullptr; // unused for this call.
     return hs_compile_multi_int(expressions, flags, ids, ext, elements, mode,
                                 platform, db, error, Grey());
 }
 
 extern "C" HS_PUBLIC_API
-hs_error_t hs_compile_ext_multi(const char * const *expressions,
-                                const unsigned *flags, const unsigned *ids,
-                                const hs_expr_ext * const *ext,
-                                unsigned elements, unsigned mode,
-                                const hs_platform_info_t *platform,
-                                hs_database_t **db,
-                                hs_compile_error_t **error) {
+hs_error_t HS_CDECL hs_compile_ext_multi(const char * const *expressions,
+                                     const unsigned *flags, const unsigned *ids,
+                                     const hs_expr_ext * const *ext,
+                                     unsigned elements, unsigned mode,
+                                     const hs_platform_info_t *platform,
+                                     hs_database_t **db,
+                                     hs_compile_error_t **error) {
     return hs_compile_multi_int(expressions, flags, ids, ext, elements, mode,
                                 platform, db, error, Grey());
 }
@@ -368,19 +371,28 @@ hs_error_t hs_expression_info_int(const char *expression, unsigned int flags,
         assert(pe.component);
 
         // Apply prefiltering transformations if desired.
-        if (pe.prefilter) {
+        if (pe.expr.prefilter) {
             prefilterTree(pe.component, ParseMode(flags));
         }
 
-        unique_ptr<NGWrapper> g = buildWrapper(rm, cc, pe);
+        // Expressions containing zero-width assertions and other extended pcre
+        // types aren't supported yet. This call will throw a ParseError
+        // exception if the component tree contains such a construct.
+        checkUnsupported(*pe.component);
+
+        pe.component->checkEmbeddedStartAnchor(true);
+        pe.component->checkEmbeddedEndAnchor(true);
+
+        auto built_expr = buildGraph(rm, cc, pe);
+        unique_ptr<NGHolder> &g = built_expr.g;
+        ExpressionInfo &expr = built_expr.expr;
 
         if (!g) {
             DEBUG_PRINTF("NFA build failed, but no exception was thrown.\n");
             throw ParseError("Internal error.");
         }
 
-        handleExtendedParams(rm, *g, cc);
-        fillExpressionInfo(rm, *g, &local_info);
+        fillExpressionInfo(rm, cc, *g, expr, &local_info);
     }
     catch (const CompileError &e) {
         // Compiler error occurred
@@ -409,24 +421,26 @@ hs_error_t hs_expression_info_int(const char *expression, unsigned int flags,
 }
 
 extern "C" HS_PUBLIC_API
-hs_error_t hs_expression_info(const char *expression, unsigned int flags,
-                              hs_expr_info_t **info,
-                              hs_compile_error_t **error) {
+hs_error_t HS_CDECL hs_expression_info(const char *expression,
+                                       unsigned int flags,
+                                       hs_expr_info_t **info,
+                                       hs_compile_error_t **error) {
     return hs_expression_info_int(expression, flags, nullptr, HS_MODE_BLOCK,
                                   info, error);
 }
 
 extern "C" HS_PUBLIC_API
-hs_error_t hs_expression_ext_info(const char *expression, unsigned int flags,
-                                  const hs_expr_ext_t *ext,
-                                  hs_expr_info_t **info,
-                                  hs_compile_error_t **error) {
+hs_error_t HS_CDECL hs_expression_ext_info(const char *expression,
+                                           unsigned int flags,
+                                           const hs_expr_ext_t *ext,
+                                           hs_expr_info_t **info,
+                                           hs_compile_error_t **error) {
     return hs_expression_info_int(expression, flags, ext, HS_MODE_BLOCK, info,
                                   error);
 }
 
 extern "C" HS_PUBLIC_API
-hs_error_t hs_populate_platform(hs_platform_info_t *platform) {
+hs_error_t HS_CDECL hs_populate_platform(hs_platform_info_t *platform) {
     if (!platform) {
         return HS_INVALID;
     }
@@ -440,7 +454,7 @@ hs_error_t hs_populate_platform(hs_platform_info_t *platform) {
 }
 
 extern "C" HS_PUBLIC_API
-hs_error_t hs_free_compile_error(hs_compile_error_t *error) {
+hs_error_t HS_CDECL hs_free_compile_error(hs_compile_error_t *error) {
 #if defined(FAT_RUNTIME)
     if (!check_ssse3()) {
         return HS_ARCH_ERROR;
diff --git a/src/hs_common.h b/src/hs_common.h
index b25b18423..ffea397e4 100644
--- a/src/hs_common.h
+++ b/src/hs_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -29,6 +29,11 @@
 #ifndef HS_COMMON_H_
 #define HS_COMMON_H_
 
+#if defined(_WIN32)
+#define HS_CDECL    __cdecl
+#else
+#define HS_CDECL
+#endif
 #include <stdlib.h>
 
 /**
@@ -76,7 +81,7 @@ typedef int hs_error_t;
  * @return
  *      @ref HS_SUCCESS on success, other values on failure.
  */
-hs_error_t hs_free_database(hs_database_t *db);
+hs_error_t HS_CDECL hs_free_database(hs_database_t *db);
 
 /**
  * Serialize a pattern database to a stream of bytes.
@@ -100,8 +105,8 @@ hs_error_t hs_free_database(hs_database_t *db);
  *      @ref HS_SUCCESS on success, @ref HS_NOMEM if the byte array cannot be
  *      allocated, other values may be returned if errors are detected.
  */
-hs_error_t hs_serialize_database(const hs_database_t *db, char **bytes,
-                                 size_t *length);
+hs_error_t HS_CDECL hs_serialize_database(const hs_database_t *db, char **bytes,
+                                          size_t *length);
 
 /**
  * Reconstruct a pattern database from a stream of bytes previously generated
@@ -129,8 +134,9 @@ hs_error_t hs_serialize_database(const hs_database_t *db, char **bytes,
  * @return
  *      @ref HS_SUCCESS on success, other values on failure.
  */
-hs_error_t hs_deserialize_database(const char *bytes, const size_t length,
-                                   hs_database_t **db);
+hs_error_t HS_CDECL hs_deserialize_database(const char *bytes,
+                                            const size_t length,
+                                            hs_database_t **db);
 
 /**
  * Reconstruct a pattern database from a stream of bytes previously generated
@@ -160,8 +166,9 @@ hs_error_t hs_deserialize_database(const char *bytes, const size_t length,
  * @return
  *      @ref HS_SUCCESS on success, other values on failure.
  */
-hs_error_t hs_deserialize_database_at(const char *bytes, const size_t length,
-                                      hs_database_t *db);
+hs_error_t HS_CDECL hs_deserialize_database_at(const char *bytes,
+                                               const size_t length,
+                                               hs_database_t *db);
 
 /**
  * Provides the size of the stream state allocated by a single stream opened
@@ -177,7 +184,8 @@ hs_error_t hs_deserialize_database_at(const char *bytes, const size_t length,
  * @return
  *      @ref HS_SUCCESS on success, other values on failure.
  */
-hs_error_t hs_stream_size(const hs_database_t *database, size_t *stream_size);
+hs_error_t HS_CDECL hs_stream_size(const hs_database_t *database,
+                                   size_t *stream_size);
 
 /**
  * Provides the size of the given database in bytes.
@@ -192,8 +200,8 @@ hs_error_t hs_stream_size(const hs_database_t *database, size_t *stream_size);
  * @return
  *      @ref HS_SUCCESS on success, other values on failure.
  */
-hs_error_t hs_database_size(const hs_database_t *database,
-                            size_t *database_size);
+hs_error_t HS_CDECL hs_database_size(const hs_database_t *database,
+                                     size_t *database_size);
 
 /**
  * Utility function for reporting the size that would be required by a
@@ -219,8 +227,9 @@ hs_error_t hs_database_size(const hs_database_t *database,
  * @return
  *      @ref HS_SUCCESS on success, other values on failure.
  */
-hs_error_t hs_serialized_database_size(const char *bytes, const size_t length,
-                                       size_t *deserialized_size);
+hs_error_t HS_CDECL hs_serialized_database_size(const char *bytes,
+                                                const size_t length,
+                                                size_t *deserialized_size);
 
 /**
  * Utility function providing information about a database.
@@ -237,7 +246,8 @@ hs_error_t hs_serialized_database_size(const char *bytes, const size_t length,
  * @return
  *      @ref HS_SUCCESS on success, other values on failure.
  */
-hs_error_t hs_database_info(const hs_database_t *database, char **info);
+hs_error_t HS_CDECL hs_database_info(const hs_database_t *database,
+                                     char **info);
 
 /**
  * Utility function providing information about a serialized database.
@@ -258,8 +268,8 @@ hs_error_t hs_database_info(const hs_database_t *database, char **info);
  * @return
  *      @ref HS_SUCCESS on success, other values on failure.
  */
-hs_error_t hs_serialized_database_info(const char *bytes, size_t length,
-                                       char **info);
+hs_error_t HS_CDECL hs_serialized_database_info(const char *bytes,
+                                                size_t length, char **info);
 
 /**
  * The type of the callback function that will be used by Hyperscan to allocate
@@ -275,7 +285,7 @@ hs_error_t hs_serialized_database_info(const char *bytes, size_t length,
  * @return
  *      A pointer to the region of memory allocated, or NULL on error.
  */
-typedef void *(*hs_alloc_t)(size_t size);
+typedef void *(HS_CDECL *hs_alloc_t)(size_t size);
 
 /**
  * The type of the callback function that will be used by Hyperscan to free
@@ -284,7 +294,7 @@ typedef void *(*hs_alloc_t)(size_t size);
  * @param ptr
  *      The region of memory to be freed.
  */
-typedef void (*hs_free_t)(void *ptr);
+typedef void (HS_CDECL *hs_free_t)(void *ptr);
 
 /**
  * Set the allocate and free functions used by Hyperscan for allocating
@@ -312,7 +322,8 @@ typedef void (*hs_free_t)(void *ptr);
  * @return
  *      @ref HS_SUCCESS on success, other values on failure.
  */
-hs_error_t hs_set_allocator(hs_alloc_t alloc_func, hs_free_t free_func);
+hs_error_t HS_CDECL hs_set_allocator(hs_alloc_t alloc_func,
+                                     hs_free_t free_func);
 
 /**
  * Set the allocate and free functions used by Hyperscan for allocating memory
@@ -344,8 +355,8 @@ hs_error_t hs_set_allocator(hs_alloc_t alloc_func, hs_free_t free_func);
  * @return
  *      @ref HS_SUCCESS on success, other values on failure.
  */
-hs_error_t hs_set_database_allocator(hs_alloc_t alloc_func,
-                                     hs_free_t free_func);
+hs_error_t HS_CDECL hs_set_database_allocator(hs_alloc_t alloc_func,
+                                              hs_free_t free_func);
 
 /**
  * Set the allocate and free functions used by Hyperscan for allocating memory
@@ -371,7 +382,8 @@ hs_error_t hs_set_database_allocator(hs_alloc_t alloc_func,
  * @return
  *      @ref HS_SUCCESS on success, other values on failure.
  */
-hs_error_t hs_set_misc_allocator(hs_alloc_t alloc_func, hs_free_t free_func);
+hs_error_t HS_CDECL hs_set_misc_allocator(hs_alloc_t alloc_func,
+                                          hs_free_t free_func);
 
 /**
  * Set the allocate and free functions used by Hyperscan for allocating memory
@@ -397,7 +409,8 @@ hs_error_t hs_set_misc_allocator(hs_alloc_t alloc_func, hs_free_t free_func);
  * @return
  *      @ref HS_SUCCESS on success, other values on failure.
  */
-hs_error_t hs_set_scratch_allocator(hs_alloc_t alloc_func, hs_free_t free_func);
+hs_error_t HS_CDECL hs_set_scratch_allocator(hs_alloc_t alloc_func,
+                                             hs_free_t free_func);
 
 /**
  * Set the allocate and free functions used by Hyperscan for allocating memory
@@ -423,7 +436,8 @@ hs_error_t hs_set_scratch_allocator(hs_alloc_t alloc_func, hs_free_t free_func);
  * @return
  *      @ref HS_SUCCESS on success, other values on failure.
  */
-hs_error_t hs_set_stream_allocator(hs_alloc_t alloc_func, hs_free_t free_func);
+hs_error_t HS_CDECL hs_set_stream_allocator(hs_alloc_t alloc_func,
+                                            hs_free_t free_func);
 
 /**
  * Utility function for identifying this release version.
@@ -433,7 +447,7 @@ hs_error_t hs_set_stream_allocator(hs_alloc_t alloc_func, hs_free_t free_func);
  *      date of the build. It is allocated statically, so it does not need to
  *      be freed by the caller.
  */
-const char *hs_version(void);
+const char * HS_CDECL hs_version(void);
 
 /**
  * Utility function to test the current system architecture.
@@ -450,7 +464,7 @@ const char *hs_version(void);
  *      @ref HS_SUCCESS on success, @ref HS_ARCH_ERROR if system does not
  *      support Hyperscan.
  */
-hs_error_t hs_valid_platform(void);
+hs_error_t HS_CDECL hs_valid_platform(void);
 
 /**
  * @defgroup HS_ERROR hs_error_t values
@@ -545,7 +559,7 @@ hs_error_t hs_valid_platform(void);
  * At a minimum, Hyperscan requires Supplemental Streaming SIMD Extensions 3
  * (SSSE3).
  */
-#define HS_ARCH_ERROR		    (-11)
+#define HS_ARCH_ERROR           (-11)
 
 /** @} */
 
diff --git a/src/hs_compile.h b/src/hs_compile.h
index c5212cbe1..3d5270443 100644
--- a/src/hs_compile.h
+++ b/src/hs_compile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -169,13 +169,23 @@ typedef struct hs_platform_info {
 typedef struct hs_expr_info {
     /**
      * The minimum length in bytes of a match for the pattern.
+     *
+     * Note: in some cases when using advanced features to suppress matches
+     * (such as extended parameters or the @ref HS_FLAG_SINGLEMATCH flag) this
+     * may represent a conservative lower bound for the true minimum length of
+     * a match.
      */
     unsigned int min_width;
 
     /**
      * The maximum length in bytes of a match for the pattern. If the pattern
-     * has an unbounded maximum width, this will be set to the maximum value of
-     * an unsigned int (UINT_MAX).
+     * has an unbounded maximum length, this will be set to the maximum value
+     * of an unsigned int (UINT_MAX).
+     *
+     * Note: in some cases when using advanced features to suppress matches
+     * (such as extended parameters or the @ref HS_FLAG_SINGLEMATCH flag) this
+     * may represent a conservative upper bound for the true maximum length of
+     * a match.
      */
     unsigned int max_width;
 
@@ -241,6 +251,13 @@ typedef struct hs_expr_ext {
      * @ref HS_EXT_FLAG_MIN_LENGTH flag in the hs_expr_ext::flags field.
      */
     unsigned long long min_length;
+
+    /**
+     * Allow patterns to approximately match within this edit distance. To use
+     * this parameter, set the @ref HS_EXT_FLAG_EDIT_DISTANCE flag in the
+     * hs_expr_ext::flags field.
+     */
+    unsigned edit_distance;
 } hs_expr_ext_t;
 
 /**
@@ -261,6 +278,9 @@ typedef struct hs_expr_ext {
 /** Flag indicating that the hs_expr_ext::min_length field is used. */
 #define HS_EXT_FLAG_MIN_LENGTH      4ULL
 
+/** Flag indicating that the hs_expr_ext::edit_distance field is used. */
+#define HS_EXT_FLAG_EDIT_DISTANCE   8ULL
+
 /** @} */
 
 /**
@@ -323,9 +343,10 @@ typedef struct hs_expr_ext {
  *      HS_COMPILER_ERROR on failure, with details provided in the error
  *      parameter.
  */
-hs_error_t hs_compile(const char *expression, unsigned int flags,
-                      unsigned int mode, const hs_platform_info_t *platform,
-                      hs_database_t **db, hs_compile_error_t **error);
+hs_error_t HS_CDECL hs_compile(const char *expression, unsigned int flags,
+                               unsigned int mode,
+                               const hs_platform_info_t *platform,
+                               hs_database_t **db, hs_compile_error_t **error);
 
 /**
  * The multiple regular expression compiler.
@@ -401,11 +422,13 @@ hs_error_t hs_compile(const char *expression, unsigned int flags,
  *      parameter.
  *
  */
-hs_error_t hs_compile_multi(const char *const *expressions,
-                            const unsigned int *flags, const unsigned int *ids,
-                            unsigned int elements, unsigned int mode,
-                            const hs_platform_info_t *platform,
-                            hs_database_t **db, hs_compile_error_t **error);
+hs_error_t HS_CDECL hs_compile_multi(const char *const *expressions,
+                                     const unsigned int *flags,
+                                     const unsigned int *ids,
+                                     unsigned int elements, unsigned int mode,
+                                     const hs_platform_info_t *platform,
+                                     hs_database_t **db,
+                                     hs_compile_error_t **error);
 
 /**
  * The multiple regular expression compiler with extended parameter support.
@@ -486,7 +509,7 @@ hs_error_t hs_compile_multi(const char *const *expressions,
  *      parameter.
  *
  */
-hs_error_t hs_compile_ext_multi(const char *const *expressions,
+hs_error_t HS_CDECL hs_compile_ext_multi(const char *const *expressions,
                                 const unsigned int *flags,
                                 const unsigned int *ids,
                                 const hs_expr_ext_t *const *ext,
@@ -505,13 +528,24 @@ hs_error_t hs_compile_ext_multi(const char *const *expressions,
  * @return
  *      @ref HS_SUCCESS on success, other values on failure.
  */
-hs_error_t hs_free_compile_error(hs_compile_error_t *error);
+hs_error_t HS_CDECL hs_free_compile_error(hs_compile_error_t *error);
 
 /**
  * Utility function providing information about a regular expression. The
  * information provided in @ref hs_expr_info_t includes the minimum and maximum
  * width of a pattern match.
  *
+ * Note: successful analysis of an expression with this function does not imply
+ * that compilation of the same expression (via @ref hs_compile(), @ref
+ * hs_compile_multi() or @ref hs_compile_ext_multi()) would succeed. This
+ * function may return @ref HS_SUCCESS for regular expressions that Hyperscan
+ * cannot compile.
+ *
+ * Note: some per-pattern flags (such as @ref HS_FLAG_ALLOWEMPTY, @ref
+ * HS_FLAG_SOM_LEFTMOST) are accepted by this call, but as they do not affect
+ * the properties returned in the @ref hs_expr_info_t structure, they will not
+ * affect the outcome of this function.
+ *
  * @param expression
  *      The NULL-terminated expression to parse. Note that this string must
  *      represent ONLY the pattern to be matched, with no delimiters or flags;
@@ -553,15 +587,27 @@ hs_error_t hs_free_compile_error(hs_compile_error_t *error);
  *      HS_COMPILER_ERROR on failure, with details provided in the error
  *      parameter.
  */
-hs_error_t hs_expression_info(const char *expression, unsigned int flags,
-                              hs_expr_info_t **info,
-                              hs_compile_error_t **error);
+hs_error_t HS_CDECL hs_expression_info(const char *expression,
+                                       unsigned int flags,
+                                       hs_expr_info_t **info,
+                                       hs_compile_error_t **error);
 
 /**
  * Utility function providing information about a regular expression, with
  * extended parameter support. The information provided in @ref hs_expr_info_t
  * includes the minimum and maximum width of a pattern match.
  *
+ * Note: successful analysis of an expression with this function does not imply
+ * that compilation of the same expression (via @ref hs_compile(), @ref
+ * hs_compile_multi() or @ref hs_compile_ext_multi()) would succeed. This
+ * function may return @ref HS_SUCCESS for regular expressions that Hyperscan
+ * cannot compile.
+ *
+ * Note: some per-pattern flags (such as @ref HS_FLAG_ALLOWEMPTY, @ref
+ * HS_FLAG_SOM_LEFTMOST) are accepted by this call, but as they do not affect
+ * the properties returned in the @ref hs_expr_info_t structure, they will not
+ * affect the outcome of this function.
+ *
  * @param expression
  *      The NULL-terminated expression to parse. Note that this string must
  *      represent ONLY the pattern to be matched, with no delimiters or flags;
@@ -608,10 +654,11 @@ hs_error_t hs_expression_info(const char *expression, unsigned int flags,
  *      HS_COMPILER_ERROR on failure, with details provided in the error
  *      parameter.
  */
-hs_error_t hs_expression_ext_info(const char *expression, unsigned int flags,
-                                  const hs_expr_ext_t *ext,
-                                  hs_expr_info_t **info,
-                                  hs_compile_error_t **error);
+hs_error_t HS_CDECL hs_expression_ext_info(const char *expression,
+                                           unsigned int flags,
+                                           const hs_expr_ext_t *ext,
+                                           hs_expr_info_t **info,
+                                           hs_compile_error_t **error);
 
 /**
  * Populates the platform information based on the current host.
@@ -623,7 +670,7 @@ hs_error_t hs_expression_ext_info(const char *expression, unsigned int flags,
  * @return
  *      @ref HS_SUCCESS on success, other values on failure.
  */
-hs_error_t hs_populate_platform(hs_platform_info_t *platform);
+hs_error_t HS_CDECL hs_populate_platform(hs_platform_info_t *platform);
 
 /**
  * @defgroup HS_PATTERN_FLAG Pattern flags
@@ -770,6 +817,14 @@ hs_error_t hs_populate_platform(hs_platform_info_t *platform);
  */
 #define HS_CPU_FEATURES_AVX2             (1ULL << 2)
 
+/**
+ * CPU features flag - Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX512)
+ *
+ * Setting this flag indicates that the target platform supports AVX512
+ * instructions, specifically AVX-512BW. Using AVX512 implies the use of AVX2.
+ */
+#define HS_CPU_FEATURES_AVX512           (1ULL << 3)
+
 /** @} */
 
 /**
@@ -826,6 +881,30 @@ hs_error_t hs_populate_platform(hs_platform_info_t *platform);
  */
 #define HS_TUNE_FAMILY_BDW 5
 
+/**
+ * Tuning Parameter - Intel(R) microarchitecture code name Skylake
+ *
+ * This indicates that the compiled database should be tuned for the
+ * Skylake microarchitecture.
+ */
+#define HS_TUNE_FAMILY_SKL 6
+
+/**
+ * Tuning Parameter - Intel(R) microarchitecture code name Skylake Server
+ *
+ * This indicates that the compiled database should be tuned for the
+ * Skylake Server microarchitecture.
+ */
+#define HS_TUNE_FAMILY_SKX 7
+
+/**
+ * Tuning Parameter - Intel(R) microarchitecture code name Goldmont
+ *
+ * This indicates that the compiled database should be tuned for the
+ * Goldmont microarchitecture.
+ */
+#define HS_TUNE_FAMILY_GLM 8
+
 /** @} */
 
 /**
diff --git a/src/hs_runtime.h b/src/hs_runtime.h
index db52f4f50..ecd97ca52 100644
--- a/src/hs_runtime.h
+++ b/src/hs_runtime.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -145,8 +145,8 @@ typedef int (*match_event_handler)(unsigned int id,
  * @return
  *      @ref HS_SUCCESS on success, other values on failure.
  */
-hs_error_t hs_open_stream(const hs_database_t *db, unsigned int flags,
-                          hs_stream_t **stream);
+hs_error_t HS_CDECL hs_open_stream(const hs_database_t *db, unsigned int flags,
+                                   hs_stream_t **stream);
 
 /**
  * Write data to be scanned to the opened stream.
@@ -185,10 +185,10 @@ hs_error_t hs_open_stream(const hs_database_t *db, unsigned int flags,
  *      match callback indicated that scanning should stop; other values on
  *      error.
  */
-hs_error_t hs_scan_stream(hs_stream_t *id, const char *data,
-                          unsigned int length, unsigned int flags,
-                          hs_scratch_t *scratch, match_event_handler onEvent,
-                          void *ctxt);
+hs_error_t HS_CDECL hs_scan_stream(hs_stream_t *id, const char *data,
+                                   unsigned int length, unsigned int flags,
+                                   hs_scratch_t *scratch,
+                                   match_event_handler onEvent, void *ctxt);
 
 /**
  * Close a stream.
@@ -223,8 +223,8 @@ hs_error_t hs_scan_stream(hs_stream_t *id, const char *data,
  * @return
  *      Returns @ref HS_SUCCESS on success, other values on failure.
  */
-hs_error_t hs_close_stream(hs_stream_t *id, hs_scratch_t *scratch,
-                           match_event_handler onEvent, void *ctxt);
+hs_error_t HS_CDECL hs_close_stream(hs_stream_t *id, hs_scratch_t *scratch,
+                                    match_event_handler onEvent, void *ctxt);
 
 /**
  * Reset a stream to an initial state.
@@ -264,9 +264,9 @@ hs_error_t hs_close_stream(hs_stream_t *id, hs_scratch_t *scratch,
  * @return
  *      @ref HS_SUCCESS on success, other values on failure.
  */
-hs_error_t hs_reset_stream(hs_stream_t *id, unsigned int flags,
-                           hs_scratch_t *scratch, match_event_handler onEvent,
-                           void *context);
+hs_error_t HS_CDECL hs_reset_stream(hs_stream_t *id, unsigned int flags,
+                                    hs_scratch_t *scratch,
+                                    match_event_handler onEvent, void *context);
 
 /**
  * Duplicate the given stream. The new stream will have the same state as the
@@ -282,7 +282,8 @@ hs_error_t hs_reset_stream(hs_stream_t *id, unsigned int flags,
  * @return
  *      @ref HS_SUCCESS on success, other values on failure.
  */
-hs_error_t hs_copy_stream(hs_stream_t **to_id, const hs_stream_t *from_id);
+hs_error_t HS_CDECL hs_copy_stream(hs_stream_t **to_id,
+                                   const hs_stream_t *from_id);
 
 /**
  * Duplicate the given 'from' stream state onto the 'to' stream. The 'to' stream
@@ -314,11 +315,11 @@ hs_error_t hs_copy_stream(hs_stream_t **to_id, const hs_stream_t *from_id);
  * @return
  *      @ref HS_SUCCESS on success, other values on failure.
  */
-hs_error_t hs_reset_and_copy_stream(hs_stream_t *to_id,
-                                    const hs_stream_t *from_id,
-                                    hs_scratch_t *scratch,
-                                    match_event_handler onEvent,
-                                    void *context);
+hs_error_t HS_CDECL hs_reset_and_copy_stream(hs_stream_t *to_id,
+                                             const hs_stream_t *from_id,
+                                             hs_scratch_t *scratch,
+                                             match_event_handler onEvent,
+                                             void *context);
 
 /**
  * The block (non-streaming) regular expression scanner.
@@ -355,10 +356,10 @@ hs_error_t hs_reset_and_copy_stream(hs_stream_t *to_id,
  *      match callback indicated that scanning should stop; other values on
  *      error.
  */
-hs_error_t hs_scan(const hs_database_t *db, const char *data,
-                   unsigned int length, unsigned int flags,
-                   hs_scratch_t *scratch, match_event_handler onEvent,
-                   void *context);
+hs_error_t HS_CDECL hs_scan(const hs_database_t *db, const char *data,
+                            unsigned int length, unsigned int flags,
+                            hs_scratch_t *scratch, match_event_handler onEvent,
+                            void *context);
 
 /**
  * The vectored regular expression scanner.
@@ -398,10 +399,12 @@ hs_error_t hs_scan(const hs_database_t *db, const char *data,
  *      Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the match
  *      callback indicated that scanning should stop; other values on error.
  */
-hs_error_t hs_scan_vector(const hs_database_t *db, const char *const *data,
-                          const unsigned int *length, unsigned int count,
-                          unsigned int flags, hs_scratch_t *scratch,
-                          match_event_handler onEvent, void *context);
+hs_error_t HS_CDECL hs_scan_vector(const hs_database_t *db,
+                                   const char *const *data,
+                                   const unsigned int *length,
+                                   unsigned int count, unsigned int flags,
+                                   hs_scratch_t *scratch,
+                                   match_event_handler onEvent, void *context);
 
 /**
  * Allocate a "scratch" space for use by Hyperscan.
@@ -429,7 +432,8 @@ hs_error_t hs_scan_vector(const hs_database_t *db, const char *const *data,
  *      allocation fails.  Other errors may be returned if invalid parameters
  *      are specified.
  */
-hs_error_t hs_alloc_scratch(const hs_database_t *db, hs_scratch_t **scratch);
+hs_error_t HS_CDECL hs_alloc_scratch(const hs_database_t *db,
+                                     hs_scratch_t **scratch);
 
 /**
  * Allocate a scratch space that is a clone of an existing scratch space.
@@ -449,7 +453,8 @@ hs_error_t hs_alloc_scratch(const hs_database_t *db, hs_scratch_t **scratch);
  *      @ref HS_SUCCESS on success; @ref HS_NOMEM if the allocation fails.
  *      Other errors may be returned if invalid parameters are specified.
  */
-hs_error_t hs_clone_scratch(const hs_scratch_t *src, hs_scratch_t **dest);
+hs_error_t HS_CDECL hs_clone_scratch(const hs_scratch_t *src,
+                                     hs_scratch_t **dest);
 
 /**
  * Provides the size of the given scratch space.
@@ -465,7 +470,8 @@ hs_error_t hs_clone_scratch(const hs_scratch_t *src, hs_scratch_t **dest);
  * @return
  *      @ref HS_SUCCESS on success, other values on failure.
  */
-hs_error_t hs_scratch_size(const hs_scratch_t *scratch, size_t *scratch_size);
+hs_error_t HS_CDECL hs_scratch_size(const hs_scratch_t *scratch,
+                                    size_t *scratch_size);
 
 /**
  * Free a scratch block previously allocated by @ref hs_alloc_scratch() or @ref
@@ -480,7 +486,7 @@ hs_error_t hs_scratch_size(const hs_scratch_t *scratch, size_t *scratch_size);
  * @return
  *      @ref HS_SUCCESS on success, other values on failure.
  */
-hs_error_t hs_free_scratch(hs_scratch_t *scratch);
+hs_error_t HS_CDECL hs_free_scratch(hs_scratch_t *scratch);
 
 /**
  * Callback 'from' return value, indicating that the start of this match was
diff --git a/src/hs_valid_platform.c b/src/hs_valid_platform.c
index 939cde1f6..128ac04fd 100644
--- a/src/hs_valid_platform.c
+++ b/src/hs_valid_platform.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,7 +30,7 @@
 #include "util/cpuid_flags.h"
 
 HS_PUBLIC_API
-hs_error_t hs_valid_platform(void) {
+hs_error_t HS_CDECL hs_valid_platform(void) {
     /* Hyperscan requires SSSE3, anything else is a bonus */
     if (check_ssse3()) {
         return HS_SUCCESS;
diff --git a/src/hs_version.c b/src/hs_version.c
index 45e23c3b5..04cf46f3f 100644
--- a/src/hs_version.c
+++ b/src/hs_version.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,6 +31,6 @@
 #include "hs_version.h"
 
 HS_PUBLIC_API
-const char *hs_version(void) {
+const char * HS_CDECL hs_version(void) {
     return HS_VERSION_STRING;
 }
diff --git a/src/hwlm/hwlm.c b/src/hwlm/hwlm.c
index 3c7615a7b..6eaa7ed15 100644
--- a/src/hwlm/hwlm.c
+++ b/src/hwlm/hwlm.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -172,6 +172,8 @@ void do_accel_streaming(const union AccelAux *aux, const u8 *hbuf, size_t hlen,
 hwlm_error_t hwlmExec(const struct HWLM *t, const u8 *buf, size_t len,
                       size_t start, HWLMCallback cb, void *ctxt,
                       hwlm_group_t groups) {
+    assert(t);
+
     DEBUG_PRINTF("buf len=%zu, start=%zu, groups=%llx\n", len, start, groups);
     if (!groups) {
         DEBUG_PRINTF("groups all off\n");
@@ -201,6 +203,9 @@ hwlm_error_t hwlmExec(const struct HWLM *t, const u8 *buf, size_t len,
 hwlm_error_t hwlmExecStreaming(const struct HWLM *t, struct hs_scratch *scratch,
                                size_t len, size_t start, HWLMCallback cb,
                                void *ctxt, hwlm_group_t groups) {
+    assert(t);
+    assert(scratch);
+
     const u8 *hbuf = scratch->core_info.hbuf;
     const size_t hlen = scratch->core_info.hlen;
     const u8 *buf = scratch->core_info.buf;
diff --git a/src/hwlm/hwlm_build.cpp b/src/hwlm/hwlm_build.cpp
index fa6335c94..2f61ea6d6 100644
--- a/src/hwlm/hwlm_build.cpp
+++ b/src/hwlm/hwlm_build.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -29,31 +29,23 @@
 /** \file
  * \brief Hamster Wheel Literal Matcher: build code.
  */
+
+#include "hwlm_build.h"
+
 #include "grey.h"
 #include "hwlm.h"
-#include "hwlm_build.h"
 #include "hwlm_internal.h"
+#include "hwlm_literal.h"
 #include "noodle_engine.h"
 #include "noodle_build.h"
 #include "scratch.h"
 #include "ue2common.h"
 #include "fdr/fdr_compile.h"
-#include "nfa/shufticompile.h"
-#include "nfa/trufflecompile.h"
-#include "util/alloc.h"
-#include "util/bitutils.h"
-#include "util/charreach.h"
-#include "util/compare.h"
 #include "util/compile_context.h"
 #include "util/compile_error.h"
-#include "util/dump_charclass.h"
-#include "util/target_info.h"
 #include "util/ue2string.h"
-#include "util/verify_types.h"
 
 #include <cassert>
-#include <cstdio>
-#include <cstdlib>
 #include <cstring>
 #include <vector>
 
@@ -61,431 +53,6 @@ using namespace std;
 
 namespace ue2 {
 
-static const unsigned int MAX_ACCEL_OFFSET = 16;
-static const unsigned int MAX_SHUFTI_WIDTH = 240;
-
-static
-size_t mask_overhang(const hwlmLiteral &lit) {
-    size_t msk_true_size = lit.msk.size();
-    assert(msk_true_size <= HWLM_MASKLEN);
-    assert(HWLM_MASKLEN <= MAX_ACCEL_OFFSET);
-    for (u8 c : lit.msk) {
-        if (!c) {
-            msk_true_size--;
-        } else {
-            break;
-        }
-    }
-
-    if (lit.s.length() >= msk_true_size) {
-        return 0;
-    }
-
-    /* only short literals should be able to have a mask which overhangs */
-    assert(lit.s.length() < MAX_ACCEL_OFFSET);
-    return msk_true_size - lit.s.length();
-}
-
-static
-bool findDVerm(const vector<const hwlmLiteral *> &lits, AccelAux *aux) {
-    const hwlmLiteral &first = *lits.front();
-
-    struct candidate {
-        candidate(void)
-            : c1(0), c2(0), max_offset(0), b5insens(false), valid(false) {}
-        candidate(const hwlmLiteral &base, u32 offset)
-            : c1(base.s[offset]), c2(base.s[offset + 1]), max_offset(0),
-              b5insens(false), valid(true) {}
-        char c1;
-        char c2;
-        u32 max_offset;
-        bool b5insens;
-        bool valid;
-
-        bool operator>(const candidate &other) const {
-            if (!valid) {
-                return false;
-            }
-
-            if (!other.valid) {
-                return true;
-            }
-
-            if (other.cdiffers() && !cdiffers()) {
-                return false;
-            }
-
-            if (!other.cdiffers() && cdiffers()) {
-                return true;
-            }
-
-            if (!other.b5insens && b5insens) {
-                return false;
-            }
-
-            if (other.b5insens && !b5insens) {
-                return true;
-            }
-
-            if (max_offset > other.max_offset) {
-                return false;
-            }
-
-            return true;
-        }
-
-        bool cdiffers(void) const {
-            if (!b5insens) {
-                return c1 != c2;
-            }
-            return (c1 & CASE_CLEAR) != (c2 & CASE_CLEAR);
-        }
-    };
-
-    candidate best;
-
-    for (u32 i = 0; i < MIN(MAX_ACCEL_OFFSET, first.s.length()) - 1; i++) {
-        candidate curr(first, i);
-
-        /* check to see if this pair appears in each string */
-        for (const auto &lit_ptr : lits) {
-            const hwlmLiteral &lit = *lit_ptr;
-            if (lit.nocase && (ourisalpha(curr.c1) || ourisalpha(curr.c2))) {
-                curr.b5insens = true; /* no choice but to be case insensitive */
-            }
-
-            bool found = false;
-            bool found_nc = false;
-            for (u32 j = 0;
-                 !found && j < MIN(MAX_ACCEL_OFFSET, lit.s.length()) - 1; j++) {
-                found |= curr.c1 == lit.s[j] && curr.c2 == lit.s[j + 1];
-                found_nc |= (curr.c1 & CASE_CLEAR) == (lit.s[j] & CASE_CLEAR)
-                    && (curr.c2 & CASE_CLEAR) == (lit.s[j + 1] & CASE_CLEAR);
-
-                if (curr.b5insens) {
-                    found = found_nc;
-                }
-            }
-
-            if (!curr.b5insens && !found && found_nc) {
-                curr.b5insens = true;
-                found = true;
-            }
-
-            if (!found) {
-                goto next_candidate;
-            }
-        }
-
-        /* check to find the max offset where this appears */
-        for (const auto &lit_ptr : lits) {
-            const hwlmLiteral &lit = *lit_ptr;
-            for (u32 j = 0; j < MIN(MAX_ACCEL_OFFSET, lit.s.length()) - 1;
-                 j++) {
-                bool found = false;
-                if (curr.b5insens) {
-                    found = (curr.c1 & CASE_CLEAR) == (lit.s[j] & CASE_CLEAR)
-                     && (curr.c2 & CASE_CLEAR) == (lit.s[j + 1] & CASE_CLEAR);
-                } else {
-                    found = curr.c1 == lit.s[j] && curr.c2 == lit.s[j + 1];
-                }
-
-                if (found) {
-                    assert(j + mask_overhang(lit) <= MAX_ACCEL_OFFSET);
-                    ENSURE_AT_LEAST(&curr.max_offset, j + mask_overhang(lit));
-                    break;
-                }
-            }
-        }
-
-        if (curr > best) {
-            best = curr;
-        }
-
-    next_candidate:;
-    }
-
-    if (!best.valid) {
-        return false;
-    }
-
-    aux->dverm.offset = verify_u8(best.max_offset);
-
-    if (!best.b5insens) {
-        aux->dverm.accel_type = ACCEL_DVERM;
-        aux->dverm.c1 = best.c1;
-        aux->dverm.c2 = best.c2;
-        DEBUG_PRINTF("built dverm for %02hhx%02hhx\n",
-                     aux->dverm.c1, aux->dverm.c2);
-    } else {
-        aux->dverm.accel_type = ACCEL_DVERM_NOCASE;
-        aux->dverm.c1 = best.c1 & CASE_CLEAR;
-        aux->dverm.c2 = best.c2 & CASE_CLEAR;
-        DEBUG_PRINTF("built dverm nc for %02hhx%02hhx\n",
-                     aux->dverm.c1, aux->dverm.c2);
-    }
-    return true;
-}
-
-static
-bool findSVerm(const vector<const hwlmLiteral *> &lits, AccelAux *aux) {
-    const hwlmLiteral &first = *lits.front();
-
-    struct candidate {
-        candidate(void)
-            : c(0), max_offset(0), b5insens(false), valid(false) {}
-        candidate(const hwlmLiteral &base, u32 offset)
-            : c(base.s[offset]), max_offset(0),
-              b5insens(false), valid(true) {}
-        char c;
-        u32 max_offset;
-        bool b5insens;
-        bool valid;
-
-        bool operator>(const candidate &other) const {
-            if (!valid) {
-                return false;
-            }
-
-            if (!other.valid) {
-                return true;
-            }
-
-            if (!other.b5insens && b5insens) {
-                return false;
-            }
-
-            if (other.b5insens && !b5insens) {
-                return true;
-            }
-
-            if (max_offset > other.max_offset) {
-                return false;
-            }
-
-            return true;
-        }
-    };
-
-    candidate best;
-
-    for (u32 i = 0; i < MIN(MAX_ACCEL_OFFSET, first.s.length()); i++) {
-        candidate curr(first, i);
-
-        /* check to see if this pair appears in each string */
-        for (const auto &lit_ptr : lits) {
-            const hwlmLiteral &lit = *lit_ptr;
-            if (lit.nocase && ourisalpha(curr.c)) {
-                curr.b5insens = true; /* no choice but to be case insensitive */
-            }
-
-            bool found = false;
-            bool found_nc = false;
-            for (u32 j = 0;
-                 !found && j < MIN(MAX_ACCEL_OFFSET, lit.s.length()); j++) {
-                found |= curr.c == lit.s[j];
-                found_nc |= (curr.c & CASE_CLEAR) == (lit.s[j] & CASE_CLEAR);
-
-                if (curr.b5insens) {
-                    found = found_nc;
-                }
-            }
-
-            if (!curr.b5insens && !found && found_nc) {
-                curr.b5insens = true;
-                found = true;
-            }
-
-            if (!found) {
-                goto next_candidate;
-            }
-        }
-
-        /* check to find the max offset where this appears */
-        for (const auto &lit_ptr : lits) {
-            const hwlmLiteral &lit = *lit_ptr;
-            for (u32 j = 0; j < MIN(MAX_ACCEL_OFFSET, lit.s.length()); j++) {
-                bool found = false;
-                if (curr.b5insens) {
-                    found = (curr.c & CASE_CLEAR) == (lit.s[j] & CASE_CLEAR);
-                } else {
-                    found = curr.c == lit.s[j];
-                }
-
-                if (found) {
-                    assert(j + mask_overhang(lit) <= MAX_ACCEL_OFFSET);
-                    ENSURE_AT_LEAST(&curr.max_offset, j + mask_overhang(lit));
-                }
-            }
-        }
-
-        if (curr > best) {
-            best = curr;
-        }
-
-    next_candidate:;
-    }
-
-    if (!best.valid) {
-        return false;
-    }
-
-    if (!best.b5insens) {
-        aux->verm.accel_type = ACCEL_VERM;
-        aux->verm.c = best.c;
-        DEBUG_PRINTF("built verm for %02hhx\n", aux->verm.c);
-    } else {
-        aux->verm.accel_type = ACCEL_VERM_NOCASE;
-        aux->verm.c = best.c & CASE_CLEAR;
-        DEBUG_PRINTF("built verm nc for %02hhx\n", aux->verm.c);
-    }
-    aux->verm.offset = verify_u8(best.max_offset);
-
-    return true;
-}
-
-static
-void filterLits(const vector<hwlmLiteral> &lits, hwlm_group_t expected_groups,
-                vector<const hwlmLiteral *> *filtered_lits, u32 *min_len) {
-    *min_len = MAX_ACCEL_OFFSET;
-
-    for (const auto &lit : lits) {
-        if (!(lit.groups & expected_groups)) {
-            continue;
-        }
-
-        const size_t lit_len = lit.s.length();
-        if (lit_len < *min_len) {
-            *min_len = verify_u32(lit_len);
-        }
-
-        filtered_lits->push_back(&lit);
-
-#ifdef DEBUG
-        DEBUG_PRINTF("lit:");
-        for (u32 i = 0; i < lit.s.length(); i++) {
-            printf("%02hhx", lit.s[i]);
-        }
-        printf("\n");
-#endif
-    }
-}
-
-static
-bool litGuardedByCharReach(const CharReach &cr, const hwlmLiteral &lit,
-                           u32 max_offset) {
-    for (u32 i = 0; i <= max_offset && i < lit.s.length(); i++) {
-         unsigned char c = lit.s[i];
-         if (lit.nocase) {
-             if (cr.test(mytoupper(c)) && cr.test(mytolower(c))) {
-                 return true;
-             }
-         } else {
-             if (cr.test(c)) {
-                 return true;
-             }
-         }
-    }
-
-    return false;
-}
-
-static
-void findForwardAccelScheme(const vector<hwlmLiteral> &lits,
-                            hwlm_group_t expected_groups, AccelAux *aux) {
-    DEBUG_PRINTF("building accel expected=%016llx\n", expected_groups);
-    u32 min_len = MAX_ACCEL_OFFSET;
-    vector<const hwlmLiteral *> filtered_lits;
-
-    filterLits(lits, expected_groups, &filtered_lits, &min_len);
-    if (filtered_lits.empty()) {
-        return;
-    }
-
-    if (findDVerm(filtered_lits, aux)
-        || findSVerm(filtered_lits, aux)) {
-        return;
-    }
-
-    /* look for shufti/truffle */
-
-    vector<CharReach> reach(MAX_ACCEL_OFFSET, CharReach());
-    for (const auto &lit : lits) {
-        if (!(lit.groups & expected_groups)) {
-            continue;
-        }
-
-        u32 overhang = mask_overhang(lit);
-        for (u32 i = 0; i < overhang; i++) {
-            /* this offset overhangs the start of the real literal; look at the
-             * msk/cmp */
-            for (u32 j = 0; j < N_CHARS; j++) {
-                if ((j & lit.msk[i]) == lit.cmp[i]) {
-                    reach[i].set(j);
-                }
-            }
-        }
-        for (u32 i = overhang; i < MAX_ACCEL_OFFSET; i++) {
-            CharReach &reach_i = reach[i];
-            u32 i_effective = i - overhang;
-
-            if (litGuardedByCharReach(reach_i, lit, i_effective)) {
-                continue;
-            }
-            unsigned char c = i_effective < lit.s.length() ? lit.s[i_effective]
-                                                           : lit.s.back();
-            if (lit.nocase) {
-                reach_i.set(mytoupper(c));
-                reach_i.set(mytolower(c));
-            } else {
-                reach_i.set(c);
-            }
-        }
-    }
-
-    u32 min_count = ~0U;
-    u32 min_offset = ~0U;
-    for (u32 i = 0; i < MAX_ACCEL_OFFSET; i++) {
-        size_t count = reach[i].count();
-        DEBUG_PRINTF("offset %u is %s (reach %zu)\n", i,
-                     describeClass(reach[i]).c_str(), count);
-        if (count < min_count) {
-            min_count = (u32)count;
-            min_offset = i;
-        }
-    }
-
-    if (min_count > MAX_SHUFTI_WIDTH) {
-        DEBUG_PRINTF("FAIL: min shufti with %u chars is too wide\n", min_count);
-        return;
-    }
-
-    const CharReach &cr = reach[min_offset];
-    if (-1 !=
-        shuftiBuildMasks(cr, (u8 *)&aux->shufti.lo, (u8 *)&aux->shufti.hi)) {
-        DEBUG_PRINTF("built shufti for %s (%zu chars, offset %u)\n",
-                     describeClass(cr).c_str(), cr.count(), min_offset);
-        aux->shufti.accel_type = ACCEL_SHUFTI;
-        aux->shufti.offset = verify_u8(min_offset);
-        return;
-    }
-
-    truffleBuildMasks(cr, (u8 *)&aux->truffle.mask1, (u8 *)&aux->truffle.mask2);
-    DEBUG_PRINTF("built truffle for %s (%zu chars, offset %u)\n",
-                 describeClass(cr).c_str(), cr.count(), min_offset);
-    aux->truffle.accel_type = ACCEL_TRUFFLE;
-    aux->truffle.offset = verify_u8(min_offset);
-}
-
-static
-void buildForwardAccel(HWLM *h, const vector<hwlmLiteral> &lits,
-                       hwlm_group_t expected_groups) {
-    findForwardAccelScheme(lits, expected_groups, &h->accel1);
-    findForwardAccelScheme(lits, HWLM_ALL_GROUPS, &h->accel0);
-
-    h->accel1_groups = expected_groups;
-}
-
 static
 void dumpLits(UNUSED const vector<hwlmLiteral> &lits) {
 #ifdef DEBUG
@@ -512,7 +79,6 @@ bool everyoneHasGroups(const vector<hwlmLiteral> &lits) {
 
 static
 bool isNoodleable(const vector<hwlmLiteral> &lits,
-                  const hwlmStreamingControl *stream_control,
                   const CompileContext &cc) {
     if (!cc.grey.allowNoodle) {
         return false;
@@ -523,19 +89,6 @@ bool isNoodleable(const vector<hwlmLiteral> &lits,
         return false;
     }
 
-    if (stream_control) { // nullptr if in block mode
-        if (lits.front().s.length() > stream_control->history_max + 1) {
-            DEBUG_PRINTF("length of %zu too long for history max %zu\n",
-                         lits.front().s.length(),
-                         stream_control->history_max);
-            return false;
-        }
-        if (2 * lits.front().s.length() - 2 > FDR_TEMP_BUF_SIZE) {
-            assert(0);
-            return false;
-        }
-    }
-
     if (!lits.front().msk.empty()) {
         DEBUG_PRINTF("noodle can't handle supplementary masks\n");
         return false;
@@ -544,23 +97,12 @@ bool isNoodleable(const vector<hwlmLiteral> &lits,
     return true;
 }
 
-aligned_unique_ptr<HWLM> hwlmBuild(const vector<hwlmLiteral> &lits,
-                                   hwlmStreamingControl *stream_control,
-                                   bool make_small, const CompileContext &cc,
-                                   hwlm_group_t expected_groups) {
+bytecode_ptr<HWLM> hwlmBuild(const vector<hwlmLiteral> &lits, bool make_small,
+                             const CompileContext &cc,
+                             UNUSED hwlm_group_t expected_groups) {
     assert(!lits.empty());
     dumpLits(lits);
 
-    if (stream_control) {
-        assert(stream_control->history_min <= stream_control->history_max);
-
-        // We should not have been passed any literals that are too long to
-        // match with a maximally-sized history buffer.
-        assert(all_of(begin(lits), end(lits), [&](const hwlmLiteral &lit) {
-            return lit.s.length() <= stream_control->history_max + 1;
-        }));
-    }
-
     // Check that we haven't exceeded the maximum number of literals.
     if (lits.size() > cc.grey.limitLiteralCount) {
         throw ResourceLimitError();
@@ -595,29 +137,21 @@ aligned_unique_ptr<HWLM> hwlmBuild(const vector<hwlmLiteral> &lits,
 
     assert(everyoneHasGroups(lits));
 
-    if (isNoodleable(lits, stream_control, cc)) {
+    if (isNoodleable(lits, cc)) {
         DEBUG_PRINTF("build noodle table\n");
         engType = HWLM_ENGINE_NOOD;
         const hwlmLiteral &lit = lits.front();
         auto noodle = noodBuildTable(lit);
         if (noodle) {
-            engSize = noodSize(noodle.get());
-        }
-        if (stream_control) {
-            // For now, a single literal still goes to noodle and asks
-            // for a great big history
-            stream_control->literal_history_required = lit.s.length() - 1;
-            assert(stream_control->literal_history_required
-                   <= stream_control->history_max);
+            engSize = noodle.size();
         }
         eng = move(noodle);
     } else {
         DEBUG_PRINTF("building a new deal\n");
         engType = HWLM_ENGINE_FDR;
-        auto fdr = fdrBuildTable(lits, make_small, cc.target_info, cc.grey,
-                            stream_control);
+        auto fdr = fdrBuildTable(lits, make_small, cc.target_info, cc.grey);
         if (fdr) {
-            engSize = fdrSize(fdr.get());
+            engSize = fdr.size();
         }
         eng = move(fdr);
     }
@@ -631,23 +165,12 @@ aligned_unique_ptr<HWLM> hwlmBuild(const vector<hwlmLiteral> &lits,
         throw ResourceLimitError();
     }
 
-    auto h = aligned_zmalloc_unique<HWLM>(ROUNDUP_CL(sizeof(HWLM)) + engSize);
+    const size_t hwlm_len = ROUNDUP_CL(sizeof(HWLM)) + engSize;
+    auto h = make_zeroed_bytecode_ptr<HWLM>(hwlm_len, 64);
 
     h->type = engType;
     memcpy(HWLM_DATA(h.get()), eng.get(), engSize);
 
-    if (engType == HWLM_ENGINE_FDR && cc.grey.hamsterAccelForward) {
-        buildForwardAccel(h.get(), lits, expected_groups);
-    }
-
-    if (stream_control) {
-        DEBUG_PRINTF("requires %zu (of max %zu) bytes of history\n",
-                     stream_control->literal_history_required,
-                     stream_control->history_max);
-        assert(stream_control->literal_history_required
-                    <= stream_control->history_max);
-    }
-
     return h;
 }
 
diff --git a/src/hwlm/hwlm_build.h b/src/hwlm/hwlm_build.h
index fbf359e60..f2691496e 100644
--- a/src/hwlm/hwlm_build.h
+++ b/src/hwlm/hwlm_build.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,9 +34,8 @@
 #define HWLM_BUILD_H
 
 #include "hwlm.h"
-#include "hwlm_literal.h"
 #include "ue2common.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"
 
 #include <memory>
 #include <vector>
@@ -47,30 +46,12 @@ namespace ue2 {
 
 struct CompileContext;
 struct Grey;
-struct target_t;
-
-/** \brief Structure gathering together the input/output parameters related to
- * streaming mode operation. */
-struct hwlmStreamingControl {
-    /** \brief IN parameter: Upper limit on the amount of history that can be
-     * requested. */
-    size_t history_max;
-
-    /** \brief IN parameter: History already known to be used before literal
-     * analysis. */
-    size_t history_min;
-
-    /** \brief OUT parameter: History required by the literal matcher to
-     * correctly match all literals. */
-    size_t literal_history_required;
-};
+struct hwlmLiteral;
 
 /** \brief Build an \ref HWLM literal matcher runtime structure for a group of
  * literals.
  *
  * \param lits The group of literals.
- * \param stream_control Streaming control parameters. If the matcher will
- *        operate in non-streaming (block) mode, this pointer should be NULL.
  * \param make_small Optimise matcher for small size.
  * \param cc Compile context.
  * \param expected_groups FIXME: document me!
@@ -79,11 +60,9 @@ struct hwlmStreamingControl {
  * may result in a nullptr return value, or a std::bad_alloc exception being
  * thrown.
  */
-aligned_unique_ptr<HWLM>
-hwlmBuild(const std::vector<hwlmLiteral> &lits,
-          hwlmStreamingControl *stream_control, bool make_small,
-          const CompileContext &cc,
-          hwlm_group_t expected_groups = HWLM_ALL_GROUPS);
+bytecode_ptr<HWLM> hwlmBuild(const std::vector<hwlmLiteral> &lits,
+                             bool make_small, const CompileContext &cc,
+                             hwlm_group_t expected_groups = HWLM_ALL_GROUPS);
 
 /**
  * Returns an estimate of the number of repeated characters on the end of a
diff --git a/src/hwlm/hwlm_literal.h b/src/hwlm/hwlm_literal.h
index b7af99d32..0e2a1ea5d 100644
--- a/src/hwlm/hwlm_literal.h
+++ b/src/hwlm/hwlm_literal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -37,12 +37,13 @@
 #include "ue2common.h"
 
 #include <string>
+#include <tuple>
 #include <vector>
 
 namespace ue2 {
 
 /** \brief Max length of the literal passed to HWLM. */
-#define HWLM_LITERAL_MAX_LEN 255
+#define HWLM_LITERAL_MAX_LEN 8
 
 /** \brief Max length of the hwlmLiteral::msk and hwlmLiteral::cmp vectors. */
 #define HWLM_MASKLEN 8
@@ -111,6 +112,19 @@ struct hwlmLiteral {
         : hwlmLiteral(s_in, nocase_in, false, id_in, HWLM_ALL_GROUPS, {}, {}) {}
 };
 
+inline
+bool operator<(const hwlmLiteral &a, const hwlmLiteral &b) {
+    return std::tie(a.id, a.s, a.nocase, a.noruns, a.groups, a.msk, a.cmp) <
+           std::tie(b.id, b.s, b.nocase, b.noruns, b.groups, b.msk, b.cmp);
+}
+
+inline
+bool operator==(const hwlmLiteral &a, const hwlmLiteral &b) {
+    return a.id == b.id && a.s == b.s && a.nocase == b.nocase &&
+           a.noruns == b.noruns && a.groups == b.groups && a.msk == b.msk &&
+           a.cmp == b.cmp;
+}
+
 /**
  * Consistency test; returns false if the given msk/cmp test can never match
  * the literal string s.
diff --git a/src/hwlm/noodle_build.cpp b/src/hwlm/noodle_build.cpp
index d2b4e3f20..63fdf0728 100644
--- a/src/hwlm/noodle_build.cpp
+++ b/src/hwlm/noodle_build.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,7 +35,6 @@
 
 #include "hwlm_literal.h"
 #include "noodle_internal.h"
-#include "util/alloc.h"
 #include "util/compare.h"
 #include "util/verify_types.h"
 #include "ue2common.h"
@@ -67,7 +66,7 @@ size_t findNoodFragOffset(const hwlmLiteral &lit) {
     return offset;
 }
 
-aligned_unique_ptr<noodTable> noodBuildTable(const hwlmLiteral &lit) {
+bytecode_ptr<noodTable> noodBuildTable(const hwlmLiteral &lit) {
     if (!lit.msk.empty()) {
         DEBUG_PRINTF("noodle can't handle supplementary masks\n");
         return nullptr;
@@ -75,7 +74,7 @@ aligned_unique_ptr<noodTable> noodBuildTable(const hwlmLiteral &lit) {
 
     const auto &s = lit.s;
     size_t noodle_len = sizeof(noodTable) + s.length();
-    auto n = aligned_zmalloc_unique<noodTable>(noodle_len);
+    auto n = make_zeroed_bytecode_ptr<noodTable>(noodle_len);
     assert(n);
 
     size_t key_offset = findNoodFragOffset(lit);
diff --git a/src/hwlm/noodle_build.h b/src/hwlm/noodle_build.h
index 1a41695f7..b5725f082 100644
--- a/src/hwlm/noodle_build.h
+++ b/src/hwlm/noodle_build.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,11 +30,11 @@
  * \brief Noodle literal matcher: build code.
  */
 
-#ifndef NOODLE_BUILD_H_048A1A6D585A9A
-#define NOODLE_BUILD_H_048A1A6D585A9A
+#ifndef NOODLE_BUILD_H
+#define NOODLE_BUILD_H
 
 #include "ue2common.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"
 
 struct noodTable;
 
@@ -43,7 +43,7 @@ namespace ue2 {
 struct hwlmLiteral;
 
 /** \brief Construct a Noodle matcher for the given literal. */
-ue2::aligned_unique_ptr<noodTable> noodBuildTable(const hwlmLiteral &lit);
+bytecode_ptr<noodTable> noodBuildTable(const hwlmLiteral &lit);
 
 size_t noodSize(const noodTable *n);
 
@@ -61,5 +61,5 @@ void noodPrintStats(const noodTable *n, FILE *f);
 
 #endif // DUMP_SUPPORT
 
-#endif /* NOODLE_BUILD_H_048A1A6D585A9A */
+#endif /* NOODLE_BUILD_H */
 
diff --git a/src/hwlm/noodle_engine.c b/src/hwlm/noodle_engine.c
index 1d1ab4e68..9758f42b2 100644
--- a/src/hwlm/noodle_engine.c
+++ b/src/hwlm/noodle_engine.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,8 +33,11 @@
 #include "noodle_engine.h"
 #include "noodle_internal.h"
 #include "ue2common.h"
+#include "util/arch.h"
 #include "util/bitutils.h"
 #include "util/compare.h"
+#include "util/intrinsics.h"
+#include "util/join.h"
 #include "util/masked_move.h"
 #include "util/simd_utils.h"
 
@@ -50,6 +53,24 @@ struct cb_info {
     size_t offsetAdj; //!< used in streaming mode
 };
 
+#if defined(HAVE_AVX512)
+#define CHUNKSIZE 64
+#define MASK_TYPE m512
+#define Z_BITS 64
+#define Z_TYPE u64a
+#elif defined(HAVE_AVX2)
+#define CHUNKSIZE 32
+#define MASK_TYPE m256
+#define Z_BITS 32
+#define Z_TYPE u32
+#else
+#define CHUNKSIZE 16
+#define MASK_TYPE m128
+#define Z_BITS 32
+#define Z_TYPE u32
+#endif
+
+
 #define RETURN_IF_TERMINATED(x)                                                \
     {                                                                          \
         if ((x) == HWLM_TERMINATED) {                                          \
@@ -60,8 +81,9 @@ struct cb_info {
 #define SINGLE_ZSCAN()                                                         \
     do {                                                                       \
         while (unlikely(z)) {                                                  \
-            u32 pos = findAndClearLSB_32(&z);                                  \
+            Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z);                   \
             size_t matchPos = d - buf + pos;                                   \
+            DEBUG_PRINTF("match pos %zu\n", matchPos);                        \
             hwlmcb_rv_t rv = final(buf, len, key, 1, 0, 0, noCase, cbi,        \
                                    matchPos);                                  \
             RETURN_IF_TERMINATED(rv);                                          \
@@ -71,8 +93,9 @@ struct cb_info {
 #define DOUBLE_ZSCAN()                                                         \
     do {                                                                       \
         while (unlikely(z)) {                                                  \
-            u32 pos = findAndClearLSB_32(&z);                                  \
+            Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z);                   \
             size_t matchPos = d - buf + pos - 1;                               \
+            DEBUG_PRINTF("match pos %zu\n", matchPos);                        \
             hwlmcb_rv_t rv = final(buf, len, key, keyLen, keyOffset, 1,        \
                                    noCase, cbi, matchPos);                     \
             RETURN_IF_TERMINATED(rv);                                          \
@@ -109,7 +132,11 @@ hwlm_error_t final(const u8 *buf, size_t len, const u8 *key, size_t keyLen,
     return HWLM_SUCCESS;
 }
 
-#if defined(__AVX2__)
+#if defined(HAVE_AVX512)
+#define CHUNKSIZE 64
+#define MASK_TYPE m512
+#include "noodle_engine_avx512.c"
+#elif defined(HAVE_AVX2)
 #define CHUNKSIZE 32
 #define MASK_TYPE m256
 #include "noodle_engine_avx2.c"
@@ -122,12 +149,14 @@ hwlm_error_t final(const u8 *buf, size_t len, const u8 *key, size_t keyLen,
 static really_inline
 hwlm_error_t scanSingleMain(const u8 *buf, size_t len, const u8 *key,
                             bool noCase, const struct cb_info *cbi) {
-    hwlm_error_t rv;
-    size_t end = len;
 
     const MASK_TYPE mask1 = getMask(key[0], noCase);
     const MASK_TYPE caseMask = getCaseMask();
 
+#if !defined(HAVE_AVX512)
+    hwlm_error_t rv;
+    size_t end = len;
+
     if (len < CHUNKSIZE) {
         rv = scanSingleShort(buf, len, key, noCase, caseMask, mask1, cbi, 0, len);
         return rv;
@@ -172,13 +201,15 @@ hwlm_error_t scanSingleMain(const u8 *buf, size_t len, const u8 *key,
                              cbi, s2End, end);
 
     return rv;
+#else // HAVE_AVX512
+    return scanSingle512(buf, len, key, noCase, caseMask, mask1, cbi);
+#endif
 }
 
 static really_inline
 hwlm_error_t scanDoubleMain(const u8 *buf, size_t len, const u8 *key,
                             size_t keyLen, size_t keyOffset, bool noCase,
                             const struct cb_info *cbi) {
-    hwlm_error_t rv;
     // we stop scanning for the key-fragment when the rest of the key can't
     // possibly fit in the remaining buffer
     size_t end = len - keyLen + keyOffset + 2;
@@ -187,6 +218,9 @@ hwlm_error_t scanDoubleMain(const u8 *buf, size_t len, const u8 *key,
     const MASK_TYPE mask1 = getMask(key[keyOffset + 0], noCase);
     const MASK_TYPE mask2 = getMask(key[keyOffset + 1], noCase);
 
+#if !defined(HAVE_AVX512)
+    hwlm_error_t rv;
+
     if (end - keyOffset < CHUNKSIZE) {
         rv = scanDoubleShort(buf, len, key, keyLen, keyOffset, noCase, caseMask,
                              mask1, mask2, cbi, keyOffset, end);
@@ -243,6 +277,10 @@ hwlm_error_t scanDoubleMain(const u8 *buf, size_t len, const u8 *key,
                              caseMask, mask1, mask2, cbi, off, end);
 
     return rv;
+#else // AVX512
+    return scanDouble512(buf, len, key, keyLen, keyOffset, noCase, caseMask,
+                         mask1, mask2, cbi, keyOffset, end);
+#endif // AVX512
 }
 
 
diff --git a/src/hwlm/noodle_engine_avx2.c b/src/hwlm/noodle_engine_avx2.c
index 14d0eab54..a3f46047e 100644
--- a/src/hwlm/noodle_engine_avx2.c
+++ b/src/hwlm/noodle_engine_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -117,9 +117,9 @@ hwlm_error_t scanSingleShort(const u8 *buf, size_t len, const u8 *key,
     if (l < 4) {
         u8 *vp = (u8*)&v;
         switch (l) {
-            case 3: vp[2] = d[2];
-            case 2: vp[1] = d[1];
-            case 1: vp[0] = d[0];
+            case 3: vp[2] = d[2]; // fallthrough
+            case 2: vp[1] = d[1]; // fallthrough
+            case 1: vp[0] = d[0]; // fallthrough
         }
     } else {
         v = masked_move256_len(d, l);
@@ -157,9 +157,9 @@ hwlm_error_t scanDoubleShort(const u8 *buf, size_t len, const u8 *key,
     if (l < 4) {
         u8 *vp = (u8*)&v;
         switch (l) {
-            case 3: vp[2] = d[2];
-            case 2: vp[1] = d[1];
-            case 1: vp[0] = d[0];
+            case 3: vp[2] = d[2]; // fallthrough
+            case 2: vp[1] = d[1]; // fallthrough
+            case 1: vp[0] = d[0]; // fallthrough
         }
     } else {
         v = masked_move256_len(d, l);
diff --git a/src/hwlm/noodle_engine_avx512.c b/src/hwlm/noodle_engine_avx512.c
new file mode 100644
index 000000000..d4e6527f8
--- /dev/null
+++ b/src/hwlm/noodle_engine_avx512.c
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* noodle scan parts for AVX512 */
+
+static really_inline
+m512 getMask(u8 c, bool noCase) {
+    u8 k = caseClear8(c, noCase);
+    return set64x8(k);
+}
+
+static really_inline
+m512 getCaseMask(void) {
+    return set64x8(CASE_CLEAR);
+}
+
+// The short scan routine. It is used both to scan data up to an
+// alignment boundary if needed and to finish off data that the aligned scan
+// function can't handle (due to small/unaligned chunk at end)
+static really_inline
+hwlm_error_t scanSingleShort(const u8 *buf, size_t len, const u8 *key,
+                             bool noCase, m512 caseMask, m512 mask1,
+                             const struct cb_info *cbi, size_t start,
+                             size_t end) {
+    const u8 *d = buf + start;
+    ptrdiff_t scan_len = end - start;
+    DEBUG_PRINTF("scan_len %zu\n", scan_len);
+    assert(scan_len <= 64);
+    if (!scan_len) {
+        return HWLM_SUCCESS;
+    }
+
+    __mmask64 k = (~0ULL) >> (64 - scan_len);
+    DEBUG_PRINTF("load mask 0x%016llx\n", k);
+
+    m512 v = loadu_maskz_m512(k, d);
+
+    if (noCase) {
+        v = and512(v, caseMask);
+    }
+
+    // reuse the load mask to indicate valid bytes
+    u64a z = masked_eq512mask(k, mask1, v);
+
+    SINGLE_ZSCAN();
+
+    return HWLM_SUCCESS;
+}
+
+static really_inline
+hwlm_error_t scanSingle512(const u8 *buf, size_t len, const u8 *key,
+                           bool noCase, m512 caseMask, m512 mask1,
+                           const struct cb_info *cbi) {
+    const u8 *d = buf;
+    const u8 *e = buf + len;
+    DEBUG_PRINTF("start %p end %p \n", d, e);
+    assert(d < e);
+    if (d + 64 >= e) {
+        goto tail;
+    }
+
+    // peel off first part to cacheline boundary
+    const u8 *d1 = ROUNDUP_PTR(d, 64);
+    if (scanSingleShort(buf, len, key, noCase, caseMask, mask1, cbi, 0,
+                        d1 - d) == HWLM_TERMINATED) {
+        return HWLM_TERMINATED;
+    }
+    d = d1;
+
+    for (; d + 64 < e; d += 64) {
+        DEBUG_PRINTF("d %p e %p \n", d, e);
+        m512 v = noCase ? and512(load512(d), caseMask) : load512(d);
+
+        u64a z = eq512mask(mask1, v);
+        __builtin_prefetch(d + 128);
+
+        SINGLE_ZSCAN();
+    }
+
+tail:
+    DEBUG_PRINTF("d %p e %p \n", d, e);
+    // finish off tail
+
+    return scanSingleShort(buf, len, key, noCase, caseMask, mask1, cbi, d - buf,
+                           e - buf);
+}
+
+static really_inline
+hwlm_error_t scanDoubleShort(const u8 *buf, size_t len, const u8 *key,
+                             size_t keyLen, size_t keyOffset, bool noCase,
+                             m512 caseMask, m512 mask1, m512 mask2,
+                             const struct cb_info *cbi, u64a *lastz0,
+                             size_t start, size_t end) {
+    DEBUG_PRINTF("start %zu end %zu last 0x%016llx\n", start, end, *lastz0);
+    const u8 *d = buf + start;
+    ptrdiff_t scan_len = end - start;
+    if (!scan_len) {
+        return HWLM_SUCCESS;
+    }
+    assert(scan_len <= 64);
+    __mmask64 k = (~0ULL) >> (64 - scan_len);
+    DEBUG_PRINTF("load mask 0x%016llx scan_len %zu\n", k, scan_len);
+
+    m512 v = loadu_maskz_m512(k, d);
+    if (noCase) {
+        v = and512(v, caseMask);
+    }
+
+    u64a z0 = masked_eq512mask(k, mask1, v);
+    u64a z1 = masked_eq512mask(k, mask2, v);
+    u64a z = (*lastz0 | (z0 << 1)) & z1;
+    DEBUG_PRINTF("z 0x%016llx\n", z);
+
+    DOUBLE_ZSCAN();
+    *lastz0 = z0 >> (scan_len - 1);
+    return HWLM_SUCCESS;
+}
+
+static really_inline
+hwlm_error_t scanDouble512(const u8 *buf, size_t len, const u8 *key,
+                           size_t keyLen, size_t keyOffset, bool noCase,
+                           m512 caseMask, m512 mask1, m512 mask2,
+                           const struct cb_info *cbi, size_t start,
+                           size_t end) {
+    const u8 *d = buf + start;
+    const u8 *e = buf + end;
+    u64a lastz0 = 0;
+    DEBUG_PRINTF("start %zu end %zu \n", start, end);
+    assert(d < e);
+    if (d + 64 >= e) {
+        goto tail;
+    }
+
+    // peel off first part to cacheline boundary
+    const u8 *d1 = ROUNDUP_PTR(d, 64);
+    if (scanDoubleShort(buf, len, key, keyLen, keyOffset, noCase, caseMask,
+                        mask1, mask2, cbi, &lastz0, start,
+                        d1 - buf) == HWLM_TERMINATED) {
+        return HWLM_TERMINATED;
+    }
+    d = d1;
+
+    for (; d + 64 < e; d += 64) {
+        DEBUG_PRINTF("d %p e %p 0x%016llx\n", d, e, lastz0);
+        m512 v = noCase ? and512(load512(d), caseMask) : load512(d);
+
+        /* we have to pull the masks out of the AVX registers because we can't
+           byte shift between the lanes */
+        u64a z0 = eq512mask(mask1, v);
+        u64a z1 = eq512mask(mask2, v);
+        u64a z = (lastz0 | (z0 << 1)) & z1;
+        lastz0 = z0 >> 63;
+
+        // On large packet buffers, this prefetch appears to get us about 2%.
+        __builtin_prefetch(d + 256);
+
+        DEBUG_PRINTF("z 0x%016llx\n", z);
+
+        DOUBLE_ZSCAN();
+    }
+
+tail:
+    DEBUG_PRINTF("d %p e %p off %zu \n", d, e, d - buf);
+    // finish off tail
+
+    return scanDoubleShort(buf, len, key, keyLen, keyOffset, noCase, caseMask,
+                           mask1, mask2, cbi, &lastz0, d - buf, end);
+}
diff --git a/src/nfa/accel.c b/src/nfa/accel.c
index 99eab11dc..2bc60945f 100644
--- a/src/nfa/accel.c
+++ b/src/nfa/accel.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,9 +30,6 @@
 #include "shufti.h"
 #include "truffle.h"
 #include "vermicelli.h"
-#include "multishufti.h"
-#include "multitruffle.h"
-#include "multivermicelli.h"
 #include "ue2common.h"
 
 const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) {
@@ -132,220 +129,6 @@ const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) {
         rv = c_end;
         break;
 
-    /* multibyte matchers */
-    case ACCEL_MLVERM:
-        DEBUG_PRINTF("accel mlverm %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = long_vermicelliExec(accel->mverm.c, 0, c, c_end, accel->mverm.len);
-        break;
-    case ACCEL_MLVERM_NOCASE:
-        DEBUG_PRINTF("accel mlverm nc %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = long_vermicelliExec(accel->mverm.c, 1, c, c_end, accel->mverm.len);
-        break;
-    case ACCEL_MLGVERM:
-        DEBUG_PRINTF("accel mlgverm %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = longgrab_vermicelliExec(accel->mverm.c, 0, c, c_end, accel->mverm.len);
-        break;
-    case ACCEL_MLGVERM_NOCASE:
-        DEBUG_PRINTF("accel mlgverm nc %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = longgrab_vermicelliExec(accel->mverm.c, 1, c, c_end, accel->mverm.len);
-        break;
-    case ACCEL_MSVERM:
-        DEBUG_PRINTF("accel msverm %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = shift_vermicelliExec(accel->mverm.c, 0, c, c_end, accel->mverm.len);
-        break;
-    case ACCEL_MSVERM_NOCASE:
-        DEBUG_PRINTF("accel msverm nc %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = shift_vermicelliExec(accel->mverm.c, 1, c, c_end, accel->mverm.len);
-        break;
-    case ACCEL_MSGVERM:
-        DEBUG_PRINTF("accel msgverm %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = shiftgrab_vermicelliExec(accel->mverm.c, 0, c, c_end, accel->mverm.len);
-        break;
-    case ACCEL_MSGVERM_NOCASE:
-        DEBUG_PRINTF("accel msgverm nc %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = shiftgrab_vermicelliExec(accel->mverm.c, 1, c, c_end, accel->mverm.len);
-        break;
-    case ACCEL_MDSVERM:
-        DEBUG_PRINTF("accel mdsverm %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = doubleshift_vermicelliExec(accel->mdverm.c, 0, c, c_end,
-                                        accel->mdverm.len1, accel->mdverm.len2);
-        break;
-    case ACCEL_MDSVERM_NOCASE:
-        DEBUG_PRINTF("accel mdsverm nc %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = doubleshift_vermicelliExec(accel->mdverm.c, 1, c, c_end,
-                                        accel->mdverm.len1, accel->mdverm.len2);
-        break;
-    case ACCEL_MDSGVERM:
-        DEBUG_PRINTF("accel mdsgverm %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = doubleshiftgrab_vermicelliExec(accel->mdverm.c, 0, c, c_end,
-                                            accel->mdverm.len1, accel->mdverm.len2);
-        break;
-    case ACCEL_MDSGVERM_NOCASE:
-        DEBUG_PRINTF("accel mdsgverm nc %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = doubleshiftgrab_vermicelliExec(accel->mdverm.c, 1, c, c_end,
-                                            accel->mdverm.len1, accel->mdverm.len2);
-        break;
-    case ACCEL_MLSHUFTI:
-        DEBUG_PRINTF("accel mlshufti %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = long_shuftiExec(accel->mshufti.lo, accel->mshufti.hi, c, c_end,
-                             accel->mshufti.len);
-        break;
-    case ACCEL_MLGSHUFTI:
-        DEBUG_PRINTF("accel mlgshufti %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = longgrab_shuftiExec(accel->mshufti.lo, accel->mshufti.hi, c, c_end,
-                                 accel->mshufti.len);
-        break;
-    case ACCEL_MSSHUFTI:
-        DEBUG_PRINTF("accel msshufti %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = shift_shuftiExec(accel->mshufti.lo, accel->mshufti.hi, c, c_end,
-                              accel->mshufti.len);
-        break;
-    case ACCEL_MSGSHUFTI:
-        DEBUG_PRINTF("accel msgshufti %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = shiftgrab_shuftiExec(accel->mshufti.lo, accel->mshufti.hi, c, c_end,
-                                  accel->mshufti.len);
-        break;
-    case ACCEL_MDSSHUFTI:
-        DEBUG_PRINTF("accel mdsshufti %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = doubleshift_shuftiExec(accel->mdshufti.lo, accel->mdshufti.hi, c, c_end,
-                                     accel->mdshufti.len1, accel->mdshufti.len2);
-        break;
-    case ACCEL_MDSGSHUFTI:
-        DEBUG_PRINTF("accel msgshufti %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = doubleshiftgrab_shuftiExec(accel->mdshufti.lo, accel->mdshufti.hi, c, c_end,
-                                         accel->mdshufti.len1, accel->mdshufti.len2);
-        break;
-    case ACCEL_MLTRUFFLE:
-        DEBUG_PRINTF("accel mltruffle %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = long_truffleExec(accel->mtruffle.mask1, accel->mtruffle.mask2,
-                               c, c_end, accel->mtruffle.len);
-        break;
-    case ACCEL_MLGTRUFFLE:
-        DEBUG_PRINTF("accel mlgtruffle %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = longgrab_truffleExec(accel->mtruffle.mask1, accel->mtruffle.mask2,
-                                   c, c_end, accel->mtruffle.len);
-        break;
-    case ACCEL_MSTRUFFLE:
-        DEBUG_PRINTF("accel mstruffle %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = shift_truffleExec(accel->mtruffle.mask1, accel->mtruffle.mask2,
-                               c, c_end, accel->mtruffle.len);
-        break;
-    case ACCEL_MSGTRUFFLE:
-        DEBUG_PRINTF("accel msgtruffle %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = shiftgrab_truffleExec(accel->mtruffle.mask1, accel->mtruffle.mask2,
-                                   c, c_end, accel->mtruffle.len);
-        break;
-    case ACCEL_MDSTRUFFLE:
-        DEBUG_PRINTF("accel mdstruffle %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = doubleshift_truffleExec(accel->mdtruffle.mask1,
-                                     accel->mdtruffle.mask2, c, c_end,
-                                     accel->mdtruffle.len1,
-                                     accel->mdtruffle.len2);
-        break;
-    case ACCEL_MDSGTRUFFLE:
-        DEBUG_PRINTF("accel mdsgtruffle %p %p\n", c, c_end);
-        if (c + 15 >= c_end) {
-            return c;
-        }
-
-        rv = doubleshiftgrab_truffleExec(accel->mdtruffle.mask1,
-                                         accel->mdtruffle.mask2, c, c_end,
-                                         accel->mdtruffle.len1,
-                                         accel->mdtruffle.len2);
-        break;
-
 
     default:
         assert(!"not here");
diff --git a/src/nfa/accel.h b/src/nfa/accel.h
index a13563b68..3a03d0596 100644
--- a/src/nfa/accel.h
+++ b/src/nfa/accel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -61,36 +61,7 @@ enum AccelType {
     ACCEL_DSHUFTI,
     ACCEL_TRUFFLE,
     ACCEL_RED_TAPE,
-    /* multibyte vermicellis */
-    ACCEL_MLVERM,
-    ACCEL_MLVERM_NOCASE,
-    ACCEL_MLGVERM,
-    ACCEL_MLGVERM_NOCASE,
-    ACCEL_MSVERM,
-    ACCEL_MSVERM_NOCASE,
-    ACCEL_MSGVERM,
-    ACCEL_MSGVERM_NOCASE,
-    ACCEL_MDSVERM,
-    ACCEL_MDSVERM_NOCASE,
-    ACCEL_MDSGVERM,
-    ACCEL_MDSGVERM_NOCASE,
-    /* multibyte shuftis */
-    ACCEL_MLSHUFTI,
-    ACCEL_MLGSHUFTI,
-    ACCEL_MSSHUFTI,
-    ACCEL_MSGSHUFTI,
-    ACCEL_MDSSHUFTI,
-    ACCEL_MDSGSHUFTI,
-    /* multibyte truffles */
-    ACCEL_MLTRUFFLE,
-    ACCEL_MLGTRUFFLE,
-    ACCEL_MSTRUFFLE,
-    ACCEL_MSGTRUFFLE,
-    ACCEL_MDSTRUFFLE,
-    ACCEL_MDSGTRUFFLE,
-    /* masked dverm */
     ACCEL_DVERM_MASKED,
-
 };
 
 /** \brief Structure for accel framework. */
@@ -140,42 +111,12 @@ union AccelAux {
         m128 lo2;
         m128 hi2;
     } dshufti;
-    struct {
-        u8 accel_type;
-        u8 offset;
-        m128 lo;
-        m128 hi;
-        u8 len;
-    } mshufti;
-    struct {
-        u8 accel_type;
-        u8 offset;
-        m128 lo;
-        m128 hi;
-        u8 len1;
-        u8 len2;
-    } mdshufti;
     struct {
         u8 accel_type;
         u8 offset;
         m128 mask1;
         m128 mask2;
     } truffle;
-    struct {
-        u8 accel_type;
-        u8 offset;
-        m128 mask1;
-        m128 mask2;
-        u8 len;
-    } mtruffle;
-    struct {
-        u8 accel_type;
-        u8 offset;
-        m128 mask1;
-        m128 mask2;
-        u8 len1;
-        u8 len2;
-    } mdtruffle;
 };
 
 /**
diff --git a/src/nfa/accel_dfa_build_strat.cpp b/src/nfa/accel_dfa_build_strat.cpp
index d257b530b..7c56ba723 100644
--- a/src/nfa/accel_dfa_build_strat.cpp
+++ b/src/nfa/accel_dfa_build_strat.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,9 +33,11 @@
 #include "nfagraph/ng_limex_accel.h"
 #include "shufticompile.h"
 #include "trufflecompile.h"
+#include "util/accel_scheme.h"
 #include "util/charreach.h"
 #include "util/container.h"
 #include "util/dump_charclass.h"
+#include "util/small_vector.h"
 #include "util/verify_types.h"
 
 #include <sstream>
@@ -49,16 +51,15 @@ namespace ue2 {
 
 namespace {
 struct path {
-    vector<CharReach> reach;
+    small_vector<CharReach, MAX_ACCEL_DEPTH + 1> reach;
     dstate_id_t dest = DEAD_STATE;
-    explicit path(dstate_id_t base) : dest(base) {
-    }
+    explicit path(dstate_id_t base) : dest(base) {}
 };
 };
 
-static
-void dump_paths(const vector<path> &paths) {
-    for (UNUSED const auto &p : paths) {
+template<typename Container>
+void dump_paths(const Container &paths) {
+    for (UNUSED const path &p : paths) {
         DEBUG_PRINTF("[%s] -> %u\n", describeClasses(p.reach).c_str(), p.dest);
     }
     DEBUG_PRINTF("%zu paths\n", paths.size());
@@ -113,17 +114,17 @@ void extend(const raw_dfa &rdfa, const path &p,
         } else {
             path pp = append(p, CharReach(), p.dest);
             all[p.dest].push_back(pp);
-            out.push_back(pp);
+            out.push_back(move(pp));
         }
     }
 
     if (!s.reports_eod.empty()) {
         path pp = append(p, CharReach(), p.dest);
         all[p.dest].push_back(pp);
-        out.push_back(pp);
+        out.push_back(move(pp));
     }
 
-    map<u32, CharReach> dest;
+    flat_map<u32, CharReach> dest;
     for (unsigned i = 0; i < N_CHARS; i++) {
         u32 succ = s.next[rdfa.alpha_remap[i]];
         dest[succ].set(i);
@@ -140,7 +141,7 @@ void extend(const raw_dfa &rdfa, const path &p,
         DEBUG_PRINTF("----good: [%s] -> %u\n",
                      describeClasses(pp.reach).c_str(), pp.dest);
         all[e.first].push_back(pp);
-        out.push_back(pp);
+        out.push_back(move(pp));
     }
 }
 
@@ -162,8 +163,10 @@ vector<vector<CharReach>> generate_paths(const raw_dfa &rdfa,
     dump_paths(paths);
 
     vector<vector<CharReach>> rv;
+    rv.reserve(paths.size());
     for (auto &p : paths) {
-        rv.push_back(move(p.reach));
+        rv.push_back(vector<CharReach>(std::make_move_iterator(p.reach.begin()),
+                                       std::make_move_iterator(p.reach.end())));
     }
     return rv;
 }
@@ -327,7 +330,7 @@ accel_dfa_build_strat::find_escape_strings(dstate_id_t this_idx) const {
     const dstate &raw = rdfa.states[this_idx];
     const vector<CharReach> rev_map = reverse_alpha_remapping(rdfa);
     bool outs2_broken = false;
-    map<dstate_id_t, CharReach> succs;
+    flat_map<dstate_id_t, CharReach> succs;
 
     for (u32 i = 0; i < rev_map.size(); i++) {
         if (raw.next[i] == this_idx) {
@@ -379,16 +382,18 @@ accel_dfa_build_strat::find_escape_strings(dstate_id_t this_idx) const {
                     for (auto jj = cr_all_j.find_first(); jj != CharReach::npos;
                          jj = cr_all_j.find_next(jj)) {
                         rv.double_byte.emplace((u8)ii, (u8)jj);
+                        if (rv.double_byte.size() > 8) {
+                            DEBUG_PRINTF("outs2 too big\n");
+                            outs2_broken = true;
+                            goto done;
+                        }
                     }
                 }
             }
         }
 
-        if (rv.double_byte.size() > 8) {
-            DEBUG_PRINTF("outs2 too big\n");
-            outs2_broken = true;
-        }
-
+    done:
+        assert(outs2_broken || rv.double_byte.size() <= 8);
         if (outs2_broken) {
             rv.double_byte.clear();
         }
@@ -536,17 +541,17 @@ accel_dfa_build_strat::getAccelInfo(const Grey &grey) {
     dstate_id_t sds_proxy = get_sds_or_proxy(rdfa);
     DEBUG_PRINTF("sds %hu\n", sds_proxy);
 
-    for (size_t i = 0; i < rdfa.states.size(); i++) {
+    /* Find accel info for a single state. */
+    auto do_state = [&](size_t i) {
         if (i == DEAD_STATE) {
-            continue;
+            return;
         }
 
         /* Note on report acceleration states: While we can't accelerate while
-         * we
-         * are spamming out callbacks, the QR code paths don't raise reports
+         * we are spamming out callbacks, the QR code paths don't raise reports
          * during scanning so they can accelerate report states. */
         if (generates_callbacks(rdfa.kind) && !rdfa.states[i].reports.empty()) {
-            continue;
+            return;
         }
 
         size_t single_limit =
@@ -557,15 +562,28 @@ accel_dfa_build_strat::getAccelInfo(const Grey &grey) {
         if (ei.cr.count() > single_limit) {
             DEBUG_PRINTF("state %zu is not accelerable has %zu\n", i,
                          ei.cr.count());
-            continue;
+            return;
         }
 
         DEBUG_PRINTF("state %zu should be accelerable %zu\n", i, ei.cr.count());
 
         rv[i] = ei;
+    };
+
+    if (only_accel_init) {
+        DEBUG_PRINTF("only computing accel for init states\n");
+        do_state(rdfa.start_anchored);
+        if (rdfa.start_floating != rdfa.start_anchored) {
+            do_state(rdfa.start_floating);
+        }
+    } else {
+        DEBUG_PRINTF("computing accel for all states\n");
+        for (size_t i = 0; i < rdfa.states.size(); i++) {
+            do_state(i);
+        }
     }
 
-    /* provide accleration states to states in the region of sds */
+    /* provide acceleration states to states in the region of sds */
     if (contains(rv, sds_proxy)) {
         AccelScheme sds_ei = rv[sds_proxy];
         sds_ei.double_byte.clear(); /* region based on single byte scheme
diff --git a/src/nfa/accel_dfa_build_strat.h b/src/nfa/accel_dfa_build_strat.h
index 3cfaf2725..881892ed4 100644
--- a/src/nfa/accel_dfa_build_strat.h
+++ b/src/nfa/accel_dfa_build_strat.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -43,8 +43,8 @@ struct Grey;
 
 class accel_dfa_build_strat : public dfa_build_strat {
 public:
-    explicit accel_dfa_build_strat(const ReportManager &rm_in)
-        : dfa_build_strat(rm_in) {}
+    accel_dfa_build_strat(const ReportManager &rm_in, bool only_accel_init_in)
+        : dfa_build_strat(rm_in), only_accel_init(only_accel_init_in) {}
     virtual AccelScheme find_escape_strings(dstate_id_t this_idx) const;
     virtual size_t accelSize(void) const = 0;
     virtual u32 max_allowed_offset_accel() const = 0;
@@ -53,6 +53,8 @@ class accel_dfa_build_strat : public dfa_build_strat {
     virtual void buildAccel(dstate_id_t this_idx, const AccelScheme &info,
                             void *accel_out);
     virtual std::map<dstate_id_t, AccelScheme> getAccelInfo(const Grey &grey);
+private:
+    bool only_accel_init;
 };
 
 } // namespace ue2
diff --git a/src/nfa/accel_dump.cpp b/src/nfa/accel_dump.cpp
index e99e71a59..0d19fa8c6 100644
--- a/src/nfa/accel_dump.cpp
+++ b/src/nfa/accel_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -93,54 +93,6 @@ const char *accelName(u8 accel_type) {
         return "truffle";
     case ACCEL_RED_TAPE:
         return "red tape";
-    case ACCEL_MLVERM:
-        return "multibyte long vermicelli";
-    case ACCEL_MLVERM_NOCASE:
-        return "multibyte long vermicelli nocase";
-    case ACCEL_MLGVERM:
-        return "multibyte long-grab vermicelli";
-    case ACCEL_MLGVERM_NOCASE:
-        return "multibyte long-grab vermicelli nocase";
-    case ACCEL_MSVERM:
-        return "multibyte shift vermicelli";
-    case ACCEL_MSVERM_NOCASE:
-        return "multibyte shift vermicelli nocase";
-    case ACCEL_MSGVERM:
-        return "multibyte shift-grab vermicelli";
-    case ACCEL_MSGVERM_NOCASE:
-        return "multibyte shift-grab vermicelli nocase";
-    case ACCEL_MDSVERM:
-        return "multibyte doubleshift vermicelli";
-    case ACCEL_MDSVERM_NOCASE:
-        return "multibyte doubleshift vermicelli nocase";
-    case ACCEL_MDSGVERM:
-        return "multibyte doubleshift-grab vermicelli";
-    case ACCEL_MDSGVERM_NOCASE:
-        return "multibyte doubleshift-grab vermicelli nocase";
-    case ACCEL_MLSHUFTI:
-        return "multibyte long shufti";
-    case ACCEL_MLGSHUFTI:
-        return "multibyte long-grab shufti";
-    case ACCEL_MSSHUFTI:
-        return "multibyte shift shufti";
-    case ACCEL_MSGSHUFTI:
-        return "multibyte shift-grab shufti";
-    case ACCEL_MDSSHUFTI:
-        return "multibyte doubleshift shufti";
-    case ACCEL_MDSGSHUFTI:
-        return "multibyte doubleshift-grab shufti";
-    case ACCEL_MLTRUFFLE:
-        return "multibyte long truffle";
-    case ACCEL_MLGTRUFFLE:
-        return "multibyte long-grab truffle";
-    case ACCEL_MSTRUFFLE:
-        return "multibyte shift truffle";
-    case ACCEL_MSGTRUFFLE:
-        return "multibyte shift-grab truffle";
-    case ACCEL_MDSTRUFFLE:
-        return "multibyte doubleshift truffle";
-    case ACCEL_MDSGTRUFFLE:
-        return "multibyte doubleshift-grab truffle";
     default:
         return "unknown!";
     }
@@ -283,59 +235,6 @@ void dumpAccelInfo(FILE *f, const AccelAux &accel) {
                              (const u8 *)&accel.truffle.mask2);
         break;
     }
-    case ACCEL_MLVERM:
-    case ACCEL_MLVERM_NOCASE:
-    case ACCEL_MLGVERM:
-    case ACCEL_MLGVERM_NOCASE:
-    case ACCEL_MSVERM:
-    case ACCEL_MSVERM_NOCASE:
-    case ACCEL_MSGVERM:
-    case ACCEL_MSGVERM_NOCASE:
-        fprintf(f, " [\\x%02hhx] len:%u\n", accel.mverm.c, accel.mverm.len);
-        break;
-    case ACCEL_MDSVERM:
-    case ACCEL_MDSVERM_NOCASE:
-    case ACCEL_MDSGVERM:
-    case ACCEL_MDSGVERM_NOCASE:
-        fprintf(f, " [\\x%02hhx] len1:%u len2:%u\n", accel.mdverm.c, accel.mdverm.len1,
-                accel.mdverm.len2);
-        break;
-    case ACCEL_MLSHUFTI:
-    case ACCEL_MLGSHUFTI:
-    case ACCEL_MSSHUFTI:
-    case ACCEL_MSGSHUFTI:
-        fprintf(f, " len:%u\n", accel.mshufti.len);
-        dumpShuftiMasks(f, (const u8 *)&accel.mshufti.lo,
-                        (const u8 *)&accel.mshufti.hi);
-        dumpShuftiCharReach(f, (const u8 *)&accel.mshufti.lo,
-                            (const u8 *)&accel.mshufti.hi);
-        break;
-    case ACCEL_MDSSHUFTI:
-    case ACCEL_MDSGSHUFTI:
-        fprintf(f, " len1:%u len2:%u\n", accel.mdshufti.len1, accel.mdshufti.len2);
-        dumpShuftiMasks(f, (const u8 *)&accel.mdshufti.lo,
-                        (const u8 *)&accel.mdshufti.hi);
-        dumpShuftiCharReach(f, (const u8 *)&accel.mdshufti.lo,
-                            (const u8 *)&accel.mdshufti.hi);
-        break;
-    case ACCEL_MLTRUFFLE:
-    case ACCEL_MLGTRUFFLE:
-    case ACCEL_MSTRUFFLE:
-    case ACCEL_MSGTRUFFLE:
-        fprintf(f, " len:%u\n", accel.mtruffle.len);
-        dumpTruffleMasks(f, (const u8 *)&accel.mtruffle.mask1,
-                         (const u8 *)&accel.mtruffle.mask2);
-        dumpTruffleCharReach(f, (const u8 *)&accel.mtruffle.mask1,
-                             (const u8 *)&accel.mtruffle.mask2);
-        break;
-    case ACCEL_MDSTRUFFLE:
-    case ACCEL_MDSGTRUFFLE:
-        fprintf(f, " len1:%u len2:%u\n", accel.mdtruffle.len1, accel.mdtruffle.len2);
-        dumpTruffleMasks(f, (const u8 *)&accel.mdtruffle.mask1,
-                         (const u8 *)&accel.mdtruffle.mask2);
-        dumpTruffleCharReach(f, (const u8 *)&accel.mdtruffle.mask1,
-                             (const u8 *)&accel.mdtruffle.mask2);
-        break;
     default:
         fprintf(f, "\n");
         break;
diff --git a/src/nfa/accelcompile.cpp b/src/nfa/accelcompile.cpp
index 32e569ba9..a224410dc 100644
--- a/src/nfa/accelcompile.cpp
+++ b/src/nfa/accelcompile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -225,274 +225,6 @@ void buildAccelDouble(const AccelInfo &info, AccelAux *aux) {
     aux->accel_type = ACCEL_NONE;
 }
 
-static
-void buildAccelMulti(const AccelInfo &info, AccelAux *aux) {
-    if (info.ma_type == MultibyteAccelInfo::MAT_NONE) {
-        DEBUG_PRINTF("no multimatch for us :(");
-        return;
-    }
-
-    u32 offset = info.multiaccel_offset;
-    const CharReach &stops = info.multiaccel_stops;
-
-    assert(aux->accel_type == ACCEL_NONE);
-    if (stops.all()) {
-        return;
-    }
-
-    size_t outs = stops.count();
-    DEBUG_PRINTF("%zu outs\n", outs);
-    assert(outs && outs < 256);
-
-    switch (info.ma_type) {
-    case MultibyteAccelInfo::MAT_LONG:
-        if (outs == 1) {
-            aux->accel_type = ACCEL_MLVERM;
-            aux->mverm.offset = offset;
-            aux->mverm.c = stops.find_first();
-            aux->mverm.len = info.ma_len1;
-            DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
-            return;
-        }
-        if (outs == 2 && stops.isCaselessChar()) {
-            aux->accel_type = ACCEL_MLVERM_NOCASE;
-            aux->mverm.offset = offset;
-            aux->mverm.c = stops.find_first() & CASE_CLEAR;
-            aux->mverm.len = info.ma_len1;
-            DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
-                         aux->verm.c);
-            return;
-        }
-        break;
-    case MultibyteAccelInfo::MAT_LONGGRAB:
-        if (outs == 1) {
-            aux->accel_type = ACCEL_MLGVERM;
-            aux->mverm.offset = offset;
-            aux->mverm.c = stops.find_first();
-            aux->mverm.len = info.ma_len1;
-            DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
-            return;
-        }
-        if (outs == 2 && stops.isCaselessChar()) {
-            aux->accel_type = ACCEL_MLGVERM_NOCASE;
-            aux->mverm.offset = offset;
-            aux->mverm.c = stops.find_first() & CASE_CLEAR;
-            aux->mverm.len = info.ma_len1;
-            DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
-                         aux->verm.c);
-            return;
-        }
-        break;
-    case MultibyteAccelInfo::MAT_SHIFT:
-        if (outs == 1) {
-            aux->accel_type = ACCEL_MSVERM;
-            aux->mverm.offset = offset;
-            aux->mverm.c = stops.find_first();
-            aux->mverm.len = info.ma_len1;
-            DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
-            return;
-        }
-        if (outs == 2 && stops.isCaselessChar()) {
-            aux->accel_type = ACCEL_MSVERM_NOCASE;
-            aux->mverm.offset = offset;
-            aux->mverm.c = stops.find_first() & CASE_CLEAR;
-            aux->mverm.len = info.ma_len1;
-            DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
-                         aux->verm.c);
-            return;
-        }
-        break;
-    case MultibyteAccelInfo::MAT_SHIFTGRAB:
-        if (outs == 1) {
-            aux->accel_type = ACCEL_MSGVERM;
-            aux->mverm.offset = offset;
-            aux->mverm.c = stops.find_first();
-            aux->mverm.len = info.ma_len1;
-            DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
-            return;
-        }
-        if (outs == 2 && stops.isCaselessChar()) {
-            aux->accel_type = ACCEL_MSGVERM_NOCASE;
-            aux->mverm.offset = offset;
-            aux->mverm.c = stops.find_first() & CASE_CLEAR;
-            aux->mverm.len = info.ma_len1;
-            DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
-                         aux->verm.c);
-            return;
-        }
-        break;
-    case MultibyteAccelInfo::MAT_DSHIFT:
-        if (outs == 1) {
-            aux->accel_type = ACCEL_MDSVERM;
-            aux->mdverm.offset = offset;
-            aux->mdverm.c = stops.find_first();
-            aux->mdverm.len1 = info.ma_len1;
-            aux->mdverm.len2 = info.ma_len2;
-            DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
-            return;
-        }
-        if (outs == 2 && stops.isCaselessChar()) {
-            aux->accel_type = ACCEL_MDSVERM_NOCASE;
-            aux->mverm.offset = offset;
-            aux->mverm.c = stops.find_first() & CASE_CLEAR;
-            aux->mdverm.len1 = info.ma_len1;
-            aux->mdverm.len2 = info.ma_len2;
-            DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
-                         aux->verm.c);
-            return;
-        }
-        break;
-    case MultibyteAccelInfo::MAT_DSHIFTGRAB:
-        if (outs == 1) {
-            aux->accel_type = ACCEL_MDSGVERM;
-            aux->mdverm.offset = offset;
-            aux->mdverm.c = stops.find_first();
-            aux->mdverm.len1 = info.ma_len1;
-            aux->mdverm.len2 = info.ma_len2;
-            DEBUG_PRINTF("building vermicelli caseful for 0x%02hhx\n", aux->verm.c);
-            return;
-        }
-        if (outs == 2 && stops.isCaselessChar()) {
-            aux->accel_type = ACCEL_MDSGVERM_NOCASE;
-            aux->mverm.offset = offset;
-            aux->mverm.c = stops.find_first() & CASE_CLEAR;
-            aux->mdverm.len1 = info.ma_len1;
-            aux->mdverm.len2 = info.ma_len2;
-            DEBUG_PRINTF("building vermicelli caseless for 0x%02hhx\n",
-                         aux->verm.c);
-            return;
-        }
-        break;
-    default:
-        // shouldn't happen
-        assert(0);
-        return;
-    }
-
-    DEBUG_PRINTF("attempting shufti for %zu chars\n", outs);
-
-    switch (info.ma_type) {
-    case MultibyteAccelInfo::MAT_LONG:
-        if (shuftiBuildMasks(stops, (u8 *)&aux->mshufti.lo,
-                (u8 *)&aux->mshufti.hi) == -1) {
-            break;
-        }
-        aux->accel_type = ACCEL_MLSHUFTI;
-        aux->mshufti.offset = offset;
-        aux->mshufti.len = info.ma_len1;
-        return;
-    case MultibyteAccelInfo::MAT_LONGGRAB:
-        if (shuftiBuildMasks(stops, (u8 *)&aux->mshufti.lo,
-                (u8 *)&aux->mshufti.hi) == -1) {
-            break;
-        }
-        aux->accel_type = ACCEL_MLGSHUFTI;
-        aux->mshufti.offset = offset;
-        aux->mshufti.len = info.ma_len1;
-        return;
-    case MultibyteAccelInfo::MAT_SHIFT:
-        if (shuftiBuildMasks(stops, (u8 *)&aux->mshufti.lo,
-                (u8 *)&aux->mshufti.hi) == -1) {
-            break;
-        }
-        aux->accel_type = ACCEL_MSSHUFTI;
-        aux->mshufti.offset = offset;
-        aux->mshufti.len = info.ma_len1;
-        return;
-    case MultibyteAccelInfo::MAT_SHIFTGRAB:
-        if (shuftiBuildMasks(stops, (u8 *)&aux->mshufti.lo,
-                (u8 *)&aux->mshufti.hi) == -1) {
-            break;
-        }
-        aux->accel_type = ACCEL_MSGSHUFTI;
-        aux->mshufti.offset = offset;
-        aux->mshufti.len = info.ma_len1;
-        return;
-    case MultibyteAccelInfo::MAT_DSHIFT:
-        if (shuftiBuildMasks(stops, (u8 *)&aux->mdshufti.lo,
-                (u8 *)&aux->mdshufti.hi) == -1) {
-            break;
-        }
-        aux->accel_type = ACCEL_MDSSHUFTI;
-        aux->mdshufti.offset = offset;
-        aux->mdshufti.len1 = info.ma_len1;
-        aux->mdshufti.len2 = info.ma_len2;
-        return;
-    case MultibyteAccelInfo::MAT_DSHIFTGRAB:
-        if (shuftiBuildMasks(stops, (u8 *)&aux->mdshufti.lo,
-                (u8 *)&aux->mdshufti.hi) == -1) {
-            break;
-        }
-        aux->accel_type = ACCEL_MDSGSHUFTI;
-        aux->mdshufti.offset = offset;
-        aux->mdshufti.len1 = info.ma_len1;
-        aux->mdshufti.len2 = info.ma_len2;
-        return;
-    default:
-        // shouldn't happen
-        assert(0);
-        return;
-    }
-    DEBUG_PRINTF("shufti build failed, falling through\n");
-
-    if (outs <= ACCEL_MAX_STOP_CHAR) {
-        DEBUG_PRINTF("building Truffle for %zu chars\n", outs);
-        switch (info.ma_type) {
-        case MultibyteAccelInfo::MAT_LONG:
-            aux->accel_type = ACCEL_MLTRUFFLE;
-            aux->mtruffle.offset = offset;
-            aux->mtruffle.len = info.ma_len1;
-            truffleBuildMasks(stops, (u8 *)&aux->mtruffle.mask1,
-                              (u8 *)&aux->mtruffle.mask2);
-            break;
-        case MultibyteAccelInfo::MAT_LONGGRAB:
-            aux->accel_type = ACCEL_MLGTRUFFLE;
-            aux->mtruffle.offset = offset;
-            aux->mtruffle.len = info.ma_len1;
-            truffleBuildMasks(stops, (u8 *)&aux->mtruffle.mask1,
-                              (u8 *)&aux->mtruffle.mask2);
-            break;
-        case MultibyteAccelInfo::MAT_SHIFT:
-            aux->accel_type = ACCEL_MSTRUFFLE;
-            aux->mtruffle.offset = offset;
-            aux->mtruffle.len = info.ma_len1;
-            truffleBuildMasks(stops, (u8 *)&aux->mtruffle.mask1,
-                              (u8 *)&aux->mtruffle.mask2);
-            break;
-        case MultibyteAccelInfo::MAT_SHIFTGRAB:
-            aux->accel_type = ACCEL_MSGTRUFFLE;
-            aux->mtruffle.offset = offset;
-            aux->mtruffle.len = info.ma_len1;
-            truffleBuildMasks(stops, (u8 *)&aux->mtruffle.mask1,
-                              (u8 *)&aux->mtruffle.mask2);
-            break;
-        case MultibyteAccelInfo::MAT_DSHIFT:
-            aux->accel_type = ACCEL_MDSTRUFFLE;
-            aux->mdtruffle.offset = offset;
-            aux->mdtruffle.len1 = info.ma_len1;
-            aux->mdtruffle.len2 = info.ma_len2;
-            truffleBuildMasks(stops, (u8 *)&aux->mtruffle.mask1,
-                              (u8 *)&aux->mdtruffle.mask2);
-            break;
-        case MultibyteAccelInfo::MAT_DSHIFTGRAB:
-            aux->accel_type = ACCEL_MDSGTRUFFLE;
-            aux->mdtruffle.offset = offset;
-            aux->mdtruffle.len1 = info.ma_len1;
-            aux->mdtruffle.len2 = info.ma_len2;
-            truffleBuildMasks(stops, (u8 *)&aux->mtruffle.mask1,
-                              (u8 *)&aux->mdtruffle.mask2);
-            break;
-        default:
-            // shouldn't happen
-            assert(0);
-            return;
-        }
-        return;
-    }
-
-    DEBUG_PRINTF("unable to accelerate multibyte case with %zu outs\n", outs);
-}
-
 bool buildAccelAux(const AccelInfo &info, AccelAux *aux) {
     assert(aux->accel_type == ACCEL_NONE);
     if (info.single_stops.none()) {
@@ -500,9 +232,6 @@ bool buildAccelAux(const AccelInfo &info, AccelAux *aux) {
         aux->accel_type = ACCEL_RED_TAPE;
         aux->generic.offset = info.single_offset;
     }
-    if (aux->accel_type == ACCEL_NONE) {
-        buildAccelMulti(info, aux);
-    }
     if (aux->accel_type == ACCEL_NONE) {
         buildAccelDouble(info, aux);
     }
diff --git a/src/nfa/accelcompile.h b/src/nfa/accelcompile.h
index 9b30146cd..9bd4ff18d 100644
--- a/src/nfa/accelcompile.h
+++ b/src/nfa/accelcompile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -37,30 +37,9 @@ union AccelAux;
 
 namespace ue2 {
 
-struct MultibyteAccelInfo {
-    /* multibyte accel schemes, ordered by strength */
-    enum multiaccel_type {
-        MAT_SHIFT,
-        MAT_SHIFTGRAB,
-        MAT_DSHIFT,
-        MAT_DSHIFTGRAB,
-        MAT_LONG,
-        MAT_LONGGRAB,
-        MAT_MAX,
-        MAT_NONE = MAT_MAX
-    };
-    CharReach cr;
-    u32 offset = 0;
-    u32 len1 = 0;
-    u32 len2 = 0;
-    multiaccel_type type = MAT_NONE;
-};
-
 struct AccelInfo {
     AccelInfo() : single_offset(0U), double_offset(0U),
-                  single_stops(CharReach::dot()),
-                  multiaccel_offset(0), ma_len1(0), ma_len2(0),
-                  ma_type(MultibyteAccelInfo::MAT_NONE) {}
+                  single_stops(CharReach::dot()) {}
     u32 single_offset; /**< offset correction to apply to single schemes */
     u32 double_offset; /**< offset correction to apply to double schemes */
     CharReach double_stop1;  /**<  single-byte accel stop literals for double
@@ -68,11 +47,6 @@ struct AccelInfo {
     flat_set<std::pair<u8, u8>> double_stop2; /**< double-byte accel stop
                                                * literals */
     CharReach single_stops; /**< escapes for single byte acceleration */
-    u32 multiaccel_offset; /**< offset correction to apply to multibyte schemes */
-    CharReach multiaccel_stops; /**< escapes for multibyte acceleration */
-    u32 ma_len1; /**< multiaccel len1 */
-    u32 ma_len2; /**< multiaccel len2 */
-    MultibyteAccelInfo::multiaccel_type ma_type; /**< multiaccel type */
 };
 
 bool buildAccelAux(const AccelInfo &info, AccelAux *aux);
diff --git a/src/nfa/castlecompile.cpp b/src/nfa/castlecompile.cpp
index 3b40ab9a8..40fbc18cb 100644
--- a/src/nfa/castlecompile.cpp
+++ b/src/nfa/castlecompile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,9 +26,11 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-/** \file
+/**
+ * \file
  * \brief Castle: multi-tenant repeat engine, compiler code.
  */
+
 #include "castlecompile.h"
 
 #include "castle_internal.h"
@@ -439,7 +441,7 @@ void buildSubcastles(const CastleProto &proto, vector<SubCastle> &subs,
     }
 }
 
-aligned_unique_ptr<NFA>
+bytecode_ptr<NFA>
 buildCastle(const CastleProto &proto,
             const map<u32, vector<vector<CharReach>>> &triggers,
             const CompileContext &cc, const ReportManager &rm) {
@@ -501,7 +503,7 @@ buildCastle(const CastleProto &proto,
         // possibly means that we've got a repeat that we can't trigger. We do
         // need to cope with it though.
         if (contains(triggers, top)) {
-            min_period = minPeriod(triggers.at(top), cr, &is_reset);
+            min_period = depth(minPeriod(triggers.at(top), cr, &is_reset));
         }
 
         if (min_period > pr.bounds.max) {
@@ -560,7 +562,7 @@ buildCastle(const CastleProto &proto,
     DEBUG_PRINTF("%zu subcastles may go stale\n", may_stale.size());
     vector<mmbit_sparse_iter> stale_iter;
     if (!may_stale.empty()) {
-        mmbBuildSparseIterator(stale_iter, may_stale, numRepeats);
+        stale_iter = mmbBuildSparseIterator(may_stale, numRepeats);
     }
 
 
@@ -577,7 +579,7 @@ buildCastle(const CastleProto &proto,
     total_size = ROUNDUP_N(total_size, alignof(mmbit_sparse_iter));
     total_size += byte_length(stale_iter); // stale sparse iter
 
-    aligned_unique_ptr<NFA> nfa = aligned_zmalloc_unique<NFA>(total_size);
+    auto nfa = make_zeroed_bytecode_ptr<NFA>(total_size);
     nfa->type = verify_u8(CASTLE_NFA);
     nfa->length = verify_u32(total_size);
     nfa->nPositions = verify_u32(subs.size());
diff --git a/src/nfa/castlecompile.h b/src/nfa/castlecompile.h
index 938e57c4d..9f44692d4 100644
--- a/src/nfa/castlecompile.h
+++ b/src/nfa/castlecompile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,7 +26,8 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-/** \file
+/**
+ * \file
  * \brief Castle: multi-tenant repeat engine, compiler code.
  */
 
@@ -36,7 +37,7 @@
 #include "nfa_kind.h"
 #include "ue2common.h"
 #include "nfagraph/ng_repeat.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"
 #include "util/depth.h"
 #include "util/ue2_containers.h"
 
@@ -120,7 +121,7 @@ void remapCastleTops(CastleProto &proto, std::map<u32, u32> &top_map);
  * NOTE: Tops must be contiguous, i.e. \ref remapCastleTops must have been run
  * first.
  */
-ue2::aligned_unique_ptr<NFA>
+bytecode_ptr<NFA>
 buildCastle(const CastleProto &proto,
             const std::map<u32, std::vector<std::vector<CharReach>>> &triggers,
             const CompileContext &cc, const ReportManager &rm);
diff --git a/src/nfa/dfa_min.cpp b/src/nfa/dfa_min.cpp
index 0d3bca114..f309cc535 100644
--- a/src/nfa/dfa_min.cpp
+++ b/src/nfa/dfa_min.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,12 +26,14 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-/** \file
-* \brief Build code for DFA minimization
-*/
+/**
+ * \file
+ * \brief Build code for DFA minimization.
+ */
 
 /**
- * /Summary of the Hopcrofts algorithm/
+ * /Summary of the Hopcroft minimisation algorithm/
+ *
  * partition := {F, Q \ F};
  * work_queue := {F};
  * while (work_queue is not empty) do
@@ -57,22 +59,20 @@
 #include "dfa_min.h"
 
 #include "grey.h"
-#include "nfa/rdfa.h"
-#include "nfagraph/ng_mcclellan.h"
+#include "rdfa.h"
 #include "ue2common.h"
-#include "util/partitioned_set.h"
 #include "util/container.h"
+#include "util/noncopyable.h"
+#include "util/partitioned_set.h"
 #include "util/ue2_containers.h"
 
 #include <algorithm>
 #include <functional>
+#include <iterator>
 #include <map>
+#include <queue>
 #include <set>
 #include <vector>
-#include <iterator>
-
-#include <boost/core/noncopyable.hpp>
-#include <boost/dynamic_bitset.hpp>
 
 using namespace std;
 
@@ -81,118 +81,81 @@ namespace ue2 {
 namespace {
 
 struct hopcroft_state_info {
-    vector<vector<dstate_id_t> > prev;
+    explicit hopcroft_state_info(size_t alpha_size) : prev(alpha_size) {}
+
+    /** \brief Mapping from symbol to a list of predecessors that transition to
+     * this state on that symbol. */
+    vector<vector<dstate_id_t>> prev;
 };
 
-struct DFA_components : boost::noncopyable {
-    dstate_id_t nstates;
-    size_t inp_size;
-    set<size_t> work_queue;
-    /*Partition contains reduced states*/
-    partitioned_set<dstate_id_t> partition;
-    vector<hopcroft_state_info> states;
+struct HopcroftInfo : noncopyable {
+    size_t alpha_size; //!< Size of DFA alphabet.
+    queue<size_t> work_queue; //!< Hopcroft work queue of partition indices.
+    partitioned_set<dstate_id_t> partition; //!< Partition set of DFA states.
+    vector<hopcroft_state_info> states; //!< Pre-calculated state info (preds)
 
-    explicit DFA_components(const raw_dfa &rdfa);
+    explicit HopcroftInfo(const raw_dfa &rdfa);
 };
 
-} //namespace
+} // namespace
 
 /**
- * create_map:
- *   Creates an initial partitioning and work_queue.
- *   Initial partition contains {accepting states..., Non-accepting states}
- *   Initial work_queue contains accepting state subsets
+ * \brief Create an initial partitioning and work_queue.
  *
- *   The initial partitioning needs to distinguish between the different
- *   reporting behaviours (unlike standard hopcroft) --> more than one subset
- *   possible for the accepting states.
+ * Initial partition contains {accepting states..., Non-accepting states}
+ * Initial work_queue contains accepting state subsets
  *
- *   Look for accepting states in both reports and reports_eod.
- *   Creates a map with a key(reports, reports_eod) and an id.
- *   Reports of each state are searched against the map and
- *   added to the corresponding id -> partition[id] and work_queue[id].
- *   Non Accept states are added to partition[id+1].
+ * The initial partitioning needs to distinguish between the different
+ * reporting behaviours (unlike standard Hopcroft) --> more than one subset
+ * possible for the accepting states.
+ *
+ * Look for accepting states in both reports and reports_eod.
+ * Creates a map with a key(reports, reports_eod) and an id.
+ * Reports of each state are searched against the map and
+ * added to the corresponding id -> partition[id] and work_queue[id].
+ * Non Accept states are added to partition[id+1].
  */
 static
-vector<size_t> create_map(const raw_dfa &rdfa, set<size_t> &work_queue) {
+vector<size_t> create_map(const raw_dfa &rdfa, queue<size_t> &work_queue) {
     using ReportKey = pair<flat_set<ReportID>, flat_set<ReportID>>;
     map<ReportKey, size_t> subset_map;
     vector<size_t> state_to_subset(rdfa.states.size(), INVALID_SUBSET);
 
     for (size_t i = 0; i < rdfa.states.size(); i++) {
-        if (!rdfa.states[i].reports.empty() ||
-            !rdfa.states[i].reports_eod.empty()) {
-            ReportKey key(rdfa.states[i].reports, rdfa.states[i].reports_eod);
+        const auto &ds = rdfa.states[i];
+        if (!ds.reports.empty() || !ds.reports_eod.empty()) {
+            ReportKey key(ds.reports, ds.reports_eod);
             if (contains(subset_map, key)) {
                 state_to_subset[i] = subset_map[key];
             } else {
                 size_t sub = subset_map.size();
-                subset_map[key] = sub;
+                subset_map.emplace(std::move(key), sub);
                 state_to_subset[i] = sub;
-                work_queue.insert(sub);
+                work_queue.push(sub);
             }
         }
     }
 
-    /* handle non accepts */
+    /* Give non-accept states their own subset. */
     size_t non_accept_sub = subset_map.size();
-    for (size_t i = 0; i < state_to_subset.size(); i++) {
-        if (state_to_subset[i] == INVALID_SUBSET) {
-            state_to_subset[i] = non_accept_sub;
-        }
-    }
+    replace(state_to_subset.begin(), state_to_subset.end(), INVALID_SUBSET,
+            non_accept_sub);
 
     return state_to_subset;
 }
 
-DFA_components::DFA_components(const raw_dfa &rdfa)
-                             : nstates(rdfa.states.size()),
-                               inp_size(rdfa.states[nstates - 1].next.size()),
-                               partition(create_map(rdfa, work_queue)) {
-    /* initializing states */
-    for (size_t i = 0; i < nstates; i++) {
-        states.push_back(hopcroft_state_info());
-        states.back().prev.resize(inp_size);
-    }
-
-    for (size_t i = 0; i < nstates; i++) {  // i is the previous state
-        for (size_t  j = 0; j < inp_size; j++) {
-            /* Creating X_table */
-            dstate_id_t present_state = rdfa.states[i].next[j];
-            states[present_state].prev[j].push_back(i);
-
-            DEBUG_PRINTF("rdfa.states[%zu].next[%zu] %hu \n", i, j,
-                         rdfa.states[i].next[j]);
+HopcroftInfo::HopcroftInfo(const raw_dfa &rdfa)
+    : alpha_size(rdfa.alpha_size), partition(create_map(rdfa, work_queue)),
+      states(rdfa.states.size(), hopcroft_state_info(alpha_size)) {
+    /* Construct predecessor lists for each state, indexed by symbol. */
+    for (size_t i = 0; i < states.size(); i++) { // i is the previous state
+        for (size_t sym = 0; sym < alpha_size; sym++) {
+            dstate_id_t present_state = rdfa.states[i].next[sym];
+            states[present_state].prev[sym].push_back(i);
         }
     }
 }
 
-/**
- * choose and remove a set A from work_queue.
- */
-static
-void get_work_item(DFA_components &mdfa, ue2::flat_set<dstate_id_t> &A) {
-    A.clear();
-    assert(!mdfa.work_queue.empty());
-    set<size_t>::iterator pt = mdfa.work_queue.begin();
-    insert(&A, mdfa.partition[*pt]);
-    mdfa.work_queue.erase(pt);
-}
-
-/**
- * X is the set of states for which a transition on the input leads to a state
- * in A.
- */
-static
-void create_X(const DFA_components &mdfa, const ue2::flat_set<dstate_id_t> &A,
-              size_t inp, ue2::flat_set<dstate_id_t> &X) {
-    X.clear();
-
-    for (dstate_id_t id : A) {
-        insert(&X, mdfa.states[id].prev[inp]);
-    }
-}
-
 /**
  * For a split set X, each subset S (given by part_index) in the partition, two
  * sets are created: v_inter (X intersection S) and v_sub (S - X).
@@ -206,14 +169,14 @@ void create_X(const DFA_components &mdfa, const ue2::flat_set<dstate_id_t> &A,
  *      - replace S in work_queue by the smaller of the two sets.
  */
 static
-void split_and_replace_set(const size_t part_index, DFA_components &mdfa,
-                           const ue2::flat_set<dstate_id_t> &splitter) {
+void split_and_replace_set(const size_t part_index, HopcroftInfo &info,
+                           const flat_set<dstate_id_t> &splitter) {
     /* singleton sets cannot be split */
-    if (mdfa.partition[part_index].size() == 1) {
+    if (info.partition[part_index].size() == 1) {
         return;
     }
 
-    size_t small_index = mdfa.partition.split(part_index, splitter);
+    size_t small_index = info.partition.split(part_index, splitter);
 
     if (small_index == INVALID_SUBSET) {
         /* the set could not be split */
@@ -223,54 +186,56 @@ void split_and_replace_set(const size_t part_index, DFA_components &mdfa,
     /* larger subset remains at the input subset index, if the input subset was
      * already in the work queue then the larger subset will remain there. */
 
-    mdfa.work_queue.insert(small_index);
+    info.work_queue.push(small_index);
 }
 
 /**
- * The complete Hopcrofts algorithm is implemented in this function.
- * Choose and remove a set tray from work_queue
- * For each input- X is created.
- * For each subset in the partition, split_and_replace_sets are called with the
- * split set.
+ * \brief Core of the Hopcroft minimisation algorithm.
  */
 static
-void dfa_min(DFA_components &mdfa) {
-    ue2::flat_set<dstate_id_t> A, X;
+void dfa_min(HopcroftInfo &info) {
+    flat_set<dstate_id_t> curr, sym_preds;
     vector<size_t> cand_subsets;
 
-    while (!mdfa.work_queue.empty()) {
-        get_work_item(mdfa, A);
+    while (!info.work_queue.empty()) {
+        /* Choose and remove a set of states (curr, or A in the description
+         * above) from the work queue. Note that we copy the set because the
+         * partition may be split by the loop below. */
+        curr.clear();
+        insert(&curr, info.partition[info.work_queue.front()]);
+        info.work_queue.pop();
+
+        for (size_t sym = 0; sym < info.alpha_size; sym++) {
+            /* Find the set of states sym_preds for which a transition on the
+             * given symbol leads to a state in curr. */
+            sym_preds.clear();
+            for (dstate_id_t s : curr) {
+                insert(&sym_preds, info.states[s].prev[sym]);
+            }
 
-        for (size_t inp = 0; inp < mdfa.inp_size; inp++) {
-            create_X(mdfa, A, inp, X);
-            if (X.empty()) {
+            if (sym_preds.empty()) {
                 continue;
             }
 
-            /* we only need to consider subsets with at least one member in X for
-             * splitting */
+            /* we only need to consider subsets with at least one member in
+             * sym_preds for splitting */
             cand_subsets.clear();
-            mdfa.partition.find_overlapping(X, &cand_subsets);
+            info.partition.find_overlapping(sym_preds, &cand_subsets);
 
             for (size_t sub : cand_subsets) {
-                split_and_replace_set(sub, mdfa, X);
+                split_and_replace_set(sub, info, sym_preds);
             }
         }
     }
 }
 
 /**
- * Creating new dfa table
- * Map ordering contains key being an equivalence classes first state
- * and the value being the equivalence class index.
- * Eq_state[i] tells us new state id the equivalence class located at
- * partition[i].
+ * \brief Build the new DFA state table.
  */
 static
-void mapping_new_states(const DFA_components &mdfa,
-                        vector<dstate_id_t> &old_to_new,
-                        raw_dfa &rdfa) {
-    const size_t num_partitions = mdfa.partition.size();
+void mapping_new_states(const HopcroftInfo &info,
+                        vector<dstate_id_t> &old_to_new, raw_dfa &rdfa) {
+    const size_t num_partitions = info.partition.size();
 
     // Mapping from equiv class's first state to equiv class index.
     map<dstate_id_t, size_t> ordering;
@@ -279,7 +244,7 @@ void mapping_new_states(const DFA_components &mdfa,
     vector<dstate_id_t> eq_state(num_partitions);
 
     for (size_t i = 0; i < num_partitions; i++) {
-        ordering[*mdfa.partition[i].begin()] = i;
+        ordering[*info.partition[i].begin()] = i;
     }
 
     dstate_id_t new_id = 0;
@@ -287,30 +252,28 @@ void mapping_new_states(const DFA_components &mdfa,
         eq_state[m.second] = new_id++;
     }
 
-    for (size_t t = 0; t < mdfa.partition.size(); t++) {
-        for (dstate_id_t id : mdfa.partition[t]) {
+    for (size_t t = 0; t < info.partition.size(); t++) {
+        for (dstate_id_t id : info.partition[t]) {
             old_to_new[id] = eq_state[t];
         }
     }
 
     vector<dstate> new_states;
     new_states.reserve(num_partitions);
-    for (size_t i = 0; i < mdfa.nstates; i++) {
-        if (contains(ordering, i)) {
-            new_states.push_back(rdfa.states[i]);
-        }
+
+    for (const auto &m : ordering) {
+        new_states.push_back(rdfa.states[m.first]);
     }
-    rdfa.states.swap(new_states);
+    rdfa.states = std::move(new_states);
 }
 
 static
-void renumber_new_states(const DFA_components &mdfa,
-                         const vector<dstate_id_t> &old_to_new,
-                         raw_dfa &rdfa) {
-    for (size_t i = 0; i < mdfa.partition.size(); i++) {
-        for (size_t j = 0; j < mdfa.inp_size; j++) {
-            dstate_id_t output = rdfa.states[i].next[j];
-            rdfa.states[i].next[j] = old_to_new[output];
+void renumber_new_states(const HopcroftInfo &info,
+                         const vector<dstate_id_t> &old_to_new, raw_dfa &rdfa) {
+    for (size_t i = 0; i < info.partition.size(); i++) {
+        for (size_t sym = 0; sym < info.alpha_size; sym++) {
+            dstate_id_t output = rdfa.states[i].next[sym];
+            rdfa.states[i].next[sym] = old_to_new[output];
         }
         dstate_id_t dad = rdfa.states[i].daddy;
         rdfa.states[i].daddy = old_to_new[dad];
@@ -321,17 +284,16 @@ void renumber_new_states(const DFA_components &mdfa,
 }
 
 static
-void new_dfa(raw_dfa &rdfa, const DFA_components &mdfa) {
-    if (mdfa.partition.size() != mdfa.nstates) {
-        vector<dstate_id_t> old_to_new(mdfa.nstates);
-        mapping_new_states(mdfa, old_to_new, rdfa);
-        renumber_new_states(mdfa, old_to_new, rdfa);
+void new_dfa(raw_dfa &rdfa, const HopcroftInfo &info) {
+    if (info.partition.size() == info.states.size()) {
+        return;
     }
+
+    vector<dstate_id_t> old_to_new(info.states.size());
+    mapping_new_states(info, old_to_new, rdfa);
+    renumber_new_states(info, old_to_new, rdfa);
 }
 
-/**
- * MAIN FUNCTION
- */
 void minimize_hopcroft(raw_dfa &rdfa, const Grey &grey) {
     if (!grey.minimizeDFA) {
         return;
@@ -339,10 +301,10 @@ void minimize_hopcroft(raw_dfa &rdfa, const Grey &grey) {
 
     UNUSED const size_t states_before = rdfa.states.size();
 
-    DFA_components mdfa(rdfa);
+    HopcroftInfo info(rdfa);
 
-    dfa_min(mdfa);
-    new_dfa(rdfa, mdfa);
+    dfa_min(info);
+    new_dfa(rdfa, info);
 
     DEBUG_PRINTF("reduced from %zu to %zu states\n", states_before,
                  rdfa.states.size());
diff --git a/src/nfa/dfa_min.h b/src/nfa/dfa_min.h
index 8277a4ba0..61ca6c21a 100644
--- a/src/nfa/dfa_min.h
+++ b/src/nfa/dfa_min.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,8 +26,9 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-/** \file
- * \brief Build code for McClellan DFA.
+/**
+ * \file
+ * \brief Build code for DFA minimization.
  */
 
 #ifndef DFA_MIN_H
diff --git a/src/nfa/goughcompile.cpp b/src/nfa/goughcompile.cpp
index 314b6fd02..58b05d3d1 100644
--- a/src/nfa/goughcompile.cpp
+++ b/src/nfa/goughcompile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,7 +35,6 @@
 #include "grey.h"
 #include "mcclellancompile.h"
 #include "nfa_internal.h"
-#include "util/alloc.h"
 #include "util/compile_context.h"
 #include "util/container.h"
 #include "util/graph_range.h"
@@ -81,7 +80,7 @@ class gough_build_strat : public mcclellan_build_strat {
     gough_build_strat(
         raw_som_dfa &r, const GoughGraph &g, const ReportManager &rm_in,
         const map<dstate_id_t, gough_accel_state_info> &accel_info)
-        : mcclellan_build_strat(r, rm_in), rdfa(r), gg(g),
+        : mcclellan_build_strat(r, rm_in, false), rdfa(r), gg(g),
           accel_gough_info(accel_info) {}
     unique_ptr<raw_report_info> gatherReports(vector<u32> &reports /* out */,
                             vector<u32> &reports_eod /* out */,
@@ -1036,9 +1035,9 @@ void update_accel_prog_offset(const gough_build_strat &gbs,
     }
 }
 
-aligned_unique_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
-                                     const CompileContext &cc,
-                                     const ReportManager &rm) {
+bytecode_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
+                               const CompileContext &cc,
+                               const ReportManager &rm) {
     assert(somPrecision == 2 || somPrecision == 4 || somPrecision == 8
            || !cc.streaming);
 
@@ -1071,7 +1070,7 @@ aligned_unique_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
     map<dstate_id_t, gough_accel_state_info> accel_allowed;
     find_allowed_accel_states(*cfg, blocks, &accel_allowed);
     gough_build_strat gbs(raw, *cfg, rm, accel_allowed);
-    aligned_unique_ptr<NFA> basic_dfa = mcclellanCompile_i(raw, gbs, cc);
+    auto basic_dfa = mcclellanCompile_i(raw, gbs, cc);
     assert(basic_dfa);
     if (!basic_dfa) {
         return nullptr;
@@ -1117,7 +1116,7 @@ aligned_unique_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
     gi.stream_som_loc_width = somPrecision;
 
     u32 gough_size = ROUNDUP_N(curr_offset, 16);
-    aligned_unique_ptr<NFA> gough_dfa = aligned_zmalloc_unique<NFA>(gough_size);
+    auto gough_dfa = make_zeroed_bytecode_ptr<NFA>(gough_size);
 
     memcpy(gough_dfa.get(), basic_dfa.get(), basic_dfa->length);
     memcpy((char *)gough_dfa.get() + haig_offset, &gi, sizeof(gi));
diff --git a/src/nfa/goughcompile.h b/src/nfa/goughcompile.h
index 54f98cef2..72469f3ca 100644
--- a/src/nfa/goughcompile.h
+++ b/src/nfa/goughcompile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -32,7 +32,7 @@
 #include "mcclellancompile.h"
 #include "nfa_kind.h"
 #include "ue2common.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"
 #include "util/ue2_containers.h"
 #include "util/order_check.h"
 
@@ -88,10 +88,10 @@ struct raw_som_dfa : public raw_dfa {
                             * som */
 };
 
-aligned_unique_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
-                                     const CompileContext &cc,
-                                     const ReportManager &rm);
+bytecode_ptr<NFA> goughCompile(raw_som_dfa &raw, u8 somPrecision,
+                               const CompileContext &cc,
+                               const ReportManager &rm);
 
 } // namespace ue2
 
-#endif
+#endif // GOUGHCOMPILE_H
diff --git a/src/nfa/goughcompile_internal.h b/src/nfa/goughcompile_internal.h
index 52e65f15f..a6ba0d1b8 100644
--- a/src/nfa/goughcompile_internal.h
+++ b/src/nfa/goughcompile_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,6 +33,7 @@
 #include "mcclellancompile.h"
 #include "ue2common.h"
 #include "util/charreach.h"
+#include "util/noncopyable.h"
 #include "util/order_check.h"
 #include "util/ue2_containers.h"
 
@@ -41,7 +42,6 @@
 #include <set>
 #include <vector>
 
-#include <boost/core/noncopyable.hpp>
 #include <boost/graph/adjacency_list.hpp>
 
 namespace ue2 {
@@ -103,7 +103,7 @@ struct GoughSSAVarWithInputs;
 struct GoughSSAVarMin;
 struct GoughSSAVarJoin;
 
-struct GoughSSAVar : boost::noncopyable {
+struct GoughSSAVar : noncopyable {
     GoughSSAVar(void) : seen(false), slot(INVALID_SLOT) {}
     virtual ~GoughSSAVar();
     const ue2::flat_set<GoughSSAVar *> &get_inputs() const {
diff --git a/src/nfa/limex_accel.c b/src/nfa/limex_accel.c
index c74c7079d..4834b6a54 100644
--- a/src/nfa/limex_accel.c
+++ b/src/nfa/limex_accel.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -39,11 +39,9 @@
 #include "nfa_internal.h"
 #include "shufti.h"
 #include "truffle.h"
-#include "multishufti.h"
-#include "multitruffle.h"
-#include "multivermicelli.h"
 #include "ue2common.h"
 #include "vermicelli.h"
+#include "util/arch.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
 
@@ -118,7 +116,7 @@ size_t doAccel256(const m256 *state, const struct LimExNFA256 *limex,
     DEBUG_PRINTF("using PSHUFB for 256-bit shuffle\n");
     m256 accelPerm = limex->accelPermute;
     m256 accelComp = limex->accelCompare;
-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)
     u32 idx1 = packedExtract128(s.lo, accelPerm.lo, accelComp.lo);
     u32 idx2 = packedExtract128(s.hi, accelPerm.hi, accelComp.hi);
     assert((idx1 & idx2) == 0); // should be no shared bits
@@ -153,18 +151,20 @@ size_t doAccel512(const m512 *state, const struct LimExNFA512 *limex,
     DEBUG_PRINTF("using PSHUFB for 512-bit shuffle\n");
     m512 accelPerm = limex->accelPermute;
     m512 accelComp = limex->accelCompare;
-#if !defined(__AVX2__)
+#if defined(HAVE_AVX512)
+    idx = packedExtract512(s, accelPerm, accelComp);
+#elif defined(HAVE_AVX2)
+    u32 idx1 = packedExtract256(s.lo, accelPerm.lo, accelComp.lo);
+    u32 idx2 = packedExtract256(s.hi, accelPerm.hi, accelComp.hi);
+    assert((idx1 & idx2) == 0); // should be no shared bits
+    idx = idx1 | idx2;
+#else
     u32 idx1 = packedExtract128(s.lo.lo, accelPerm.lo.lo, accelComp.lo.lo);
     u32 idx2 = packedExtract128(s.lo.hi, accelPerm.lo.hi, accelComp.lo.hi);
     u32 idx3 = packedExtract128(s.hi.lo, accelPerm.hi.lo, accelComp.hi.lo);
     u32 idx4 = packedExtract128(s.hi.hi, accelPerm.hi.hi, accelComp.hi.hi);
     assert((idx1 & idx2 & idx3 & idx4) == 0); // should be no shared bits
     idx = idx1 | idx2 | idx3 | idx4;
-#else
-    u32 idx1 = packedExtract256(s.lo, accelPerm.lo, accelComp.lo);
-    u32 idx2 = packedExtract256(s.hi, accelPerm.hi, accelComp.hi);
-    assert((idx1 & idx2) == 0); // should be no shared bits
-    idx = idx1 | idx2;
 #endif
     return accelScanWrapper(accelTable, aux, input, idx, i, end);
 }
diff --git a/src/nfa/limex_compile.cpp b/src/nfa/limex_compile.cpp
index c75eae597..7183d4b79 100644
--- a/src/nfa/limex_compile.cpp
+++ b/src/nfa/limex_compile.cpp
@@ -26,9 +26,11 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-/** \file
+/**
+ * \file
  * \brief Main NFA build code.
  */
+
 #include "limex_compile.h"
 
 #include "accel.h"
@@ -47,6 +49,7 @@
 #include "repeatcompile.h"
 #include "util/alloc.h"
 #include "util/bitutils.h"
+#include "util/bytecode_ptr.h"
 #include "util/charreach.h"
 #include "util/compile_context.h"
 #include "util/container.h"
@@ -66,6 +69,7 @@
 #include <vector>
 
 #include <boost/graph/breadth_first_search.hpp>
+#include <boost/graph/depth_first_search.hpp>
 #include <boost/range/adaptor/map.hpp>
 
 using namespace std;
@@ -89,8 +93,6 @@ struct precalcAccel {
     CharReach double_cr;
     flat_set<pair<u8, u8>> double_lits; /* double-byte accel stop literals */
     u32 double_offset;
-
-    MultibyteAccelInfo ma_info;
 };
 
 struct limex_accel_info {
@@ -354,16 +356,12 @@ void buildReachMapping(const build_info &args, vector<NFAStateSet> &reach,
 }
 
 struct AccelBuild {
-    AccelBuild() : v(NGHolder::null_vertex()), state(0), offset(0), ma_len1(0),
-            ma_len2(0), ma_type(MultibyteAccelInfo::MAT_NONE) {}
+    AccelBuild() : v(NGHolder::null_vertex()), state(0), offset(0) {}
     NFAVertex v;
     u32 state;
     u32 offset; // offset correction to apply
     CharReach stop1; // single-byte accel stop literals
     flat_set<pair<u8, u8>> stop2; // double-byte accel stop literals
-    u32 ma_len1; // multiaccel len1
-    u32 ma_len2; // multiaccel len2
-    MultibyteAccelInfo::multiaccel_type ma_type; // multiaccel type
 };
 
 static
@@ -378,12 +376,7 @@ void findStopLiterals(const build_info &bi, NFAVertex v, AccelBuild &build) {
         build.stop1 = CharReach::dot();
     } else {
         const precalcAccel &precalc = bi.accel.precalc.at(ss);
-        unsigned ma_len = precalc.ma_info.len1 + precalc.ma_info.len2;
-        if (ma_len >= MULTIACCEL_MIN_LEN) {
-            build.ma_len1 = precalc.ma_info.len1;
-            build.stop1 = precalc.ma_info.cr;
-            build.offset = precalc.ma_info.offset;
-        } else if (precalc.double_lits.empty()) {
+        if (precalc.double_lits.empty()) {
             build.stop1 = precalc.single_cr;
             build.offset = precalc.single_offset;
         } else {
@@ -602,7 +595,6 @@ void fillAccelInfo(build_info &bi) {
     limex_accel_info &accel = bi.accel;
     unordered_map<NFAVertex, AccelScheme> &accel_map = accel.accel_map;
     const map<NFAVertex, BoundedRepeatSummary> &br_cyclic = bi.br_cyclic;
-    const CompileContext &cc = bi.cc;
     const unordered_map<NFAVertex, u32> &state_ids = bi.state_ids;
     const u32 num_states = bi.num_states;
 
@@ -659,27 +651,17 @@ void fillAccelInfo(build_info &bi) {
         DEBUG_PRINTF("accel %u ok with offset s%u, d%u\n", i, as.offset,
                      as.double_offset);
 
-        // try multibyte acceleration first
-        MultibyteAccelInfo mai = nfaCheckMultiAccel(g, states, cc);
-
         precalcAccel &pa = accel.precalc[state_set];
-        useful |= state_set;
-
-        // if we successfully built a multibyte accel scheme, use that
-        if (mai.type != MultibyteAccelInfo::MAT_NONE) {
-            pa.ma_info = mai;
-
-            DEBUG_PRINTF("multibyte acceleration!\n");
-            continue;
-        }
-
         pa.single_offset = as.offset;
         pa.single_cr = as.cr;
+
         if (as.double_byte.size() != 0) {
             pa.double_offset = as.double_offset;
             pa.double_lits = as.double_byte;
             pa.double_cr = as.double_cr;
-        };
+        }
+
+        useful |= state_set;
     }
 
     for (const auto &m : accel_map) {
@@ -696,19 +678,8 @@ void fillAccelInfo(build_info &bi) {
         state_set.reset();
         state_set.set(state_id);
 
-        bool is_multi = false;
-        auto p_it = accel.precalc.find(state_set);
-        if (p_it != accel.precalc.end()) {
-            const precalcAccel &pa = p_it->second;
-            offset = max(pa.double_offset, pa.single_offset);
-            is_multi = pa.ma_info.type != MultibyteAccelInfo::MAT_NONE;
-            assert(offset <= MAX_ACCEL_DEPTH);
-        }
-
         accel.accelerable.insert(v);
-        if (!is_multi) {
-            findAccelFriends(g, v, br_cyclic, offset, &accel.friends[v]);
-        }
+        findAccelFriends(g, v, br_cyclic, offset, &accel.friends[v]);
     }
 }
 
@@ -721,6 +692,7 @@ typedef vector<AccelAux, AlignedAllocator<AccelAux, alignof(AccelAux)>>
 
 static
 u32 getEffectiveAccelStates(const build_info &args,
+                            const unordered_map<NFAVertex, NFAVertex> &dom_map,
                             u32 active_accel_mask,
                             const vector<AccelBuild> &accelStates) {
     /* accelStates is indexed by the acceleration bit index and contains a
@@ -756,7 +728,6 @@ u32 getEffectiveAccelStates(const build_info &args,
      * so we may still require on earlier states to be accurately modelled.
      */
     const NGHolder &h = args.h;
-    auto dom_map = findDominators(h);
 
     /* map from accel_id to mask of accel_ids that it is dominated by */
     vector<u32> dominated_by(accelStates.size());
@@ -773,8 +744,8 @@ u32 getEffectiveAccelStates(const build_info &args,
         u32 accel_id = findAndClearLSB_32(&local_accel_mask);
         assert(accel_id < accelStates.size());
         NFAVertex v = accelStates[accel_id].v;
-        while (dom_map[v]) {
-            v = dom_map[v];
+        while (contains(dom_map, v) && dom_map.at(v)) {
+            v = dom_map.at(v);
             if (contains(accel_id_map, v)) {
                 dominated_by[accel_id] |= 1U << accel_id_map[v];
             }
@@ -887,6 +858,8 @@ void buildAccel(const build_info &args, NFAStateSet &accelMask,
         return;
     }
 
+    const auto dom_map = findDominators(args.h);
+
     // We have 2^n different accel entries, one for each possible
     // combination of accelerable states.
     assert(accelStates.size() < 32);
@@ -900,7 +873,8 @@ void buildAccel(const build_info &args, NFAStateSet &accelMask,
     effective_accel_set.push_back(0); /* empty is effectively empty */
 
     for (u32 i = 1; i < accelCount; i++) {
-        u32 effective_i = getEffectiveAccelStates(args, i, accelStates);
+        u32 effective_i = getEffectiveAccelStates(args, dom_map, i,
+                                                  accelStates);
         effective_accel_set.push_back(effective_i);
 
         if (effective_i == IMPOSSIBLE_ACCEL_MASK) {
@@ -947,16 +921,8 @@ void buildAccel(const build_info &args, NFAStateSet &accelMask,
 
             if (contains(accel.precalc, effective_states)) {
                 const auto &precalc = accel.precalc.at(effective_states);
-                if (precalc.ma_info.type != MultibyteAccelInfo::MAT_NONE) {
-                    ainfo.ma_len1 = precalc.ma_info.len1;
-                    ainfo.ma_len2 = precalc.ma_info.len2;
-                    ainfo.multiaccel_offset = precalc.ma_info.offset;
-                    ainfo.multiaccel_stops = precalc.ma_info.cr;
-                    ainfo.ma_type = precalc.ma_info.type;
-                } else {
-                    ainfo.single_offset = precalc.single_offset;
-                    ainfo.single_stops = precalc.single_cr;
-                }
+                ainfo.single_offset = precalc.single_offset;
+                ainfo.single_stops = precalc.single_cr;
             }
         }
 
@@ -1637,6 +1603,84 @@ u32 findBestNumOfVarShifts(const build_info &args,
     return bestNumOfVarShifts;
 }
 
+static
+bool cannotDie(const build_info &args, const set<NFAVertex> &tops) {
+    const auto &h = args.h;
+
+    // When this top is activated, all of the vertices in 'tops' are switched
+    // on. If any of those lead to a graph that cannot die, then this top
+    // cannot die.
+
+    // For each top, we use a depth-first search to traverse the graph from the
+    // top, looking for a cyclic path consisting of vertices of dot reach. If
+    // one exists, than the NFA cannot die after this top is triggered.
+
+    vector<boost::default_color_type> colours(num_vertices(h));
+    auto colour_map = boost::make_iterator_property_map(colours.begin(),
+                                                        get(vertex_index, h));
+
+    struct CycleFound {};
+    struct CannotDieVisitor : public boost::default_dfs_visitor {
+        void back_edge(const NFAEdge &e, const NGHolder &g) const {
+            DEBUG_PRINTF("back-edge %zu,%zu\n", g[source(e, g)].index,
+                         g[target(e, g)].index);
+            if (g[target(e, g)].char_reach.all()) {
+                assert(g[source(e, g)].char_reach.all());
+                throw CycleFound();
+            }
+        }
+    };
+
+    try {
+        for (const auto &top : tops) {
+            DEBUG_PRINTF("checking top vertex %zu\n", h[top].index);
+
+            // Constrain the search to the top vertices and any dot vertices it
+            // can reach.
+            auto term_func = [&](NFAVertex v, const NGHolder &g) {
+                if (v == top) {
+                    return false;
+                }
+                if (!g[v].char_reach.all()) {
+                    return true;
+                }
+                if (contains(args.br_cyclic, v) &&
+                    args.br_cyclic.at(v).repeatMax != depth::infinity()) {
+                    // Bounded repeat vertices without inf max can be turned
+                    // off.
+                    return true;
+                }
+                return false;
+            };
+
+            boost::depth_first_visit(h, top, CannotDieVisitor(), colour_map,
+                                     term_func);
+        }
+    } catch (const CycleFound &) {
+        DEBUG_PRINTF("cycle found\n");
+        return true;
+    }
+
+    return false;
+}
+
+/** \brief True if this NFA cannot ever be in no states at all. */
+static
+bool cannotDie(const build_info &args) {
+    const auto &h = args.h;
+    const auto &state_ids = args.state_ids;
+
+    // If we have a startDs we're actually using, we can't die.
+    if (state_ids.at(h.startDs) != NO_STATE) {
+        DEBUG_PRINTF("is using startDs\n");
+        return true;
+    }
+
+    return all_of_in(args.tops | map_values, [&](const set<NFAVertex> &verts) {
+        return cannotDie(args, verts);
+    });
+}
+
 template<NFAEngineType dtype>
 struct Factory {
     // typedefs for readability, for types derived from traits
@@ -1700,8 +1744,8 @@ struct Factory {
 
     static
     void buildRepeats(const build_info &args,
-                vector<pair<aligned_unique_ptr<NFARepeatInfo>, size_t>> &out,
-                u32 *scratchStateSize, u32 *streamState) {
+                      vector<bytecode_ptr<NFARepeatInfo>> &out,
+                      u32 *scratchStateSize, u32 *streamState) {
         out.reserve(args.repeats.size());
 
         u32 repeat_idx = 0;
@@ -1712,7 +1756,7 @@ struct Factory {
 
             u32 tableOffset, tugMaskOffset;
             size_t len = repeatAllocSize(br, &tableOffset, &tugMaskOffset);
-            auto info = aligned_zmalloc_unique<NFARepeatInfo>(len);
+            auto info = make_zeroed_bytecode_ptr<NFARepeatInfo>(len);
             char *info_ptr = (char *)info.get();
 
             // Collect state space info.
@@ -1766,7 +1810,7 @@ struct Factory {
             *streamState += streamStateLen;
             *scratchStateSize += sizeof(RepeatControl);
 
-            out.emplace_back(move(info), len);
+            out.emplace_back(move(info));
         }
     }
 
@@ -2074,8 +2118,7 @@ struct Factory {
     }
 
     static
-    void writeRepeats(const vector<pair<aligned_unique_ptr<NFARepeatInfo>,
-                                        size_t>> &repeats,
+    void writeRepeats(const vector<bytecode_ptr<NFARepeatInfo>> &repeats,
                       vector<u32> &repeatOffsets, implNFA_t *limex,
                       const u32 repeatOffsetsOffset, const u32 repeatOffset) {
         const u32 num_repeats = verify_u32(repeats.size());
@@ -2088,10 +2131,9 @@ struct Factory {
 
         for (u32 i = 0; i < num_repeats; i++) {
             repeatOffsets[i] = offset;
-            assert(repeats[i].first);
-            memcpy((char *)limex + offset, repeats[i].first.get(),
-                   repeats[i].second);
-            offset += repeats[i].second;
+            assert(repeats[i]);
+            memcpy((char *)limex + offset, repeats[i].get(), repeats[i].size());
+            offset += repeats[i].size();
         }
 
         // Write repeat offset lookup table.
@@ -2112,19 +2154,19 @@ struct Factory {
     }
 
     static
-    aligned_unique_ptr<NFA> generateNfa(const build_info &args) {
+    bytecode_ptr<NFA> generateNfa(const build_info &args) {
         if (args.num_states > NFATraits<dtype>::maxStates) {
             return nullptr;
         }
 
         // Build bounded repeat structures.
-        vector<pair<aligned_unique_ptr<NFARepeatInfo>, size_t>> repeats;
+        vector<bytecode_ptr<NFARepeatInfo>> repeats;
         u32 repeats_full_state = 0;
         u32 repeats_stream_state = 0;
         buildRepeats(args, repeats, &repeats_full_state, &repeats_stream_state);
         size_t repeatSize = 0;
         for (size_t i = 0; i < repeats.size(); i++) {
-            repeatSize += repeats[i].second;
+            repeatSize += repeats[i].size();
         }
 
         // We track report lists that have already been written into the global
@@ -2214,7 +2256,7 @@ struct Factory {
 
         size_t nfaSize = sizeof(NFA) + offset;
         DEBUG_PRINTF("nfa size %zu\n", nfaSize);
-        auto nfa = aligned_zmalloc_unique<NFA>(nfaSize);
+        auto nfa = make_zeroed_bytecode_ptr<NFA>(nfaSize);
         assert(nfa); // otherwise we would have thrown std::bad_alloc
 
         implNFA_t *limex = (implNFA_t *)getMutableImplNfa(nfa.get());
@@ -2234,6 +2276,11 @@ struct Factory {
         limex->shiftCount = shiftCount;
         writeShiftMasks(args, limex);
 
+        if (cannotDie(args)) {
+            DEBUG_PRINTF("nfa cannot die\n");
+            setLimexFlag(limex, LIMEX_FLAG_CANNOT_DIE);
+        }
+
         // Determine the state required for our state vector.
         findStateSize(args, limex);
 
@@ -2295,7 +2342,7 @@ struct Factory {
 
 template<NFAEngineType dtype>
 struct generateNfa {
-    static aligned_unique_ptr<NFA> call(const build_info &args) {
+    static bytecode_ptr<NFA> call(const build_info &args) {
         return Factory<dtype>::generateNfa(args);
     }
 };
@@ -2392,17 +2439,15 @@ u32 max_state(const ue2::unordered_map<NFAVertex, u32> &state_ids) {
     return rv;
 }
 
-aligned_unique_ptr<NFA> generate(NGHolder &h,
-                         const ue2::unordered_map<NFAVertex, u32> &states,
-                         const vector<BoundedRepeatData> &repeats,
-                         const map<NFAVertex, NFAStateSet> &reportSquashMap,
-                         const map<NFAVertex, NFAStateSet> &squashMap,
-                         const map<u32, set<NFAVertex>> &tops,
-                         const set<NFAVertex> &zombies,
-                         bool do_accel,
-                         bool stateCompression,
-                         u32 hint,
-                         const CompileContext &cc) {
+bytecode_ptr<NFA> generate(NGHolder &h,
+                           const ue2::unordered_map<NFAVertex, u32> &states,
+                           const vector<BoundedRepeatData> &repeats,
+                           const map<NFAVertex, NFAStateSet> &reportSquashMap,
+                           const map<NFAVertex, NFAStateSet> &squashMap,
+                           const map<u32, set<NFAVertex>> &tops,
+                           const set<NFAVertex> &zombies, bool do_accel,
+                           bool stateCompression, u32 hint,
+                           const CompileContext &cc) {
     const u32 num_states = max_state(states) + 1;
     DEBUG_PRINTF("total states: %u\n", num_states);
 
diff --git a/src/nfa/limex_compile.h b/src/nfa/limex_compile.h
index 21cb76087..a12ae9f6e 100644
--- a/src/nfa/limex_compile.h
+++ b/src/nfa/limex_compile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,7 +26,8 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-/** \file
+/**
+ * \file
  * \brief Main NFA build code.
  */
 
@@ -37,10 +38,10 @@
 #include <memory>
 #include <vector>
 
-#include "ue2common.h"
 #include "nfagraph/ng_holder.h"
 #include "nfagraph/ng_squash.h" // for NFAStateSet
-#include "util/alloc.h"
+#include "ue2common.h"
+#include "util/bytecode_ptr.h"
 #include "util/ue2_containers.h"
 
 struct NFA;
@@ -50,7 +51,8 @@ namespace ue2 {
 struct BoundedRepeatData;
 struct CompileContext;
 
-/** \brief Construct a LimEx NFA from an NGHolder.
+/**
+ * \brief Construct a LimEx NFA from an NGHolder.
  *
  * \param g Input NFA graph. Must have state IDs assigned.
  * \param repeats Bounded repeat information, if any.
@@ -66,7 +68,7 @@ struct CompileContext;
  * \return a built NFA, or nullptr if no NFA could be constructed for this
  * graph.
  */
-aligned_unique_ptr<NFA> generate(NGHolder &g,
+bytecode_ptr<NFA> generate(NGHolder &g,
                         const ue2::unordered_map<NFAVertex, u32> &states,
                         const std::vector<BoundedRepeatData> &repeats,
                         const std::map<NFAVertex, NFAStateSet> &reportSquashMap,
diff --git a/src/nfa/limex_dump.cpp b/src/nfa/limex_dump.cpp
index 852639ea3..797e87ba2 100644
--- a/src/nfa/limex_dump.cpp
+++ b/src/nfa/limex_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -290,6 +290,20 @@ static
 void dumpLimexText(const limex_type *limex, FILE *f) {
     u32 size = limex_traits<limex_type>::size;
 
+    fprintf(f, "%u-bit LimEx NFA (%u shifts, %u exceptions)\n", size,
+            limex->shiftCount, limex->exceptionCount);
+    fprintf(f, "flags: ");
+    if (limex->flags & LIMEX_FLAG_COMPRESS_STATE) {
+        fprintf(f, "COMPRESS_STATE ");
+    }
+    if (limex->flags & LIMEX_FLAG_COMPRESS_MASKED) {
+        fprintf(f, "COMPRESS_MASKED ");
+    }
+    if (limex->flags & LIMEX_FLAG_CANNOT_DIE) {
+        fprintf(f, "CANNOT_DIE ");
+    }
+    fprintf(f, "\n\n");
+
     dumpMask(f, "init", (const u8 *)&limex->init, size);
     dumpMask(f, "init_dot_star", (const u8 *)&limex->initDS, size);
     dumpMask(f, "accept", (const u8 *)&limex->accept, size);
diff --git a/src/nfa/limex_internal.h b/src/nfa/limex_internal.h
index ccbf34223..db703f039 100644
--- a/src/nfa/limex_internal.h
+++ b/src/nfa/limex_internal.h
@@ -85,6 +85,7 @@
 
 #define LIMEX_FLAG_COMPRESS_STATE  1 /**< pack state into stream state */
 #define LIMEX_FLAG_COMPRESS_MASKED 2 /**< use reach mask-based compression */
+#define LIMEX_FLAG_CANNOT_DIE      4 /**< limex cannot have no states on */
 
 enum LimExTrigger {
     LIMEX_TRIGGER_NONE = 0,
diff --git a/src/nfa/limex_runtime_impl.h b/src/nfa/limex_runtime_impl.h
index 016d1f924..7b89182be 100644
--- a/src/nfa/limex_runtime_impl.h
+++ b/src/nfa/limex_runtime_impl.h
@@ -60,6 +60,7 @@
 #define RUN_ACCEL_FN        JOIN(LIMEX_API_ROOT, _Run_Accel)
 #define RUN_EXCEPTIONS_FN   JOIN(LIMEX_API_ROOT, _Run_Exceptions)
 #define REV_STREAM_FN       JOIN(LIMEX_API_ROOT, _Rev_Stream)
+#define LOOP_NOACCEL_FN     JOIN(LIMEX_API_ROOT, _Loop_No_Accel)
 #define STREAM_FN           JOIN(LIMEX_API_ROOT, _Stream)
 #define STREAMCB_FN         JOIN(LIMEX_API_ROOT, _Stream_CB)
 #define STREAMFIRST_FN      JOIN(LIMEX_API_ROOT, _Stream_First)
@@ -172,24 +173,75 @@ size_t RUN_ACCEL_FN(const STATE_T s, UNUSED const STATE_T accelMask,
         switch (limex_m->shiftCount) {                                         \
         case 8:                                                                \
             succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 7)); \
+            /* fallthrough */                                                  \
         case 7:                                                                \
             succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 6)); \
+            /* fallthrough */                                                  \
         case 6:                                                                \
             succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 5)); \
+            /* fallthrough */                                                  \
         case 5:                                                                \
             succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 4)); \
+            /* fallthrough */                                                  \
         case 4:                                                                \
             succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 3)); \
+            /* fallthrough */                                                  \
         case 3:                                                                \
             succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 2)); \
+            /* fallthrough */                                                  \
         case 2:                                                                \
             succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 1)); \
+            /* fallthrough */                                                  \
         case 1:                                                                \
+            /* fallthrough */                                                  \
         case 0:                                                                \
             ;                                                                  \
         }                                                                      \
     } while (0)
 
+/**
+ * \brief LimEx NFAS inner loop without accel.
+ *
+ * Note that the "all zeroes" early death check is only performed if can_die is
+ * true.
+ *
+ */
+static really_inline
+char LOOP_NOACCEL_FN(const IMPL_NFA_T *limex, const u8 *input, size_t *loc,
+                     size_t length, STATE_T *s_ptr, struct CONTEXT_T *ctx,
+                     u64a offset, const char flags, u64a *final_loc,
+                     const char first_match, const char can_die) {
+    const ENG_STATE_T *reach = get_reach_table(limex);
+#if SIZE < 256
+    const STATE_T exceptionMask = LOAD_FROM_ENG(&limex->exceptionMask);
+#endif
+    const EXCEPTION_T *exceptions = getExceptionTable(EXCEPTION_T, limex);
+    STATE_T s = *s_ptr;
+
+    size_t i = *loc;
+    for (; i != length; i++) {
+        DUMP_INPUT(i);
+        if (can_die && ISZERO_STATE(s)) {
+            DEBUG_PRINTF("no states are switched on, early exit\n");
+            break;
+        }
+
+        STATE_T succ;
+        NFA_EXEC_GET_LIM_SUCC(limex, s, succ);
+
+        if (RUN_EXCEPTIONS_FN(limex, exceptions, s, EXCEPTION_MASK, i, offset,
+                              &succ, final_loc, ctx, flags, 0, first_match)) {
+            return MO_HALT_MATCHING;
+        }
+
+        u8 c = input[i];
+        s = AND_STATE(succ, LOAD_FROM_ENG(&reach[limex->reachMap[c]]));
+    }
+
+    *loc = i;
+    *s_ptr = s;
+    return MO_CONTINUE_MATCHING;
+}
 
 static really_inline
 char STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
@@ -202,7 +254,8 @@ char STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
         = LOAD_FROM_ENG(&limex->accel_and_friends);
     const STATE_T exceptionMask = LOAD_FROM_ENG(&limex->exceptionMask);
 #endif
-    const u8 *accelTable = (const u8 *)((const char *)limex + limex->accelTableOffset);
+    const u8 *accelTable =
+        (const u8 *)((const char *)limex + limex->accelTableOffset);
     const union AccelAux *accelAux =
         (const union AccelAux *)((const char *)limex + limex->accelAuxOffset);
     const EXCEPTION_T *exceptions = getExceptionTable(EXCEPTION_T, limex);
@@ -221,24 +274,20 @@ char STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
     }
 
 without_accel:
-    for (; i != min_accel_offset; i++) {
-        DUMP_INPUT(i);
-        if (ISZERO_STATE(s)) {
-            DEBUG_PRINTF("no states are switched on, early exit\n");
-            ctx->s = s;
-            return MO_CONTINUE_MATCHING;
+    if (limex->flags & LIMEX_FLAG_CANNOT_DIE) {
+        const char can_die = 0;
+        if (LOOP_NOACCEL_FN(limex, input, &i, min_accel_offset, &s, ctx, offset,
+                            flags, final_loc, first_match,
+                            can_die) == MO_HALT_MATCHING) {
+            return MO_HALT_MATCHING;
         }
-
-        u8 c = input[i];
-        STATE_T succ;
-        NFA_EXEC_GET_LIM_SUCC(limex, s, succ);
-
-        if (RUN_EXCEPTIONS_FN(limex, exceptions, s, EXCEPTION_MASK, i, offset,
-                              &succ, final_loc, ctx, flags, 0, first_match)) {
+    } else {
+        const char can_die = 1;
+        if (LOOP_NOACCEL_FN(limex, input, &i, min_accel_offset, &s, ctx, offset,
+                            flags, final_loc, first_match,
+                            can_die) == MO_HALT_MATCHING) {
             return MO_HALT_MATCHING;
         }
-
-        s = AND_STATE(succ, LOAD_FROM_ENG(&reach[limex->reachMap[c]]));
     }
 
 with_accel:
@@ -279,7 +328,6 @@ char STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
             goto without_accel;
         }
 
-        u8 c = input[i];
         STATE_T succ;
         NFA_EXEC_GET_LIM_SUCC(limex, s, succ);
 
@@ -288,6 +336,7 @@ char STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
             return MO_HALT_MATCHING;
         }
 
+        u8 c = input[i];
         s = AND_STATE(succ, LOAD_FROM_ENG(&reach[limex->reachMap[c]]));
     }
 
@@ -333,14 +382,13 @@ char REV_STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
     u64a *final_loc = NULL;
 
     for (size_t i = length; i != 0; i--) {
-        DUMP_INPUT(i-1);
+        DUMP_INPUT(i - 1);
         if (ISZERO_STATE(s)) {
             DEBUG_PRINTF("no states are switched on, early exit\n");
             ctx->s = s;
             return MO_CONTINUE_MATCHING;
         }
 
-        u8 c = input[i-1];
         STATE_T succ;
         NFA_EXEC_GET_LIM_SUCC(limex, s, succ);
 
@@ -349,6 +397,7 @@ char REV_STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length,
             return MO_HALT_MATCHING;
         }
 
+        u8 c = input[i - 1];
         s = AND_STATE(succ, LOAD_FROM_ENG(&reach[limex->reachMap[c]]));
     }
 
@@ -999,6 +1048,7 @@ enum nfa_zombie_status JOIN(LIMEX_API_ROOT, _zombie_status)(
 #undef RUN_ACCEL_FN
 #undef RUN_EXCEPTIONS_FN
 #undef REV_STREAM_FN
+#undef LOOP_NOACCEL_FN
 #undef STREAM_FN
 #undef STREAMCB_FN
 #undef STREAMFIRST_FN
diff --git a/src/nfa/limex_shuffle.h b/src/nfa/limex_shuffle.h
index 5ca8fce09..365d47296 100644
--- a/src/nfa/limex_shuffle.h
+++ b/src/nfa/limex_shuffle.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -38,22 +38,23 @@
 #define LIMEX_SHUFFLE_H
 
 #include "ue2common.h"
+#include "util/arch.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
 
 static really_inline
 u32 packedExtract128(m128 s, const m128 permute, const m128 compare) {
-    m128 shuffled = pshufb(s, permute);
+    m128 shuffled = pshufb_m128(s, permute);
     m128 compared = and128(shuffled, compare);
     u16 rv = ~movemask128(eq128(compared, shuffled));
     return (u32)rv;
 }
 
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
 static really_inline
 u32 packedExtract256(m256 s, const m256 permute, const m256 compare) {
     // vpshufb doesn't cross lanes, so this is a bit of a cheat
-    m256 shuffled = vpshufb(s, permute);
+    m256 shuffled = pshufb_m256(s, permute);
     m256 compared = and256(shuffled, compare);
     u32 rv = ~movemask256(eq256(compared, shuffled));
     // stitch the lane-wise results back together
@@ -61,4 +62,17 @@ u32 packedExtract256(m256 s, const m256 permute, const m256 compare) {
 }
 #endif // AVX2
 
+#if defined(HAVE_AVX512)
+static really_inline
+u32 packedExtract512(m512 s, const m512 permute, const m512 compare) {
+    // vpshufb doesn't cross lanes, so this is a bit of a cheat
+    m512 shuffled = pshufb_m512(s, permute);
+    m512 compared = and512(shuffled, compare);
+    u64a rv = ~eq512mask(compared, shuffled);
+    // stitch the lane-wise results back together
+    rv = rv >> 32 | rv;
+    return (u32)(((rv >> 16) | rv) & 0xffffU);
+}
+#endif // AVX512
+
 #endif // LIMEX_SHUFFLE_H
diff --git a/src/nfa/mcclellancompile.cpp b/src/nfa/mcclellancompile.cpp
index 7a73c9d42..e875477b1 100644
--- a/src/nfa/mcclellancompile.cpp
+++ b/src/nfa/mcclellancompile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -456,9 +456,8 @@ bool allocateFSN16(dfa_info &info, dstate_id_t *sherman_base) {
 }
 
 static
-aligned_unique_ptr<NFA> mcclellanCompile16(dfa_info &info,
-                                           const CompileContext &cc,
-                                           set<dstate_id_t> *accel_states) {
+bytecode_ptr<NFA> mcclellanCompile16(dfa_info &info, const CompileContext &cc,
+                                     set<dstate_id_t> *accel_states) {
     DEBUG_PRINTF("building mcclellan 16\n");
 
     vector<u32> reports; /* index in ri for the appropriate report list */
@@ -497,7 +496,7 @@ aligned_unique_ptr<NFA> mcclellanCompile16(dfa_info &info,
     accel_offset -= sizeof(NFA); /* adj accel offset to be relative to m */
     assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));
 
-    aligned_unique_ptr<NFA> nfa = aligned_zmalloc_unique<NFA>(total_size);
+    auto nfa = make_zeroed_bytecode_ptr<NFA>(total_size);
     char *nfa_base = (char *)nfa.get();
 
     populateBasicInfo(sizeof(u16), info, total_size, aux_offset, accel_offset,
@@ -685,9 +684,8 @@ void allocateFSN8(dfa_info &info,
 }
 
 static
-aligned_unique_ptr<NFA> mcclellanCompile8(dfa_info &info,
-                                          const CompileContext &cc,
-                                          set<dstate_id_t> *accel_states) {
+bytecode_ptr<NFA> mcclellanCompile8(dfa_info &info, const CompileContext &cc,
+                                    set<dstate_id_t> *accel_states) {
     DEBUG_PRINTF("building mcclellan 8\n");
 
     vector<u32> reports;
@@ -717,12 +715,13 @@ aligned_unique_ptr<NFA> mcclellanCompile8(dfa_info &info,
     accel_offset -= sizeof(NFA); /* adj accel offset to be relative to m */
     assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));
 
-    aligned_unique_ptr<NFA> nfa = aligned_zmalloc_unique<NFA>(total_size);
+    auto nfa = make_zeroed_bytecode_ptr<NFA>(total_size);
     char *nfa_base = (char *)nfa.get();
 
     mcclellan *m = (mcclellan *)getMutableImplNfa(nfa.get());
 
-    allocateFSN8(info, accel_escape_info, &m->accel_limit_8, &m->accept_limit_8);
+    allocateFSN8(info, accel_escape_info, &m->accel_limit_8,
+                 &m->accept_limit_8);
     populateBasicInfo(sizeof(u8), info, total_size, aux_offset, accel_offset,
                       accel_escape_info.size(), arb, single, nfa.get());
 
@@ -763,7 +762,7 @@ aligned_unique_ptr<NFA> mcclellanCompile8(dfa_info &info,
 #define MAX_SHERMAN_LIST_LEN 8
 
 static
-void addIfEarlier(set<dstate_id_t> &dest, dstate_id_t candidate,
+void addIfEarlier(flat_set<dstate_id_t> &dest, dstate_id_t candidate,
                   dstate_id_t max) {
     if (candidate < max) {
         dest.insert(candidate);
@@ -771,19 +770,41 @@ void addIfEarlier(set<dstate_id_t> &dest, dstate_id_t candidate,
 }
 
 static
-void addSuccessors(set<dstate_id_t> &dest, const dstate &source,
+void addSuccessors(flat_set<dstate_id_t> &dest, const dstate &source,
                    u16 alphasize, dstate_id_t curr_id) {
     for (symbol_t s = 0; s < alphasize; s++) {
         addIfEarlier(dest, source.next[s], curr_id);
     }
 }
 
+/* \brief Returns a set of states to search for a better daddy. */
+static
+flat_set<dstate_id_t> find_daddy_candidates(const dfa_info &info,
+                                            dstate_id_t curr_id) {
+    flat_set<dstate_id_t> hinted;
+
+    addIfEarlier(hinted, 0, curr_id);
+    addIfEarlier(hinted, info.raw.start_anchored, curr_id);
+    addIfEarlier(hinted, info.raw.start_floating, curr_id);
+
+    // Add existing daddy and his successors, then search back one generation.
+    const u16 alphasize = info.impl_alpha_size;
+    dstate_id_t daddy = info.states[curr_id].daddy;
+    for (u32 level = 0; daddy && level < 2; level++) {
+        addIfEarlier(hinted, daddy, curr_id);
+        addSuccessors(hinted, info.states[daddy], alphasize, curr_id);
+        daddy = info.states[daddy].daddy;
+    }
+
+    return hinted;
+}
+
 #define MAX_SHERMAN_SELF_LOOP 20
 
 static
-void find_better_daddy(dfa_info &info, dstate_id_t curr_id,
-                       bool using8bit, bool any_cyclic_near_anchored_state,
-                       const Grey &grey) {
+void find_better_daddy(dfa_info &info, dstate_id_t curr_id, bool using8bit,
+                       bool any_cyclic_near_anchored_state,
+                       bool trust_daddy_states, const Grey &grey) {
     if (!grey.allowShermanStates) {
         return;
     }
@@ -818,21 +839,21 @@ void find_better_daddy(dfa_info &info, dstate_id_t curr_id,
     dstate_id_t best_daddy = 0;
     dstate &currState = info.states[curr_id];
 
-    set<dstate_id_t> hinted; /* set of states to search for a better daddy */
-    addIfEarlier(hinted, 0, curr_id);
-    addIfEarlier(hinted, info.raw.start_anchored, curr_id);
-    addIfEarlier(hinted, info.raw.start_floating, curr_id);
-
-    dstate_id_t mydaddy = currState.daddy;
-    if (mydaddy) {
-        addIfEarlier(hinted, mydaddy, curr_id);
-        addSuccessors(hinted, info.states[mydaddy], alphasize, curr_id);
-        dstate_id_t mygranddaddy = info.states[mydaddy].daddy;
-        if (mygranddaddy) {
-            addIfEarlier(hinted, mygranddaddy, curr_id);
-            addSuccessors(hinted, info.states[mygranddaddy], alphasize,
-                          curr_id);
+    flat_set<dstate_id_t> hinted;
+    if (trust_daddy_states) {
+        // Use the daddy already set for this state so long as it isn't already
+        // a Sherman state.
+        if (!info.is_sherman(currState.daddy)) {
+            hinted.insert(currState.daddy);
+        } else {
+            // Fall back to granddaddy, which has already been processed (due
+            // to BFS ordering) and cannot be a Sherman state.
+            dstate_id_t granddaddy = info.states[currState.daddy].daddy;
+            assert(!info.is_sherman(granddaddy));
+            hinted.insert(granddaddy);
         }
+    } else {
+        hinted = find_daddy_candidates(info, curr_id);
     }
 
     for (const dstate_id_t &donor : hinted) {
@@ -885,7 +906,7 @@ void find_better_daddy(dfa_info &info, dstate_id_t curr_id,
 
     if (self_loop_width > MAX_SHERMAN_SELF_LOOP) {
         DEBUG_PRINTF("%hu is banned wide self loop (%u)\n", curr_id,
-                      self_loop_width);
+                     self_loop_width);
         return;
     }
 
@@ -939,9 +960,10 @@ bool is_cyclic_near(const raw_dfa &raw, dstate_id_t root) {
     return false;
 }
 
-aligned_unique_ptr<NFA> mcclellanCompile_i(raw_dfa &raw, accel_dfa_build_strat &strat,
-                                           const CompileContext &cc,
-                                           set<dstate_id_t> *accel_states) {
+bytecode_ptr<NFA> mcclellanCompile_i(raw_dfa &raw, accel_dfa_build_strat &strat,
+                                     const CompileContext &cc,
+                                     bool trust_daddy_states,
+                                     set<dstate_id_t> *accel_states) {
     u16 total_daddy = 0;
     dfa_info info(strat);
     bool using8bit = cc.grey.allowMcClellan8 && info.size() <= 256;
@@ -957,7 +979,7 @@ aligned_unique_ptr<NFA> mcclellanCompile_i(raw_dfa &raw, accel_dfa_build_strat &
 
     for (u32 i = 0; i < info.size(); i++) {
         find_better_daddy(info, i, using8bit, any_cyclic_near_anchored_state,
-                          cc.grey);
+                          trust_daddy_states, cc.grey);
         total_daddy += info.extra[i].daddytaken;
     }
 
@@ -965,7 +987,7 @@ aligned_unique_ptr<NFA> mcclellanCompile_i(raw_dfa &raw, accel_dfa_build_strat &
                  info.size() * info.impl_alpha_size, info.size(),
                  info.impl_alpha_size);
 
-    aligned_unique_ptr<NFA> nfa;
+    bytecode_ptr<NFA> nfa;
     if (!using8bit) {
         nfa = mcclellanCompile16(info, cc, accel_states);
     } else {
@@ -980,11 +1002,13 @@ aligned_unique_ptr<NFA> mcclellanCompile_i(raw_dfa &raw, accel_dfa_build_strat &
     return nfa;
 }
 
-aligned_unique_ptr<NFA> mcclellanCompile(raw_dfa &raw, const CompileContext &cc,
-                                         const ReportManager &rm,
-                                         set<dstate_id_t> *accel_states) {
-    mcclellan_build_strat mbs(raw, rm);
-    return mcclellanCompile_i(raw, mbs, cc, accel_states);
+bytecode_ptr<NFA> mcclellanCompile(raw_dfa &raw, const CompileContext &cc,
+                                   const ReportManager &rm,
+                                   bool only_accel_init,
+                                   bool trust_daddy_states,
+                                   set<dstate_id_t> *accel_states) {
+    mcclellan_build_strat mbs(raw, rm, only_accel_init);
+    return mcclellanCompile_i(raw, mbs, cc, trust_daddy_states, accel_states);
 }
 
 size_t mcclellan_build_strat::accelSize(void) const {
diff --git a/src/nfa/mcclellancompile.h b/src/nfa/mcclellancompile.h
index 8d8dfb196..baf72d9ce 100644
--- a/src/nfa/mcclellancompile.h
+++ b/src/nfa/mcclellancompile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -32,7 +32,7 @@
 #include "accel_dfa_build_strat.h"
 #include "rdfa.h"
 #include "ue2common.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"
 #include "util/ue2_containers.h"
 
 #include <memory>
@@ -48,14 +48,15 @@ struct CompileContext;
 
 class mcclellan_build_strat : public accel_dfa_build_strat {
 public:
-    mcclellan_build_strat(raw_dfa &rdfa_in, const ReportManager &rm_in)
-        : accel_dfa_build_strat(rm_in), rdfa(rdfa_in) {}
+    mcclellan_build_strat(raw_dfa &rdfa_in, const ReportManager &rm_in,
+                          bool only_accel_init_in)
+        : accel_dfa_build_strat(rm_in, only_accel_init_in), rdfa(rdfa_in) {}
     raw_dfa &get_raw() const override { return rdfa; }
     std::unique_ptr<raw_report_info> gatherReports(
                                   std::vector<u32> &reports /* out */,
                                   std::vector<u32> &reports_eod /* out */,
                                   u8 *isSingleReport /* out */,
-                                  ReportID *arbReport  /* out */) const override;
+                                  ReportID *arbReport /* out */) const override;
     size_t accelSize(void) const override;
     u32 max_allowed_offset_accel() const override;
     u32 max_stop_char() const override;
@@ -65,17 +66,30 @@ class mcclellan_build_strat : public accel_dfa_build_strat {
     raw_dfa &rdfa;
 };
 
-/* accel_states: (optional) on success, is filled with the set of accelerable
- * states */
-ue2::aligned_unique_ptr<NFA>
+/**
+ * \brief Construct an implementation DFA.
+ *
+ * \param raw the raw dfa to construct from
+ * \param cc compile context
+ * \param rm report manger
+ * \param only_accel_init if true, only the init states will be examined for
+ *        acceleration opportunities
+ * \param trust_daddy_states if true, trust the daddy state set in the raw dfa
+ *        rather than conducting a search for a better daddy (for Sherman
+ *        states)
+ * \param accel_states (optional) success, is filled with the set of
+ *        accelerable states
+ */
+bytecode_ptr<NFA>
 mcclellanCompile(raw_dfa &raw, const CompileContext &cc,
-                 const ReportManager &rm,
+                 const ReportManager &rm, bool only_accel_init,
+                 bool trust_daddy_states = false,
                  std::set<dstate_id_t> *accel_states = nullptr);
 
 /* used internally by mcclellan/haig/gough compile process */
-ue2::aligned_unique_ptr<NFA>
+bytecode_ptr<NFA>
 mcclellanCompile_i(raw_dfa &raw, accel_dfa_build_strat &strat,
-                   const CompileContext &cc,
+                   const CompileContext &cc, bool trust_daddy_states = false,
                    std::set<dstate_id_t> *accel_states = nullptr);
 
 /**
@@ -89,4 +103,4 @@ bool has_accel_mcclellan(const NFA *nfa);
 
 } // namespace ue2
 
-#endif
+#endif // MCCLELLANCOMPILE_H
diff --git a/src/nfa/mcclellancompile_util.cpp b/src/nfa/mcclellancompile_util.cpp
index a61a19ab7..17e022fe6 100644
--- a/src/nfa/mcclellancompile_util.cpp
+++ b/src/nfa/mcclellancompile_util.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -43,6 +43,12 @@ namespace ue2 {
 
 #define INIT_STATE 1
 
+static
+bool state_has_reports(const raw_dfa &raw, dstate_id_t s) {
+    const auto &ds = raw.states[s];
+    return !ds.reports.empty() || !ds.reports_eod.empty();
+}
+
 static
 u32 count_dots(const raw_dfa &raw) {
     assert(raw.start_anchored == INIT_STATE);
@@ -60,8 +66,7 @@ u32 count_dots(const raw_dfa &raw) {
             }
         }
 
-        if (!raw.states[raw.states[i].next[0]].reports.empty()
-            || !raw.states[raw.states[i].next[0]].reports_eod.empty()) {
+        if (state_has_reports(raw, raw.states[i].next[0])) {
             goto validate;
         }
 
@@ -162,74 +167,8 @@ u32 calc_min_dist_from_bob(raw_dfa &raw, vector<u32> *dist_in) {
     return last_d;
 }
 
-static
-void find_in_edges(const raw_dfa &raw, vector<vector<dstate_id_t> > *in_edges) {
-    in_edges->clear();
-    in_edges->resize(raw.states.size());
-    ue2::unordered_set<dstate_id_t> seen;
-
-    for (u32 s = 1; s < raw.states.size(); s++) {
-        seen.clear();
-        for (u32 j = 0; j < raw.alpha_size; j++) {
-            dstate_id_t t = raw.states[s].next[j];
-            if (contains(seen, t)) {
-                continue;
-            }
-            seen.insert(t);
-            (*in_edges)[t].push_back(s);
-        }
-    }
-}
-
-static
-void calc_min_dist_to_accept(const raw_dfa &raw,
-                             const vector<vector<dstate_id_t> > &in_edges,
-                             vector<u32> *accept_dist) {
-    vector<u32> &dist = *accept_dist;
-    dist.clear();
-    dist.resize(raw.states.size(), ~0U);
-
-    /* for reporting states to start from */
-    deque<dstate_id_t> to_visit;
-    for (u32 s = 0; s < raw.states.size(); s++) {
-        if (!raw.states[s].reports.empty()
-            || !raw.states[s].reports_eod.empty()) {
-            to_visit.push_back(s);
-            dist[s] = 0;
-        }
-    }
-
-    /* bfs */
-    UNUSED u32 last_d = 0;
-    while (!to_visit.empty()) {
-        dstate_id_t s = to_visit.front();
-        to_visit.pop_front();
-        assert(s != DEAD_STATE);
-
-        u32 d = dist[s];
-        assert(d >= last_d);
-        assert(d != ~0U);
-
-        for (vector<dstate_id_t>::const_iterator it = in_edges[s].begin();
-             it != in_edges[s].end(); ++it) {
-            dstate_id_t t = *it;
-            if (t == DEAD_STATE) {
-                continue;
-            }
-            if (dist[t] == ~0U) {
-                to_visit.push_back(t);
-                dist[t] = d + 1;
-            } else {
-                assert(dist[t] <= d + 1);
-            }
-        }
-
-        last_d = d;
-    }
-}
-
-bool prune_overlong(raw_dfa &raw, u32 max_offset) {
-    DEBUG_PRINTF("pruning to at most %u\n", max_offset);
+bool clear_deeper_reports(raw_dfa &raw, u32 max_offset) {
+    DEBUG_PRINTF("clearing reports on states deeper than %u\n", max_offset);
     vector<u32> bob_dist;
     u32 max_min_dist_bob = calc_min_dist_from_bob(raw, &bob_dist);
 
@@ -237,53 +176,18 @@ bool prune_overlong(raw_dfa &raw, u32 max_offset) {
         return false;
     }
 
-    vector<vector<dstate_id_t> > in_edges;
-    find_in_edges(raw, &in_edges);
-
-    vector<u32> accept_dist;
-    calc_min_dist_to_accept(raw, in_edges, &accept_dist);
-
-    in_edges.clear();
-
-    /* look over the states and filter out any which cannot reach a report
-     * states before max_offset */
-    vector<dstate_id_t> new_ids(raw.states.size());
-    vector<dstate> new_states;
-    u32 count = 1;
-    new_states.push_back(raw.states[DEAD_STATE]);
-
-    for (u32 s = DEAD_STATE + 1; s < raw.states.size(); s++) {
-        if (bob_dist[s] + accept_dist[s] > max_offset) {
-            DEBUG_PRINTF("pruned %u: bob %u, report %u\n", s, bob_dist[s],
-                          accept_dist[s]);
-            new_ids[s] = DEAD_STATE;
-        } else {
-            new_ids[s] = count++;
-            new_states.push_back(raw.states[s]);
-            assert(new_states.size() == count);
-            assert(new_ids[s] <= s);
-        }
-    }
-
-    /* swap states */
-    DEBUG_PRINTF("pruned %zu -> %u\n", raw.states.size(), count);
-    raw.states.swap(new_states);
-    new_states.clear();
-
-    /* update edges and daddys to refer to the new ids */
+    bool changed = false;
     for (u32 s = DEAD_STATE + 1; s < raw.states.size(); s++) {
-        for (u32 j = 0; j < raw.alpha_size; j++) {
-            dstate_id_t old_t = raw.states[s].next[j];
-            raw.states[s].next[j] = new_ids[old_t];
+        if (bob_dist[s] > max_offset && state_has_reports(raw, s)) {
+            DEBUG_PRINTF("clearing reports on %u (depth %u)\n", s, bob_dist[s]);
+            auto &ds = raw.states[s];
+            ds.reports.clear();
+            ds.reports_eod.clear();
+            changed = true;
         }
-        raw.states[s].daddy = new_ids[raw.states[s].daddy];
     }
 
-    /* update specials */
-    raw.start_floating = new_ids[raw.start_floating];
-    raw.start_anchored = new_ids[raw.start_anchored];
-
-    return true;
+    return changed;
 }
 
 set<ReportID> all_reports(const raw_dfa &rdfa) {
diff --git a/src/nfa/mcclellancompile_util.h b/src/nfa/mcclellancompile_util.h
index 554c1efdd..d681e06b1 100644
--- a/src/nfa/mcclellancompile_util.h
+++ b/src/nfa/mcclellancompile_util.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -39,10 +39,12 @@ namespace ue2 {
 u32 remove_leading_dots(raw_dfa &raw);
 
 /**
- * Prunes any states which cannot be reached within max_offset from start of
- * stream. Returns false if no changes are made to the rdfa
+ * \brief Clear reports on any states that are deeper than \a max_offset from
+ * start of stream.
+ *
+ * Returns false if no changes are made to the DFA.
  */
-bool prune_overlong(raw_dfa &raw, u32 max_offset);
+bool clear_deeper_reports(raw_dfa &raw, u32 max_offset);
 
 std::set<ReportID> all_reports(const raw_dfa &rdfa);
 bool has_eod_accepts(const raw_dfa &rdfa);
diff --git a/src/nfa/mcsheng.c b/src/nfa/mcsheng.c
index 98db3f0a1..9722fd676 100644
--- a/src/nfa/mcsheng.c
+++ b/src/nfa/mcsheng.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,6 +33,7 @@
 #include "nfa_api.h"
 #include "nfa_api_queue.h"
 #include "nfa_internal.h"
+#include "util/arch.h"
 #include "util/bitutils.h"
 #include "util/compare.h"
 #include "util/simd_utils.h"
@@ -168,7 +169,7 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end,
      * extract a single copy of the state from the u32 for checking. */
     u32 sheng_stop_limit_x4 = sheng_stop_limit * 0x01010101;
 
-#if defined(HAVE_PEXT) && defined(ARCH_64_BIT)
+#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
     u32 sheng_limit_x4 = sheng_limit * 0x01010101;
     m128 simd_stop_limit = set4x32(sheng_stop_limit_x4);
     m128 accel_delta = set16x8(sheng_limit - sheng_stop_limit);
@@ -176,20 +177,20 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end,
                  m->sheng_accel_limit, sheng_stop_limit);
 #endif
 
-#define SHENG_SINGLE_ITER do {                                          \
-        m128 shuffle_mask = masks[*(c++)];                              \
-        s = pshufb(shuffle_mask, s);                                    \
-        u32 s_gpr_x4 = movd(s); /* convert to u8 */                     \
-        DEBUG_PRINTF("c %hhu (%c) --> s %hhu\n", c[-1], c[-1], s_gpr);  \
-        if (s_gpr_x4 >= sheng_stop_limit_x4) {                          \
-            s_gpr = s_gpr_x4;                                           \
-            goto exit;                                                  \
-        }                                                               \
+#define SHENG_SINGLE_ITER do {                                             \
+        m128 shuffle_mask = masks[*(c++)];                                 \
+        s = pshufb_m128(shuffle_mask, s);                                  \
+        u32 s_gpr_x4 = movd(s); /* convert to u8 */                        \
+        DEBUG_PRINTF("c %hhu (%c) --> s %hhu\n", c[-1], c[-1], s_gpr_x4);  \
+        if (s_gpr_x4 >= sheng_stop_limit_x4) {                             \
+            s_gpr = s_gpr_x4;                                              \
+            goto exit;                                                     \
+        }                                                                  \
     } while (0)
 
     u8 s_gpr;
     while (c < c_end) {
-#if defined(HAVE_PEXT) && defined(ARCH_64_BIT)
+#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
         /* This version uses pext for efficently bitbashing out scaled
          * versions of the bytes to process from a u64a */
 
@@ -197,7 +198,7 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end,
         u64a cc0 = pdep64(data_bytes, 0xff0); /* extract scaled low byte */
         data_bytes &= ~0xffULL; /* clear low bits for scale space */
         m128 shuffle_mask0 = load128((const char *)masks + cc0);
-        s = pshufb(shuffle_mask0, s);
+        s = pshufb_m128(shuffle_mask0, s);
         m128 s_max = s;
         m128 s_max0 = s_max;
         DEBUG_PRINTF("c %02llx --> s %hhu\n", cc0 >> 4, movd(s));
@@ -207,7 +208,7 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end,
         u64a cc##iter = pext64(data_bytes, mcsheng_pext_mask[iter]);    \
         assert(cc##iter == (u64a)c[iter] << 4);                         \
         m128 shuffle_mask##iter = load128((const char *)masks + cc##iter); \
-        s = pshufb(shuffle_mask##iter, s);                              \
+        s = pshufb_m128(shuffle_mask##iter, s);                         \
         if (do_accel && iter == 7) {                                    \
             /* in the final iteration we also have to check against accel */ \
             m128 s_temp = sadd_u8_m128(s, accel_delta);                 \
@@ -287,19 +288,19 @@ u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end,
         assert(soft_c_end - c < SHENG_CHUNK);
         switch (soft_c_end - c) {
         case 7:
-            SHENG_SINGLE_ITER;
+            SHENG_SINGLE_ITER; // fallthrough
         case 6:
-            SHENG_SINGLE_ITER;
+            SHENG_SINGLE_ITER; // fallthrough
         case 5:
-            SHENG_SINGLE_ITER;
+            SHENG_SINGLE_ITER; // fallthrough
         case 4:
-            SHENG_SINGLE_ITER;
+            SHENG_SINGLE_ITER; // fallthrough
         case 3:
-            SHENG_SINGLE_ITER;
+            SHENG_SINGLE_ITER; // fallthrough
         case 2:
-            SHENG_SINGLE_ITER;
+            SHENG_SINGLE_ITER; // fallthrough
         case 1:
-            SHENG_SINGLE_ITER;
+            SHENG_SINGLE_ITER; // fallthrough
         }
     }
 
diff --git a/src/nfa/mcsheng_compile.cpp b/src/nfa/mcsheng_compile.cpp
index 7b4e58ab1..2049fee03 100644
--- a/src/nfa/mcsheng_compile.cpp
+++ b/src/nfa/mcsheng_compile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -617,7 +617,7 @@ void fill_in_succ_table_16(NFA *nfa, const dfa_info &info,
 #define MAX_SHERMAN_LIST_LEN 8
 
 static
-void addIfEarlier(set<dstate_id_t> &dest, dstate_id_t candidate,
+void addIfEarlier(flat_set<dstate_id_t> &dest, dstate_id_t candidate,
                   dstate_id_t max) {
     if (candidate < max) {
         dest.insert(candidate);
@@ -625,13 +625,35 @@ void addIfEarlier(set<dstate_id_t> &dest, dstate_id_t candidate,
 }
 
 static
-void addSuccessors(set<dstate_id_t> &dest, const dstate &source,
+void addSuccessors(flat_set<dstate_id_t> &dest, const dstate &source,
                    u16 alphasize, dstate_id_t curr_id) {
     for (symbol_t s = 0; s < alphasize; s++) {
         addIfEarlier(dest, source.next[s], curr_id);
     }
 }
 
+/* \brief Returns a set of states to search for a better daddy. */
+static
+flat_set<dstate_id_t> find_daddy_candidates(const dfa_info &info,
+                                            dstate_id_t curr_id) {
+    flat_set<dstate_id_t> hinted;
+
+    addIfEarlier(hinted, 0, curr_id);
+    addIfEarlier(hinted, info.raw.start_anchored, curr_id);
+    addIfEarlier(hinted, info.raw.start_floating, curr_id);
+
+    // Add existing daddy and his successors, then search back one generation.
+    const u16 alphasize = info.impl_alpha_size;
+    dstate_id_t daddy = info.states[curr_id].daddy;
+    for (u32 level = 0; daddy && level < 2; level++) {
+        addIfEarlier(hinted, daddy, curr_id);
+        addSuccessors(hinted, info.states[daddy], alphasize, curr_id);
+        daddy = info.states[daddy].daddy;
+    }
+
+    return hinted;
+}
+
 #define MAX_SHERMAN_SELF_LOOP 20
 
 static
@@ -671,22 +693,7 @@ void find_better_daddy(dfa_info &info, dstate_id_t curr_id,
     dstate_id_t best_daddy = 0;
     dstate &currState = info.states[curr_id];
 
-    set<dstate_id_t> hinted; /* set of states to search for a better daddy */
-    addIfEarlier(hinted, 0, curr_id);
-    addIfEarlier(hinted, info.raw.start_anchored, curr_id);
-    addIfEarlier(hinted, info.raw.start_floating, curr_id);
-
-    dstate_id_t mydaddy = currState.daddy;
-    if (mydaddy) {
-        addIfEarlier(hinted, mydaddy, curr_id);
-        addSuccessors(hinted, info.states[mydaddy], alphasize, curr_id);
-        dstate_id_t mygranddaddy = info.states[mydaddy].daddy;
-        if (mygranddaddy) {
-            addIfEarlier(hinted, mygranddaddy, curr_id);
-            addSuccessors(hinted, info.states[mygranddaddy], alphasize,
-                          curr_id);
-        }
-    }
+    flat_set<dstate_id_t> hinted = find_daddy_candidates(info, curr_id);
 
     for (const dstate_id_t &donor : hinted) {
         assert(donor < curr_id);
@@ -821,7 +828,7 @@ void fill_in_sherman(NFA *nfa, dfa_info &info, UNUSED u16 sherman_limit) {
 }
 
 static
-aligned_unique_ptr<NFA> mcshengCompile16(dfa_info &info, dstate_id_t sheng_end,
+bytecode_ptr<NFA> mcshengCompile16(dfa_info &info, dstate_id_t sheng_end,
                         const map<dstate_id_t, AccelScheme> &accel_escape_info,
                         const Grey &grey) {
     DEBUG_PRINTF("building mcsheng 16\n");
@@ -872,7 +879,7 @@ aligned_unique_ptr<NFA> mcshengCompile16(dfa_info &info, dstate_id_t sheng_end,
     accel_offset -= sizeof(NFA); /* adj accel offset to be relative to m */
     assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));
 
-    aligned_unique_ptr<NFA> nfa = aligned_zmalloc_unique<NFA>(total_size);
+    auto nfa = make_zeroed_bytecode_ptr<NFA>(total_size);
     mcsheng *m = (mcsheng *)getMutableImplNfa(nfa.get());
 
     populateBasicInfo(sizeof(u16), info, total_size, aux_offset, accel_offset,
@@ -967,7 +974,7 @@ void allocateImplId8(dfa_info &info, dstate_id_t sheng_end,
 }
 
 static
-aligned_unique_ptr<NFA> mcshengCompile8(dfa_info &info, dstate_id_t sheng_end,
+bytecode_ptr<NFA> mcshengCompile8(dfa_info &info, dstate_id_t sheng_end,
                        const map<dstate_id_t, AccelScheme> &accel_escape_info) {
     DEBUG_PRINTF("building mcsheng 8\n");
 
@@ -998,7 +1005,7 @@ aligned_unique_ptr<NFA> mcshengCompile8(dfa_info &info, dstate_id_t sheng_end,
     accel_offset -= sizeof(NFA); /* adj accel offset to be relative to m */
     assert(ISALIGNED_N(accel_offset, alignof(union AccelAux)));
 
-    aligned_unique_ptr<NFA> nfa = aligned_zmalloc_unique<NFA>(total_size);
+    auto nfa = make_zeroed_bytecode_ptr<NFA>(total_size);
     mcsheng *m = (mcsheng *)getMutableImplNfa(nfa.get());
 
     allocateImplId8(info, sheng_end, accel_escape_info, &m->accel_limit_8,
@@ -1019,13 +1026,13 @@ aligned_unique_ptr<NFA> mcshengCompile8(dfa_info &info, dstate_id_t sheng_end,
     return nfa;
 }
 
-aligned_unique_ptr<NFA> mcshengCompile(raw_dfa &raw, const CompileContext &cc,
-                                       const ReportManager &rm) {
+bytecode_ptr<NFA> mcshengCompile(raw_dfa &raw, const CompileContext &cc,
+                                 const ReportManager &rm) {
     if (!cc.grey.allowMcSheng) {
         return nullptr;
     }
 
-    mcclellan_build_strat mbs(raw, rm);
+    mcclellan_build_strat mbs(raw, rm, false);
     dfa_info info(mbs);
     bool using8bit = cc.grey.allowMcClellan8 && info.size() <= 256;
 
@@ -1044,7 +1051,7 @@ aligned_unique_ptr<NFA> mcshengCompile(raw_dfa &raw, const CompileContext &cc,
         return nullptr;
     }
 
-    aligned_unique_ptr<NFA> nfa;
+    bytecode_ptr<NFA> nfa;
     if (!using8bit) {
         nfa = mcshengCompile16(info, sheng_end, accel_escape_info, cc.grey);
     } else {
diff --git a/src/nfa/mcsheng_compile.h b/src/nfa/mcsheng_compile.h
index d1ae1e323..487ab45f4 100644
--- a/src/nfa/mcsheng_compile.h
+++ b/src/nfa/mcsheng_compile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -29,13 +29,8 @@
 #ifndef MCSHENGCOMPILE_H
 #define MCSHENGCOMPILE_H
 
-#include "accel_dfa_build_strat.h"
-#include "rdfa.h"
 #include "ue2common.h"
-#include "util/alloc.h"
-#include "util/ue2_containers.h"
-
-#include <memory>
+#include "util/bytecode_ptr.h"
 
 struct NFA;
 
@@ -43,10 +38,10 @@ namespace ue2 {
 
 class ReportManager;
 struct CompileContext;
+struct raw_dfa;
 
-ue2::aligned_unique_ptr<NFA>
-mcshengCompile(raw_dfa &raw, const CompileContext &cc,
-               const ReportManager &rm);
+bytecode_ptr<NFA> mcshengCompile(raw_dfa &raw, const CompileContext &cc,
+                                 const ReportManager &rm);
 
 bool has_accel_mcsheng(const NFA *nfa);
 
diff --git a/src/nfa/mpvcompile.cpp b/src/nfa/mpvcompile.cpp
index 87fb462e5..8497c6487 100644
--- a/src/nfa/mpvcompile.cpp
+++ b/src/nfa/mpvcompile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -309,9 +309,9 @@ const mpv_counter_info &findCounter(const vector<mpv_counter_info> &counters,
     return counters.front();
 }
 
-aligned_unique_ptr<NFA> mpvCompile(const vector<raw_puff> &puffs_in,
-                                   const vector<raw_puff> &triggered_puffs,
-                                   const ReportManager &rm) {
+bytecode_ptr<NFA> mpvCompile(const vector<raw_puff> &puffs_in,
+                             const vector<raw_puff> &triggered_puffs,
+                             const ReportManager &rm) {
     assert(!puffs_in.empty() || !triggered_puffs.empty());
     u32 puffette_count = puffs_in.size() + triggered_puffs.size();
 
@@ -343,7 +343,7 @@ aligned_unique_ptr<NFA> mpvCompile(const vector<raw_puff> &puffs_in,
 
     DEBUG_PRINTF("%u puffs, len = %u\n", puffette_count, len);
 
-    aligned_unique_ptr<NFA> nfa = aligned_zmalloc_unique<NFA>(len);
+    auto nfa = make_zeroed_bytecode_ptr<NFA>(len);
 
     mpv_puffette *pa_base = (mpv_puffette *)
         ((char *)nfa.get() + sizeof(NFA) + sizeof(mpv)
diff --git a/src/nfa/mpvcompile.h b/src/nfa/mpvcompile.h
index fb91ac64e..4f820e436 100644
--- a/src/nfa/mpvcompile.h
+++ b/src/nfa/mpvcompile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,7 +30,7 @@
 #define MPV_COMPILE_H
 
 #include "ue2common.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"
 #include "util/charreach.h"
 
 #include <memory>
@@ -61,9 +61,9 @@ struct raw_puff {
  * puffs in the triggered_puffs vector are enabled when an TOP_N event is
  * delivered corresponding to their index in the vector
  */
-aligned_unique_ptr<NFA> mpvCompile(const std::vector<raw_puff> &puffs,
-                                   const std::vector<raw_puff> &triggered_puffs,
-                                   const ReportManager &rm);
+bytecode_ptr<NFA> mpvCompile(const std::vector<raw_puff> &puffs,
+                             const std::vector<raw_puff> &triggered_puffs,
+                             const ReportManager &rm);
 
 } // namespace ue2
 
diff --git a/src/nfa/multiaccel_common.h b/src/nfa/multiaccel_common.h
deleted file mode 100644
index 1a13c3b6d..000000000
--- a/src/nfa/multiaccel_common.h
+++ /dev/null
@@ -1,265 +0,0 @@
-/*
- * Copyright (c) 2015, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef MULTIACCEL_COMMON_H_
-#define MULTIACCEL_COMMON_H_
-
-#include "config.h"
-#include "ue2common.h"
-#include "util/join.h"
-#include "util/bitutils.h"
-
-/*
- * When doing shifting, remember that the total number of shifts should be n-1
- */
-#define VARISHIFT(src, dst, len) \
-    do { \
-        (dst) &= (src) >> (len); \
-    } while (0)
-#define STATIC_SHIFT1(x) \
-    do { \
-        (x) &= (x) >> 1; \
-    } while (0)
-#define STATIC_SHIFT2(x) \
-    do { \
-        (x) &= (x) >> 2;\
-    } while (0)
-#define STATIC_SHIFT4(x) \
-    do { \
-        (x) &= (x) >> 4; \
-    } while (0)
-#define STATIC_SHIFT8(x) \
-    do { \
-        (x) &= (x) >> 8; \
-    } while (0)
-#define SHIFT1(x) \
-    do {} while (0)
-#define SHIFT2(x) \
-    do { \
-        STATIC_SHIFT1(x); \
-    } while (0)
-#define SHIFT3(x) \
-    do { \
-        STATIC_SHIFT1(x); \
-        STATIC_SHIFT1(x); \
-    } while (0)
-#define SHIFT4(x) \
-    do { \
-        STATIC_SHIFT1(x); \
-        STATIC_SHIFT2(x); \
-    } while (0)
-#define SHIFT5(x) \
-    do { \
-        SHIFT4(x); \
-        STATIC_SHIFT1(x); \
-    } while (0)
-#define SHIFT6(x) \
-    do { \
-        SHIFT4(x); \
-        STATIC_SHIFT2(x); \
-    } while (0)
-#define SHIFT7(x) \
-    do { \
-        SHIFT4(x); \
-        STATIC_SHIFT1(x); \
-        STATIC_SHIFT2(x); \
-    } while (0)
-#define SHIFT8(x) \
-    do { \
-        SHIFT4(x); \
-        STATIC_SHIFT4(x); \
-    } while (0)
-#define SHIFT9(x) \
-    do { \
-        SHIFT8(x); \
-        STATIC_SHIFT1(x); \
-    } while (0)
-#define SHIFT10(x) \
-    do { \
-        SHIFT8(x); \
-        STATIC_SHIFT2(x); \
-    } while (0)
-#define SHIFT11(x) \
-    do { \
-        SHIFT8(x); \
-        STATIC_SHIFT1(x); \
-        STATIC_SHIFT2(x); \
-    } while (0)
-#define SHIFT12(x); \
-    do { \
-        SHIFT8(x);\
-        STATIC_SHIFT4(x); \
-    } while (0)
-#define SHIFT13(x); \
-    do { \
-        SHIFT8(x); \
-        STATIC_SHIFT1(x); \
-        STATIC_SHIFT4(x); \
-    } while (0)
-#define SHIFT14(x) \
-    do { \
-        SHIFT8(x); \
-        STATIC_SHIFT2(x); \
-        STATIC_SHIFT4(x); \
-    } while (0)
-#define SHIFT15(x) \
-    do { \
-        SHIFT8(x); \
-        STATIC_SHIFT1(x); \
-        STATIC_SHIFT2(x); \
-        STATIC_SHIFT4(x); \
-    } while (0)
-#define SHIFT16(x) \
-    do { \
-        SHIFT8(x); \
-        STATIC_SHIFT8(x); \
-    } while (0)
-#define SHIFT17(x) \
-    do { \
-        SHIFT16(x); \
-        STATIC_SHIFT1(x); \
-    } while (0)
-#define SHIFT18(x) \
-    do { \
-        SHIFT16(x); \
-        STATIC_SHIFT2(x); \
-    } while (0)
-#define SHIFT19(x) \
-    do { \
-        SHIFT16(x); \
-        STATIC_SHIFT1(x); \
-        STATIC_SHIFT2(x); \
-    } while (0)
-#define SHIFT20(x) \
-    do { \
-        SHIFT16(x); \
-        STATIC_SHIFT4(x); \
-    } while (0)
-#define SHIFT21(x) \
-    do { \
-        SHIFT16(x); \
-        STATIC_SHIFT1(x); \
-        STATIC_SHIFT4(x); \
-    } while (0)
-#define SHIFT22(x) \
-    do { \
-        SHIFT16(x); \
-        STATIC_SHIFT2(x); \
-        STATIC_SHIFT4(x); \
-    } while (0)
-#define SHIFT23(x) \
-    do { \
-        SHIFT16(x); \
-        STATIC_SHIFT1(x); \
-        STATIC_SHIFT2(x); \
-        STATIC_SHIFT4(x); \
-    } while (0)
-#define SHIFT24(x) \
-    do { \
-        SHIFT16(x); \
-        STATIC_SHIFT8(x); \
-    } while (0)
-#define SHIFT25(x) \
-    do { \
-        SHIFT24(x); \
-        STATIC_SHIFT1(x); \
-    } while (0)
-#define SHIFT26(x) \
-    do { \
-        SHIFT24(x); \
-        STATIC_SHIFT2(x); \
-    } while (0)
-#define SHIFT27(x) \
-    do { \
-        SHIFT24(x); \
-        STATIC_SHIFT1(x); \
-        STATIC_SHIFT2(x); \
-    } while (0)
-#define SHIFT28(x) \
-    do { \
-        SHIFT24(x); \
-        STATIC_SHIFT4(x); \
-    } while (0)
-#define SHIFT29(x) \
-    do { \
-        SHIFT24(x); \
-        STATIC_SHIFT1(x); \
-        STATIC_SHIFT4(x); \
-    } while (0)
-#define SHIFT30(x) \
-    do { \
-        SHIFT24(x); \
-        STATIC_SHIFT2(x); \
-        STATIC_SHIFT4(x); \
-    } while (0)
-#define SHIFT31(x) \
-    do { \
-        SHIFT24(x); \
-        STATIC_SHIFT1(x); \
-        STATIC_SHIFT2(x); \
-        STATIC_SHIFT4(x); \
-    } while (0)
-#define SHIFT32(x) \
-    do { \
-        SHIFT24(x); \
-        STATIC_SHIFT8(x); \
-    } while (0)
-
-/*
- * this function is used by 32-bit multiaccel matchers. 32-bit matchers accept
- * a 32-bit integer as a buffer, where low 16 bits is movemask result and
- * high 16 bits are "don't care" values. this function is not expected to return
- * a result higher than 16.
- */
-static really_inline
-const u8 *match32(const u8 *buf, const u32 z) {
-    if (unlikely(z != 0)) {
-        u32 pos = ctz32(z);
-        assert(pos < 16);
-        return buf + pos;
-    }
-    return NULL;
-}
-
-/*
- * this function is used by 64-bit multiaccel matchers. 64-bit matchers accept
- * a 64-bit integer as a buffer, where low 32 bits is movemask result and
- * high 32 bits are "don't care" values. this function is not expected to return
- * a result higher than 32.
- */
-static really_inline
-const u8 *match64(const u8 *buf, const u64a z) {
-    if (unlikely(z != 0)) {
-        u32 pos = ctz64(z);
-        assert(pos < 32);
-        return buf + pos;
-    }
-    return NULL;
-}
-
-#endif /* MULTIACCEL_COMMON_H_ */
diff --git a/src/nfa/multiaccel_compilehelper.cpp b/src/nfa/multiaccel_compilehelper.cpp
deleted file mode 100644
index 4c1f81018..000000000
--- a/src/nfa/multiaccel_compilehelper.cpp
+++ /dev/null
@@ -1,439 +0,0 @@
-/*
- * Copyright (c) 2015-2016, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "multiaccel_compilehelper.h"
-
-using namespace std;
-using namespace ue2;
-
-#ifdef DEBUG
-static const char* state_to_str[] = {
-    "FIRST_RUN",
-    "SECOND_RUN",
-    "WAITING_FOR_GRAB",
-    "FIRST_TAIL",
-    "SECOND_TAIL",
-    "STOPPED",
-    "INVALID"
-};
-static const char* type_to_str[] = {
-    "SHIFT",
-    "SHIFTGRAB",
-    "DOUBLESHIFT",
-    "DOUBLESHIFTGRAB",
-    "LONG",
-    "LONGGRAB",
-    "NONE"
-};
-
-static
-void dumpMultiaccelState(const accel_data &d) {
-    DEBUG_PRINTF("type: %s state: %s len1: %u tlen1: %u len2: %u tlen2: %u\n",
-                 type_to_str[(unsigned) d.type],
-                 state_to_str[(unsigned) d.state],
-                 d.len1, d.tlen1, d.len2, d.tlen2);
-}
-#endif
-
-/* stop all the matching. this may render most schemes invalid. */
-static
-void stop(accel_data &d) {
-    switch (d.state) {
-    case STATE_STOPPED:
-    case STATE_INVALID:
-        break;
-    case STATE_FIRST_TAIL:
-    case STATE_SECOND_RUN:
-        /*
-         * Shift matchers are special case, because they have "tails".
-         * When shift matcher reaches a mid/endpoint, tail mode is
-         * activated, which looks for more matches to extend the match.
-         *
-         * For example, consider pattern /a{5}ba{3}/. Under normal circumstances,
-         * long-grab matcher will be picked for this pattern (matching a run of a's,
-         * followed by a not-a), because doubleshift matcher would be confused by
-         * consecutive a's and would parse the pattern as a.{0}a.{0}a (two shifts
-         * by 1) and throw out the rest of the pattern.
-         *
-         * With tails, we defer ending the run until we actually run out of
-         * matching characters, so the above pattern will now be parsed by
-         * doubleshift matcher as /a.{3}a.{3}a/ (two shifts by 4).
-         *
-         * So if we are stopping shift matchers, we should check if we aren't in
-         * the process of matching first tail or second run. If we are, we can't
-         * finish the second run as we are stopping, but we can try and split
-         * the first tail instead to obtain a valid second run.
-         */
-        if ((d.type == MultibyteAccelInfo::MAT_DSHIFT ||
-                 d.type == MultibyteAccelInfo::MAT_DSHIFTGRAB) && d.tlen1 == 0) {
-            // can't split an empty void...
-            d.state = STATE_INVALID;
-            break;
-        }
-        d.len2 = 0;
-        d.state = STATE_STOPPED;
-        break;
-    case STATE_SECOND_TAIL:
-        d.state = STATE_STOPPED;
-        break;
-    case STATE_WAITING_FOR_GRAB:
-    case STATE_FIRST_RUN:
-        if (d.type == MultibyteAccelInfo::MAT_LONG) {
-            d.state = STATE_STOPPED;
-        } else {
-            d.state = STATE_INVALID;
-        }
-        break;
-    }
-}
-
-static
-void validate(accel_data &d, unsigned max_len) {
-    // try and fit in all our tails
-    if (d.len1 + d.tlen1 + d.len2 + d.tlen2 < max_len && d.len2 > 0) {
-        // case 1: everything fits in
-        d.len1 += d.tlen1;
-        d.len2 += d.tlen2;
-        d.tlen1 = 0;
-        d.tlen2 = 0;
-    } else if (d.len1 + d.tlen1 + d.len2 < max_len && d.len2 > 0) {
-        // case 2: everything but the second tail fits in
-        d.len1 += d.tlen1;
-        d.tlen1 = 0;
-        // try going for a partial tail
-        if (d.tlen2 != 0) {
-            int new_tlen2 = max_len - 1 - d.len1 - d.len2;
-            if (new_tlen2 > 0) {
-                d.len2 += new_tlen2;
-            }
-            d.tlen2 = 0;
-        }
-    } else if (d.len1 + d.tlen1 < max_len) {
-        // case 3: first run and its tail fits in
-        if (d.type == MultibyteAccelInfo::MAT_DSHIFT ||
-                 d.type == MultibyteAccelInfo::MAT_DSHIFTGRAB) {
-            // split the tail into a second run
-            d.len2 = d.tlen1;
-        } else {
-            d.len1 += d.tlen1;
-            d.len2 = 0;
-        }
-        d.tlen1 = 0;
-        d.tlen2 = 0;
-    } else if (d.len1 < max_len) {
-        // case 4: nothing but the first run fits in
-        // try going for a partial tail
-        if (d.tlen1 != 0) {
-            int new_tlen1 = max_len - 1 - d.len1;
-            if (new_tlen1 > 0) {
-                d.len1 += new_tlen1;
-            }
-            d.tlen1 = 0;
-        }
-        d.len2 = 0;
-        d.tlen2 = 0;
-    }
-    // if we removed our second run, doubleshift matchers are no longer valid
-    if ((d.type == MultibyteAccelInfo::MAT_DSHIFT ||
-                 d.type == MultibyteAccelInfo::MAT_DSHIFTGRAB) && d.len2 == 0) {
-        d.state = STATE_INVALID;
-    } else if ((d.type == MultibyteAccelInfo::MAT_LONG) && d.len1 >= max_len) {
-        // long matchers can just stop whenever they want to
-        d.len1 = max_len - 1;
-    }
-
-    // now, general sanity checks
-    if ((d.len1 + d.tlen1 + d.len2 + d.tlen2) >= max_len) {
-        d.state = STATE_INVALID;
-    }
-    if ((d.len1 + d.tlen1 + d.len2 + d.tlen2) < MULTIACCEL_MIN_LEN) {
-        d.state = STATE_INVALID;
-    }
-}
-
-static
-void match(accel_data &d, const CharReach &ref_cr, const CharReach &cur_cr) {
-    switch (d.type) {
-    case MultibyteAccelInfo::MAT_LONG:
-        {
-            /*
-             * For long matcher, we want lots of consecutive same-or-subset
-             * char-reaches
-             */
-            if ((ref_cr & cur_cr) == cur_cr) {
-                d.len1++;
-            } else {
-                d.state = STATE_STOPPED;
-            }
-        }
-        break;
-
-    case MultibyteAccelInfo::MAT_LONGGRAB:
-        {
-            /*
-             * For long-grab matcher, we want lots of consecutive same-or-subset
-             * char-reaches with a negative match in the end.
-             */
-            if ((ref_cr & cur_cr) == cur_cr) {
-                d.len1++;
-            } else if (!(ref_cr & cur_cr).any()) {
-                /* we grabbed, stop immediately */
-                d.state = STATE_STOPPED;
-            } else {
-                /* our run-n-grab was interrupted; mark as invalid */
-                d.state = STATE_INVALID;
-            }
-        }
-        break;
-
-    case MultibyteAccelInfo::MAT_SHIFTGRAB:
-        {
-            /*
-             * For shift-grab matcher, we want two matches separated by anything;
-             * however the second vertex *must* be a negative (non-overlapping) match.
-             *
-             * Shiftgrab matcher is identical to shift except for presence of grab.
-             */
-            if (d.state == STATE_WAITING_FOR_GRAB) {
-                if ((ref_cr & cur_cr).any()) {
-                    d.state = STATE_INVALID;
-                } else {
-                    d.state = STATE_FIRST_RUN;
-                    d.len1++;
-                }
-                return;
-            }
-        }
-        /* no break, falling through */
-    case MultibyteAccelInfo::MAT_SHIFT:
-        {
-            /*
-             * For shift-matcher, we want two matches separated by anything.
-             */
-            if (ref_cr == cur_cr) {
-                // keep matching tail
-                switch (d.state) {
-                case STATE_FIRST_RUN:
-                    d.state = STATE_FIRST_TAIL;
-                    break;
-                case STATE_FIRST_TAIL:
-                    d.tlen1++;
-                    break;
-                default:
-                    // shouldn't happen
-                    assert(0);
-                }
-            } else {
-                switch (d.state) {
-                case STATE_FIRST_RUN:
-                    // simply advance
-                    d.len1++;
-                    break;
-                case STATE_FIRST_TAIL:
-                    // we found a non-matching char after tail, so stop
-                    d.state = STATE_STOPPED;
-                    break;
-                default:
-                    // shouldn't happen
-                    assert(0);
-                }
-            }
-        }
-        break;
-
-    case MultibyteAccelInfo::MAT_DSHIFTGRAB:
-        {
-            /*
-             * For double shift-grab matcher, we want two matches separated by
-             * either negative matches or dots; however the second vertex *must*
-             * be a negative match.
-             *
-             * Doubleshiftgrab matcher is identical to doubleshift except for
-             * presence of grab.
-             */
-            if (d.state == STATE_WAITING_FOR_GRAB) {
-                if ((ref_cr & cur_cr).any()) {
-                    d.state = STATE_INVALID;
-                } else {
-                    d.state = STATE_FIRST_RUN;
-                    d.len1++;
-                }
-                return;
-            }
-        }
-        /* no break, falling through */
-    case MultibyteAccelInfo::MAT_DSHIFT:
-        {
-            /*
-             * For double shift matcher, we want three matches, each separated
-             * by a lot of anything.
-             *
-             * Doubleshift matcher is complicated by presence of tails.
-             */
-            if (ref_cr == cur_cr) {
-                // decide if we are activating second shift or matching tails
-                switch (d.state) {
-                case STATE_FIRST_RUN:
-                    d.state = STATE_FIRST_TAIL;
-                    d.len2 = 1; // we're now ready for our second run
-                    break;
-                case STATE_FIRST_TAIL:
-                    d.tlen1++;
-                    break;
-                case STATE_SECOND_RUN:
-                    d.state = STATE_SECOND_TAIL;
-                    break;
-                case STATE_SECOND_TAIL:
-                    d.tlen2++;
-                    break;
-                default:
-                    // shouldn't happen
-                    assert(0);
-                }
-            } else {
-                switch (d.state) {
-                case STATE_FIRST_RUN:
-                    d.len1++;
-                    break;
-                case STATE_FIRST_TAIL:
-                    // start second run
-                    d.state = STATE_SECOND_RUN;
-                    d.len2++;
-                    break;
-                case STATE_SECOND_RUN:
-                    d.len2++;
-                    break;
-                case STATE_SECOND_TAIL:
-                    // stop
-                    d.state = STATE_STOPPED;
-                    break;
-                default:
-                    // shouldn't happen
-                    assert(0);
-                }
-            }
-        }
-        break;
-
-    default:
-        // shouldn't happen
-        assert(0);
-        break;
-    }
-}
-
-MultiaccelCompileHelper::MultiaccelCompileHelper(const CharReach &ref_cr,
-                                                 u32 off, unsigned max_length)
-    : cr(ref_cr), offset(off), max_len(max_length) {
-    int accel_num = (int) MultibyteAccelInfo::MAT_MAX;
-    accels.resize(accel_num);
-
-    // mark everything as valid
-    for (int i = 0; i < accel_num; i++) {
-        accel_data &ad = accels[i];
-        ad.len1 = 1;
-        ad.type = (MultibyteAccelInfo::multiaccel_type) i;
-
-        /* for shift-grab matchers, we are waiting for the grab right at the start */
-        if (ad.type == MultibyteAccelInfo::MAT_SHIFTGRAB
-                || ad.type == MultibyteAccelInfo::MAT_DSHIFTGRAB) {
-            ad.state = STATE_WAITING_FOR_GRAB;
-        } else {
-            ad.state = STATE_FIRST_RUN;
-        }
-    }
-}
-
-bool MultiaccelCompileHelper::canAdvance() {
-    for (const accel_data &ad : accels) {
-        if (ad.state != STATE_STOPPED && ad.state != STATE_INVALID) {
-            return true;
-        }
-    }
-    return false;
-}
-
-void MultiaccelCompileHelper::advance(const CharReach &cur_cr) {
-    for (accel_data &ad : accels) {
-        if (ad.state == STATE_STOPPED || ad.state == STATE_INVALID) {
-            continue;
-        }
-        match(ad, cr, cur_cr);
-#ifdef DEBUG
-        dumpMultiaccelState(ad);
-#endif
-    }
-}
-
-MultibyteAccelInfo MultiaccelCompileHelper::getBestScheme() {
-    int best_len = 0;
-    accel_data best;
-
-    DEBUG_PRINTF("Stopping multiaccel compile\n");
-
-    for (accel_data &ad : accels) {
-        // stop our matching
-        stop(ad);
-        validate(ad, max_len);
-
-#ifdef DEBUG
-        dumpMultiaccelState(ad);
-#endif
-
-        // skip invalid schemes
-        if (ad.state == STATE_INVALID) {
-            continue;
-        }
-        DEBUG_PRINTF("Marking as viable\n");
-
-        // TODO: relative strengths of accel schemes? maybe e.g. a shorter
-        // long match would in some cases be preferable to a longer
-        // double shift match (for example, depending on length)?
-        int as_len = ad.len1 + ad.len2;
-        if (as_len >= best_len) {
-            DEBUG_PRINTF("Marking as best\n");
-            best_len = as_len;
-            best = ad;
-        }
-    }
-    // if we found at least one accel scheme, return it
-    if (best.state != STATE_INVALID) {
-#ifdef DEBUG
-        DEBUG_PRINTF("Picked best multiaccel state:\n");
-        dumpMultiaccelState(best);
-#endif
-        MultibyteAccelInfo info;
-        info.cr = cr;
-        info.offset = offset;
-        info.len1 = best.len1;
-        info.len2 = best.len2;
-        info.type = best.type;
-        return info;
-    }
-    return MultibyteAccelInfo();
-}
diff --git a/src/nfa/multiaccel_doubleshift.h b/src/nfa/multiaccel_doubleshift.h
deleted file mode 100644
index 7ed7534cf..000000000
--- a/src/nfa/multiaccel_doubleshift.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright (c) 2015, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef MULTIACCEL_DOUBLESHIFT_H_
-#define MULTIACCEL_DOUBLESHIFT_H_
-
-#include "multiaccel_common.h"
-
-#define DOUBLESHIFT_MATCH(len, match_t, match_sz) \
-    static really_inline \
-    const u8 * JOIN4(doubleshiftMatch_, match_sz, _, len)(const u8 *buf, match_t z, u32 len2) {\
-        if (unlikely(z)) { \
-            match_t tmp = z; \
-            z |= ((match_t) (1 << (len)) - 1) << (match_sz / 2); \
-            tmp |= ((match_t) (1 << (len + len2)) - 1) << (match_sz / 2); \
-            VARISHIFT(z, z, len); \
-            VARISHIFT(tmp, tmp, len2); \
-            VARISHIFT(tmp, z, len); \
-            return JOIN(match, match_sz)(buf, z); \
-        } \
-        return NULL; \
-    }
-
-#define DOUBLESHIFT_MATCH_32_DEF(n) \
-        DOUBLESHIFT_MATCH(n, u32, 32)
-#define DOUBLESHIFT_MATCH_64_DEF(n) \
-        DOUBLESHIFT_MATCH(n, u64a, 64)
-#define DOUBLESHIFT_MATCH_DEF(n) \
-    DOUBLESHIFT_MATCH_32_DEF(n) \
-    DOUBLESHIFT_MATCH_64_DEF(n)
-
-DOUBLESHIFT_MATCH_DEF(1)
-DOUBLESHIFT_MATCH_DEF(2)
-DOUBLESHIFT_MATCH_DEF(3)
-DOUBLESHIFT_MATCH_DEF(4)
-DOUBLESHIFT_MATCH_DEF(5)
-DOUBLESHIFT_MATCH_DEF(6)
-DOUBLESHIFT_MATCH_DEF(7)
-DOUBLESHIFT_MATCH_DEF(8)
-DOUBLESHIFT_MATCH_DEF(9)
-DOUBLESHIFT_MATCH_DEF(10)
-DOUBLESHIFT_MATCH_DEF(11)
-DOUBLESHIFT_MATCH_DEF(12)
-DOUBLESHIFT_MATCH_DEF(13)
-DOUBLESHIFT_MATCH_DEF(14)
-DOUBLESHIFT_MATCH_DEF(15)
-DOUBLESHIFT_MATCH_64_DEF(16)
-DOUBLESHIFT_MATCH_64_DEF(17)
-DOUBLESHIFT_MATCH_64_DEF(18)
-DOUBLESHIFT_MATCH_64_DEF(19)
-DOUBLESHIFT_MATCH_64_DEF(20)
-DOUBLESHIFT_MATCH_64_DEF(21)
-DOUBLESHIFT_MATCH_64_DEF(22)
-DOUBLESHIFT_MATCH_64_DEF(23)
-DOUBLESHIFT_MATCH_64_DEF(24)
-DOUBLESHIFT_MATCH_64_DEF(25)
-DOUBLESHIFT_MATCH_64_DEF(26)
-DOUBLESHIFT_MATCH_64_DEF(27)
-DOUBLESHIFT_MATCH_64_DEF(28)
-DOUBLESHIFT_MATCH_64_DEF(29)
-DOUBLESHIFT_MATCH_64_DEF(30)
-DOUBLESHIFT_MATCH_64_DEF(31)
-
-static
-const UNUSED u8 * (*doubleshift_match_funcs_32[])(const u8 *buf, u32 z, u32 len2) =
-{
-// skip the first
-    0,
-    &doubleshiftMatch_32_1,
-    &doubleshiftMatch_32_2,
-    &doubleshiftMatch_32_3,
-    &doubleshiftMatch_32_4,
-    &doubleshiftMatch_32_5,
-    &doubleshiftMatch_32_6,
-    &doubleshiftMatch_32_7,
-    &doubleshiftMatch_32_8,
-    &doubleshiftMatch_32_9,
-    &doubleshiftMatch_32_10,
-    &doubleshiftMatch_32_11,
-    &doubleshiftMatch_32_12,
-    &doubleshiftMatch_32_13,
-    &doubleshiftMatch_32_14,
-    &doubleshiftMatch_32_15,
-};
-
-static
-const UNUSED u8 * (*doubleshift_match_funcs_64[])(const u8 *buf, u64a z, u32 len2) =
-{
-// skip the first
-    0,
-    &doubleshiftMatch_64_1,
-    &doubleshiftMatch_64_2,
-    &doubleshiftMatch_64_3,
-    &doubleshiftMatch_64_4,
-    &doubleshiftMatch_64_5,
-    &doubleshiftMatch_64_6,
-    &doubleshiftMatch_64_7,
-    &doubleshiftMatch_64_8,
-    &doubleshiftMatch_64_9,
-    &doubleshiftMatch_64_10,
-    &doubleshiftMatch_64_11,
-    &doubleshiftMatch_64_12,
-    &doubleshiftMatch_64_13,
-    &doubleshiftMatch_64_14,
-    &doubleshiftMatch_64_15,
-    &doubleshiftMatch_64_16,
-    &doubleshiftMatch_64_17,
-    &doubleshiftMatch_64_18,
-    &doubleshiftMatch_64_19,
-    &doubleshiftMatch_64_20,
-    &doubleshiftMatch_64_21,
-    &doubleshiftMatch_64_22,
-    &doubleshiftMatch_64_23,
-    &doubleshiftMatch_64_24,
-    &doubleshiftMatch_64_25,
-    &doubleshiftMatch_64_26,
-    &doubleshiftMatch_64_27,
-    &doubleshiftMatch_64_28,
-    &doubleshiftMatch_64_29,
-    &doubleshiftMatch_64_30,
-    &doubleshiftMatch_64_31,
-};
-
-#endif /* MULTIACCEL_DOUBLESHIFT_H_ */
diff --git a/src/nfa/multiaccel_doubleshiftgrab.h b/src/nfa/multiaccel_doubleshiftgrab.h
deleted file mode 100644
index 51955b4a6..000000000
--- a/src/nfa/multiaccel_doubleshiftgrab.h
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2015, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef MULTIACCEL_DOUBLESHIFTGRAB_H_
-#define MULTIACCEL_DOUBLESHIFTGRAB_H_
-
-#include "multiaccel_common.h"
-
-#define DOUBLESHIFTGRAB_MATCH(len, match_t, match_sz) \
-    static really_inline \
-    const u8 * JOIN4(doubleshiftgrabMatch_, match_sz, _, len)(const u8 *buf, match_t z, u32 len2) {\
-        if (unlikely(z)) { \
-            match_t neg = ~z; \
-            match_t tmp = z; \
-            z |= ((match_t) (1 << (len)) - 1) << (match_sz / 2); \
-            tmp |= ((match_t) (1 << (len + len2)) - 1) << (match_sz / 2); \
-            neg |= ((match_t) (1 << len) - 1) << (match_sz / 2); \
-            VARISHIFT(z, z, len); \
-            VARISHIFT(tmp, tmp, len2); \
-            VARISHIFT(neg, z, 1); \
-            VARISHIFT(tmp, z, len); \
-            return JOIN(match, match_sz)(buf, z); \
-        } \
-        return NULL; \
-    }
-
-#define DOUBLESHIFTGRAB_MATCH_32_DEF(n) \
-        DOUBLESHIFTGRAB_MATCH(n, u32, 32)
-#define DOUBLESHIFTGRAB_MATCH_64_DEF(n) \
-        DOUBLESHIFTGRAB_MATCH(n, u64a, 64)
-#define DOUBLESHIFTGRAB_MATCH_DEF(n) \
-    DOUBLESHIFTGRAB_MATCH_32_DEF(n) \
-    DOUBLESHIFTGRAB_MATCH_64_DEF(n)
-
-DOUBLESHIFTGRAB_MATCH_DEF(1)
-DOUBLESHIFTGRAB_MATCH_DEF(2)
-DOUBLESHIFTGRAB_MATCH_DEF(3)
-DOUBLESHIFTGRAB_MATCH_DEF(4)
-DOUBLESHIFTGRAB_MATCH_DEF(5)
-DOUBLESHIFTGRAB_MATCH_DEF(6)
-DOUBLESHIFTGRAB_MATCH_DEF(7)
-DOUBLESHIFTGRAB_MATCH_DEF(8)
-DOUBLESHIFTGRAB_MATCH_DEF(9)
-DOUBLESHIFTGRAB_MATCH_DEF(10)
-DOUBLESHIFTGRAB_MATCH_DEF(11)
-DOUBLESHIFTGRAB_MATCH_DEF(12)
-DOUBLESHIFTGRAB_MATCH_DEF(13)
-DOUBLESHIFTGRAB_MATCH_DEF(14)
-DOUBLESHIFTGRAB_MATCH_DEF(15)
-DOUBLESHIFTGRAB_MATCH_64_DEF(16)
-DOUBLESHIFTGRAB_MATCH_64_DEF(17)
-DOUBLESHIFTGRAB_MATCH_64_DEF(18)
-DOUBLESHIFTGRAB_MATCH_64_DEF(19)
-DOUBLESHIFTGRAB_MATCH_64_DEF(20)
-DOUBLESHIFTGRAB_MATCH_64_DEF(21)
-DOUBLESHIFTGRAB_MATCH_64_DEF(22)
-DOUBLESHIFTGRAB_MATCH_64_DEF(23)
-DOUBLESHIFTGRAB_MATCH_64_DEF(24)
-DOUBLESHIFTGRAB_MATCH_64_DEF(25)
-DOUBLESHIFTGRAB_MATCH_64_DEF(26)
-DOUBLESHIFTGRAB_MATCH_64_DEF(27)
-DOUBLESHIFTGRAB_MATCH_64_DEF(28)
-DOUBLESHIFTGRAB_MATCH_64_DEF(29)
-DOUBLESHIFTGRAB_MATCH_64_DEF(30)
-DOUBLESHIFTGRAB_MATCH_64_DEF(31)
-
-static
-const UNUSED u8 * (*doubleshiftgrab_match_funcs_32[])(const u8 *buf, u32 z, u32 len2) =
-{
-// skip the first
-    0,
-    &doubleshiftgrabMatch_32_1,
-    &doubleshiftgrabMatch_32_2,
-    &doubleshiftgrabMatch_32_3,
-    &doubleshiftgrabMatch_32_4,
-    &doubleshiftgrabMatch_32_5,
-    &doubleshiftgrabMatch_32_6,
-    &doubleshiftgrabMatch_32_7,
-    &doubleshiftgrabMatch_32_8,
-    &doubleshiftgrabMatch_32_9,
-    &doubleshiftgrabMatch_32_10,
-    &doubleshiftgrabMatch_32_11,
-    &doubleshiftgrabMatch_32_12,
-    &doubleshiftgrabMatch_32_13,
-    &doubleshiftgrabMatch_32_14,
-    &doubleshiftgrabMatch_32_15,
-};
-
-static
-const UNUSED u8 * (*doubleshiftgrab_match_funcs_64[])(const u8 *buf, u64a z, u32 len2) =
-{
-// skip the first
-    0,
-    &doubleshiftgrabMatch_64_1,
-    &doubleshiftgrabMatch_64_2,
-    &doubleshiftgrabMatch_64_3,
-    &doubleshiftgrabMatch_64_4,
-    &doubleshiftgrabMatch_64_5,
-    &doubleshiftgrabMatch_64_6,
-    &doubleshiftgrabMatch_64_7,
-    &doubleshiftgrabMatch_64_8,
-    &doubleshiftgrabMatch_64_9,
-    &doubleshiftgrabMatch_64_10,
-    &doubleshiftgrabMatch_64_11,
-    &doubleshiftgrabMatch_64_12,
-    &doubleshiftgrabMatch_64_13,
-    &doubleshiftgrabMatch_64_14,
-    &doubleshiftgrabMatch_64_15,
-    &doubleshiftgrabMatch_64_16,
-    &doubleshiftgrabMatch_64_17,
-    &doubleshiftgrabMatch_64_18,
-    &doubleshiftgrabMatch_64_19,
-    &doubleshiftgrabMatch_64_20,
-    &doubleshiftgrabMatch_64_21,
-    &doubleshiftgrabMatch_64_22,
-    &doubleshiftgrabMatch_64_23,
-    &doubleshiftgrabMatch_64_24,
-    &doubleshiftgrabMatch_64_25,
-    &doubleshiftgrabMatch_64_26,
-    &doubleshiftgrabMatch_64_27,
-    &doubleshiftgrabMatch_64_28,
-    &doubleshiftgrabMatch_64_29,
-    &doubleshiftgrabMatch_64_30,
-    &doubleshiftgrabMatch_64_31,
-};
-
-#endif /* MULTIACCEL_DOUBLESHIFTGRAB_H_ */
diff --git a/src/nfa/multiaccel_long.h b/src/nfa/multiaccel_long.h
deleted file mode 100644
index 515f0bc22..000000000
--- a/src/nfa/multiaccel_long.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2015, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef MULTIACCEL_LONG_H_
-#define MULTIACCEL_LONG_H_
-
-#include "multiaccel_common.h"
-
-#define LONG_MATCH(len, match_t, match_sz) \
-    static really_inline \
-    const u8 * JOIN4(longMatch_, match_sz, _, len)(const u8 *buf, match_t z) { \
-        if (unlikely(z)) { \
-            z |= ((match_t) (1 << (len - 1)) - 1) << (match_sz / 2); \
-            JOIN(SHIFT, len)(z); \
-            return JOIN(match, match_sz)(buf, z); \
-        } \
-        return NULL; \
-    }
-
-#define LONG_MATCH_32_DEF(n) \
-        LONG_MATCH(n, u32, 32)
-#define LONG_MATCH_64_DEF(n) \
-        LONG_MATCH(n, u64a, 64)
-#define LONG_MATCH_DEF(n) \
-    LONG_MATCH_32_DEF(n) \
-    LONG_MATCH_64_DEF(n)
-
-LONG_MATCH_DEF(1)
-LONG_MATCH_DEF(2)
-LONG_MATCH_DEF(3)
-LONG_MATCH_DEF(4)
-LONG_MATCH_DEF(5)
-LONG_MATCH_DEF(6)
-LONG_MATCH_DEF(7)
-LONG_MATCH_DEF(8)
-LONG_MATCH_DEF(9)
-LONG_MATCH_DEF(10)
-LONG_MATCH_DEF(11)
-LONG_MATCH_DEF(12)
-LONG_MATCH_DEF(13)
-LONG_MATCH_DEF(14)
-LONG_MATCH_DEF(15)
-LONG_MATCH_64_DEF(16)
-LONG_MATCH_64_DEF(17)
-LONG_MATCH_64_DEF(18)
-LONG_MATCH_64_DEF(19)
-LONG_MATCH_64_DEF(20)
-LONG_MATCH_64_DEF(21)
-LONG_MATCH_64_DEF(22)
-LONG_MATCH_64_DEF(23)
-LONG_MATCH_64_DEF(24)
-LONG_MATCH_64_DEF(25)
-LONG_MATCH_64_DEF(26)
-LONG_MATCH_64_DEF(27)
-LONG_MATCH_64_DEF(28)
-LONG_MATCH_64_DEF(29)
-LONG_MATCH_64_DEF(30)
-LONG_MATCH_64_DEF(31)
-
-static
-const UNUSED u8 *(*long_match_funcs_32[])(const u8 *buf, u32 z) =
-{
-    // skip the first three
-     0,
-     &longMatch_32_1,
-     &longMatch_32_2,
-     &longMatch_32_3,
-     &longMatch_32_4,
-     &longMatch_32_5,
-     &longMatch_32_6,
-     &longMatch_32_7,
-     &longMatch_32_8,
-     &longMatch_32_9,
-     &longMatch_32_10,
-     &longMatch_32_11,
-     &longMatch_32_12,
-     &longMatch_32_13,
-     &longMatch_32_14,
-     &longMatch_32_15,
- };
-
-static
-const UNUSED u8 *(*long_match_funcs_64[])(const u8 *buf, u64a z) =
-{
-// skip the first three
-    0,
-    &longMatch_64_1,
-    &longMatch_64_2,
-    &longMatch_64_3,
-    &longMatch_64_4,
-    &longMatch_64_5,
-    &longMatch_64_6,
-    &longMatch_64_7,
-    &longMatch_64_8,
-    &longMatch_64_9,
-    &longMatch_64_10,
-    &longMatch_64_11,
-    &longMatch_64_12,
-    &longMatch_64_13,
-    &longMatch_64_14,
-    &longMatch_64_15,
-    &longMatch_64_16,
-    &longMatch_64_17,
-    &longMatch_64_18,
-    &longMatch_64_19,
-    &longMatch_64_20,
-    &longMatch_64_21,
-    &longMatch_64_22,
-    &longMatch_64_23,
-    &longMatch_64_24,
-    &longMatch_64_25,
-    &longMatch_64_26,
-    &longMatch_64_27,
-    &longMatch_64_28,
-    &longMatch_64_29,
-    &longMatch_64_30,
-    &longMatch_64_31,
-};
-
-#endif /* MULTIACCEL_LONG_H_ */
diff --git a/src/nfa/multiaccel_longgrab.h b/src/nfa/multiaccel_longgrab.h
deleted file mode 100644
index 09daaf82a..000000000
--- a/src/nfa/multiaccel_longgrab.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) 2015, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef MULTIACCEL_LONGGRAB_H_
-#define MULTIACCEL_LONGGRAB_H_
-
-#include "multiaccel_common.h"
-
-#define LONGGRAB_MATCH(len, match_t, match_sz) \
-    static really_inline \
-    const u8 * JOIN4(longgrabMatch_, match_sz, _, len)(const u8 *buf, match_t z) { \
-        if (unlikely(z)) { \
-            match_t tmp = ~z; \
-            tmp |= ((match_t) (1 << len) - 1) << (match_sz / 2); \
-            z |= ((match_t) (1 << (len - 1)) - 1) << (match_sz / 2); \
-            JOIN(SHIFT, len)(z); \
-            VARISHIFT(tmp, z, len); \
-            return JOIN(match, match_sz)(buf, z); \
-        } \
-        return NULL; \
-    }
-
-#define LONGGRAB_MATCH_32_DEF(n) \
-        LONGGRAB_MATCH(n, u32, 32)
-#define LONGGRAB_MATCH_64_DEF(n) \
-        LONGGRAB_MATCH(n, u64a, 64)
-#define LONGGRAB_MATCH_DEF(n) \
-    LONGGRAB_MATCH_32_DEF(n) \
-    LONGGRAB_MATCH_64_DEF(n)
-
-LONGGRAB_MATCH_DEF(1)
-LONGGRAB_MATCH_DEF(2)
-LONGGRAB_MATCH_DEF(3)
-LONGGRAB_MATCH_DEF(4)
-LONGGRAB_MATCH_DEF(5)
-LONGGRAB_MATCH_DEF(6)
-LONGGRAB_MATCH_DEF(7)
-LONGGRAB_MATCH_DEF(8)
-LONGGRAB_MATCH_DEF(9)
-LONGGRAB_MATCH_DEF(10)
-LONGGRAB_MATCH_DEF(11)
-LONGGRAB_MATCH_DEF(12)
-LONGGRAB_MATCH_DEF(13)
-LONGGRAB_MATCH_DEF(14)
-LONGGRAB_MATCH_DEF(15)
-LONGGRAB_MATCH_64_DEF(16)
-LONGGRAB_MATCH_64_DEF(17)
-LONGGRAB_MATCH_64_DEF(18)
-LONGGRAB_MATCH_64_DEF(19)
-LONGGRAB_MATCH_64_DEF(20)
-LONGGRAB_MATCH_64_DEF(21)
-LONGGRAB_MATCH_64_DEF(22)
-LONGGRAB_MATCH_64_DEF(23)
-LONGGRAB_MATCH_64_DEF(24)
-LONGGRAB_MATCH_64_DEF(25)
-LONGGRAB_MATCH_64_DEF(26)
-LONGGRAB_MATCH_64_DEF(27)
-LONGGRAB_MATCH_64_DEF(28)
-LONGGRAB_MATCH_64_DEF(29)
-LONGGRAB_MATCH_64_DEF(30)
-LONGGRAB_MATCH_64_DEF(31)
-
-static
-const UNUSED u8 *(*longgrab_match_funcs_32[])(const u8 *buf, u32 z) =
-{
-// skip the first three
-     0,
-     &longgrabMatch_32_1,
-     &longgrabMatch_32_2,
-     &longgrabMatch_32_3,
-     &longgrabMatch_32_4,
-     &longgrabMatch_32_5,
-     &longgrabMatch_32_6,
-     &longgrabMatch_32_7,
-     &longgrabMatch_32_8,
-     &longgrabMatch_32_9,
-     &longgrabMatch_32_10,
-     &longgrabMatch_32_11,
-     &longgrabMatch_32_12,
-     &longgrabMatch_32_13,
-     &longgrabMatch_32_14,
-     &longgrabMatch_32_15,
- };
-
-static
-const UNUSED u8 *(*longgrab_match_funcs_64[])(const u8 *buf, u64a z) =
-{
-// skip the first three
-    0,
-    &longgrabMatch_64_1,
-    &longgrabMatch_64_2,
-    &longgrabMatch_64_3,
-    &longgrabMatch_64_4,
-    &longgrabMatch_64_5,
-    &longgrabMatch_64_6,
-    &longgrabMatch_64_7,
-    &longgrabMatch_64_8,
-    &longgrabMatch_64_9,
-    &longgrabMatch_64_10,
-    &longgrabMatch_64_11,
-    &longgrabMatch_64_12,
-    &longgrabMatch_64_13,
-    &longgrabMatch_64_14,
-    &longgrabMatch_64_15,
-    &longgrabMatch_64_16,
-    &longgrabMatch_64_17,
-    &longgrabMatch_64_18,
-    &longgrabMatch_64_19,
-    &longgrabMatch_64_20,
-    &longgrabMatch_64_21,
-    &longgrabMatch_64_22,
-    &longgrabMatch_64_23,
-    &longgrabMatch_64_24,
-    &longgrabMatch_64_25,
-    &longgrabMatch_64_26,
-    &longgrabMatch_64_27,
-    &longgrabMatch_64_28,
-    &longgrabMatch_64_29,
-    &longgrabMatch_64_30,
-    &longgrabMatch_64_31,
-};
-
-#endif /* MULTIACCEL_LONGGRAB_H_ */
diff --git a/src/nfa/multiaccel_shift.h b/src/nfa/multiaccel_shift.h
deleted file mode 100644
index fd362a8b6..000000000
--- a/src/nfa/multiaccel_shift.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2015, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef MULTIACCEL_SHIFT_H_
-#define MULTIACCEL_SHIFT_H_
-
-#include "multiaccel_common.h"
-
-#define SHIFT_MATCH(len, match_t, match_sz) \
-    static really_inline \
-    const u8 * JOIN4(shiftMatch_, match_sz, _, len)(const u8 *buf, match_t z) {\
-        if (unlikely(z)) { \
-            z |= ((match_t) (1 << (len)) - 1) << (match_sz / 2); \
-            VARISHIFT(z, z, len); \
-            return JOIN(match, match_sz)(buf, z); \
-        } \
-        return NULL; \
-    }
-
-#define SHIFT_MATCH_32_DEF(n) \
-        SHIFT_MATCH(n, u32, 32)
-#define SHIFT_MATCH_64_DEF(n) \
-        SHIFT_MATCH(n, u64a, 64)
-#define SHIFT_MATCH_DEF(n) \
-    SHIFT_MATCH_32_DEF(n) \
-    SHIFT_MATCH_64_DEF(n)
-
-SHIFT_MATCH_DEF(1)
-SHIFT_MATCH_DEF(2)
-SHIFT_MATCH_DEF(3)
-SHIFT_MATCH_DEF(4)
-SHIFT_MATCH_DEF(5)
-SHIFT_MATCH_DEF(6)
-SHIFT_MATCH_DEF(7)
-SHIFT_MATCH_DEF(8)
-SHIFT_MATCH_DEF(9)
-SHIFT_MATCH_DEF(10)
-SHIFT_MATCH_DEF(11)
-SHIFT_MATCH_DEF(12)
-SHIFT_MATCH_DEF(13)
-SHIFT_MATCH_DEF(14)
-SHIFT_MATCH_DEF(15)
-SHIFT_MATCH_64_DEF(16)
-SHIFT_MATCH_64_DEF(17)
-SHIFT_MATCH_64_DEF(18)
-SHIFT_MATCH_64_DEF(19)
-SHIFT_MATCH_64_DEF(20)
-SHIFT_MATCH_64_DEF(21)
-SHIFT_MATCH_64_DEF(22)
-SHIFT_MATCH_64_DEF(23)
-SHIFT_MATCH_64_DEF(24)
-SHIFT_MATCH_64_DEF(25)
-SHIFT_MATCH_64_DEF(26)
-SHIFT_MATCH_64_DEF(27)
-SHIFT_MATCH_64_DEF(28)
-SHIFT_MATCH_64_DEF(29)
-SHIFT_MATCH_64_DEF(30)
-SHIFT_MATCH_64_DEF(31)
-
-static
-const UNUSED u8 * (*shift_match_funcs_32[])(const u8 *buf, u32 z) =
-{
-// skip the first
-   0,
-   &shiftMatch_32_1,
-   &shiftMatch_32_2,
-   &shiftMatch_32_3,
-   &shiftMatch_32_4,
-   &shiftMatch_32_5,
-   &shiftMatch_32_6,
-   &shiftMatch_32_7,
-   &shiftMatch_32_8,
-   &shiftMatch_32_9,
-   &shiftMatch_32_10,
-   &shiftMatch_32_11,
-   &shiftMatch_32_12,
-   &shiftMatch_32_13,
-   &shiftMatch_32_14,
-   &shiftMatch_32_15,
-};
-
-static
-const UNUSED u8 * (*shift_match_funcs_64[])(const u8 *buf, u64a z) =
-{
-// skip the first
-    0,
-    &shiftMatch_64_1,
-    &shiftMatch_64_2,
-    &shiftMatch_64_3,
-    &shiftMatch_64_4,
-    &shiftMatch_64_5,
-    &shiftMatch_64_6,
-    &shiftMatch_64_7,
-    &shiftMatch_64_8,
-    &shiftMatch_64_9,
-    &shiftMatch_64_10,
-    &shiftMatch_64_11,
-    &shiftMatch_64_12,
-    &shiftMatch_64_13,
-    &shiftMatch_64_14,
-    &shiftMatch_64_15,
-    &shiftMatch_64_16,
-    &shiftMatch_64_17,
-    &shiftMatch_64_18,
-    &shiftMatch_64_19,
-    &shiftMatch_64_20,
-    &shiftMatch_64_21,
-    &shiftMatch_64_22,
-    &shiftMatch_64_23,
-    &shiftMatch_64_24,
-    &shiftMatch_64_25,
-    &shiftMatch_64_26,
-    &shiftMatch_64_27,
-    &shiftMatch_64_28,
-    &shiftMatch_64_29,
-    &shiftMatch_64_30,
-    &shiftMatch_64_31,
-};
-
-#endif /* MULTIACCEL_SHIFT_H_ */
diff --git a/src/nfa/multiaccel_shiftgrab.h b/src/nfa/multiaccel_shiftgrab.h
deleted file mode 100644
index 032ed0865..000000000
--- a/src/nfa/multiaccel_shiftgrab.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) 2015, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef MULTIACCEL_SHIFTGRAB_H_
-#define MULTIACCEL_SHIFTGRAB_H_
-
-#include "multiaccel_common.h"
-
-#define SHIFTGRAB_MATCH(len, match_t, match_sz) \
-    static really_inline \
-    const u8 * JOIN4(shiftgrabMatch_, match_sz, _, len)(const u8 *buf, match_t z) {\
-        if (unlikely(z)) { \
-            match_t tmp = ~z; \
-            z |= ((match_t) (1 << (len)) - 1) << (match_sz / 2); \
-            tmp |= ((match_t) (1 << len) - 1) << (match_sz / 2); \
-            VARISHIFT(z, z, len); \
-            VARISHIFT(tmp, z, 1); \
-            return JOIN(match, match_sz)(buf, z); \
-        } \
-        return NULL; \
-    }
-
-#define SHIFTGRAB_MATCH_32_DEF(n) \
-        SHIFTGRAB_MATCH(n, u32, 32)
-#define SHIFTGRAB_MATCH_64_DEF(n) \
-        SHIFTGRAB_MATCH(n, u64a, 64)
-#define SHIFTGRAB_MATCH_DEF(n) \
-    SHIFTGRAB_MATCH_32_DEF(n) \
-    SHIFTGRAB_MATCH_64_DEF(n)
-
-SHIFTGRAB_MATCH_DEF(1)
-SHIFTGRAB_MATCH_DEF(2)
-SHIFTGRAB_MATCH_DEF(3)
-SHIFTGRAB_MATCH_DEF(4)
-SHIFTGRAB_MATCH_DEF(5)
-SHIFTGRAB_MATCH_DEF(6)
-SHIFTGRAB_MATCH_DEF(7)
-SHIFTGRAB_MATCH_DEF(8)
-SHIFTGRAB_MATCH_DEF(9)
-SHIFTGRAB_MATCH_DEF(10)
-SHIFTGRAB_MATCH_DEF(11)
-SHIFTGRAB_MATCH_DEF(12)
-SHIFTGRAB_MATCH_DEF(13)
-SHIFTGRAB_MATCH_DEF(14)
-SHIFTGRAB_MATCH_DEF(15)
-SHIFTGRAB_MATCH_64_DEF(16)
-SHIFTGRAB_MATCH_64_DEF(17)
-SHIFTGRAB_MATCH_64_DEF(18)
-SHIFTGRAB_MATCH_64_DEF(19)
-SHIFTGRAB_MATCH_64_DEF(20)
-SHIFTGRAB_MATCH_64_DEF(21)
-SHIFTGRAB_MATCH_64_DEF(22)
-SHIFTGRAB_MATCH_64_DEF(23)
-SHIFTGRAB_MATCH_64_DEF(24)
-SHIFTGRAB_MATCH_64_DEF(25)
-SHIFTGRAB_MATCH_64_DEF(26)
-SHIFTGRAB_MATCH_64_DEF(27)
-SHIFTGRAB_MATCH_64_DEF(28)
-SHIFTGRAB_MATCH_64_DEF(29)
-SHIFTGRAB_MATCH_64_DEF(30)
-SHIFTGRAB_MATCH_64_DEF(31)
-
-static
-const UNUSED u8 * (*shiftgrab_match_funcs_32[])(const u8 *buf, u32 z) =
-{
-// skip the first
-    0,
-    &shiftgrabMatch_32_1,
-    &shiftgrabMatch_32_2,
-    &shiftgrabMatch_32_3,
-    &shiftgrabMatch_32_4,
-    &shiftgrabMatch_32_5,
-    &shiftgrabMatch_32_6,
-    &shiftgrabMatch_32_7,
-    &shiftgrabMatch_32_8,
-    &shiftgrabMatch_32_9,
-    &shiftgrabMatch_32_10,
-    &shiftgrabMatch_32_11,
-    &shiftgrabMatch_32_12,
-    &shiftgrabMatch_32_13,
-    &shiftgrabMatch_32_14,
-    &shiftgrabMatch_32_15,
-};
-
-static
-const UNUSED u8 * (*shiftgrab_match_funcs_64[])(const u8 *buf, u64a z) =
-                                                       {
-// skip the first
-    0,
-    &shiftgrabMatch_64_1,
-    &shiftgrabMatch_64_2,
-    &shiftgrabMatch_64_3,
-    &shiftgrabMatch_64_4,
-    &shiftgrabMatch_64_5,
-    &shiftgrabMatch_64_6,
-    &shiftgrabMatch_64_7,
-    &shiftgrabMatch_64_8,
-    &shiftgrabMatch_64_9,
-    &shiftgrabMatch_64_10,
-    &shiftgrabMatch_64_11,
-    &shiftgrabMatch_64_12,
-    &shiftgrabMatch_64_13,
-    &shiftgrabMatch_64_14,
-    &shiftgrabMatch_64_15,
-    &shiftgrabMatch_64_16,
-    &shiftgrabMatch_64_17,
-    &shiftgrabMatch_64_18,
-    &shiftgrabMatch_64_19,
-    &shiftgrabMatch_64_20,
-    &shiftgrabMatch_64_21,
-    &shiftgrabMatch_64_22,
-    &shiftgrabMatch_64_23,
-    &shiftgrabMatch_64_24,
-    &shiftgrabMatch_64_25,
-    &shiftgrabMatch_64_26,
-    &shiftgrabMatch_64_27,
-    &shiftgrabMatch_64_28,
-    &shiftgrabMatch_64_29,
-    &shiftgrabMatch_64_30,
-    &shiftgrabMatch_64_31,
-};
-
-#endif /* MULTIACCEL_SHIFTGRAB_H_ */
diff --git a/src/nfa/multishufti.c b/src/nfa/multishufti.c
deleted file mode 100644
index cb85b7186..000000000
--- a/src/nfa/multishufti.c
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2015, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/** \file
- * \brief Shufti: character class acceleration.
- *
- * Utilises the SSSE3 pshufb shuffle instruction
- */
-
-#include "config.h"
-#include "ue2common.h"
-
-#include "multishufti.h"
-
-#include "multiaccel_common.h"
-
-#if !defined(__AVX2__)
-
-#define MATCH_ALGO long_
-#include "multiaccel_long.h"
-#include "multishufti_sse.h"
-#undef MATCH_ALGO
-
-#define MATCH_ALGO longgrab_
-#include "multiaccel_longgrab.h"
-#include "multishufti_sse.h"
-#undef MATCH_ALGO
-
-#define MATCH_ALGO shift_
-#include "multiaccel_shift.h"
-#include "multishufti_sse.h"
-#undef MATCH_ALGO
-
-#define MATCH_ALGO shiftgrab_
-#include "multiaccel_shiftgrab.h"
-#include "multishufti_sse.h"
-#undef MATCH_ALGO
-
-#define MULTIACCEL_DOUBLE
-
-#define MATCH_ALGO doubleshift_
-#include "multiaccel_doubleshift.h"
-#include "multishufti_sse.h"
-#undef MATCH_ALGO
-
-#define MATCH_ALGO doubleshiftgrab_
-#include "multiaccel_doubleshiftgrab.h"
-#include "multishufti_sse.h"
-#undef MATCH_ALGO
-
-#undef MULTIACCEL_DOUBLE
-
-#else
-
-#define MATCH_ALGO long_
-#include "multiaccel_long.h"
-#include "multishufti_avx2.h"
-#undef MATCH_ALGO
-
-#define MATCH_ALGO longgrab_
-#include "multiaccel_longgrab.h"
-#include "multishufti_avx2.h"
-#undef MATCH_ALGO
-
-#define MATCH_ALGO shift_
-#include "multiaccel_shift.h"
-#include "multishufti_avx2.h"
-#undef MATCH_ALGO
-
-#define MATCH_ALGO shiftgrab_
-#include "multiaccel_shiftgrab.h"
-#include "multishufti_avx2.h"
-#undef MATCH_ALGO
-
-#define MULTIACCEL_DOUBLE
-
-#define MATCH_ALGO doubleshift_
-#include "multiaccel_doubleshift.h"
-#include "multishufti_avx2.h"
-#undef MATCH_ALGO
-
-#define MATCH_ALGO doubleshiftgrab_
-#include "multiaccel_doubleshiftgrab.h"
-#include "multishufti_avx2.h"
-#undef MATCH_ALGO
-
-#undef MULTIACCEL_DOUBLE
-
-#endif
diff --git a/src/nfa/multishufti_avx2.h b/src/nfa/multishufti_avx2.h
deleted file mode 100644
index 042f55707..000000000
--- a/src/nfa/multishufti_avx2.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2015-2016, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "shufti_common.h"
-
-#include "ue2common.h"
-#include "util/bitutils.h"
-#include "util/simd_utils.h"
-
-static really_inline
-const u8 *JOIN(MATCH_ALGO, fwdBlock)(m256 mask_lo, m256 mask_hi, m256 chars,
-                                     const u8 *buf, const m256 low4bits,
-                                     const m256 zeroes, const u8 run_len
-#ifdef MULTIACCEL_DOUBLE
-                                     , const u8 run_len2
-#endif
-                                     ) {
-    u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
-    return (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])(buf, ~z
-#ifdef MULTIACCEL_DOUBLE
-                                                             , run_len2
-#endif
-                                                             );
-}
-
-const u8 *JOIN(MATCH_ALGO, shuftiExec)(m128 mask_lo, m128 mask_hi,
-                                       const u8 *buf,
-                                       const u8 *buf_end, u8 run_len
-#ifdef MULTIACCEL_DOUBLE
-                                       , u8 run_len2
-#endif
-                                       ) {
-    assert(buf && buf_end);
-    assert(buf < buf_end);
-
-    // Slow path for small cases.
-    if (buf_end - buf < 32) {
-        return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
-                             buf, buf_end);
-    }
-
-    const m256 zeroes = zeroes256();
-    const m256 low4bits = set32x8(0xf);
-    const m256 wide_mask_lo = set2x128(mask_lo);
-    const m256 wide_mask_hi = set2x128(mask_hi);
-    const u8 *rv;
-
-    size_t min = (size_t)buf % 32;
-    assert(buf_end - buf >= 32);
-
-    // Preconditioning: most of the time our buffer won't be aligned.
-    m256 chars = loadu256(buf);
-    rv = JOIN(MATCH_ALGO, fwdBlock)(wide_mask_lo, wide_mask_hi, chars, buf,
-                                            low4bits, zeroes, run_len
-#ifdef MULTIACCEL_DOUBLE
-                                            , run_len2
-#endif
-                                            );
-    if (rv) {
-        return rv;
-    }
-    buf += (32 - min);
-
-    // Unrolling was here, but it wasn't doing anything but taking up space.
-    // Reroll FTW.
-    const u8 *last_block = buf_end - 32;
-    while (buf < last_block) {
-        m256 lchars = load256(buf);
-        rv = JOIN(MATCH_ALGO, fwdBlock)(wide_mask_lo, wide_mask_hi, lchars, buf,
-                                        low4bits, zeroes, run_len
-#ifdef MULTIACCEL_DOUBLE
-                                        , run_len2
-#endif
-                                        );
-        if (rv) {
-            return rv;
-        }
-        buf += 32;
-    }
-
-    // Use an unaligned load to mop up the last 32 bytes and get an accurate
-    // picture to buf_end.
-    assert(buf <= buf_end && buf >= buf_end - 32);
-    chars = loadu256(buf_end - 32);
-    rv = JOIN(MATCH_ALGO, fwdBlock)(wide_mask_lo, wide_mask_hi, chars, buf_end - 32,
-                                    low4bits, zeroes, run_len
-#ifdef MULTIACCEL_DOUBLE
-                                    , run_len2
-#endif
-                                    );
-    if (rv) {
-        return rv;
-    }
-
-    return buf_end;
-}
diff --git a/src/nfa/multishufti_sse.h b/src/nfa/multishufti_sse.h
deleted file mode 100644
index 0a9b543ee..000000000
--- a/src/nfa/multishufti_sse.h
+++ /dev/null
@@ -1,265 +0,0 @@
-/*
- * Copyright (c) 2015-2016, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "shufti_common.h"
-
-#include "ue2common.h"
-#include "util/bitutils.h"
-#include "util/simd_utils.h"
-
-/* Normal SSSE3 shufti */
-
-static really_inline
-const u8 *JOIN(MATCH_ALGO, fwdBlock)(m128 mask_lo, m128 mask_hi, m128 chars,
-                                     const u8 *buf, const m128 low4bits,
-                                     const m128 zeroes, const u8 run_len
-#ifdef MULTIACCEL_DOUBLE
-                                     , const u8 run_len2
-#endif
-                                             ) {
-    // negate first 16 bits
-    u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes) ^ 0xFFFF;
-    return (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len])(buf, z
-#ifdef MULTIACCEL_DOUBLE
-            , run_len2
-#endif
-            );
-}
-
-/*
- * 16-byte pipeline, for smaller scans
- */
-static
-const u8 *JOIN(MATCH_ALGO, shuftiPipeline16)(m128 mask_lo, m128 mask_hi,
-                                             const u8 *buf, const u8 *buf_end,
-                                             const m128 low4bits,
-                                             const m128 zeroes, const u8 run_len
-#ifdef MULTIACCEL_DOUBLE
-                                             , const u8 run_len2
-#endif
-                                             ) {
-    const u8* ptr, *last_buf;
-    u32 last_res;
-
-    // pipeline prologue: scan first 16 bytes
-    m128 data = load128(buf);
-    u32 z = block(mask_lo, mask_hi, data, low4bits, zeroes) ^ 0xFFFF;
-    last_buf = buf;
-    last_res = z;
-    buf += 16;
-
-    // now, start the pipeline!
-    assert((size_t)buf % 16 == 0);
-    for (; buf + 15 < buf_end; buf += 16) {
-        // scan more data
-        data = load128(buf);
-        z = block(mask_lo, mask_hi, data, low4bits, zeroes) ^ 0xFFFF;
-
-        // do a comparison on previous result
-        ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len])
-                (last_buf, last_res
-#ifdef MULTIACCEL_DOUBLE
-                 , run_len2
-#endif
-                 );
-        if (unlikely(ptr)) {
-            return ptr;
-        }
-        last_buf = buf;
-        last_res = z;
-    }
-    assert(buf <= buf_end && buf >= buf_end - 16);
-
-    // epilogue: compare final results
-    ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len])
-            (last_buf, last_res
-#ifdef MULTIACCEL_DOUBLE
-             , run_len2
-#endif
-             );
-    if (unlikely(ptr)) {
-        return ptr;
-    }
-
-    return NULL;
-}
-
-/*
- * 32-byte pipeline, for bigger scans
- */
-static
-const u8 *JOIN(MATCH_ALGO, shuftiPipeline32)(m128 mask_lo, m128 mask_hi,
-                                             const u8 *buf, const u8 *buf_end,
-                                             const m128 low4bits,
-                                             const m128 zeroes, const u8 run_len
-#ifdef MULTIACCEL_DOUBLE
-                                             , const u8 run_len2
-#endif
-                                             ) {
-    const u8* ptr, *last_buf;
-    u32 res;
-
-    // pipeline prologue: scan first 32 bytes
-    m128 data1 = load128(buf);
-    u32 z1 = block(mask_lo, mask_hi, data1, low4bits, zeroes) ^ 0xFFFF;
-    m128 data2 = load128(buf + 16);
-    u32 z2 = block(mask_lo, mask_hi, data2, low4bits, zeroes) ^ 0xFFFF;
-
-    // store the results
-    u32 last_res = z1 | (z2 << 16);
-    last_buf = buf;
-    buf += 32;
-
-
-    // now, start the pipeline!
-    assert((size_t)buf % 16 == 0);
-    for (; buf + 31 < buf_end; buf += 32) {
-        // scan more data
-        data1 = load128(buf);
-        z1 = block(mask_lo, mask_hi, data1, low4bits, zeroes) ^ 0xFFFF;
-        data2 = load128(buf + 16);
-        z2 = block(mask_lo, mask_hi, data2, low4bits, zeroes) ^ 0xFFFF;
-        res = z1 | (z2 << 16);
-
-        // do a comparison on previous result
-        ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])
-                (last_buf, last_res
-#ifdef MULTIACCEL_DOUBLE
-                 , run_len2
-#endif
-                 );
-        if (unlikely(ptr)) {
-            return ptr;
-        }
-        last_res = res;
-        last_buf = buf;
-    }
-
-    // epilogue: compare final results
-    ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])
-            (last_buf, last_res
-#ifdef MULTIACCEL_DOUBLE
-             , run_len2
-#endif
-             );
-    if (unlikely(ptr)) {
-        return ptr;
-    }
-
-    // if we still have some data left, scan it too
-    for (; buf + 15 < buf_end; buf += 16) {
-        m128 chars = load128(buf);
-        ptr = JOIN(MATCH_ALGO, fwdBlock)(mask_lo, mask_hi, chars, buf,
-                low4bits, zeroes, run_len
-#ifdef MULTIACCEL_DOUBLE
-                , run_len2
-#endif
-                );
-        if (unlikely(ptr)) {
-            return ptr;
-        }
-    }
-    assert(buf <= buf_end && buf >= buf_end - 16);
-
-    return NULL;
-}
-
-const u8 *JOIN(MATCH_ALGO, shuftiExec)(m128 mask_lo, m128 mask_hi,
-                                       const u8 *buf,
-                                       const u8 *buf_end, u8 run_len
-#ifdef MULTIACCEL_DOUBLE
-                                       , u8 run_len2
-#endif
-                                               ) {
-    assert(buf && buf_end);
-    assert(buf < buf_end);
-
-    // Slow path for small cases.
-    if (buf_end - buf < 16) {
-        return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi,
-                buf, buf_end);
-    }
-
-    const m128 zeroes = zeroes128();
-    const m128 low4bits = _mm_set1_epi8(0xf);
-    const u8 *rv;
-
-    size_t min = (size_t)buf % 16;
-    assert(buf_end - buf >= 16);
-
-    // Preconditioning: most of the time our buffer won't be aligned.
-    m128 chars = loadu128(buf);
-    rv = JOIN(MATCH_ALGO, fwdBlock)(mask_lo, mask_hi, chars, buf,
-            low4bits, zeroes, run_len
-#ifdef MULTIACCEL_DOUBLE
-            , run_len2
-#endif
-            );
-    if (rv) {
-        return rv;
-    }
-    buf += (16 - min);
-
-    // if we have enough data, run bigger pipeline; otherwise run smaller one
-    if (buf_end - buf >= 128) {
-        rv = JOIN(MATCH_ALGO, shuftiPipeline32)(mask_lo, mask_hi,
-                buf, buf_end, low4bits, zeroes, run_len
-#ifdef MULTIACCEL_DOUBLE
-                , run_len2
-#endif
-                );
-        if (unlikely(rv)) {
-            return rv;
-        }
-    } else if (buf_end - buf >= 16){
-        rv = JOIN(MATCH_ALGO, shuftiPipeline16)(mask_lo, mask_hi,
-                buf, buf_end, low4bits, zeroes, run_len
-#ifdef MULTIACCEL_DOUBLE
-                , run_len2
-#endif
-                );
-        if (unlikely(rv)) {
-            return rv;
-        }
-    }
-
-    // Use an unaligned load to mop up the last 16 bytes and get an accurate
-    // picture to buf_end.
-    chars = loadu128(buf_end - 16);
-    rv = JOIN(MATCH_ALGO, fwdBlock)(mask_lo, mask_hi, chars,
-            buf_end - 16, low4bits, zeroes, run_len
-#ifdef MULTIACCEL_DOUBLE
-            , run_len2
-#endif
-            );
-    if (rv) {
-        return rv;
-    }
-
-    return buf_end;
-}
diff --git a/src/nfa/multitruffle.c b/src/nfa/multitruffle.c
deleted file mode 100644
index 381bda936..000000000
--- a/src/nfa/multitruffle.c
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2015-2016, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "config.h"
-#include "ue2common.h"
-
-#include "multitruffle.h"
-#include "util/bitutils.h"
-#include "util/simd_utils.h"
-
-#include "multiaccel_common.h"
-
-#if !defined(__AVX2__)
-
-#define MATCH_ALGO long_
-#include "multiaccel_long.h"
-#include "multitruffle_sse.h"
-#undef MATCH_ALGO
-
-#define MATCH_ALGO longgrab_
-#include "multiaccel_longgrab.h"
-#include "multitruffle_sse.h"
-#undef MATCH_ALGO
-
-#define MATCH_ALGO shift_
-#include "multiaccel_shift.h"
-#include "multitruffle_sse.h"
-#undef MATCH_ALGO
-
-#define MATCH_ALGO shiftgrab_
-#include "multiaccel_shiftgrab.h"
-#include "multitruffle_sse.h"
-#undef MATCH_ALGO
-
-#define MULTIACCEL_DOUBLE
-
-#define MATCH_ALGO doubleshift_
-#include "multiaccel_doubleshift.h"
-#include "multitruffle_sse.h"
-#undef MATCH_ALGO
-
-#define MATCH_ALGO doubleshiftgrab_
-#include "multiaccel_doubleshiftgrab.h"
-#include "multitruffle_sse.h"
-#undef MATCH_ALGO
-
-#undef MULTIACCEL_DOUBLE
-
-#else
-
-#define MATCH_ALGO long_
-#include "multiaccel_long.h"
-#include "multitruffle_avx2.h"
-#undef MATCH_ALGO
-
-#define MATCH_ALGO longgrab_
-#include "multiaccel_longgrab.h"
-#include "multitruffle_avx2.h"
-#undef MATCH_ALGO
-
-#define MATCH_ALGO shift_
-#include "multiaccel_shift.h"
-#include "multitruffle_avx2.h"
-#undef MATCH_ALGO
-
-#define MATCH_ALGO shiftgrab_
-#include "multiaccel_shiftgrab.h"
-#include "multitruffle_avx2.h"
-#undef MATCH_ALGO
-
-#define MULTIACCEL_DOUBLE
-
-#define MATCH_ALGO doubleshift_
-#include "multiaccel_doubleshift.h"
-#include "multitruffle_avx2.h"
-#undef MATCH_ALGO
-
-#define MATCH_ALGO doubleshiftgrab_
-#include "multiaccel_doubleshiftgrab.h"
-#include "multitruffle_avx2.h"
-#undef MATCH_ALGO
-
-#undef MULTIACCEL_DOUBLE
-
-#endif
diff --git a/src/nfa/multitruffle.h b/src/nfa/multitruffle.h
deleted file mode 100644
index 8703b5ca3..000000000
--- a/src/nfa/multitruffle.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2015, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef MULTITRUFFLE_H
-#define MULTITRUFFLE_H
-
-/** \file
- * \brief Multitruffle: multibyte version of Truffle.
- *
- * Utilises the SSSE3 pshufb shuffle instruction
- */
-
-#include "util/simd_types.h"
-
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-
-const u8 *long_truffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
-                           const u8 *buf, const u8 *buf_end, const u8 run_len);
-
-const u8 *longgrab_truffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
-                               const u8 *buf, const u8 *buf_end, const u8 run_len);
-
-const u8 *shift_truffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
-                            const u8 *buf, const u8 *buf_end, const u8 run_len);
-
-const u8 *shiftgrab_truffleExec(m128 shuf_mask_lo_highclear,
-                                m128 shuf_mask_lo_highset, const u8 *buf,
-                                const u8 *buf_end, const u8 run_len);
-
-const u8 *doubleshift_truffleExec(m128 shuf_mask_lo_highclear,
-                                  m128 shuf_mask_lo_highset, const u8 *buf,
-                                  const u8 *buf_end, const u8 run_len,
-                                  const u8 run2_len);
-
-const u8 *doubleshiftgrab_truffleExec(m128 shuf_mask_lo_highclear,
-                                      m128 shuf_mask_lo_highset, const u8 *buf,
-                                      const u8 *buf_end, const u8 run_len,
-                                      const u8 run2_len);
-
-#ifdef __cplusplus
-}
-#endif
-
-
-#endif /* MULTITRUFFLE_H */
diff --git a/src/nfa/multitruffle_avx2.h b/src/nfa/multitruffle_avx2.h
deleted file mode 100644
index e52db5fc9..000000000
--- a/src/nfa/multitruffle_avx2.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) 2015, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
- * Matches a byte in a charclass using three shuffles
- */
-
-#include "config.h"
-#include "ue2common.h"
-#include "multiaccel_common.h"
-
-/*
- * include "block" function
- */
-#include "truffle_common.h"
-
-/*
- * single-byte truffle fwd match function, should only be defined when not
- * compiling multiaccel
- */
-static really_inline
-const u8 *JOIN(MATCH_ALGO, fwdBlock)(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset,
-                                     m256 v, const u8 *buf, const u8 run_len
-#ifdef MULTIACCEL_DOUBLE
-                                     , const u8 run_len2
-#endif
-                                     ) {
-    u64a z = (u64a) block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v);
-    return (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])(buf, z ^ 0xFFFFFFFF
-#ifdef MULTIACCEL_DOUBLE
-                                                             , run_len2
-#endif
-                                                             );
-}
-
-const u8 *JOIN(MATCH_ALGO, truffleExec)(m128 shuf_mask_lo_highclear,
-                                        m128 shuf_mask_lo_highset,
-                                        const u8 *buf, const u8 *buf_end, const u8 run_len
-#ifdef MULTIACCEL_DOUBLE
-                                        , const u8 run_len2
-#endif
-                                        ) {
-    DEBUG_PRINTF("run_len %zu\n", buf_end - buf);
-    const m256 wide_clear = set2x128(shuf_mask_lo_highclear);
-    const m256 wide_set = set2x128(shuf_mask_lo_highset);
-
-    assert(buf && buf_end);
-    assert(buf < buf_end);
-    const u8 *rv;
-
-    if (buf_end - buf < 32) {
-        return truffleMini(wide_clear, wide_set, buf, buf_end);
-    }
-
-    size_t min = (size_t)buf % 32;
-    assert(buf_end - buf >= 32);
-
-    // Preconditioning: most of the time our buffer won't be aligned.
-    m256 chars = loadu256(buf);
-    rv = JOIN(MATCH_ALGO, fwdBlock)(wide_clear, wide_set, chars, buf, run_len
-#ifdef MULTIACCEL_DOUBLE
-                                    , run_len2
-#endif
-                                    );
-    if (rv) {
-        return rv;
-    }
-    buf += (32 - min);
-
-    const u8 *last_block = buf_end - 32;
-    while (buf < last_block) {
-        m256 lchars = load256(buf);
-        rv = JOIN(MATCH_ALGO, fwdBlock)(wide_clear, wide_set, lchars,
-                                        buf, run_len
-#ifdef MULTIACCEL_DOUBLE
-                                        , run_len2
-#endif
-                                        );
-        if (rv) {
-            return rv;
-        }
-        buf += 32;
-    }
-
-    // Use an unaligned load to mop up the last 32 bytes and get an accurate
-    // picture to buf_end.
-    assert(buf <= buf_end && buf >= buf_end - 32);
-    chars = loadu256(buf_end - 32);
-    rv = JOIN(MATCH_ALGO, fwdBlock)(wide_clear, wide_set, chars,
-                                    buf_end - 32, run_len
-#ifdef MULTIACCEL_DOUBLE
-                                    , run_len2
-#endif
-                                    );
-    if (rv) {
-        return rv;
-    }
-
-    return buf_end;
-}
diff --git a/src/nfa/multitruffle_sse.h b/src/nfa/multitruffle_sse.h
deleted file mode 100644
index b287e4fc4..000000000
--- a/src/nfa/multitruffle_sse.h
+++ /dev/null
@@ -1,265 +0,0 @@
-/*
- * Copyright (c) 2015, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "config.h"
-#include "ue2common.h"
-#include "multiaccel_common.h"
-
-/*
- * include "block" function
- */
-#include "truffle_common.h"
-
-/*
- * single-byte truffle fwd match function, should only be defined when not
- * compiling multiaccel
- */
-
-static really_inline
-const u8 *JOIN(MATCH_ALGO, fwdBlock)(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
-                                     m128 v, const u8 *buf, const u8 run_len
-#ifdef MULTIACCEL_DOUBLE
-                                     , const u8 run_len2
-#endif
-                                     ) {
-    u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v) ^ 0xFFFF;
-    return (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len])(buf, z
-#ifdef MULTIACCEL_DOUBLE
-                                                             , run_len2
-#endif
-            );
-}
-
-/*
- * 16-byte pipeline, for smaller scans
- */
-static
-const u8 *JOIN(MATCH_ALGO, trufflePipeline16)(m128 shuf_mask_lo_highclear,
-                                              m128 shuf_mask_lo_highset,
-                                              const u8 *buf, const u8 *buf_end,
-                                              const u8 run_len
-#ifdef MULTIACCEL_DOUBLE
-                                              , const u8 run_len2
-#endif
-                                              ) {
-    const u8* ptr, *last_buf;
-    u32 last_res;
-
-    // pipeline prologue: scan first 16 bytes
-    m128 data = load128(buf);
-    u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, data) ^ 0xFFFF;
-    last_buf = buf;
-    last_res = z;
-    buf += 16;
-
-    // now, start the pipeline!
-    assert((size_t)buf % 16 == 0);
-    for (; buf + 15 < buf_end; buf += 16) {
-        // scan more data
-        data = load128(buf);
-        z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, data) ^ 0xFFFF;
-
-        // do a comparison on previous result
-        ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len])
-                (last_buf, last_res
-#ifdef MULTIACCEL_DOUBLE
-                 , run_len2
-#endif
-                 );
-        if (unlikely(ptr)) {
-            return ptr;
-        }
-        last_buf = buf;
-        last_res = z;
-    }
-    assert(buf <= buf_end && buf >= buf_end - 16);
-
-    // epilogue: compare final results
-    ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len])
-            (last_buf, last_res
-#ifdef MULTIACCEL_DOUBLE
-             , run_len2
-#endif
-             );
-    if (unlikely(ptr)) {
-        return ptr;
-    }
-
-    return NULL;
-}
-
-/*
- * 32-byte pipeline, for bigger scans
- */
-static
-const u8 *JOIN(MATCH_ALGO, trufflePipeline32)(m128 shuf_mask_lo_highclear,
-                                              m128 shuf_mask_lo_highset,
-                                              const u8 *buf, const u8 *buf_end,
-                                              const u8 run_len
-#ifdef MULTIACCEL_DOUBLE
-                                              , const u8 run_len2
-#endif
-                                              ) {
-    const u8* ptr, *last_buf;
-    u32 res;
-
-    // pipeline prologue: scan first 32 bytes
-    m128 data1 = load128(buf);
-    u32 z1 = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, data1) ^ 0xFFFF;
-    m128 data2 = load128(buf + 16);
-    u32 z2 = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, data2) ^ 0xFFFF;
-
-    // store the results
-    u32 last_res = z1 | (z2 << 16);
-    last_buf = buf;
-    buf += 32;
-
-
-    // now, start the pipeline!
-    assert((size_t)buf % 16 == 0);
-    for (; buf + 31 < buf_end; buf += 32) {
-        // scan more data
-        data1 = load128(buf);
-        z1 = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, data1) ^ 0xFFFF;
-        data2 = load128(buf + 16);
-        z2 = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, data2) ^ 0xFFFF;
-        res = z1 | (z2 << 16);
-
-        // do a comparison on previous result
-        ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])
-                (last_buf, last_res
-#ifdef MULTIACCEL_DOUBLE
-                 , run_len2
-#endif
-                 );
-        if (unlikely(ptr)) {
-            return ptr;
-        }
-        last_res = res;
-        last_buf = buf;
-    }
-
-    // epilogue: compare final results
-    ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])
-            (last_buf, last_res
-#ifdef MULTIACCEL_DOUBLE
-             , run_len2
-#endif
-             );
-    if (unlikely(ptr)) {
-        return ptr;
-    }
-
-    // if we still have some data left, scan it too
-    for (; buf + 15 < buf_end; buf += 16) {
-        m128 chars = load128(buf);
-        ptr = JOIN(MATCH_ALGO, fwdBlock)(shuf_mask_lo_highclear, shuf_mask_lo_highset,
-                                         chars, buf, run_len
-#ifdef MULTIACCEL_DOUBLE
-                                         , run_len2
-#endif
-                                         );
-        if (unlikely(ptr)) {
-            return ptr;
-        }
-    }
-    assert(buf <= buf_end && buf >= buf_end - 16);
-
-    return NULL;
-}
-
-const u8 *JOIN(MATCH_ALGO, truffleExec)(m128 shuf_mask_lo_highclear,
-                                        m128 shuf_mask_lo_highset,
-                                        const u8 *buf, const u8 *buf_end, const u8 run_len
-#ifdef MULTIACCEL_DOUBLE
-                                        , const u8 run_len2
-#endif
-                                                ) {
-    DEBUG_PRINTF("run_len %zu\n", buf_end - buf);
-
-    assert(buf && buf_end);
-    assert(buf < buf_end);
-    const u8 *rv;
-
-    if (buf_end - buf < 16) {
-        return truffleMini(shuf_mask_lo_highclear, shuf_mask_lo_highset, buf, buf_end);
-    }
-
-    size_t min = (size_t)buf % 16;
-    assert(buf_end - buf >= 16);
-
-    // Preconditioning: most of the time our buffer won't be aligned.
-    m128 chars = loadu128(buf);
-    rv = JOIN(MATCH_ALGO, fwdBlock)(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars, buf, run_len
-#ifdef MULTIACCEL_DOUBLE
-                                    , run_len2
-#endif
-                                    );
-    if (rv) {
-        return rv;
-    }
-    buf += (16 - min);
-
-    // if we have enough data, run bigger pipeline; otherwise run smaller one
-    if (buf_end - buf >= 128) {
-        rv = JOIN(MATCH_ALGO, trufflePipeline32)(shuf_mask_lo_highclear, shuf_mask_lo_highset,
-                                                 buf, buf_end, run_len
-#ifdef MULTIACCEL_DOUBLE
-                                                 , run_len2
-#endif
-                                                 );
-        if (unlikely(rv)) {
-            return rv;
-        }
-    } else if (buf_end - buf >= 16){
-        rv = JOIN(MATCH_ALGO, trufflePipeline16)(shuf_mask_lo_highclear, shuf_mask_lo_highset,
-                                                 buf, buf_end, run_len
-#ifdef MULTIACCEL_DOUBLE
-                                                 , run_len2
-#endif
-                                                 );
-        if (unlikely(rv)) {
-            return rv;
-        }
-    }
-
-    // Use an unaligned load to mop up the last 16 bytes and get an accurate
-    // picture to buf_end.
-    chars = loadu128(buf_end - 16);
-    rv = JOIN(MATCH_ALGO, fwdBlock)(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars,
-                                    buf_end - 16, run_len
-#ifdef MULTIACCEL_DOUBLE
-                                    , run_len2
-#endif
-                                    );
-    if (rv) {
-        return rv;
-    }
-
-    return buf_end;
-}
diff --git a/src/nfa/multivermicelli.c b/src/nfa/multivermicelli.c
deleted file mode 100644
index ab6d2cf21..000000000
--- a/src/nfa/multivermicelli.c
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2015, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "config.h"
-#include "ue2common.h"
-
-#include "multivermicelli.h"
-
-#include "multiaccel_common.h"
-
-#if !defined(__AVX2__)
-
-#define MATCH_ALGO long_
-#include "multiaccel_long.h"
-#include "multivermicelli_sse.h"
-#undef MATCH_ALGO
-
-#define MATCH_ALGO longgrab_
-#include "multiaccel_longgrab.h"
-#include "multivermicelli_sse.h"
-#undef MATCH_ALGO
-
-#define MATCH_ALGO shift_
-#include "multiaccel_shift.h"
-#include "multivermicelli_sse.h"
-#undef MATCH_ALGO
-
-#define MATCH_ALGO shiftgrab_
-#include "multiaccel_shiftgrab.h"
-#include "multivermicelli_sse.h"
-#undef MATCH_ALGO
-
-#define MULTIACCEL_DOUBLE
-
-#define MATCH_ALGO doubleshift_
-#include "multiaccel_doubleshift.h"
-#include "multivermicelli_sse.h"
-#undef MATCH_ALGO
-
-#define MATCH_ALGO doubleshiftgrab_
-#include "multiaccel_doubleshiftgrab.h"
-#include "multivermicelli_sse.h"
-#undef MATCH_ALGO
-
-#undef MULTIACCEL_DOUBLE
-
-#else
-
-#define MATCH_ALGO long_
-#include "multiaccel_long.h"
-#include "multivermicelli_avx2.h"
-#undef MATCH_ALGO
-
-#define MATCH_ALGO longgrab_
-#include "multiaccel_longgrab.h"
-#include "multivermicelli_avx2.h"
-#undef MATCH_ALGO
-
-#define MATCH_ALGO shift_
-#include "multiaccel_shift.h"
-#include "multivermicelli_avx2.h"
-#undef MATCH_ALGO
-
-#define MATCH_ALGO shiftgrab_
-#include "multiaccel_shiftgrab.h"
-#include "multivermicelli_avx2.h"
-#undef MATCH_ALGO
-
-#define MULTIACCEL_DOUBLE
-
-#define MATCH_ALGO doubleshift_
-#include "multiaccel_doubleshift.h"
-#include "multivermicelli_avx2.h"
-#undef MATCH_ALGO
-
-#define MATCH_ALGO doubleshiftgrab_
-#include "multiaccel_doubleshiftgrab.h"
-#include "multivermicelli_avx2.h"
-#undef MATCH_ALGO
-
-#undef MULTIACCEL_DOUBLE
-
-#endif
diff --git a/src/nfa/multivermicelli_avx2.h b/src/nfa/multivermicelli_avx2.h
deleted file mode 100644
index 9081aa3fc..000000000
--- a/src/nfa/multivermicelli_avx2.h
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- * Copyright (c) 2015, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "util/bitutils.h"
-#include "util/simd_utils.h"
-#include "util/unaligned.h"
-
-#include "multiaccel_common.h"
-
-static really_inline
-const u8 *JOIN(MATCH_ALGO, vermUnalignNocase)(m256 chars,
-                                              const u8 *buf,
-                                              const u8 run_len
-#ifdef MULTIACCEL_DOUBLE
-                                              , const u8 run_len2
-#endif
-                                              ) {
-    m256 casemask = set32x8(CASE_CLEAR);
-    const u8 *ptr;
-    m256 data = loadu256(buf);
-    u32 z = movemask256(eq256(chars, and256(casemask, data)));
-    ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])
-            (buf, z
-#ifdef MULTIACCEL_DOUBLE
-            , run_len2
-#endif
-            );
-    if (unlikely(ptr)) {
-        return ptr;
-    }
-    return NULL;
-}
-
-static really_inline
-const u8 *JOIN(MATCH_ALGO, vermUnalign)(m256 chars,
-                                        const u8 *buf,
-                                        const u8 run_len
-#ifdef MULTIACCEL_DOUBLE
-                                        , const u8 run_len2
-#endif
-                                        ) {
-    const u8 *ptr;
-
-    m256 data = loadu256(buf);
-    u32 z = movemask256(eq256(chars, data));
-    ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])
-            (buf, z
-#ifdef MULTIACCEL_DOUBLE
-            , run_len2
-#endif
-            );
-    if (unlikely(ptr)) {
-        return ptr;
-    }
-    return NULL;
-}
-
-/*
- * 32-byte pipeline
- */
-static really_inline
-const u8 *JOIN(MATCH_ALGO, vermPipeline)(m256 chars,
-                                         const u8 *buf,
-                                         const u8 *buf_end,
-                                         const u8 run_len
-#ifdef MULTIACCEL_DOUBLE
-                                         , const u8 run_len2
-#endif
-                                         ) {
-    const u8* ptr, *last_buf;
-    u32 last_res;
-
-    // pipeline prologue: scan first 32 bytes
-    m256 data = load256(buf);
-    u32 z = movemask256(eq256(chars, data));
-    last_res = z;
-    last_buf = buf;
-    buf += 32;
-
-    // now, start the pipeline!
-    assert((size_t)buf % 32 == 0);
-    for (; buf + 31 < buf_end; buf += 32) {
-        // scan more data
-        data = load256(buf);
-        z = movemask256(eq256(chars, data));
-
-        // do a comparison on previous result
-        ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])
-                (last_buf, last_res
-#ifdef MULTIACCEL_DOUBLE
-                , run_len2
-#endif
-                );
-        if (unlikely(ptr)) {
-            return ptr;
-        }
-        last_buf = buf;
-        last_res = z;
-    }
-    assert(buf <= buf_end && buf >= buf_end - 32);
-
-    // epilogue: compare final results
-    ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])
-            (last_buf, last_res
-#ifdef MULTIACCEL_DOUBLE
-            , run_len2
-#endif
-            );
-    if (unlikely(ptr)) {
-        return ptr;
-    }
-
-    return NULL;
-}
-
-/*
- * 32-byte caseless pipeline
- */
-static really_inline
-const u8 *JOIN(MATCH_ALGO, vermPipelineNocase)(m256 chars,
-                                               const u8 *buf,
-                                               const u8 *buf_end,
-                                               const u8 run_len
-#ifdef MULTIACCEL_DOUBLE
-                                               , const u8 run_len2
-#endif
-                                               ) {
-    m256 casemask = set32x8(CASE_CLEAR);
-    const u8* ptr, *last_buf;
-    u32 last_res;
-
-    // pipeline prologue: scan first 32 bytes
-    m256 data = load256(buf);
-    u32 z = movemask256(eq256(chars, and256(casemask, data)));
-    last_res = z;
-    last_buf = buf;
-    buf += 32;
-
-
-    // now, start the pipeline!
-    assert((size_t)buf % 32 == 0);
-    for (; buf + 31 < buf_end; buf += 32) {
-        // scan more data
-        data = load256(buf);
-        z = movemask256(eq256(chars, and256(casemask, data)));
-
-        // do a comparison on previous result
-        ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])
-                (last_buf, last_res
-#ifdef MULTIACCEL_DOUBLE
-                , run_len2
-#endif
-                );
-        if (unlikely(ptr)) {
-            return ptr;
-        }
-        last_buf = buf;
-        last_res = z;
-    }
-    assert(buf <= buf_end && buf >= buf_end - 32);
-
-    // epilogue: compare final results
-    ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])
-            (last_buf, last_res
-#ifdef MULTIACCEL_DOUBLE
-            , run_len2
-#endif
-            );
-    if (unlikely(ptr)) {
-        return ptr;
-    }
-
-    return NULL;
-}
-
-const u8 *JOIN(MATCH_ALGO, vermicelliExec)(char c, char nocase,
-                                           const u8 *buf,
-                                           const u8 *buf_end,
-                                           const u8 run_len
-#ifdef MULTIACCEL_DOUBLE
-                                           , const u8 run_len2
-#endif
-                                           ) {
-    DEBUG_PRINTF("verm scan %s\\x%02hhx over %zu bytes\n",
-                 nocase ? "nocase " : "", c, (size_t)(buf_end - buf));
-    assert(buf < buf_end);
-
-    const u8 *ptr;
-
-    // Handle small scans.
-    if (buf_end - buf < 32) {
-        for (; buf < buf_end; buf++) {
-            char cur = (char)*buf;
-            if (nocase) {
-                cur &= CASE_CLEAR;
-            }
-            if (cur == c) {
-                break;
-            }
-        }
-        return buf;
-    }
-
-    m256 chars = set32x8(c); /* nocase already uppercase */
-
-    uintptr_t min = (uintptr_t)buf % 32;
-
-    if (min) {
-        ptr = nocase ? JOIN(MATCH_ALGO, vermUnalignNocase)(chars,
-                buf, run_len
-#ifdef MULTIACCEL_DOUBLE
-                , run_len2
-#endif
-                ) : JOIN(MATCH_ALGO, vermUnalign)(chars,
-                        buf, run_len
-#ifdef MULTIACCEL_DOUBLE
-                        , run_len2
-#endif
-                        );
-        if (unlikely(ptr)) {
-            return ptr;
-        }
-        buf += 32 - min;
-    }
-
-    if (buf_end - buf >= 32){
-        ptr = nocase ? JOIN(MATCH_ALGO, vermPipelineNocase)(chars,
-                buf, buf_end, run_len
-#ifdef MULTIACCEL_DOUBLE
-                , run_len2
-#endif
-                ) : JOIN(MATCH_ALGO, vermPipeline)(chars,
-                        buf, buf_end, run_len
-#ifdef MULTIACCEL_DOUBLE
-                        , run_len2
-#endif
-                        );
-        if (unlikely(ptr)) {
-            return ptr;
-        }
-    }
-
-    // final unaligned scan
-    ptr = nocase ? JOIN(MATCH_ALGO, vermUnalignNocase)(chars,
-            buf_end - 32, run_len
-#ifdef MULTIACCEL_DOUBLE
-            , run_len2
-#endif
-            ) : JOIN(MATCH_ALGO, vermUnalign)(chars,
-                    buf_end - 32, run_len
-#ifdef MULTIACCEL_DOUBLE
-                    , run_len2
-#endif
-                    );
-
-    // run our pipeline
-    return ptr ? ptr : buf_end;
-}
diff --git a/src/nfa/multivermicelli_sse.h b/src/nfa/multivermicelli_sse.h
deleted file mode 100644
index cdacd2c43..000000000
--- a/src/nfa/multivermicelli_sse.h
+++ /dev/null
@@ -1,452 +0,0 @@
-/*
- * Copyright (c) 2015, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "util/bitutils.h"
-#include "util/simd_utils.h"
-#include "util/unaligned.h"
-
-#define VERM_BOUNDARY 16
-#define VERM_TYPE m128
-#define VERM_SET_FN set16x8
-
-#include "multiaccel_common.h"
-
-static really_inline
-const u8 *JOIN(MATCH_ALGO, vermUnalignNocase)(m128 chars,
-                                              const u8 *buf,
-                                              const u8 run_len
-#ifdef MULTIACCEL_DOUBLE
-                                              , const u8 run_len2
-#endif
-                                              ) {
-    m128 casemask = set16x8(CASE_CLEAR);
-    const u8 *ptr;
-    m128 data = loadu128(buf);
-    u32 z = movemask128(eq128(chars, and128(casemask, data)));
-    ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len])
-            (buf, z
-#ifdef MULTIACCEL_DOUBLE
-             , run_len2
-#endif
-             );
-    if (unlikely(ptr)) {
-        return ptr;
-    }
-    return NULL;
-}
-
-static really_inline
-const u8 *JOIN(MATCH_ALGO, vermUnalign)(m128 chars,
-                                        const u8 *buf,
-                                        const u8 run_len
-#ifdef MULTIACCEL_DOUBLE
-                                        , const u8 run_len2
-#endif
-                                        ) {
-    const u8 *ptr;
-
-    m128 data = loadu128(buf);
-    u32 z = movemask128(eq128(chars, data));
-    ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len])
-            (buf, z
-#ifdef MULTIACCEL_DOUBLE
-             , run_len2
-#endif
-             );
-    if (unlikely(ptr)) {
-        return ptr;
-    }
-    return NULL;
-}
-
-/*
- * 16-byte pipeline, for smaller scans
- */
-static
-const u8 *JOIN(MATCH_ALGO, vermPipeline16)(m128 chars,
-                                                 const u8 *buf,
-                                                 const u8 *buf_end,
-                                                 const u8 run_len
-#ifdef MULTIACCEL_DOUBLE
-                                                 , const u8 run_len2
-#endif
-                                                 ) {
-    const u8* ptr, *last_buf;
-    u32 last_res;
-
-    // pipeline prologue: scan first 16 bytes
-    m128 data = load128(buf);
-    u32 z = movemask128(eq128(chars, data));
-    last_buf = buf;
-    last_res = z;
-    buf += 16;
-
-    // now, start the pipeline!
-    assert((size_t)buf % 16 == 0);
-    for (; buf + 15 < buf_end; buf += 16) {
-        // scan more data
-        data = load128(buf);
-        z = movemask128(eq128(chars, data));
-
-        // do a comparison on previous result
-        ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len])
-                (last_buf, last_res
-#ifdef MULTIACCEL_DOUBLE
-                 , run_len2
-#endif
-                 );
-        if (unlikely(ptr)) {
-            return ptr;
-        }
-        last_buf = buf;
-        last_res = z;
-    }
-    assert(buf <= buf_end && buf >= buf_end - 16);
-
-    // epilogue: compare final results
-    ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len])
-            (last_buf, last_res
-#ifdef MULTIACCEL_DOUBLE
-             , run_len2
-#endif
-             );
-    if (unlikely(ptr)) {
-        return ptr;
-    }
-
-    return NULL;
-}
-
-/*
- * 16-byte pipeline, for smaller scans
- */
-static
-const u8 *JOIN(MATCH_ALGO, vermPipeline16Nocase)(m128 chars,
-                                                         const u8 *buf,
-                                                         const u8 *buf_end,
-                                                         const u8 run_len
-#ifdef MULTIACCEL_DOUBLE
-                                                         , const u8 run_len2
-#endif
-                                                         ) {
-    m128 casemask = set16x8(CASE_CLEAR);
-    const u8* ptr, *last_buf;
-    u32 last_res;
-
-    // pipeline prologue: scan first 16 bytes
-    m128 data = load128(buf);
-    u32 z = movemask128(eq128(chars, and128(casemask, data)));
-    last_buf = buf;
-    last_res = z;
-    buf += 16;
-
-    // now, start the pipeline!
-    assert((size_t)buf % 16 == 0);
-    for (; buf + 15 < buf_end; buf += 16) {
-        // scan more data
-        data = load128(buf);
-        z = movemask128(eq128(chars, and128(casemask, data)));
-
-        // do a comparison on previous result
-        ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len])
-                (last_buf, last_res
-#ifdef MULTIACCEL_DOUBLE
-                 , run_len2
-#endif
-                 );
-        if (unlikely(ptr)) {
-            return ptr;
-        }
-        last_buf = buf;
-        last_res = z;
-    }
-    assert(buf <= buf_end && buf >= buf_end - 16);
-
-    // epilogue: compare final results
-    ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 32)[run_len])
-            (last_buf, last_res
-#ifdef MULTIACCEL_DOUBLE
-             , run_len2
-#endif
-             );
-    if (unlikely(ptr)) {
-        return ptr;
-    }
-
-    return NULL;
-}
-
-/*
- * 32-byte pipeline, for bigger scans
- */
-static
-const u8 *JOIN(MATCH_ALGO, vermPipeline32)(m128 chars,
-                                                   const u8 *buf,
-                                                   const u8 *buf_end,
-                                                   const u8 run_len
-#ifdef MULTIACCEL_DOUBLE
-                                                   , const u8 run_len2
-#endif
-                                                   ) {
-    const u8* ptr, *last_buf;
-    u32 res;
-
-    // pipeline prologue: scan first 32 bytes
-    m128 data1 = load128(buf);
-    u32 z1 = movemask128(eq128(chars, data1));
-    m128 data2 = load128(buf + 16);
-    u32 z2 = movemask128(eq128(chars, data2));
-
-    // store the results
-    u32 last_res = z1 | (z2 << VERM_BOUNDARY);
-    last_buf = buf;
-    buf += 32;
-
-
-    // now, start the pipeline!
-    assert((size_t)buf % 16 == 0);
-    for (; buf + 31 < buf_end; buf += 32) {
-        // scan more data
-        data1 = load128(buf);
-        z1 = movemask128(eq128(chars, data1));
-        data2 = load128(buf + 16);
-        z2 = movemask128(eq128(chars, data2));
-        res = z1 | (z2 << 16);
-
-        // do a comparison on previous result
-        ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])
-                (last_buf, last_res
-#ifdef MULTIACCEL_DOUBLE
-                 , run_len2
-#endif
-                 );
-        if (unlikely(ptr)) {
-            return ptr;
-        }
-        last_res = res;
-        last_buf = buf;
-    }
-
-    // epilogue: compare final results
-    ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])
-            (last_buf, last_res
-#ifdef MULTIACCEL_DOUBLE
-             , run_len2
-#endif
-             );
-    if (unlikely(ptr)) {
-        return ptr;
-    }
-
-    // if we still have some data left, scan it too
-    if (buf + 15 < buf_end) {
-        return JOIN(MATCH_ALGO, vermPipeline16)(chars, buf, buf_end, run_len
-#ifdef MULTIACCEL_DOUBLE
-                                                , run_len2
-#endif
-                                                );
-    }
-    assert(buf <= buf_end && buf >= buf_end - 16);
-
-    return NULL;
-}
-
-/*
- * 32-byte caseless pipeline, for bigger scans
- */
-static
-const u8 *JOIN(MATCH_ALGO, vermPipeline32Nocase)(m128 chars,
-                                                         const u8 *buf,
-                                                         const u8 *buf_end,
-                                                         const u8 run_len
-#ifdef MULTIACCEL_DOUBLE
-                                                         , const u8 run_len2
-#endif
-                                                         ) {
-    m128 casemask = set16x8(CASE_CLEAR);
-    const u8* ptr, *last_buf;
-    u32 last_res;
-
-    // pipeline prologue: scan first 32 bytes
-    m128 data1 = load128(buf);
-    u32 z1 = movemask128(eq128(chars, and128(casemask, data1)));
-    m128 data2 = load128(buf + 16);
-    u32 z2 = movemask128(eq128(chars, and128(casemask, data2)));
-    u32 z = z1 | (z2 << VERM_BOUNDARY);
-
-    last_res = z;
-    last_buf = buf;
-    buf += 32;
-
-    // now, start the pipeline!
-    assert((size_t)buf % 16 == 0);
-    for (; buf + 31 < buf_end; buf += 32) {
-        // scan more data
-        data1 = load128(buf);
-        z1 = movemask128(eq128(chars, and128(casemask, data1)));
-        data2 = load128(buf + 16);
-        z2 = movemask128(eq128(chars, and128(casemask, data2)));
-        z = z1 | (z2 << 16);
-
-        // do a comparison on previous result
-        ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])
-                (last_buf, last_res
-#ifdef MULTIACCEL_DOUBLE
-                 , run_len2
-#endif
-                 );
-        if (unlikely(ptr)) {
-            return ptr;
-        }
-        last_res = z;
-        last_buf = buf;
-    }
-
-    // epilogue: compare final results
-    ptr = (*JOIN4(MATCH_ALGO, match_funcs, _, 64)[run_len])
-            (last_buf, last_res
-#ifdef MULTIACCEL_DOUBLE
-             , run_len2
-#endif
-             );
-    if (unlikely(ptr)) {
-        return ptr;
-    }
-
-    // if we still have some data left, scan it too
-    if (buf + 15 < buf_end) {
-        return JOIN(MATCH_ALGO, vermPipeline16Nocase)(chars, buf, buf_end, run_len
-#ifdef MULTIACCEL_DOUBLE
-                                                              , run_len2
-#endif
-                                                              );
-    }
-    assert(buf <= buf_end && buf >= buf_end - 16);
-
-    return NULL;
-}
-
-const u8 *JOIN(MATCH_ALGO, vermicelliExec)(char c, char nocase,
-                                                   const u8 *buf,
-                                                   const u8 *buf_end,
-                                                   const u8 run_len
-#ifdef MULTIACCEL_DOUBLE
-                                                   , const u8 run_len2
-#endif
-                                                   ) {
-    DEBUG_PRINTF("verm scan %s\\x%02hhx over %zu bytes\n",
-                 nocase ? "nocase " : "", c, (size_t)(buf_end - buf));
-    assert(buf < buf_end);
-
-    const u8 *ptr;
-
-    // Handle small scans.
-    if (buf_end - buf < VERM_BOUNDARY) {
-        for (; buf < buf_end; buf++) {
-            char cur = (char)*buf;
-            if (nocase) {
-                cur &= CASE_CLEAR;
-            }
-            if (cur == c) {
-                break;
-            }
-        }
-        return buf;
-    }
-
-    VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */
-
-    uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY;
-
-    if (min) {
-        ptr = nocase ? JOIN(MATCH_ALGO, vermUnalignNocase)(chars,
-                buf, run_len
-#ifdef MULTIACCEL_DOUBLE
-                , run_len2
-#endif
-                ) : JOIN(MATCH_ALGO, vermUnalign)(chars,
-                        buf, run_len
-#ifdef MULTIACCEL_DOUBLE
-                        , run_len2
-#endif
-                        );
-        if (unlikely(ptr)) {
-            return ptr;
-        }
-        buf += VERM_BOUNDARY - min;
-    }
-
-    // if we have enough data, run bigger pipeline; otherwise run smaller one
-    if (buf_end - buf >= 128) {
-        ptr = nocase ? JOIN(MATCH_ALGO, vermPipeline32Nocase)(chars,
-                buf, buf_end, run_len
-#ifdef MULTIACCEL_DOUBLE
-                , run_len2
-#endif
-                ) : JOIN(MATCH_ALGO, vermPipeline32)(chars,
-                        buf, buf_end, run_len
-#ifdef MULTIACCEL_DOUBLE
-                        , run_len2
-#endif
-                        );
-        if (unlikely(ptr)) {
-            return ptr;
-        }
-    } else if (buf_end - buf >= 16){
-        ptr = nocase ? JOIN(MATCH_ALGO, vermPipeline16Nocase)(chars,
-                buf, buf_end, run_len
-#ifdef MULTIACCEL_DOUBLE
-                , run_len2
-#endif
-                ) : JOIN(MATCH_ALGO, vermPipeline16)(chars,
-                        buf, buf_end, run_len
-#ifdef MULTIACCEL_DOUBLE
-                        , run_len2
-#endif
-                        );
-        if (unlikely(ptr)) {
-            return ptr;
-        }
-    }
-
-    // final unaligned scan
-    ptr = nocase ? JOIN(MATCH_ALGO, vermUnalignNocase)(chars,
-            buf_end - VERM_BOUNDARY, run_len
-#ifdef MULTIACCEL_DOUBLE
-            , run_len2
-#endif
-            ) : JOIN(MATCH_ALGO, vermUnalign)(chars,
-                    buf_end - VERM_BOUNDARY, run_len
-#ifdef MULTIACCEL_DOUBLE
-                    , run_len2
-#endif
-                    );
-
-    // run our pipeline
-    return ptr ? ptr : buf_end;
-}
diff --git a/src/nfa/nfa_build_util.cpp b/src/nfa/nfa_build_util.cpp
index 3103cd297..9185ccdd7 100644
--- a/src/nfa/nfa_build_util.cpp
+++ b/src/nfa/nfa_build_util.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -401,7 +401,7 @@ const char *NFATraits<SHENG_NFA>::name = "Sheng";
 template<> struct NFATraits<TAMARAMA_NFA> {
     UNUSED static const char *name;
     static const NFACategory category = NFA_OTHER;
-    static const u32 stateAlign = 32;
+    static const u32 stateAlign = 64;
     static const bool fast = true;
     static const nfa_dispatch_fn has_accel;
     static const nfa_dispatch_fn has_repeats;
diff --git a/src/nfa/rdfa_merge.cpp b/src/nfa/rdfa_merge.cpp
index 45457555c..50e9b62a0 100644
--- a/src/nfa/rdfa_merge.cpp
+++ b/src/nfa/rdfa_merge.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -40,6 +40,7 @@
 #include "util/report_manager.h"
 #include "util/ue2_containers.h"
 
+#include <algorithm>
 #include <queue>
 
 using namespace std;
@@ -135,6 +136,10 @@ class Automaton_Merge {
             }
         }
 
+        // Sort so that our alphabet mapping isn't dependent on the order of
+        // rdfas passed in.
+        sort(esets.begin(), esets.end());
+
         alphasize = buildAlphabetFromEquivSets(esets, alpha, unalpha);
     }
 
diff --git a/src/nfa/sheng_impl.h b/src/nfa/sheng_impl.h
index fc3e54aa8..9552fe15d 100644
--- a/src/nfa/sheng_impl.h
+++ b/src/nfa/sheng_impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -58,7 +58,7 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s,
     while (likely(cur_buf != end)) {
         const u8 c = *cur_buf;
         const m128 shuffle_mask = masks[c];
-        cur_state = pshufb(shuffle_mask, cur_state);
+        cur_state = pshufb_m128(shuffle_mask, cur_state);
         const u8 tmp = movd(cur_state);
 
         DEBUG_PRINTF("c: %02hhx '%c'\n", c, ourisprint(c) ? c : '?');
diff --git a/src/nfa/sheng_impl4.h b/src/nfa/sheng_impl4.h
index 2561e52d3..740322010 100644
--- a/src/nfa/sheng_impl4.h
+++ b/src/nfa/sheng_impl4.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -100,19 +100,19 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s,
         const u8 c4 = *b4;
 
         const m128 shuffle_mask1 = masks[c1];
-        cur_state = pshufb(shuffle_mask1, cur_state);
+        cur_state = pshufb_m128(shuffle_mask1, cur_state);
         const u8 a1 = movd(cur_state);
 
         const m128 shuffle_mask2 = masks[c2];
-        cur_state = pshufb(shuffle_mask2, cur_state);
+        cur_state = pshufb_m128(shuffle_mask2, cur_state);
         const u8 a2 = movd(cur_state);
 
         const m128 shuffle_mask3 = masks[c3];
-        cur_state = pshufb(shuffle_mask3, cur_state);
+        cur_state = pshufb_m128(shuffle_mask3, cur_state);
         const u8 a3 = movd(cur_state);
 
         const m128 shuffle_mask4 = masks[c4];
-        cur_state = pshufb(shuffle_mask4, cur_state);
+        cur_state = pshufb_m128(shuffle_mask4, cur_state);
         const u8 a4 = movd(cur_state);
 
         DEBUG_PRINTF("c: %02hhx '%c'\n", c1, ourisprint(c1) ? c1 : '?');
diff --git a/src/nfa/shengcompile.cpp b/src/nfa/shengcompile.cpp
index 53f2c1318..c4094cedc 100644
--- a/src/nfa/shengcompile.cpp
+++ b/src/nfa/shengcompile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -450,16 +450,15 @@ bool has_accel_sheng(const NFA *) {
     return true; /* consider the sheng region as accelerated */
 }
 
-aligned_unique_ptr<NFA> shengCompile(raw_dfa &raw,
-                                     const CompileContext &cc,
-                                     const ReportManager &rm,
-                                     set<dstate_id_t> *accel_states) {
+bytecode_ptr<NFA> shengCompile(raw_dfa &raw, const CompileContext &cc,
+                               const ReportManager &rm, bool only_accel_init,
+                               set<dstate_id_t> *accel_states) {
     if (!cc.grey.allowSheng) {
         DEBUG_PRINTF("Sheng is not allowed!\n");
         return nullptr;
     }
 
-    sheng_build_strat strat(raw, rm);
+    sheng_build_strat strat(raw, rm, only_accel_init);
     dfa_info info(strat);
 
     DEBUG_PRINTF("Trying to compile a %zu state Sheng\n", raw.states.size());
@@ -508,7 +507,7 @@ aligned_unique_ptr<NFA> shengCompile(raw_dfa &raw,
     DEBUG_PRINTF("NFA: %u, aux: %u, reports: %u, accel: %u, total: %u\n",
                  nfa_size, total_aux, total_reports, total_accel, total_size);
 
-    aligned_unique_ptr<NFA> nfa = aligned_zmalloc_unique<NFA>(total_size);
+    auto nfa = make_zeroed_bytecode_ptr<NFA>(total_size);
 
     populateBasicInfo(nfa.get(), info, accelInfo, nfa_size, reports_offset,
                       accel_offset, total_size, total_size - sizeof(NFA));
diff --git a/src/nfa/shengcompile.h b/src/nfa/shengcompile.h
index 873b7c758..9885cd16f 100644
--- a/src/nfa/shengcompile.h
+++ b/src/nfa/shengcompile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,12 +26,12 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef SHENGCOMPILE_H_
-#define SHENGCOMPILE_H_
+#ifndef SHENGCOMPILE_H
+#define SHENGCOMPILE_H
 
 #include "accel_dfa_build_strat.h"
 #include "rdfa.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"
 #include "util/charreach.h"
 #include "util/ue2_containers.h"
 
@@ -45,8 +45,9 @@ struct raw_dfa;
 
 class sheng_build_strat : public accel_dfa_build_strat {
 public:
-    sheng_build_strat(raw_dfa &rdfa_in, const ReportManager &rm_in)
-        : accel_dfa_build_strat(rm_in), rdfa(rdfa_in) {}
+    sheng_build_strat(raw_dfa &rdfa_in, const ReportManager &rm_in,
+                      bool only_accel_init_in)
+        : accel_dfa_build_strat(rm_in, only_accel_init_in), rdfa(rdfa_in) {}
     raw_dfa &get_raw() const override { return rdfa; }
     std::unique_ptr<raw_report_info> gatherReports(
                                   std::vector<u32> &reports /* out */,
@@ -62,9 +63,9 @@ class sheng_build_strat : public accel_dfa_build_strat {
     raw_dfa &rdfa;
 };
 
-aligned_unique_ptr<NFA>
-shengCompile(raw_dfa &raw, const CompileContext &cc, const ReportManager &rm,
-             std::set<dstate_id_t> *accel_states = nullptr);
+bytecode_ptr<NFA> shengCompile(raw_dfa &raw, const CompileContext &cc,
+                               const ReportManager &rm, bool only_accel_init,
+                               std::set<dstate_id_t> *accel_states = nullptr);
 
 struct sheng_escape_info {
     CharReach outs;
@@ -77,4 +78,4 @@ bool has_accel_sheng(const NFA *nfa);
 
 } // namespace ue2
 
-#endif /* SHENGCOMPILE_H_ */
+#endif /* SHENGCOMPILE_H */
diff --git a/src/nfa/shufti.c b/src/nfa/shufti.c
index d68b1b047..09ffc0cf9 100644
--- a/src/nfa/shufti.c
+++ b/src/nfa/shufti.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,11 +34,57 @@
 
 #include "shufti.h"
 #include "ue2common.h"
+#include "util/arch.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
 #include "util/unaligned.h"
 
-#include "shufti_common.h"
+#ifdef DEBUG
+#include <ctype.h>
+
+#define DUMP_MSK(_t)                                \
+static UNUSED                                       \
+void dumpMsk##_t(m##_t msk) {                       \
+    u8 * mskAsU8 = (u8 *)&msk;                      \
+    for (unsigned i = 0; i < sizeof(msk); i++) {    \
+        u8 c = mskAsU8[i];                          \
+        for (int j = 0; j < 8; j++) {               \
+            if ((c >> (7-j)) & 0x1)                 \
+                printf("1");                        \
+            else                                    \
+                printf("0");                        \
+        }                                           \
+        printf(" ");                                \
+    }                                               \
+}                                                   \
+static UNUSED                                       \
+void dumpMsk##_t##AsChars(m##_t msk) {              \
+    u8 * mskAsU8 = (u8 *)&msk;                      \
+    for (unsigned i = 0; i < sizeof(msk); i++) {    \
+        u8 c = mskAsU8[i];                          \
+        if (isprint(c))                             \
+            printf("%c",c);                         \
+        else                                        \
+            printf(".");                            \
+    }                                               \
+}
+
+#endif
+
+/** \brief Naive byte-by-byte implementation. */
+static really_inline
+const u8 *shuftiFwdSlow(const u8 *lo, const u8 *hi, const u8 *buf,
+                        const u8 *buf_end) {
+    assert(buf < buf_end);
+
+    for (; buf < buf_end; ++buf) {
+        u8 c = *buf;
+        if (lo[c & 0xf] & hi[c >> 4]) {
+            break;
+        }
+    }
+    return buf;
+}
 
 /** \brief Naive byte-by-byte implementation. */
 static really_inline
@@ -55,9 +101,33 @@ const u8 *shuftiRevSlow(const u8 *lo, const u8 *hi, const u8 *buf,
     return buf_end;
 }
 
-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)
 /* Normal SSSE3 shufti */
 
+#ifdef DEBUG
+DUMP_MSK(128)
+#endif
+
+#define GET_LO_4(chars) and128(chars, low4bits)
+#define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4)
+
+static really_inline
+u32 block(m128 mask_lo, m128 mask_hi, m128 chars, const m128 low4bits,
+          const m128 compare) {
+    m128 c_lo  = pshufb_m128(mask_lo, GET_LO_4(chars));
+    m128 c_hi  = pshufb_m128(mask_hi, GET_HI_4(chars));
+    m128 t     = and128(c_lo, c_hi);
+
+#ifdef DEBUG
+    DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n");
+    DEBUG_PRINTF("  char: "); dumpMsk128(chars);        printf("\n");
+    DEBUG_PRINTF("  c_lo: "); dumpMsk128(c_lo);         printf("\n");
+    DEBUG_PRINTF("  c_hi: "); dumpMsk128(c_hi);         printf("\n");
+    DEBUG_PRINTF("     t: "); dumpMsk128(t);            printf("\n");
+#endif
+    return movemask128(eq128(t, compare));
+}
+
 static really_inline
 const u8 *firstMatch(const u8 *buf, u32 z) {
     if (unlikely(z != 0xffff)) {
@@ -149,8 +219,8 @@ const u8 *lastMatch(const u8 *buf, m128 t, m128 compare) {
 static really_inline
 const u8 *revBlock(m128 mask_lo, m128 mask_hi, m128 chars, const u8 *buf,
                    const m128 low4bits, const m128 zeroes) {
-    m128 c_lo  = pshufb(mask_lo, GET_LO_4(chars));
-    m128 c_hi  = pshufb(mask_hi, GET_HI_4(chars));
+    m128 c_lo  = pshufb_m128(mask_lo, GET_LO_4(chars));
+    m128 c_hi  = pshufb_m128(mask_hi, GET_HI_4(chars));
     m128 t     = and128(c_lo, c_hi);
 
 #ifdef DEBUG
@@ -219,8 +289,8 @@ const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi,
                     const m128 ones) {
     m128 chars_lo = GET_LO_4(chars);
     m128 chars_hi = GET_HI_4(chars);
-    m128 c_lo  = pshufb(mask1_lo, chars_lo);
-    m128 c_hi  = pshufb(mask1_hi, chars_hi);
+    m128 c_lo  = pshufb_m128(mask1_lo, chars_lo);
+    m128 c_hi  = pshufb_m128(mask1_hi, chars_hi);
     m128 t     = or128(c_lo, c_hi);
 
 #ifdef DEBUG
@@ -231,8 +301,8 @@ const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi,
     DEBUG_PRINTF("     t: "); dumpMsk128(t);            printf("\n");
 #endif
 
-    m128 c2_lo  = pshufb(mask2_lo, chars_lo);
-    m128 c2_hi  = pshufb(mask2_hi, chars_hi);
+    m128 c2_lo  = pshufb_m128(mask2_lo, chars_lo);
+    m128 c2_hi  = pshufb_m128(mask2_hi, chars_hi);
     m128 t2     = or128(t, rshiftbyte_m128(or128(c2_lo, c2_hi), 1));
 
 #ifdef DEBUG
@@ -290,13 +360,41 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
     return buf_end;
 }
 
-#else // AVX2 - 256 wide shuftis
+#elif !defined(HAVE_AVX512)
+// AVX2 - 256 wide shuftis
+
+#ifdef DEBUG
+DUMP_MSK(256)
+#endif
+
+#define GET_LO_4(chars) and256(chars, low4bits)
+#define GET_HI_4(chars) rshift64_m256(andnot256(low4bits, chars), 4)
+
+static really_inline
+u32 block(m256 mask_lo, m256 mask_hi, m256 chars, const m256 low4bits,
+          const m256 compare) {
+    m256 c_lo  = pshufb_m256(mask_lo, GET_LO_4(chars));
+    m256 c_hi  = pshufb_m256(mask_hi, GET_HI_4(chars));
+    m256 t = and256(c_lo, c_hi);
+
+#ifdef DEBUG
+    DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n");
+    DEBUG_PRINTF("  char: "); dumpMsk256(chars); printf("\n");
+    DEBUG_PRINTF("  c_lo: "); dumpMsk256(c_lo); printf("\n");
+    DEBUG_PRINTF("  c_hi: "); dumpMsk256(c_hi); printf("\n");
+    DEBUG_PRINTF("     t: "); dumpMsk256(t); printf("\n");
+#endif
+
+    return movemask256(eq256(t, compare));
+}
 
 static really_inline
 const u8 *firstMatch(const u8 *buf, u32 z) {
+    DEBUG_PRINTF("z 0x%08x\n", z);
     if (unlikely(z != 0xffffffff)) {
         u32 pos = ctz32(~z);
         assert(pos < 32);
+        DEBUG_PRINTF("match @ pos %u\n", pos);
         return buf + pos;
     } else {
         return NULL; // no match
@@ -309,7 +407,7 @@ const u8 *fwdBlockShort(m256 mask, m128 chars, const u8 *buf,
     // do the hi and lo shuffles in the one avx register
     m256 c = combine2x128(rshift64_m128(chars, 4), chars);
     c = and256(c, low4bits);
-    m256 c_shuf = vpshufb(mask, c);
+    m256 c_shuf = pshufb_m256(mask, c);
     m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf));
     // the upper 32-bits can't match
     u32 z = 0xffff0000U | movemask128(eq128(t, zeroes128()));
@@ -418,8 +516,8 @@ const u8 *lastMatch(const u8 *buf, u32 z) {
 static really_inline
 const u8 *revBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf,
                    const m256 low4bits, const m256 zeroes) {
-    m256 c_lo  = vpshufb(mask_lo, GET_LO_4(chars));
-    m256 c_hi  = vpshufb(mask_hi, GET_HI_4(chars));
+    m256 c_lo  = pshufb_m256(mask_lo, GET_LO_4(chars));
+    m256 c_hi  = pshufb_m256(mask_hi, GET_HI_4(chars));
     m256 t     = and256(c_lo, c_hi);
 
 #ifdef DEBUG
@@ -440,7 +538,7 @@ const u8 *revBlockShort(m256 mask, m128 chars, const u8 *buf,
     // do the hi and lo shuffles in the one avx register
     m256 c = combine2x128(rshift64_m128(chars, 4), chars);
     c = and256(c, low4bits);
-    m256 c_shuf = vpshufb(mask, c);
+    m256 c_shuf = pshufb_m256(mask, c);
     m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf));
     // the upper 32-bits can't match
     u32 z = 0xffff0000U | movemask128(eq128(t, zeroes128()));
@@ -532,8 +630,8 @@ const u8 *fwdBlock2(m256 mask1_lo, m256 mask1_hi, m256 mask2_lo, m256 mask2_hi,
     DEBUG_PRINTF("buf %p\n", buf);
     m256 chars_lo = GET_LO_4(chars);
     m256 chars_hi = GET_HI_4(chars);
-    m256 c_lo  = vpshufb(mask1_lo, chars_lo);
-    m256 c_hi  = vpshufb(mask1_hi, chars_hi);
+    m256 c_lo  = pshufb_m256(mask1_lo, chars_lo);
+    m256 c_hi  = pshufb_m256(mask1_hi, chars_hi);
     m256 t     = or256(c_lo, c_hi);
 
 #ifdef DEBUG
@@ -544,8 +642,8 @@ const u8 *fwdBlock2(m256 mask1_lo, m256 mask1_hi, m256 mask2_lo, m256 mask2_hi,
     DEBUG_PRINTF("     t: "); dumpMsk256(t);            printf("\n");
 #endif
 
-    m256 c2_lo  = vpshufb(mask2_lo, chars_lo);
-    m256 c2_hi  = vpshufb(mask2_hi, chars_hi);
+    m256 c2_lo  = pshufb_m256(mask2_lo, chars_lo);
+    m256 c2_hi  = pshufb_m256(mask2_hi, chars_hi);
     m256 t2 = or256(t, rshift128_m256(or256(c2_lo, c2_hi), 1));
 
 #ifdef DEBUG
@@ -564,8 +662,8 @@ const u8 *fwdBlockShort2(m256 mask1, m256 mask2, m128 chars, const u8 *buf,
     // do the hi and lo shuffles in the one avx register
     m256 c = combine2x128(rshift64_m128(chars, 4), chars);
     c = and256(c, low4bits);
-    m256 c_shuf1 = vpshufb(mask1, c);
-    m256 c_shuf2 = rshift128_m256(vpshufb(mask2, c), 1);
+    m256 c_shuf1 = pshufb_m256(mask1, c);
+    m256 c_shuf2 = rshift128_m256(pshufb_m256(mask2, c), 1);
     m256 t0 = or256(c_shuf1, c_shuf2);
     m128 t = or128(movdq_hi(t0), cast256to128(t0));
     // the upper 32-bits can't match
@@ -602,6 +700,7 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
                            const u8 *buf, const u8 *buf_end) {
     /* we should always have at least 16 bytes */
     assert(buf_end - buf >= 16);
+    DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf);
 
     if (buf_end - buf < 32) {
         return shuftiDoubleShort(mask1_lo, mask1_hi, mask2_lo, mask2_hi, buf,
@@ -652,4 +751,347 @@ const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
     return buf_end;
 }
 
-#endif //AVX2
+#else // defined(HAVE_AVX512)
+
+#ifdef DEBUG
+DUMP_MSK(512)
+#endif
+
+static really_inline
+u64a block(m512 mask_lo, m512 mask_hi, m512 chars, const m512 low4bits,
+           const m512 compare) {
+    m512 c_lo = pshufb_m512(mask_lo, and512(chars, low4bits));
+    m512 c_hi = pshufb_m512(mask_hi,
+                            rshift64_m512(andnot512(low4bits, chars), 4));
+    m512 t = and512(c_lo, c_hi);
+
+#ifdef DEBUG
+    DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n");
+    DEBUG_PRINTF("  char: "); dumpMsk512(chars); printf("\n");
+    DEBUG_PRINTF("  c_lo: "); dumpMsk512(c_lo); printf("\n");
+    DEBUG_PRINTF("  c_hi: "); dumpMsk512(c_hi); printf("\n");
+    DEBUG_PRINTF("     t: "); dumpMsk512(t); printf("\n");
+#endif
+
+    return eq512mask(t, compare);
+}
+static really_inline
+const u8 *firstMatch64(const u8 *buf, u64a z) {
+    DEBUG_PRINTF("z 0x%016llx\n", z);
+    if (unlikely(z != ~0ULL)) {
+        u32 pos = ctz64(~z);
+        DEBUG_PRINTF("match @ pos %u\n", pos);
+        assert(pos < 64);
+        return buf + pos;
+    } else {
+        return NULL; // no match
+    }
+}
+
+static really_inline
+const u8 *fwdBlock512(m512 mask_lo, m512 mask_hi, m512 chars, const u8 *buf,
+                      const m512 low4bits, const m512 zeroes) {
+    u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
+
+    return firstMatch64(buf, z);
+}
+
+static really_inline
+const u8 *shortShufti512(m512 mask_lo, m512 mask_hi, const u8 *buf,
+                         const u8 *buf_end, const m512 low4bits,
+                         const m512 zeroes) {
+    DEBUG_PRINTF("short shufti %p len %zu\n", buf, buf_end - buf);
+    uintptr_t len = buf_end - buf;
+    assert(len <= 64);
+
+    // load mask
+    u64a k = (~0ULL) >> (64 - len);
+    DEBUG_PRINTF("load mask 0x%016llx\n", k);
+
+    m512 chars = loadu_maskz_m512(k, buf);
+
+    u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
+
+    // reuse the load mask to indicate valid bytes
+    return firstMatch64(buf, z | ~k);
+}
+
+/* takes 128 bit masks, but operates on 512 bits of data */
+const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
+                     const u8 *buf_end) {
+    assert(buf && buf_end);
+    assert(buf < buf_end);
+    DEBUG_PRINTF("shufti %p len %zu\n", buf, buf_end - buf);
+    DEBUG_PRINTF("b %s\n", buf);
+
+    const m512 low4bits = set64x8(0xf);
+    const m512 zeroes = zeroes512();
+    const m512 wide_mask_lo = set4x128(mask_lo);
+    const m512 wide_mask_hi = set4x128(mask_hi);
+    const u8 *rv;
+
+    // small cases.
+    if (buf_end - buf <= 64) {
+        rv = shortShufti512(wide_mask_lo, wide_mask_hi, buf, buf_end, low4bits,
+                            zeroes);
+        return rv ? rv : buf_end;
+    }
+
+    assert(buf_end - buf >= 64);
+
+    // Preconditioning: most of the time our buffer won't be aligned.
+    if ((uintptr_t)buf % 64) {
+        rv = shortShufti512(wide_mask_lo, wide_mask_hi, buf,
+                            ROUNDUP_PTR(buf, 64), low4bits, zeroes);
+        if (rv) {
+            return rv;
+        }
+        buf = ROUNDUP_PTR(buf, 64);
+    }
+
+    const u8 *last_block = ROUNDDOWN_PTR(buf_end, 64);
+    while (buf < last_block) {
+        m512 lchars = load512(buf);
+        rv = fwdBlock512(wide_mask_lo, wide_mask_hi, lchars, buf, low4bits,
+                         zeroes);
+        if (rv) {
+            return rv;
+        }
+        buf += 64;
+    }
+
+    if (buf == buf_end) {
+        goto done;
+    }
+
+    // Use an unaligned load to mop up the last 64 bytes and get an accurate
+    // picture to buf_end.
+    assert(buf <= buf_end && buf >= buf_end - 64);
+    m512 chars = loadu512(buf_end - 64);
+    rv = fwdBlock512(wide_mask_lo, wide_mask_hi, chars, buf_end - 64, low4bits,
+                     zeroes);
+    if (rv) {
+        return rv;
+    }
+done:
+    return buf_end;
+}
+
+static really_inline
+const u8 *lastMatch64(const u8 *buf, u64a z) {
+    DEBUG_PRINTF("z 0x%016llx\n", z);
+    if (unlikely(z != ~0ULL)) {
+        u32 pos = clz64(~z);
+        DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos);
+        return buf + (63 - pos);
+    } else {
+        return NULL; // no match
+    }
+}
+
+static really_inline
+const u8 *rshortShufti512(m512 mask_lo, m512 mask_hi, const u8 *buf,
+                          const u8 *buf_end, const m512 low4bits,
+                          const m512 zeroes) {
+    DEBUG_PRINTF("short %p len %zu\n", buf, buf_end - buf);
+    uintptr_t len = buf_end - buf;
+    assert(len <= 64);
+
+    // load mask
+    u64a k = (~0ULL) >> (64 - len);
+    DEBUG_PRINTF("load mask 0x%016llx\n", k);
+
+    m512 chars = loadu_maskz_m512(k, buf);
+
+    u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes);
+
+    // reuse the load mask to indicate valid bytes
+    return lastMatch64(buf, z | ~k);
+}
+
+static really_inline
+const u8 *revBlock512(m512 mask_lo, m512 mask_hi, m512 chars, const u8 *buf,
+                      const m512 low4bits, const m512 zeroes) {
+    m512 c_lo  = pshufb_m512(mask_lo, and512(chars, low4bits));
+    m512 c_hi  = pshufb_m512(mask_hi,
+                             rshift64_m512(andnot512(low4bits, chars), 4));
+    m512 t     = and512(c_lo, c_hi);
+
+#ifdef DEBUG
+    DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n");
+    DEBUG_PRINTF("  char: "); dumpMsk512(chars);        printf("\n");
+    DEBUG_PRINTF("  c_lo: "); dumpMsk512(c_lo);         printf("\n");
+    DEBUG_PRINTF("  c_hi: "); dumpMsk512(c_hi);         printf("\n");
+    DEBUG_PRINTF("     t: "); dumpMsk512(t);            printf("\n");
+#endif
+
+    u64a z = eq512mask(t, zeroes);
+    return lastMatch64(buf, z);
+}
+
+/* takes 128 bit masks, but operates on 512 bits of data */
+const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
+                      const u8 *buf_end) {
+    DEBUG_PRINTF("buf %p buf_end %p\n", buf, buf_end);
+    assert(buf && buf_end);
+    assert(buf < buf_end);
+
+    const m512 low4bits = set64x8(0xf);
+    const m512 zeroes = zeroes512();
+    const m512 wide_mask_lo = set4x128(mask_lo);
+    const m512 wide_mask_hi = set4x128(mask_hi);
+    const u8 *rv;
+
+    if (buf_end - buf < 64) {
+        rv = rshortShufti512(wide_mask_lo, wide_mask_hi, buf, buf_end, low4bits,
+                             zeroes);
+        return rv ? rv : buf - 1;
+    }
+
+    if (ROUNDDOWN_PTR(buf_end, 64) != buf_end) {
+        // peel off unaligned portion
+        assert(buf_end - buf >= 64);
+        DEBUG_PRINTF("start\n");
+        rv = rshortShufti512(wide_mask_lo, wide_mask_hi,
+                             ROUNDDOWN_PTR(buf_end, 64), buf_end, low4bits,
+                             zeroes);
+        if (rv) {
+            return rv;
+        }
+        buf_end = ROUNDDOWN_PTR(buf_end, 64);
+    }
+
+    const u8 *last_block = ROUNDUP_PTR(buf, 64);
+    while (buf_end > last_block) {
+        buf_end -= 64;
+        m512 lchars = load512(buf_end);
+        rv = revBlock512(wide_mask_lo, wide_mask_hi, lchars, buf_end, low4bits,
+                         zeroes);
+        if (rv) {
+            return rv;
+        }
+    }
+    if (buf_end == buf) {
+        goto done;
+    }
+    // Use an unaligned load to mop up the last 64 bytes and get an accurate
+    // picture to buf.
+    m512 chars = loadu512(buf);
+    rv = revBlock512(wide_mask_lo, wide_mask_hi, chars, buf, low4bits, zeroes);
+    if (rv) {
+        return rv;
+    }
+done:
+    return buf - 1;
+}
+
+static really_inline
+const u8 *fwdBlock2(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo, m512 mask2_hi,
+                    m512 chars, const u8 *buf, const m512 low4bits,
+                    const m512 ones, __mmask64 k) {
+    DEBUG_PRINTF("buf %p %.64s\n", buf, buf);
+    m512 chars_lo = and512(chars, low4bits);
+    m512 chars_hi = rshift64_m512(andnot512(low4bits, chars), 4);
+    m512 c_lo  = maskz_pshufb_m512(k, mask1_lo, chars_lo);
+    m512 c_hi  = maskz_pshufb_m512(k, mask1_hi, chars_hi);
+    m512 t     = or512(c_lo, c_hi);
+
+#ifdef DEBUG
+    DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n");
+    DEBUG_PRINTF("  char: "); dumpMsk512(chars);        printf("\n");
+    DEBUG_PRINTF("  c_lo: "); dumpMsk512(c_lo);         printf("\n");
+    DEBUG_PRINTF("  c_hi: "); dumpMsk512(c_hi);         printf("\n");
+    DEBUG_PRINTF("     t: "); dumpMsk512(t);            printf("\n");
+#endif
+
+    m512 c2_lo  = maskz_pshufb_m512(k, mask2_lo, chars_lo);
+    m512 c2_hi  = maskz_pshufb_m512(k, mask2_hi, chars_hi);
+    m512 t2 = or512(t, rshift128_m512(or512(c2_lo, c2_hi), 1));
+
+#ifdef DEBUG
+    DEBUG_PRINTF(" c2_lo: "); dumpMsk512(c2_lo);        printf("\n");
+    DEBUG_PRINTF(" c2_hi: "); dumpMsk512(c2_hi);        printf("\n");
+    DEBUG_PRINTF("    t2: "); dumpMsk512(t2);           printf("\n");
+#endif
+    u64a z = eq512mask(t2, ones);
+
+    return firstMatch64(buf, z | ~k);
+}
+
+static really_inline
+const u8 *shortDoubleShufti512(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo,
+                               m512 mask2_hi, const u8 *buf, const u8 *buf_end,
+                               const m512 low4bits, const m512 ones) {
+    DEBUG_PRINTF("short %p len %zu\n", buf, buf_end - buf);
+    uintptr_t len = buf_end - buf;
+    assert(len <= 64);
+
+    u64a k = (~0ULL) >> (64 - len);
+    DEBUG_PRINTF("load mask 0x%016llx\n", k);
+
+    m512 chars = loadu_mask_m512(ones, k, buf);
+
+    const u8 *rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, chars, buf,
+                             low4bits, ones, k);
+
+    return rv;
+}
+
+/* takes 128 bit masks, but operates on 512 bits of data */
+const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi,
+                           m128 mask2_lo, m128 mask2_hi,
+                           const u8 *buf, const u8 *buf_end) {
+    /* we should always have at least 16 bytes */
+    assert(buf_end - buf >= 16);
+    DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf);
+
+    const m512 ones = ones512();
+    const m512 low4bits = set64x8(0xf);
+    const m512 wide_mask1_lo = set4x128(mask1_lo);
+    const m512 wide_mask1_hi = set4x128(mask1_hi);
+    const m512 wide_mask2_lo = set4x128(mask2_lo);
+    const m512 wide_mask2_hi = set4x128(mask2_hi);
+    const u8 *rv;
+
+    if (buf_end - buf <= 64) {
+        rv = shortDoubleShufti512(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo,
+                                  wide_mask2_hi, buf, buf_end, low4bits, ones);
+        DEBUG_PRINTF("rv %p\n", rv);
+        return rv ? rv : buf_end;
+    }
+
+    // Preconditioning: most of the time our buffer won't be aligned.
+    if ((uintptr_t)buf % 64) {
+        rv = shortDoubleShufti512(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo,
+                                  wide_mask2_hi, buf, ROUNDUP_PTR(buf, 64),
+                                  low4bits, ones);
+        if (rv) {
+            return rv;
+        }
+
+        buf = ROUNDUP_PTR(buf, 64);
+    }
+
+    const u8 *last_block = buf_end - 64;
+    while (buf < last_block) {
+        m512 lchars = load512(buf);
+        rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo,
+                       wide_mask2_hi, lchars, buf, low4bits, ones, ~0);
+        if (rv) {
+            return rv;
+        }
+        buf += 64;
+    }
+
+    // Use an unaligned load to mop up the last 64 bytes and get an accurate
+    // picture to buf_end.
+    m512 chars = loadu512(buf_end - 64);
+    rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi,
+                   chars, buf_end - 64, low4bits, ones, ~0);
+    if (rv) {
+        return rv;
+    }
+
+    return buf_end;
+}
+#endif
diff --git a/src/nfa/shufti_common.h b/src/nfa/shufti_common.h
deleted file mode 100644
index e63ad27af..000000000
--- a/src/nfa/shufti_common.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2015-2016, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef SHUFTI_COMMON_H_
-#define SHUFTI_COMMON_H_
-
-#include "ue2common.h"
-
-#include "util/bitutils.h"
-#include "util/simd_utils.h"
-#include "util/unaligned.h"
-
-/*
- * Common stuff for all versions of shufti (single, multi and multidouble)
- */
-
-/** \brief Naive byte-by-byte implementation. */
-static really_inline
-const u8 *shuftiFwdSlow(const u8 *lo, const u8 *hi, const u8 *buf,
-                        const u8 *buf_end) {
-    assert(buf < buf_end);
-
-    for (; buf < buf_end; ++buf) {
-        u8 c = *buf;
-        if (lo[c & 0xf] & hi[c >> 4]) {
-            break;
-        }
-    }
-    return buf;
-}
-
-#ifdef DEBUG
-#include <ctype.h>
-
-#define DUMP_MSK(_t)                                \
-static UNUSED                                       \
-void dumpMsk##_t(m##_t msk) {                       \
-    u8 * mskAsU8 = (u8 *)&msk;                      \
-    for (unsigned i = 0; i < sizeof(msk); i++) {    \
-        u8 c = mskAsU8[i];                          \
-        for (int j = 0; j < 8; j++) {               \
-            if ((c >> (7-j)) & 0x1)                 \
-                printf("1");                        \
-            else                                    \
-                printf("0");                        \
-        }                                           \
-        printf(" ");                                \
-    }                                               \
-}                                                   \
-static UNUSED                                       \
-void dumpMsk##_t##AsChars(m##_t msk) {              \
-    u8 * mskAsU8 = (u8 *)&msk;                      \
-    for (unsigned i = 0; i < sizeof(msk); i++) {    \
-        u8 c = mskAsU8[i];                          \
-        if (isprint(c))                             \
-            printf("%c",c);                         \
-        else                                        \
-            printf(".");                            \
-    }                                               \
-}
-
-#endif
-
-#if !defined(__AVX2__)
-
-#ifdef DEBUG
-DUMP_MSK(128)
-#endif
-
-#define GET_LO_4(chars) and128(chars, low4bits)
-#define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4)
-
-static really_inline
-u32 block(m128 mask_lo, m128 mask_hi, m128 chars, const m128 low4bits,
-          const m128 compare) {
-    m128 c_lo  = pshufb(mask_lo, GET_LO_4(chars));
-    m128 c_hi  = pshufb(mask_hi, GET_HI_4(chars));
-    m128 t     = and128(c_lo, c_hi);
-
-#ifdef DEBUG
-    DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n");
-    DEBUG_PRINTF("  char: "); dumpMsk128(chars);        printf("\n");
-    DEBUG_PRINTF("  c_lo: "); dumpMsk128(c_lo);         printf("\n");
-    DEBUG_PRINTF("  c_hi: "); dumpMsk128(c_hi);         printf("\n");
-    DEBUG_PRINTF("     t: "); dumpMsk128(t);            printf("\n");
-#endif
-    return movemask128(eq128(t, compare));
-}
-
-#else
-
-#ifdef DEBUG
-DUMP_MSK(256)
-#endif
-
-#define GET_LO_4(chars) and256(chars, low4bits)
-#define GET_HI_4(chars) rshift64_m256(andnot256(low4bits, chars), 4)
-
-static really_inline
-u32 block(m256 mask_lo, m256 mask_hi, m256 chars, const m256 low4bits,
-          const m256 compare) {
-    m256 c_lo  = vpshufb(mask_lo, GET_LO_4(chars));
-    m256 c_hi  = vpshufb(mask_hi, GET_HI_4(chars));
-    m256 t = and256(c_lo, c_hi);
-
-#ifdef DEBUG
-    DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n");
-    DEBUG_PRINTF("  char: "); dumpMsk256(chars); printf("\n");
-    DEBUG_PRINTF("  c_lo: "); dumpMsk256(c_lo); printf("\n");
-    DEBUG_PRINTF("  c_hi: "); dumpMsk256(c_hi); printf("\n");
-    DEBUG_PRINTF("     t: "); dumpMsk256(t); printf("\n");
-#endif
-
-    return movemask256(eq256(t, compare));
-}
-
-#endif
-
-
-#endif /* SHUFTI_COMMON_H_ */
diff --git a/src/nfa/tamaramacompile.cpp b/src/nfa/tamaramacompile.cpp
index c28caacbe..1a6e8beff 100644
--- a/src/nfa/tamaramacompile.cpp
+++ b/src/nfa/tamaramacompile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,9 +26,9 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-/** \file
- * \brief Tamarama: container engine for exclusive engines,
- *                  compiler code.
+/**
+ * \file
+ * \brief Tamarama: container engine for exclusive engines, compiler code.
  */
 
 #include "config.h"
@@ -111,8 +111,9 @@ void copyInSubnfas(const char *base_offset, NFA &nfa,
  * returns via out_top_remap, a mapping indicating how tops in the subengines in
  * relate to the tamarama's tops.
  */
-aligned_unique_ptr<NFA> buildTamarama(const TamaInfo &tamaInfo, const u32 queue,
-                        map<pair<const NFA *, u32>, u32> &out_top_remap) {
+bytecode_ptr<NFA>
+buildTamarama(const TamaInfo &tamaInfo, const u32 queue,
+              map<pair<const NFA *, u32>, u32> &out_top_remap) {
     vector<u32> top_base;
     remapTops(tamaInfo, top_base, out_top_remap);
 
@@ -133,7 +134,7 @@ aligned_unique_ptr<NFA> buildTamarama(const TamaInfo &tamaInfo, const u32 queue,
     // use subSize as a sentinel value for no active subengines,
     // so add one to subSize here
     u32 activeIdxSize = calcPackedBytes(subSize + 1);
-    aligned_unique_ptr<NFA> nfa = aligned_zmalloc_unique<NFA>(total_size);
+    auto nfa = make_zeroed_bytecode_ptr<NFA>(total_size);
     nfa->type = verify_u8(TAMARAMA_NFA);
     nfa->length = verify_u32(total_size);
     nfa->queueIndex = queue;
@@ -148,7 +149,7 @@ aligned_unique_ptr<NFA> buildTamarama(const TamaInfo &tamaInfo, const u32 queue,
     copy_bytes(ptr, top_base);
     ptr += byte_length(top_base);
 
-    u32 *offsets = (u32*)ptr;
+    u32 *offsets = (u32 *)ptr;
     char *sub_nfa_offset = ptr + sizeof(u32) * subSize;
     copyInSubnfas(base_offset, *nfa, tamaInfo, offsets, sub_nfa_offset,
                   activeIdxSize);
diff --git a/src/nfa/tamaramacompile.h b/src/nfa/tamaramacompile.h
index 048b966b8..7fcea3ec8 100644
--- a/src/nfa/tamaramacompile.h
+++ b/src/nfa/tamaramacompile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,15 +26,16 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-/** \file
- *  \brief Tamarama: container engine for exclusive engines, compiler code.
+/**
+ * \file
+ * \brief Tamarama: container engine for exclusive engines, compiler code.
  */
 
 #ifndef NFA_TAMARAMACOMPILE_H
 #define NFA_TAMARAMACOMPILE_H
 
 #include "ue2common.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"
 
 #include <map>
 #include <set>
@@ -45,7 +46,7 @@ struct NFA;
 namespace ue2 {
 
 /**
- * \brief A TamaProto that contains top remapping and reports info
+ * \brief A TamaProto that contains top remapping and reports info.
  */
 struct TamaProto {
     void add(const NFA *n, const u32 id, const u32 top,
@@ -59,7 +60,7 @@ struct TamaProto {
 };
 
 /**
- * \brief Contruction info for a Tamarama engine:
+ * \brief Construction info for a Tamarama engine:
  * contains at least two subengines.
  *
  * A TamaInfo is converted into a single NFA, with each top triggering a
@@ -70,7 +71,7 @@ struct TamaInfo {
     static constexpr size_t max_occupancy = 65536; // arbitrary limit
 
     /** \brief Add a new subengine. */
-    void add(NFA* sub, const std::set<u32> &top);
+    void add(NFA *sub, const std::set<u32> &top);
 
     /** \brief All the subengines */
     std::vector<NFA *> subengines;
@@ -86,9 +87,10 @@ std::set<ReportID> all_reports(const TamaProto &proto);
  * returns via out_top_remap, a mapping indicating how tops in the subengines in
  * relate to the tamarama's tops.
  */
-ue2::aligned_unique_ptr<NFA> buildTamarama(const TamaInfo &tamaInfo,
-                      const u32 queue,
-                      std::map<std::pair<const NFA *, u32>, u32> &out_top_remap);
+bytecode_ptr<NFA>
+buildTamarama(const TamaInfo &tamaInfo, const u32 queue,
+              std::map<std::pair<const NFA *, u32>, u32> &out_top_remap);
+
 } // namespace ue2
 
 #endif // NFA_TAMARAMACOMPILE_H
diff --git a/src/nfa/truffle.c b/src/nfa/truffle.c
index 1eff269ab..be6b312cf 100644
--- a/src/nfa/truffle.c
+++ b/src/nfa/truffle.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,12 +33,11 @@
 
 #include "ue2common.h"
 #include "truffle.h"
+#include "util/arch.h"
 #include "util/bitutils.h"
 #include "util/simd_utils.h"
 
-#include "truffle_common.h"
-
-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)
 
 static really_inline
 const u8 *lastMatch(const u8 *buf, u32 z) {
@@ -51,6 +50,57 @@ const u8 *lastMatch(const u8 *buf, u32 z) {
     return NULL; // no match
 }
 
+static really_inline
+const u8 *firstMatch(const u8 *buf, u32 z) {
+    if (unlikely(z != 0xffff)) {
+        u32 pos = ctz32(~z & 0xffff);
+        assert(pos < 16);
+        return buf + pos;
+    }
+
+    return NULL; // no match
+}
+
+static really_inline
+u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) {
+
+    m128 highconst = _mm_set1_epi8(0x80);
+    m128 shuf_mask_hi = _mm_set1_epi64x(0x8040201008040201);
+
+    // and now do the real work
+    m128 shuf1 = pshufb_m128(shuf_mask_lo_highclear, v);
+    m128 t1 = xor128(v, highconst);
+    m128 shuf2 = pshufb_m128(shuf_mask_lo_highset, t1);
+    m128 t2 = andnot128(highconst, rshift64_m128(v, 4));
+    m128 shuf3 = pshufb_m128(shuf_mask_hi, t2);
+    m128 tmp = and128(or128(shuf1, shuf2), shuf3);
+    m128 tmp2 = eq128(tmp, zeroes128());
+    u32 z = movemask128(tmp2);
+
+    return z;
+}
+
+static
+const u8 *truffleMini(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
+                       const u8 *buf, const u8 *buf_end) {
+    uintptr_t len = buf_end - buf;
+    assert(len < 16);
+
+    m128 chars = zeroes128();
+    memcpy(&chars, buf, len);
+
+    u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
+    // can't be these bytes in z
+    u32 mask = (0xffff >> (16 - len)) ^ 0xffff;
+    const u8 *rv = firstMatch(buf, z | mask);
+
+    if (rv) {
+        return rv;
+    } else {
+        return buf_end;
+    }
+}
+
 static really_inline
 const u8 *fwdBlock(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
                    m128 v, const u8 *buf) {
@@ -124,7 +174,7 @@ const u8 *truffleRevMini(m128 shuf_mask_lo_highclear,
     m128 chars = zeroes128();
     memcpy(&chars, buf, len);
 
-    u32 mask = (0xFFFF >> (16 - len)) ^ 0xFFFF;
+    u32 mask = (0xffff >> (16 - len)) ^ 0xffff;
     u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
     const u8 *rv = lastMatch(buf, z | mask);
 
@@ -181,7 +231,9 @@ const u8 *rtruffleExec(m128 shuf_mask_lo_highclear,
     return buf - 1;
 }
 
-#else
+#elif !defined(HAVE_AVX512)
+
+// AVX2
 
 static really_inline
 const u8 *lastMatch(const u8 *buf, u32 z) {
@@ -194,6 +246,57 @@ const u8 *lastMatch(const u8 *buf, u32 z) {
     return NULL; // no match
 }
 
+static really_inline
+const u8 *firstMatch(const u8 *buf, u32 z) {
+    if (unlikely(z != 0xffffffff)) {
+        u32 pos = ctz32(~z);
+        assert(pos < 32);
+        return buf + pos;
+    }
+
+    return NULL; // no match
+}
+
+static really_inline
+u32 block(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, m256 v) {
+
+    m256 highconst = _mm256_set1_epi8(0x80);
+    m256 shuf_mask_hi = _mm256_set1_epi64x(0x8040201008040201);
+
+    // and now do the real work
+    m256 shuf1 = pshufb_m256(shuf_mask_lo_highclear, v);
+    m256 t1 = xor256(v, highconst);
+    m256 shuf2 = pshufb_m256(shuf_mask_lo_highset, t1);
+    m256 t2 = andnot256(highconst, rshift64_m256(v, 4));
+    m256 shuf3 = pshufb_m256(shuf_mask_hi, t2);
+    m256 tmp = and256(or256(shuf1, shuf2), shuf3);
+    m256 tmp2 = eq256(tmp, zeroes256());
+    u32 z = movemask256(tmp2);
+
+    return z;
+}
+
+static
+const u8 *truffleMini(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset,
+                       const u8 *buf, const u8 *buf_end) {
+    uintptr_t len = buf_end - buf;
+    assert(len < 32);
+
+    m256 chars = zeroes256();
+    memcpy(&chars, buf, len);
+
+    u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
+    // can't be these bytes in z
+    u32 mask = (0xffffffff >> (32 - len)) ^ 0xffffffff;
+    const u8 *rv = firstMatch(buf, z | mask);
+
+    if (rv) {
+        return rv;
+    } else {
+        return buf_end;
+    }
+}
+
 static really_inline
 const u8 *fwdBlock(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset,
                    m256 v, const u8 *buf) {
@@ -265,7 +368,7 @@ const u8 *truffleRevMini(m256 shuf_mask_lo_highclear,
     m256 chars = zeroes256();
     memcpy(&chars, buf, len);
 
-    u32 mask = (0xFFFFFFFF >> (32 - len)) ^ 0xFFFFFFFF;
+    u32 mask = (0xffffffff >> (32 - len)) ^ 0xffffffff;
     u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
     const u8 *rv = lastMatch(buf, z | mask);
 
@@ -322,4 +425,184 @@ const u8 *rtruffleExec(m128 shuf_mask_lo_highclear,
     return buf - 1;
 }
 
+#else // AVX512
+
+static really_inline
+const u8 *lastMatch(const u8 *buf, u64a z) {
+    if (unlikely(z != ~0ULL)) {
+        u64a pos = clz64(~z);
+        assert(pos < 64);
+        return buf + (63 - pos);
+    }
+
+    return NULL; // no match
+}
+
+static really_inline
+const u8 *firstMatch(const u8 *buf, u64a z) {
+    if (unlikely(z != ~0ULL)) {
+        u64a pos = ctz64(~z);
+        assert(pos < 64);
+        DEBUG_PRINTF("pos %llu\n", pos);
+        return buf + pos;
+    }
+
+    return NULL; // no match
+}
+
+static really_inline
+u64a block(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset, m512 v) {
+    m512 highconst = set64x8(0x80);
+    m512 shuf_mask_hi = set8x64(0x8040201008040201);
+
+    // and now do the real work
+    m512 shuf1 = pshufb_m512(shuf_mask_lo_highclear, v);
+    m512 t1 = xor512(v, highconst);
+    m512 shuf2 = pshufb_m512(shuf_mask_lo_highset, t1);
+    m512 t2 = andnot512(highconst, rshift64_m512(v, 4));
+    m512 shuf3 = pshufb_m512(shuf_mask_hi, t2);
+    m512 tmp = and512(or512(shuf1, shuf2), shuf3);
+    u64a z = eq512mask(tmp, zeroes512());
+
+    return z;
+}
+
+static really_inline
+const u8 *truffleMini(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset,
+                      const u8 *buf, const u8 *buf_end) {
+    uintptr_t len = buf_end - buf;
+    assert(len <= 64);
+
+    __mmask64 mask = (~0ULL) >> (64 - len);
+
+    m512 chars = loadu_maskz_m512(mask, buf);
+
+    u64a z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
+
+    const u8 *rv = firstMatch(buf, z | ~mask);
+
+    return rv;
+}
+
+static really_inline
+const u8 *fwdBlock(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset,
+                   m512 v, const u8 *buf) {
+    u64a z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v);
+    return firstMatch(buf, z);
+}
+
+static really_inline
+const u8 *revBlock(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset,
+                   m512 v, const u8 *buf) {
+    u64a z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v);
+    return lastMatch(buf, z);
+}
+
+const u8 *truffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
+                      const u8 *buf, const u8 *buf_end) {
+    DEBUG_PRINTF("len %zu\n", buf_end - buf);
+    const m512 wide_clear = set4x128(shuf_mask_lo_highclear);
+    const m512 wide_set = set4x128(shuf_mask_lo_highset);
+
+    assert(buf && buf_end);
+    assert(buf < buf_end);
+    const u8 *rv;
+
+    if (buf_end - buf <= 64) {
+        rv = truffleMini(wide_clear, wide_set, buf, buf_end);
+        return rv ? rv : buf_end;
+    }
+
+    assert(buf_end - buf >= 64);
+    if ((uintptr_t)buf % 64) {
+        // Preconditioning: most of the time our buffer won't be aligned.
+        rv = truffleMini(wide_clear, wide_set, buf, ROUNDUP_PTR(buf, 64));
+        if (rv) {
+            return rv;
+        }
+        buf = ROUNDUP_PTR(buf, 64);
+    }
+    const u8 *last_block = buf_end - 64;
+    while (buf < last_block) {
+        m512 lchars = load512(buf);
+        rv = fwdBlock(wide_clear, wide_set, lchars, buf);
+        if (rv) {
+            return rv;
+        }
+        buf += 64;
+    }
+
+    // Use an unaligned load to mop up the last 64 bytes and get an accurate
+    // picture to buf_end.
+    assert(buf <= buf_end && buf >= buf_end - 64);
+    m512 chars = loadu512(buf_end - 64);
+    rv = fwdBlock(wide_clear, wide_set, chars, buf_end - 64);
+    if (rv) {
+        return rv;
+    }
+    return buf_end;
+}
+
+static really_inline
+const u8 *truffleRevMini(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset,
+                         const u8 *buf, const u8 *buf_end) {
+    uintptr_t len = buf_end - buf;
+    assert(len < 64);
+
+    __mmask64 mask = (~0ULL) >> (64 - len);
+    m512 chars = loadu_maskz_m512(mask, buf);
+    u64a z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
+    DEBUG_PRINTF("mask 0x%016llx z 0x%016llx\n", mask, z);
+    const u8 *rv = lastMatch(buf, z | ~mask);
+
+    if (rv) {
+        return rv;
+    }
+    return buf - 1;
+}
+
+const u8 *rtruffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
+                       const u8 *buf, const u8 *buf_end) {
+    const m512 wide_clear = set4x128(shuf_mask_lo_highclear);
+    const m512 wide_set = set4x128(shuf_mask_lo_highset);
+    assert(buf && buf_end);
+    assert(buf < buf_end);
+    const u8 *rv;
+
+    DEBUG_PRINTF("len %zu\n", buf_end - buf);
+
+    if (buf_end - buf < 64) {
+        return truffleRevMini(wide_clear, wide_set, buf, buf_end);
+    }
+
+    assert(buf_end - buf >= 64);
+
+    // Preconditioning: most of the time our buffer won't be aligned.
+    m512 chars = loadu512(buf_end - 64);
+    rv = revBlock(wide_clear, wide_set, chars, buf_end - 64);
+    if (rv) {
+        return rv;
+    }
+    buf_end = (const u8 *)ROUNDDOWN_N((uintptr_t)buf_end, 64);
+
+    const u8 *last_block = buf + 64;
+    while (buf_end > last_block) {
+        buf_end -= 64;
+        m512 lchars = load512(buf_end);
+        rv = revBlock(wide_clear, wide_set, lchars, buf_end);
+        if (rv) {
+            return rv;
+        }
+    }
+
+    // Use an unaligned load to mop up the last 64 bytes and get an accurate
+    // picture to buf_end.
+    chars = loadu512(buf);
+    rv = revBlock(wide_clear, wide_set, chars, buf);
+    if (rv) {
+        return rv;
+    }
+    return buf - 1;
+}
+
 #endif
diff --git a/src/nfa/truffle_common.h b/src/nfa/truffle_common.h
deleted file mode 100644
index 7368e550d..000000000
--- a/src/nfa/truffle_common.h
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Copyright (c) 2015-2016, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef TRUFFLE_COMMON_H_
-#define TRUFFLE_COMMON_H_
-
-#include "util/bitutils.h"
-#include "util/simd_utils.h"
-
-/*
- * Common stuff for all versions of truffle (single, multi and multidouble)
- */
-#if !defined(__AVX2__)
-
-static really_inline
-const u8 *firstMatch(const u8 *buf, u32 z) {
-    if (unlikely(z != 0xffff)) {
-        u32 pos = ctz32(~z & 0xffff);
-        assert(pos < 16);
-        return buf + pos;
-    }
-
-    return NULL; // no match
-}
-
-static really_inline
-u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) {
-
-    m128 highconst = _mm_set1_epi8(0x80);
-    m128 shuf_mask_hi = _mm_set1_epi64x(0x8040201008040201);
-
-    // and now do the real work
-    m128 shuf1 = pshufb(shuf_mask_lo_highclear, v);
-    m128 t1 = xor128(v, highconst);
-    m128 shuf2 = pshufb(shuf_mask_lo_highset, t1);
-    m128 t2 = andnot128(highconst, rshift64_m128(v, 4));
-    m128 shuf3 = pshufb(shuf_mask_hi, t2);
-    m128 tmp = and128(or128(shuf1, shuf2), shuf3);
-    m128 tmp2 = eq128(tmp, zeroes128());
-    u32 z = movemask128(tmp2);
-
-    return z;
-}
-
-static
-const u8 *truffleMini(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset,
-                       const u8 *buf, const u8 *buf_end) {
-    uintptr_t len = buf_end - buf;
-    assert(len < 16);
-
-    m128 chars = zeroes128();
-    memcpy(&chars, buf, len);
-
-    u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
-    // can't be these bytes in z
-    u32 mask = (0xFFFF >> (16 - len)) ^ 0xFFFF;
-    const u8 *rv = firstMatch(buf, z| mask);
-
-    if (rv) {
-        return rv;
-    } else {
-        return buf_end;
-    }
-}
-
-#else
-
-static really_inline
-const u8 *firstMatch(const u8 *buf, u32 z) {
-    if (unlikely(z != 0xffffffff)) {
-        u32 pos = ctz32(~z);
-        assert(pos < 32);
-        return buf + pos;
-    }
-
-    return NULL; // no match
-}
-
-static really_inline
-u32 block(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, m256 v) {
-
-    m256 highconst = _mm256_set1_epi8(0x80);
-    m256 shuf_mask_hi = _mm256_set1_epi64x(0x8040201008040201);
-
-    // and now do the real work
-    m256 shuf1 = vpshufb(shuf_mask_lo_highclear, v);
-    m256 t1 = xor256(v, highconst);
-    m256 shuf2 = vpshufb(shuf_mask_lo_highset, t1);
-    m256 t2 = andnot256(highconst, rshift64_m256(v, 4));
-    m256 shuf3 = vpshufb(shuf_mask_hi, t2);
-    m256 tmp = and256(or256(shuf1, shuf2), shuf3);
-    m256 tmp2 = eq256(tmp, zeroes256());
-    u32 z = movemask256(tmp2);
-
-    return z;
-}
-
-static
-const u8 *truffleMini(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset,
-                       const u8 *buf, const u8 *buf_end) {
-    uintptr_t len = buf_end - buf;
-    assert(len < 32);
-
-    m256 chars = zeroes256();
-    memcpy(&chars, buf, len);
-
-    u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars);
-    // can't be these bytes in z
-    u32 mask = (0xFFFFFFFF >> (32 - len)) ^ 0xFFFFFFFF;
-    const u8 *rv = firstMatch(buf, z | mask);
-
-    if (rv) {
-        return rv;
-    } else {
-        return buf_end;
-    }
-}
-
-#endif
-
-#endif /* TRUFFLE_COMMON_H_ */
diff --git a/src/nfagraph/ng.cpp b/src/nfagraph/ng.cpp
index dff9c7e87..8b247c74b 100644
--- a/src/nfagraph/ng.cpp
+++ b/src/nfagraph/ng.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -27,10 +27,11 @@
  */
 
 /** \file
- * \brief NG, NGHolder, NGWrapper and graph handling.
+ * \brief NG and graph handling.
  */
-#include "grey.h"
 #include "ng.h"
+
+#include "grey.h"
 #include "ng_anchored_acyclic.h"
 #include "ng_anchored_dots.h"
 #include "ng_asserts.h"
@@ -41,6 +42,7 @@
 #include "ng_equivalence.h"
 #include "ng_extparam.h"
 #include "ng_fixed_width.h"
+#include "ng_fuzzy.h"
 #include "ng_haig.h"
 #include "ng_literal_component.h"
 #include "ng_literal_decorated.h"
@@ -52,7 +54,6 @@
 #include "ng_region.h"
 #include "ng_region_redundancy.h"
 #include "ng_reports.h"
-#include "ng_rose.h"
 #include "ng_sep.h"
 #include "ng_small_literal_set.h"
 #include "ng_som.h"
@@ -62,6 +63,7 @@
 #include "ng_util.h"
 #include "ng_width.h"
 #include "ue2common.h"
+#include "compiler/compiler.h"
 #include "nfa/goughcompile.h"
 #include "rose/rose_build.h"
 #include "smallwrite/smallwrite_build.h"
@@ -100,16 +102,16 @@ NG::~NG() {
  * \throw CompileError if SOM cannot be supported for the component.
  */
 static
-bool addComponentSom(NG &ng, NGHolder &g, const NGWrapper &w,
+bool addComponentSom(NG &ng, NGHolder &g, const ExpressionInfo &expr,
                      const som_type som, const u32 comp_id) {
     DEBUG_PRINTF("doing som\n");
-    dumpComponent(g, "03_presom", w.expressionIndex, comp_id, ng.cc.grey);
+    dumpComponent(g, "03_presom", expr.index, comp_id, ng.cc.grey);
     assert(hasCorrectlyNumberedVertices(g));
-    assert(allMatchStatesHaveReports(w));
+    assert(allMatchStatesHaveReports(g));
 
     // First, we try the "SOM chain" support in ng_som.cpp.
 
-    sombe_rv rv = doSom(ng, g, w, comp_id, som);
+    sombe_rv rv = doSom(ng, g, expr, comp_id, som);
     if (rv == SOMBE_HANDLED_INTERNAL) {
         return false;
     } else if (rv == SOMBE_HANDLED_ALL) {
@@ -118,7 +120,7 @@ bool addComponentSom(NG &ng, NGHolder &g, const NGWrapper &w,
     assert(rv == SOMBE_FAIL);
 
     /* Next, Sombe style approaches */
-    rv = doSomWithHaig(ng, g, w, comp_id, som);
+    rv = doSomWithHaig(ng, g, expr, comp_id, som);
     if (rv == SOMBE_HANDLED_INTERNAL) {
         return false;
     } else if (rv == SOMBE_HANDLED_ALL) {
@@ -132,7 +134,7 @@ bool addComponentSom(NG &ng, NGHolder &g, const NGWrapper &w,
     vector<vector<CharReach> > triggers; /* empty for outfix */
 
     assert(g.kind == NFA_OUTFIX);
-    dumpComponent(g, "haig", w.expressionIndex, comp_id, ng.cc.grey);
+    dumpComponent(g, "haig", expr.index, comp_id, ng.cc.grey);
     makeReportsSomPass(ng.rm, g);
     auto haig = attemptToBuildHaig(g, som, ng.ssm.somPrecision(), triggers,
                                    ng.cc.grey);
@@ -145,7 +147,7 @@ bool addComponentSom(NG &ng, NGHolder &g, const NGWrapper &w,
     /* Our various strategies for supporting SOM for this pattern have failed.
      * Provide a generic pattern not supported/too large return value as it is
      * unclear what the meaning of a specific SOM error would be */
-    throw CompileError(w.expressionIndex, "Pattern is too large.");
+    throw CompileError(expr.index, "Pattern is too large.");
 
     assert(0); // unreachable
     return false;
@@ -200,27 +202,35 @@ void reduceGraph(NGHolder &g, som_type som, bool utf8,
 }
 
 static
-bool addComponent(NG &ng, NGHolder &g, const NGWrapper &w, const som_type som,
-                  const u32 comp_id) {
+bool addComponent(NG &ng, NGHolder &g, const ExpressionInfo &expr,
+                  const som_type som, const u32 comp_id) {
     const CompileContext &cc = ng.cc;
     assert(hasCorrectlyNumberedVertices(g));
 
     DEBUG_PRINTF("expr=%u, comp=%u: %zu vertices, %zu edges\n",
-                 w.expressionIndex, comp_id, num_vertices(g), num_edges(g));
+                 expr.index, comp_id, num_vertices(g), num_edges(g));
 
-    dumpComponent(g, "01_begin", w.expressionIndex, comp_id, ng.cc.grey);
+    dumpComponent(g, "01_begin", expr.index, comp_id, ng.cc.grey);
 
-    assert(allMatchStatesHaveReports(w));
+    assert(allMatchStatesHaveReports(g));
 
-    reduceGraph(g, som, w.utf8, cc);
+    reduceExtendedParams(g, ng.rm, som);
+    reduceGraph(g, som, expr.utf8, cc);
 
-    dumpComponent(g, "02_reduced", w.expressionIndex, comp_id, ng.cc.grey);
+    dumpComponent(g, "02_reduced", expr.index, comp_id, ng.cc.grey);
 
     // There may be redundant regions that we can remove
     if (cc.grey.performGraphSimplification) {
         removeRegionRedundancy(g, som);
     }
 
+    // We might be done at this point: if we've run out of vertices, we can
+    // stop processing.
+    if (num_vertices(g) == N_SPECIALS) {
+        DEBUG_PRINTF("all vertices claimed\n");
+        return true;
+    }
+
     // "Short Exhaustible Passthrough" patterns always become outfixes.
     if (!som && isSEP(g, ng.rm, cc.grey)) {
         DEBUG_PRINTF("graph is SEP\n");
@@ -231,12 +241,12 @@ bool addComponent(NG &ng, NGHolder &g, const NGWrapper &w, const som_type som,
 
     // Start Of Match handling.
     if (som) {
-        if (addComponentSom(ng, g, w, som, comp_id)) {
+        if (addComponentSom(ng, g, expr, som, comp_id)) {
             return true;
         }
     }
 
-    assert(allMatchStatesHaveReports(w));
+    assert(allMatchStatesHaveReports(g));
 
     if (splitOffAnchoredAcyclic(*ng.rose, g, cc)) {
         return true;
@@ -251,15 +261,11 @@ bool addComponent(NG &ng, NGHolder &g, const NGWrapper &w, const som_type som,
         return true;
     }
 
-    if (doViolet(*ng.rose, g, w.prefilter, cc)) {
+    if (doViolet(*ng.rose, g, expr.prefilter, false, ng.rm, cc)) {
         return true;
     }
 
-    if (splitOffRose(*ng.rose, g, w.prefilter, cc)) {
-        return true;
-    }
-
-    if (splitOffPuffs(*ng.rose, ng.rm, g, w.prefilter, cc)) {
+    if (splitOffPuffs(*ng.rose, ng.rm, g, expr.prefilter, cc)) {
         return true;
     }
 
@@ -272,26 +278,7 @@ bool addComponent(NG &ng, NGHolder &g, const NGWrapper &w, const som_type som,
         return true;
     }
 
-    if (doViolet(*ng.rose, g, w.prefilter, cc)) {
-        return true;
-    }
-
-    if (splitOffRose(*ng.rose, g, w.prefilter, cc)) {
-        return true;
-    }
-
-    // A final pass at cyclic redundancy and Rose
-    // TODO: investigate - coverage results suggest that this never succeeds?
-    if (cc.grey.performGraphSimplification) {
-        if (removeCyclicPathRedundancy(g) ||
-            removeCyclicDominated(g, som)) {
-            if (handleFixedWidth(*ng.rose, g, cc.grey)) {
-                return true;
-            }
-        }
-    }
-
-    if (finalChanceRose(*ng.rose, g, w.prefilter, cc)) {
+    if (doViolet(*ng.rose, g, expr.prefilter, true, ng.rm, cc)) {
         return true;
     }
 
@@ -306,7 +293,7 @@ bool addComponent(NG &ng, NGHolder &g, const NGWrapper &w, const som_type som,
 
 // Returns true if all components have been added.
 static
-bool processComponents(NG &ng, NGWrapper &w,
+bool processComponents(NG &ng, ExpressionInfo &expr,
                        deque<unique_ptr<NGHolder>> &g_comp,
                        const som_type som) {
     const u32 num_components = g_comp.size();
@@ -316,7 +303,7 @@ bool processComponents(NG &ng, NGWrapper &w,
         if (!g_comp[i]) {
             continue;
         }
-        if (addComponent(ng, *g_comp[i], w, som, i)) {
+        if (addComponent(ng, *g_comp[i], expr, som, i)) {
             g_comp[i].reset();
             continue;
         }
@@ -336,40 +323,65 @@ bool processComponents(NG &ng, NGWrapper &w,
     return false;
 }
 
-bool NG::addGraph(NGWrapper &w) {
+bool NG::addGraph(ExpressionInfo &expr, unique_ptr<NGHolder> g_ptr) {
+    assert(g_ptr);
+    NGHolder &g = *g_ptr;
+
     // remove reports that aren't on vertices connected to accept.
-    clearReports(w);
+    clearReports(g);
 
-    som_type som = w.som;
-    if (som && isVacuous(w)) {
-        throw CompileError(w.expressionIndex, "Start of match is not "
+    som_type som = expr.som;
+    if (som && isVacuous(g)) {
+        throw CompileError(expr.index, "Start of match is not "
                            "currently supported for patterns which match an "
                            "empty buffer.");
     }
 
-    dumpDotWrapper(w, "01_initial", cc.grey);
-    assert(allMatchStatesHaveReports(w));
+    dumpDotWrapper(g, expr, "01_initial", cc.grey);
+    assert(allMatchStatesHaveReports(g));
 
     /* ensure utf8 starts at cp boundary */
-    ensureCodePointStart(rm, w);
-    resolveAsserts(rm, w);
+    ensureCodePointStart(rm, g, expr);
 
-    dumpDotWrapper(w, "02_post_assert_resolve", cc.grey);
-    assert(allMatchStatesHaveReports(w));
+    if (can_never_match(g)) {
+        throw CompileError(expr.index, "Pattern can never match.");
+    }
 
-    pruneUseless(w);
-    pruneEmptyVertices(w);
+    // validate graph's suitability for fuzzing before resolving asserts
+    validate_fuzzy_compile(g, expr.edit_distance, expr.utf8, cc.grey);
 
-    if (can_never_match(w)) {
-        throw CompileError(w.expressionIndex, "Pattern can never match.");
+    resolveAsserts(rm, g, expr);
+    dumpDotWrapper(g, expr, "02_post_assert_resolve", cc.grey);
+    assert(allMatchStatesHaveReports(g));
+
+    make_fuzzy(g, expr.edit_distance, cc.grey);
+    dumpDotWrapper(g, expr, "02a_post_fuzz", cc.grey);
+
+    pruneUseless(g);
+    pruneEmptyVertices(g);
+
+    if (can_never_match(g)) {
+        throw CompileError(expr.index, "Pattern can never match.");
     }
 
-    optimiseVirtualStarts(w); /* good for som */
+    optimiseVirtualStarts(g); /* good for som */
+
+    propagateExtendedParams(g, expr, rm);
+    reduceExtendedParams(g, rm, som);
+
+    // We may have removed all the edges to accept, in which case this
+    // expression cannot match.
+    if (can_never_match(g)) {
+        throw CompileError(expr.index, "Extended parameter constraints can not "
+                                       "be satisfied for any match from this "
+                                       "expression.");
+    }
 
-    handleExtendedParams(rm, w, cc);
-    if (w.min_length) {
-        // We have a minimum length constraint, which we currently use SOM to
-        // satisfy.
+    if (any_of_in(all_reports(g), [&](ReportID id) {
+            return rm.getReport(id).minLength;
+        })) {
+        // We have at least one report with a minimum length constraint, which
+        // we currently use SOM to satisfy.
         som = SOM_LEFT;
         ssm.somPrecision(8);
     }
@@ -381,98 +393,104 @@ bool NG::addGraph(NGWrapper &w) {
     // first, we can perform graph work that can be done on an individual
     // expression basis.
 
-    if (w.utf8) {
-        relaxForbiddenUtf8(w);
+    if (expr.utf8) {
+        relaxForbiddenUtf8(g, expr);
     }
 
-    if (w.highlander && !w.min_length && !w.min_offset) {
+    if (all_of_in(all_reports(g), [&](ReportID id) {
+            const auto &report = rm.getReport(id);
+            return report.ekey != INVALID_EKEY && !report.minLength &&
+                   !report.minOffset;
+        })) {
         // In highlander mode: if we don't have constraints on our reports that
         // may prevent us accepting our first match (i.e. extended params) we
         // can prune the other out-edges of all vertices connected to accept.
-        pruneHighlanderAccepts(w, rm);
+        // TODO: shift the report checking down into pruneHighlanderAccepts()
+        // to allow us to handle the parts we can in mixed cases.
+        pruneHighlanderAccepts(g, rm);
     }
 
-    dumpDotWrapper(w, "02b_fairly_early", cc.grey);
+    dumpDotWrapper(g, expr, "02b_fairly_early", cc.grey);
 
     // If we're a vacuous pattern, we can handle this early.
-    if (splitOffVacuous(boundary, rm, w)) {
+    if (splitOffVacuous(boundary, rm, g, expr)) {
         DEBUG_PRINTF("split off vacuous\n");
     }
 
     // We might be done at this point: if we've run out of vertices, we can
     // stop processing.
-    if (num_vertices(w) == N_SPECIALS) {
+    if (num_vertices(g) == N_SPECIALS) {
         DEBUG_PRINTF("all vertices claimed by vacuous handling\n");
         return true;
     }
 
     // Now that vacuous edges have been removed, update the min width exclusive
     // of boundary reports.
-    minWidth = min(minWidth, findMinWidth(w));
+    minWidth = min(minWidth, findMinWidth(g));
 
     // Add the pattern to the small write builder.
-    smwr->add(w);
+    smwr->add(g, expr);
 
     if (!som) {
-        removeSiblingsOfStartDotStar(w);
+        removeSiblingsOfStartDotStar(g);
     }
 
-    dumpDotWrapper(w, "03_early", cc.grey);
+    dumpDotWrapper(g, expr, "03_early", cc.grey);
 
     // Perform a reduction pass to merge sibling character classes together.
     if (cc.grey.performGraphSimplification) {
-        removeRedundancy(w, som);
-        prunePathsRedundantWithSuccessorOfCyclics(w, som);
+        removeRedundancy(g, som);
+        prunePathsRedundantWithSuccessorOfCyclics(g, som);
     }
 
-    dumpDotWrapper(w, "04_reduced", cc.grey);
+    dumpDotWrapper(g, expr, "04_reduced", cc.grey);
 
     // If we've got some literals that span the graph from start to accept, we
     // can split them off into Rose from here.
     if (!som) {
-        if (splitOffLiterals(*this, w)) {
+        if (splitOffLiterals(*this, g)) {
             DEBUG_PRINTF("some vertices claimed by literals\n");
         }
     }
 
     // We might be done at this point: if we've run out of vertices, we can
     // stop processing.
-    if (num_vertices(w) == N_SPECIALS) {
+    if (num_vertices(g) == N_SPECIALS) {
         DEBUG_PRINTF("all vertices claimed before calc components\n");
         return true;
     }
 
-    // Split the graph into a set of connected components.
+    // Split the graph into a set of connected components and process those.
+    // Note: this invalidates g_ptr.
 
-    deque<unique_ptr<NGHolder>> g_comp = calcComponents(w);
+    auto g_comp = calcComponents(std::move(g_ptr), cc.grey);
     assert(!g_comp.empty());
 
     if (!som) {
-        for (u32 i = 0; i < g_comp.size(); i++) {
-            assert(g_comp[i]);
-            reformLeadingDots(*g_comp[i]);
+        for (auto &gc : g_comp) {
+            assert(gc);
+            reformLeadingDots(*gc);
         }
 
-        recalcComponents(g_comp);
+        recalcComponents(g_comp, cc.grey);
     }
 
-    if (processComponents(*this, w, g_comp, som)) {
+    if (processComponents(*this, expr, g_comp, som)) {
         return true;
     }
 
     // If we're in prefiltering mode, we can run the prefilter reductions and
     // have another shot at accepting the graph.
 
-    if (cc.grey.prefilterReductions && w.prefilter) {
-        for (u32 i = 0; i < g_comp.size(); i++) {
-            if (!g_comp[i]) {
+    if (cc.grey.prefilterReductions && expr.prefilter) {
+        for (auto &gc : g_comp) {
+            if (!gc) {
                 continue;
             }
-
-            prefilterReductions(*g_comp[i], cc);
+            prefilterReductions(*gc, cc);
         }
 
-        if (processComponents(*this, w, g_comp, som)) {
+        if (processComponents(*this, expr, g_comp, som)) {
             return true;
         }
     }
@@ -482,7 +500,7 @@ bool NG::addGraph(NGWrapper &w) {
         if (g_comp[i]) {
             DEBUG_PRINTF("could not compile component %u with %zu vertices\n",
                          i, num_vertices(*g_comp[i]));
-            throw CompileError(w.expressionIndex, "Pattern is too large.");
+            throw CompileError(expr.index, "Pattern is too large.");
         }
     }
 
@@ -491,63 +509,60 @@ bool NG::addGraph(NGWrapper &w) {
 }
 
 /** \brief Used from SOM mode to add an arbitrary NGHolder as an engine. */
-bool NG::addHolder(NGHolder &w) {
-    DEBUG_PRINTF("adding holder of %zu states\n", num_vertices(w));
-    assert(allMatchStatesHaveReports(w));
-    assert(hasCorrectlyNumberedVertices(w));
+bool NG::addHolder(NGHolder &g) {
+    DEBUG_PRINTF("adding holder of %zu states\n", num_vertices(g));
+    assert(allMatchStatesHaveReports(g));
+    assert(hasCorrectlyNumberedVertices(g));
 
     /* We don't update the global minWidth here as we care about the min width
      * of the whole pattern - not a just a prefix of it. */
 
     bool prefilter = false;
-    //dumpDotComp(comp, w, *this, 20, "prefix_init");
+    //dumpDotComp(comp, g, *this, 20, "prefix_init");
 
     som_type som = SOM_NONE; /* the prefixes created by the SOM code do not
                                 themselves track som */
     bool utf8 = false; // handling done earlier
-    reduceGraph(w, som, utf8, cc);
+    reduceGraph(g, som, utf8, cc);
 
     // There may be redundant regions that we can remove
     if (cc.grey.performGraphSimplification) {
-        removeRegionRedundancy(w, som);
+        removeRegionRedundancy(g, som);
     }
 
     // "Short Exhaustible Passthrough" patterns always become outfixes.
-    if (isSEP(w, rm, cc.grey)) {
+    if (isSEP(g, rm, cc.grey)) {
         DEBUG_PRINTF("graph is SEP\n");
-        if (rose->addOutfix(w)) {
+        if (rose->addOutfix(g)) {
             return true;
         }
     }
 
-    if (splitOffAnchoredAcyclic(*rose, w, cc)) {
+    if (splitOffAnchoredAcyclic(*rose, g, cc)) {
         return true;
     }
 
-    if (handleSmallLiteralSets(*rose, w, cc)
-        || handleFixedWidth(*rose, w, cc.grey)) {
+    if (handleSmallLiteralSets(*rose, g, cc)
+        || handleFixedWidth(*rose, g, cc.grey)) {
         return true;
     }
 
-    if (handleDecoratedLiterals(*rose, w, cc)) {
+    if (handleDecoratedLiterals(*rose, g, cc)) {
         return true;
     }
 
-    if (splitOffRose(*rose, w, prefilter, cc)) {
+    if (doViolet(*rose, g, prefilter, false, rm, cc)) {
         return true;
     }
-    if (splitOffPuffs(*rose, rm, w, prefilter, cc)) {
+    if (splitOffPuffs(*rose, rm, g, prefilter, cc)) {
         return true;
     }
-    if (splitOffRose(*rose, w, prefilter, cc)) {
-        return true;
-    }
-    if (finalChanceRose(*rose, w, prefilter, cc)) {
+    if (doViolet(*rose, g, prefilter, true, rm, cc)) {
         return true;
     }
 
     DEBUG_PRINTF("trying for outfix\n");
-    if (rose->addOutfix(w)) {
+    if (rose->addOutfix(g)) {
         DEBUG_PRINTF("ok\n");
         return true;
     }
@@ -602,24 +617,4 @@ bool NG::addLiteral(const ue2_literal &literal, u32 expr_index,
     return true;
 }
 
-NGWrapper::NGWrapper(unsigned int ei, bool highlander_in, bool utf8_in,
-                     bool prefilter_in, som_type som_in, ReportID r,
-                     u64a min_offset_in, u64a max_offset_in, u64a min_length_in)
-    : expressionIndex(ei), reportId(r), highlander(highlander_in),
-      utf8(utf8_in), prefilter(prefilter_in), som(som_in),
-      min_offset(min_offset_in), max_offset(max_offset_in),
-      min_length(min_length_in) {
-    // All special nodes/edges are added in NGHolder's constructor.
-    DEBUG_PRINTF("built %p: expr=%u report=%u%s%s%s%s "
-                 "min_offset=%llu max_offset=%llu min_length=%llu\n",
-                 this, expressionIndex, reportId,
-                 highlander ? " highlander" : "",
-                 utf8 ? " utf8" : "",
-                 prefilter ? " prefilter" : "",
-                 (som != SOM_NONE) ? " som" : "",
-                 min_offset, max_offset, min_length);
-}
-
-NGWrapper::~NGWrapper() {}
-
 } // namespace ue2
diff --git a/src/nfagraph/ng.h b/src/nfagraph/ng.h
index 4aa6a7dc7..a5a5c235a 100644
--- a/src/nfagraph/ng.h
+++ b/src/nfagraph/ng.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -27,7 +27,7 @@
  */
 
 /** \file
- * \brief NG, NGHolder, NGWrapper declarations.
+ * \brief NG declaration.
  */
 
 #ifndef NG_H
@@ -42,6 +42,7 @@
 #include "util/compile_context.h"
 #include "util/depth.h"
 #include "util/graph.h"
+#include "util/noncopyable.h"
 #include "util/report_manager.h"
 #include "util/ue2_containers.h"
 
@@ -51,41 +52,16 @@
 #include <utility>
 #include <vector>
 
-#include <boost/core/noncopyable.hpp>
-
 namespace ue2 {
 
 struct CompileContext;
 struct ue2_literal;
 
-class NGWrapper : public NGHolder {
-public:
-    NGWrapper(unsigned int expressionIndex, bool highlander, bool utf8,
-              bool prefilter, const som_type som, ReportID rid, u64a min_offset,
-              u64a max_offset, u64a min_length);
-
-    ~NGWrapper() override;
-
-    /** index of the expression represented by this graph, used
-     * - down the track in error handling
-     * - identifying parts of an expression in highlander mode
-     */
-    const unsigned int expressionIndex;
-
-    const ReportID reportId; /**< user-visible report id */
-    const bool highlander; /**< user-specified single match only */
-    const bool utf8; /**< UTF-8 mode */
-    const bool prefilter; /**< prefiltering mode */
-    const som_type som; /**< SOM type requested */
-    u64a min_offset; /**< extparam min_offset value */
-    u64a max_offset; /**< extparam max_offset value */
-    u64a min_length; /**< extparam min_length value */
-};
-
+class ExpressionInfo;
 class RoseBuild;
 class SmallWriteBuild;
 
-class NG : boost::noncopyable {
+class NG : noncopyable {
 public:
     NG(const CompileContext &in_cc, size_t num_patterns,
        unsigned in_somPrecision);
@@ -93,14 +69,14 @@ class NG : boost::noncopyable {
 
     /** \brief Consumes a pattern, returns false or throws a CompileError
      * exception if the graph cannot be consumed. */
-    bool addGraph(NGWrapper &w);
+    bool addGraph(ExpressionInfo &expr, std::unique_ptr<NGHolder> g_ptr);
 
     /** \brief Consumes a graph, cut-down version of addGraph for use by SOM
      * processing. */
     bool addHolder(NGHolder &h);
 
-    /** \brief Adds a literal to Rose, used by literal shortcut passes (instead of
-     * using \ref addGraph) */
+    /** \brief Adds a literal to Rose, used by literal shortcut passes (instead
+     * of using \ref addGraph) */
     bool addLiteral(const ue2_literal &lit, u32 expr_index, u32 external_report,
                     bool highlander, som_type som);
 
@@ -127,7 +103,8 @@ class NG : boost::noncopyable {
  *
  * Shared with the small write compiler.
  */
-void reduceGraph(NGHolder &g, som_type som, bool utf8, const CompileContext &cc);
+void reduceGraph(NGHolder &g, som_type som, bool utf8,
+                 const CompileContext &cc);
 
 } // namespace ue2
 
diff --git a/src/nfagraph/ng_anchored_dots.cpp b/src/nfagraph/ng_anchored_dots.cpp
index ed9c7f486..9a13376d1 100644
--- a/src/nfagraph/ng_anchored_dots.cpp
+++ b/src/nfagraph/ng_anchored_dots.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -208,7 +208,7 @@ void reformAnchoredRepeatsComponent(NGHolder &g,
 
     /* get bounds */
     depth min;
-    depth max = 1;
+    depth max(1);
 
     if (selfLoop) {
         // A self-loop indicates that this is a '.+' or '.*'
@@ -229,9 +229,9 @@ void reformAnchoredRepeatsComponent(NGHolder &g,
             }
         }
 
-        min = 0;
+        min = depth(0);
     } else {
-        min = 1;
+        min = depth(1);
     }
 
     *startBegin = min;
@@ -326,8 +326,8 @@ void reformUnanchoredRepeatsComponent(NGHolder &g,
         }
 
         /* get bounds */
-        depth min = 1;
-        depth max = 1;
+        depth min(1);
+        depth max(1);
 
         if (selfLoop) {
             // A self-loop indicates that this is a '.+' or '.*'
@@ -349,7 +349,7 @@ void reformUnanchoredRepeatsComponent(NGHolder &g,
                 DEBUG_PRINTF("min greater than one, skipping\n");
                 return;
             }
-            min = 0;
+            min = depth(0);
         }
 
         *startBegin += min;
@@ -502,7 +502,7 @@ void collapseVariableDotRepeat(NGHolder &g, NFAVertex start,
                  startEnd->str().c_str());
 
     if (start == g.start && startEnd->is_infinite()) {
-        *startEnd = dots.size();
+        *startEnd = depth(dots.size());
     } else if (startEnd->is_finite()) {
         *startEnd += dots.size();
     }
diff --git a/src/nfagraph/ng_asserts.cpp b/src/nfagraph/ng_asserts.cpp
index c2f0d68f1..8812afadb 100644
--- a/src/nfagraph/ng_asserts.cpp
+++ b/src/nfagraph/ng_asserts.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -47,6 +47,7 @@
 #include "ng_prune.h"
 #include "ng_redundancy.h"
 #include "ng_util.h"
+#include "compiler/compiler.h"
 #include "parser/position.h" // for POS flags
 #include "util/bitutils.h" // for findAndClearLSB_32
 #include "util/boundary_reports.h"
@@ -184,43 +185,45 @@ void findSplitters(const NGHolder &g, const vector<NFAEdge> &asserts,
 }
 
 static
-void setReportId(ReportManager &rm, NGWrapper &g, NFAVertex v, s32 adj) {
+void setReportId(ReportManager &rm, NGHolder &g, const ExpressionInfo &expr,
+                 NFAVertex v, s32 adj) {
     // Don't try and set the report ID of a special vertex.
     assert(!is_special(v, g));
 
     // If there's a report set already, we're replacing it.
     g[v].reports.clear();
 
-    Report ir = rm.getBasicInternalReport(g, adj);
+    Report ir = rm.getBasicInternalReport(expr, adj);
 
     g[v].reports.insert(rm.getInternalId(ir));
     DEBUG_PRINTF("set report id for vertex %zu, adj %d\n", g[v].index, adj);
 }
 
 static
-NFAVertex makeClone(ReportManager &rm, NGWrapper &g, NFAVertex v,
-                    const CharReach &cr_mask) {
+NFAVertex makeClone(ReportManager &rm, NGHolder &g, const ExpressionInfo &expr,
+                    NFAVertex v, const CharReach &cr_mask) {
     NFAVertex clone = clone_vertex(g, v);
     g[clone].char_reach &= cr_mask;
     clone_out_edges(g, v, clone);
     clone_in_edges(g, v, clone);
 
     if (v == g.startDs) {
-        if (g.utf8) {
+        if (expr.utf8) {
             g[clone].char_reach &= ~UTF_START_CR;
         }
 
         DEBUG_PRINTF("marked as virt\n");
         g[clone].assert_flags = POS_FLAG_VIRTUAL_START;
 
-        setReportId(rm, g, clone, 0);
+        setReportId(rm, g, expr, clone, 0);
     }
 
     return clone;
 }
 
 static
-void splitVertex(ReportManager &rm, NGWrapper &g, NFAVertex v, bool ucp) {
+void splitVertex(ReportManager &rm, NGHolder &g, const ExpressionInfo &expr,
+                 NFAVertex v, bool ucp) {
     assert(v != g.start);
     assert(v != g.accept);
     assert(v != g.acceptEod);
@@ -232,14 +235,14 @@ void splitVertex(ReportManager &rm, NGWrapper &g, NFAVertex v, bool ucp) {
     auto has_no_assert = [&g](const NFAEdge &e) { return !g[e].assert_flags; };
 
     // Split v into word/nonword vertices with only asserting out-edges.
-    NFAVertex w_out = makeClone(rm, g, v, cr_word);
-    NFAVertex nw_out = makeClone(rm, g, v, cr_nonword);
+    NFAVertex w_out = makeClone(rm, g, expr, v, cr_word);
+    NFAVertex nw_out = makeClone(rm, g, expr, v, cr_nonword);
     remove_out_edge_if(w_out, has_no_assert, g);
     remove_out_edge_if(nw_out, has_no_assert, g);
 
     // Split v into word/nonword vertices with only asserting in-edges.
-    NFAVertex w_in = makeClone(rm, g, v, cr_word);
-    NFAVertex nw_in = makeClone(rm, g, v, cr_nonword);
+    NFAVertex w_in = makeClone(rm, g, expr, v, cr_word);
+    NFAVertex nw_in = makeClone(rm, g, expr, v, cr_nonword);
     remove_in_edge_if(w_in, has_no_assert, g);
     remove_in_edge_if(nw_in, has_no_assert, g);
 
@@ -250,7 +253,8 @@ void splitVertex(ReportManager &rm, NGWrapper &g, NFAVertex v, bool ucp) {
 }
 
 static
-void resolveEdges(ReportManager &rm, NGWrapper &g, set<NFAEdge> *dead) {
+void resolveEdges(ReportManager &rm, NGHolder &g, const ExpressionInfo &expr,
+                  set<NFAEdge> *dead) {
     for (const auto &e : edges_range(g)) {
         u32 flags = g[e].assert_flags;
         if (!flags) {
@@ -363,7 +367,7 @@ void resolveEdges(ReportManager &rm, NGWrapper &g, set<NFAEdge> *dead) {
             } else if (v_w) {
                 /* need to add a word byte */
                 NFAVertex vv = add_vertex(g);
-                setReportId(rm, g, vv, -1);
+                setReportId(rm, g, expr, vv, -1);
                 g[vv].char_reach = CHARREACH_WORD;
                 add_edge(vv, g.accept, g);
                 g[e].assert_flags = 0;
@@ -372,7 +376,7 @@ void resolveEdges(ReportManager &rm, NGWrapper &g, set<NFAEdge> *dead) {
             } else {
                 /* need to add a non word byte or see eod */
                 NFAVertex vv = add_vertex(g);
-                setReportId(rm, g, vv, -1);
+                setReportId(rm, g, expr, vv, -1);
                 g[vv].char_reach = CHARREACH_NONWORD;
                 add_edge(vv, g.accept, g);
                 g[e].assert_flags = 0;
@@ -416,7 +420,7 @@ void resolveEdges(ReportManager &rm, NGWrapper &g, set<NFAEdge> *dead) {
             } else if (v_w) {
                 /* need to add a word byte */
                 NFAVertex vv = add_vertex(g);
-                setReportId(rm, g, vv, -1);
+                setReportId(rm, g, expr, vv, -1);
                 g[vv].char_reach = CHARREACH_WORD_UCP_PRE;
                 add_edge(vv, g.accept, g);
                 g[e].assert_flags = 0;
@@ -425,7 +429,7 @@ void resolveEdges(ReportManager &rm, NGWrapper &g, set<NFAEdge> *dead) {
             } else {
                 /* need to add a non word byte or see eod */
                 NFAVertex vv = add_vertex(g);
-                setReportId(rm, g, vv, -1);
+                setReportId(rm, g, expr, vv, -1);
                 g[vv].char_reach = CHARREACH_NONWORD_UCP_PRE;
                 add_edge(vv, g.accept, g);
                 g[e].assert_flags = 0;
@@ -450,7 +454,8 @@ void resolveEdges(ReportManager &rm, NGWrapper &g, set<NFAEdge> *dead) {
     }
 }
 
-void resolveAsserts(ReportManager &rm, NGWrapper &g) {
+void resolveAsserts(ReportManager &rm, NGHolder &g,
+                    const ExpressionInfo &expr) {
     vector<NFAEdge> asserts = getAsserts(g);
     if (asserts.empty()) {
         return;
@@ -460,20 +465,20 @@ void resolveAsserts(ReportManager &rm, NGWrapper &g) {
     map<u32, NFAVertex> to_split_ucp; /* by index, for determinism */
     findSplitters(g, asserts, &to_split, &to_split_ucp);
     if (to_split.size() + to_split_ucp.size() > MAX_CLONED_VERTICES) {
-        throw CompileError(g.expressionIndex, "Pattern is too large.");
+        throw CompileError(expr.index, "Pattern is too large.");
     }
 
     for (const auto &m : to_split) {
         assert(!contains(to_split_ucp, m.first));
-        splitVertex(rm, g, m.second, false);
+        splitVertex(rm, g, expr, m.second, false);
     }
 
     for (const auto &m : to_split_ucp) {
-        splitVertex(rm, g, m.second, true);
+        splitVertex(rm, g, expr, m.second, true);
     }
 
     set<NFAEdge> dead;
-    resolveEdges(rm, g, &dead);
+    resolveEdges(rm, g, expr, &dead);
 
     remove_edges(dead, g);
     renumber_vertices(g);
@@ -485,15 +490,16 @@ void resolveAsserts(ReportManager &rm, NGWrapper &g) {
     clearReports(g);
 }
 
-void ensureCodePointStart(ReportManager &rm, NGWrapper &g) {
+void ensureCodePointStart(ReportManager &rm, NGHolder &g,
+                          const ExpressionInfo &expr) {
     /* In utf8 mode there is an implicit assertion that we start at codepoint
      * boundaries. Assert resolution handles the badness coming from asserts.
      * The only other source of trouble is startDs->accept connections.
      */
     NFAEdge orig = edge(g.startDs, g.accept, g);
-    if (g.utf8 && orig) {
-        DEBUG_PRINTF("rectifying %u\n", g.reportId);
-        Report ir = rm.getBasicInternalReport(g);
+    if (expr.utf8 && orig) {
+        DEBUG_PRINTF("rectifying %u\n", expr.report);
+        Report ir = rm.getBasicInternalReport(expr);
         ReportID rep = rm.getInternalId(ir);
 
         NFAVertex v_a = add_vertex(g);
diff --git a/src/nfagraph/ng_asserts.h b/src/nfagraph/ng_asserts.h
index 8183490ac..2534f5714 100644
--- a/src/nfagraph/ng_asserts.h
+++ b/src/nfagraph/ng_asserts.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -36,12 +36,14 @@
 namespace ue2 {
 
 struct BoundaryReports;
-class NGWrapper;
+class ExpressionInfo;
+class NGHolder;
 class ReportManager;
 
-void resolveAsserts(ReportManager &rm, NGWrapper &g);
+void resolveAsserts(ReportManager &rm, NGHolder &g, const ExpressionInfo &expr);
 
-void ensureCodePointStart(ReportManager &rm, NGWrapper &g);
+void ensureCodePointStart(ReportManager &rm, NGHolder &g,
+                          const ExpressionInfo &expr);
 
 } // namespace ue2
 
diff --git a/src/nfagraph/ng_builder.cpp b/src/nfagraph/ng_builder.cpp
index 4ca0b37e4..60f667f49 100644
--- a/src/nfagraph/ng_builder.cpp
+++ b/src/nfagraph/ng_builder.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -28,11 +28,13 @@
 
 /** \file
  * \brief: NFA Graph Builder: used by Glushkov construction to construct an
- * NGWrapper from a parsed expression.
+ * NGHolder from a parsed expression.
  */
+
+#include "ng_builder.h"
+
 #include "grey.h"
 #include "ng.h"
-#include "ng_builder.h"
 #include "ng_util.h"
 #include "ue2common.h"
 #include "compiler/compiler.h" // for ParsedExpression
@@ -79,7 +81,7 @@ class NFABuilderImpl : public NFABuilder {
     void cloneRegion(Position first, Position last,
                      unsigned posOffset) override;
 
-    unique_ptr<NGWrapper> getGraph() override;
+    BuiltExpression getGraph() override;
 
 private:
     /** fetch a vertex given its Position ID. */
@@ -94,8 +96,11 @@ class NFABuilderImpl : public NFABuilder {
     /** \brief Greybox: used for resource limits. */
     const Grey &grey;
 
-    /** \brief Underlying NGWrapper graph. */
-    unique_ptr<NGWrapper> graph;
+    /** \brief Underlying graph. */
+    unique_ptr<NGHolder> graph;
+
+    /** \brief Underlying expression info. */
+    ExpressionInfo expr;
 
     /** \brief mapping from position to vertex. Use \ref getVertex for access.
      * */
@@ -108,12 +113,9 @@ class NFABuilderImpl : public NFABuilder {
 } // namespace
 
 NFABuilderImpl::NFABuilderImpl(ReportManager &rm_in, const Grey &grey_in,
-                               const ParsedExpression &expr)
-    : rm(rm_in), grey(grey_in),
-      graph(ue2::make_unique<NGWrapper>(
-          expr.index, expr.highlander, expr.utf8, expr.prefilter, expr.som,
-          expr.id, expr.min_offset, expr.max_offset, expr.min_length)),
-      vertIdx(N_SPECIALS) {
+                               const ParsedExpression &parsed)
+    : rm(rm_in), grey(grey_in), graph(ue2::make_unique<NGHolder>()),
+      expr(parsed.expr), vertIdx(N_SPECIALS) {
 
     // Reserve space for a reasonably-sized NFA
     id2vertex.reserve(64);
@@ -150,7 +152,7 @@ void NFABuilderImpl::addVertex(Position pos) {
     (*graph)[v].index = pos;
 }
 
-unique_ptr<NGWrapper> NFABuilderImpl::getGraph() {
+BuiltExpression NFABuilderImpl::getGraph() {
     DEBUG_PRINTF("built graph has %zu vertices and %zu edges\n",
                  num_vertices(*graph), num_edges(*graph));
 
@@ -161,13 +163,13 @@ unique_ptr<NGWrapper> NFABuilderImpl::getGraph() {
         throw CompileError("Pattern too large.");
     }
 
-    return move(graph);
+    return { expr, move(graph) };
 }
 
 void NFABuilderImpl::setNodeReportID(Position pos, int offsetAdjust) {
-    Report ir = rm.getBasicInternalReport(*graph, offsetAdjust);
+    Report ir = rm.getBasicInternalReport(expr, offsetAdjust);
     DEBUG_PRINTF("setting report id on %u = (%u, %d, %u)\n",
-                 pos, graph->reportId, offsetAdjust, ir.ekey);
+                 pos, expr.report, offsetAdjust, ir.ekey);
 
     NFAVertex v = getVertex(pos);
     auto &reports = (*graph)[v].reports;
diff --git a/src/nfagraph/ng_builder.h b/src/nfagraph/ng_builder.h
index 5bd95ba9d..9f71b6223 100644
--- a/src/nfagraph/ng_builder.h
+++ b/src/nfagraph/ng_builder.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -28,7 +28,7 @@
 
 /** \file
  * \brief: NFA Graph Builder: used by Glushkov construction to construct an
- * NGWrapper from a parsed expression.
+ * NGHolder from a parsed expression.
  */
 
 #ifndef NG_BUILDER_H
@@ -37,22 +37,22 @@
 #include "ue2common.h"
 
 #include "parser/position.h"
+#include "util/noncopyable.h"
 
 #include <memory>
-#include <boost/core/noncopyable.hpp>
 
 namespace ue2 {
 
 class CharReach;
-class NGWrapper;
 class ReportManager;
+struct BuiltExpression;
 struct CompileContext;
 
 class ParsedExpression;
 
 /** \brief Abstract builder interface. Use \ref makeNFABuilder to construct
  * one. Used by GlushkovBuildState. */
-class NFABuilder : boost::noncopyable {
+class NFABuilder : noncopyable {
 public:
     virtual ~NFABuilder();
 
@@ -83,10 +83,10 @@ class NFABuilder : boost::noncopyable {
                              unsigned posOffset) = 0;
 
     /**
-     * \brief Returns the built NGWrapper graph.
+     * \brief Returns the built NGHolder graph and ExpressionInfo.
      * Note that this builder cannot be used after this call.
      */
-    virtual std::unique_ptr<NGWrapper> getGraph() = 0;
+    virtual BuiltExpression getGraph() = 0;
 };
 
 /** Construct a usable NFABuilder. */
diff --git a/src/nfagraph/ng_calc_components.cpp b/src/nfagraph/ng_calc_components.cpp
index da6775e44..bfe73eb27 100644
--- a/src/nfagraph/ng_calc_components.cpp
+++ b/src/nfagraph/ng_calc_components.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -55,6 +55,7 @@
 #include "ng_prune.h"
 #include "ng_undirected.h"
 #include "ng_util.h"
+#include "grey.h"
 #include "ue2common.h"
 #include "util/graph_range.h"
 #include "util/make_unique.h"
@@ -63,6 +64,7 @@
 #include <vector>
 
 #include <boost/graph/connected_components.hpp>
+#include <boost/graph/filtered_graph.hpp>
 
 using namespace std;
 
@@ -218,66 +220,43 @@ vector<NFAEdge> findShellEdges(const NGHolder &g,
     return shell_edges;
 }
 
-static
-void removeVertices(const flat_set<NFAVertex> &verts, NFAUndirectedGraph &ug,
-                   ue2::unordered_map<NFAVertex, NFAUndirectedVertex> &old2new,
-                   ue2::unordered_map<NFAUndirectedVertex, NFAVertex> &new2old) {
-    for (auto v : verts) {
-        assert(contains(old2new, v));
-        auto uv = old2new.at(v);
-        clear_vertex(uv, ug);
-        remove_vertex(uv, ug);
-        old2new.erase(v);
-        new2old.erase(uv);
-    }
-}
-
-static
-void renumberVertices(NFAUndirectedGraph &ug) {
-    u32 vertexIndex = 0;
-    for (auto uv : vertices_range(ug)) {
-        put(boost::vertex_index, ug, uv, vertexIndex++);
-    }
-}
-
 /**
  * Common code called by calc- and recalc- below. Splits the given holder into
  * one or more connected components, adding them to the comps deque.
  */
 static
-void splitIntoComponents(const NGHolder &g, deque<unique_ptr<NGHolder>> &comps,
+void splitIntoComponents(unique_ptr<NGHolder> g,
+                         deque<unique_ptr<NGHolder>> &comps,
                          const depth &max_head_depth,
                          const depth &max_tail_depth, bool *shell_comp) {
-    DEBUG_PRINTF("graph has %zu vertices\n", num_vertices(g));
+    DEBUG_PRINTF("graph has %zu vertices\n", num_vertices(*g));
 
     assert(shell_comp);
     *shell_comp = false;
 
     // Compute "shell" head and tail subgraphs.
-    vector<NFAVertexBidiDepth> depths;
-    calcDepths(g, depths);
-    auto head_shell = findHeadShell(g, depths, max_head_depth);
-    auto tail_shell = findTailShell(g, depths, max_tail_depth);
+    auto depths = calcBidiDepths(*g);
+    auto head_shell = findHeadShell(*g, depths, max_head_depth);
+    auto tail_shell = findTailShell(*g, depths, max_tail_depth);
     for (auto v : head_shell) {
         tail_shell.erase(v);
     }
 
-    if (head_shell.size() + tail_shell.size() + N_SPECIALS >= num_vertices(g)) {
+    if (head_shell.size() + tail_shell.size() + N_SPECIALS >=
+        num_vertices(*g)) {
         DEBUG_PRINTF("all in shell component\n");
-        comps.push_back(cloneHolder(g));
+        comps.push_back(std::move(g));
         *shell_comp = true;
         return;
     }
 
-    vector<NFAEdge> shell_edges = findShellEdges(g, head_shell, tail_shell);
+    vector<NFAEdge> shell_edges = findShellEdges(*g, head_shell, tail_shell);
 
     DEBUG_PRINTF("%zu vertices in head, %zu in tail, %zu shell edges\n",
                  head_shell.size(), tail_shell.size(), shell_edges.size());
 
-    NFAUndirectedGraph ug;
     ue2::unordered_map<NFAVertex, NFAUndirectedVertex> old2new;
-
-    createUnGraph(g, true, true, ug, old2new);
+    auto ug = createUnGraph(*g, true, true, old2new);
 
     // Construct reverse mapping.
     ue2::unordered_map<NFAUndirectedVertex, NFAVertex> new2old;
@@ -285,20 +264,26 @@ void splitIntoComponents(const NGHolder &g, deque<unique_ptr<NGHolder>> &comps,
         new2old.emplace(m.second, m.first);
     }
 
-    // Remove shells from undirected graph and renumber so we have dense
-    // vertex indices.
-    removeVertices(head_shell, ug, old2new, new2old);
-    removeVertices(tail_shell, ug, old2new, new2old);
-    renumberVertices(ug);
+    // Filter shell vertices from undirected graph.
+    unordered_set<NFAUndirectedVertex> shell_undir_vertices;
+    for (auto v : head_shell) {
+        shell_undir_vertices.insert(old2new.at(v));
+    }
+    for (auto v : tail_shell) {
+        shell_undir_vertices.insert(old2new.at(v));
+    }
+    auto filtered_ug = boost::make_filtered_graph(
+        ug, boost::keep_all(), make_bad_vertex_filter(&shell_undir_vertices));
 
+    // Actually run the connected components algorithm.
     map<NFAUndirectedVertex, u32> split_components;
     const u32 num = connected_components(
-        ug, boost::make_assoc_property_map(split_components));
+        filtered_ug, boost::make_assoc_property_map(split_components));
 
     assert(num > 0);
     if (num == 1 && shell_edges.empty()) {
         DEBUG_PRINTF("single component\n");
-        comps.push_back(cloneHolder(g));
+        comps.push_back(std::move(g));
         return;
     }
 
@@ -313,7 +298,7 @@ void splitIntoComponents(const NGHolder &g, deque<unique_ptr<NGHolder>> &comps,
         assert(contains(new2old, uv));
         NFAVertex v = new2old.at(uv);
         verts[c].push_back(v);
-        DEBUG_PRINTF("vertex %zu is in comp %u\n", g[v].index, c);
+        DEBUG_PRINTF("vertex %zu is in comp %u\n", (*g)[v].index, c);
     }
 
     ue2::unordered_map<NFAVertex, NFAVertex> v_map; // temp map for fillHolder
@@ -328,12 +313,12 @@ void splitIntoComponents(const NGHolder &g, deque<unique_ptr<NGHolder>> &comps,
 
         auto gc = ue2::make_unique<NGHolder>();
         v_map.clear();
-        fillHolder(gc.get(), g, vv, &v_map);
+        fillHolder(gc.get(), *g, vv, &v_map);
 
         // Remove shell edges, which will get their own component.
         for (const auto &e : shell_edges) {
-            auto cu = v_map.at(source(e, g));
-            auto cv = v_map.at(target(e, g));
+            auto cu = v_map.at(source(e, *g));
+            auto cv = v_map.at(target(e, *g));
             assert(edge(cu, cv, *gc).second);
             remove_edge(cu, cv, *gc);
         }
@@ -352,7 +337,7 @@ void splitIntoComponents(const NGHolder &g, deque<unique_ptr<NGHolder>> &comps,
 
         auto gc = ue2::make_unique<NGHolder>();
         v_map.clear();
-        fillHolder(gc.get(), g, vv, &v_map);
+        fillHolder(gc.get(), *g, vv, &v_map);
 
         pruneUseless(*gc);
         DEBUG_PRINTF("shell edge component %zu has %zu vertices\n",
@@ -374,33 +359,39 @@ void splitIntoComponents(const NGHolder &g, deque<unique_ptr<NGHolder>> &comps,
                   }));
 }
 
-deque<unique_ptr<NGHolder>> calcComponents(const NGHolder &g) {
+deque<unique_ptr<NGHolder>> calcComponents(unique_ptr<NGHolder> g,
+                                           const Grey &grey) {
     deque<unique_ptr<NGHolder>> comps;
 
     // For trivial cases, we needn't bother running the full
     // connected_components algorithm.
-    if (isAlternationOfClasses(g)) {
-        comps.push_back(cloneHolder(g));
+    if (!grey.calcComponents || isAlternationOfClasses(*g)) {
+        comps.push_back(std::move(g));
         return comps;
     }
 
     bool shell_comp = false;
-    splitIntoComponents(g, comps, MAX_HEAD_SHELL_DEPTH, MAX_TAIL_SHELL_DEPTH,
-                        &shell_comp);
+    splitIntoComponents(std::move(g), comps, depth(MAX_HEAD_SHELL_DEPTH),
+                        depth(MAX_TAIL_SHELL_DEPTH), &shell_comp);
 
     if (shell_comp) {
         DEBUG_PRINTF("re-running on shell comp\n");
         assert(!comps.empty());
-        auto sc = move(comps.back());
+        auto sc = std::move(comps.back());
         comps.pop_back();
-        splitIntoComponents(*sc, comps, 0, 0, &shell_comp);
+        splitIntoComponents(std::move(sc), comps, depth(0), depth(0),
+                            &shell_comp);
     }
 
     DEBUG_PRINTF("finished; split into %zu components\n", comps.size());
     return comps;
 }
 
-void recalcComponents(deque<unique_ptr<NGHolder>> &comps) {
+void recalcComponents(deque<unique_ptr<NGHolder>> &comps, const Grey &grey) {
+    if (!grey.calcComponents) {
+        return;
+    }
+
     deque<unique_ptr<NGHolder>> out;
 
     for (auto &gc : comps) {
@@ -409,14 +400,13 @@ void recalcComponents(deque<unique_ptr<NGHolder>> &comps) {
         }
 
         if (isAlternationOfClasses(*gc)) {
-            out.push_back(move(gc));
+            out.push_back(std::move(gc));
             continue;
         }
 
-        auto gc_comps = calcComponents(*gc);
-        for (auto &elem : gc_comps) {
-            out.push_back(move(elem));
-        }
+        auto gc_comps = calcComponents(std::move(gc), grey);
+        out.insert(end(out), std::make_move_iterator(begin(gc_comps)),
+                   std::make_move_iterator(end(gc_comps)));
     }
 
     // Replace comps with our recalculated list.
diff --git a/src/nfagraph/ng_calc_components.h b/src/nfagraph/ng_calc_components.h
index e68c81fcc..1bcdc5f81 100644
--- a/src/nfagraph/ng_calc_components.h
+++ b/src/nfagraph/ng_calc_components.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -39,12 +39,15 @@
 namespace ue2 {
 
 class NGHolder;
+struct Grey;
 
 bool isAlternationOfClasses(const NGHolder &g);
 
-std::deque<std::unique_ptr<NGHolder>> calcComponents(const NGHolder &g);
+std::deque<std::unique_ptr<NGHolder>>
+calcComponents(std::unique_ptr<NGHolder> g, const Grey &grey);
 
-void recalcComponents(std::deque<std::unique_ptr<NGHolder>> &comps);
+void recalcComponents(std::deque<std::unique_ptr<NGHolder>> &comps,
+                      const Grey &grey);
 
 } // namespace ue2
 
diff --git a/src/nfagraph/ng_depth.cpp b/src/nfagraph/ng_depth.cpp
index 63e0e46b7..67a6b27b4 100644
--- a/src/nfagraph/ng_depth.cpp
+++ b/src/nfagraph/ng_depth.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,7 +26,8 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-/** \file
+/**
+ * \file
  * \brief NFA graph vertex depth calculations.
  */
 #include "ng_depth.h"
@@ -123,34 +124,35 @@ struct StartFilter {
 
 } // namespace
 
-template<class GraphT>
+template<class Graph>
 static
-void findLoopReachable(const GraphT &g,
-                       const typename GraphT::vertex_descriptor srcVertex,
-                       vector<bool> &deadNodes) {
-    typedef typename GraphT::edge_descriptor EdgeT;
-    typedef typename GraphT::vertex_descriptor VertexT;
-    typedef set<EdgeT> EdgeSet;
+vector<bool> findLoopReachable(const Graph &g,
+                               const typename Graph::vertex_descriptor src) {
+    vector<bool> deadNodes(num_vertices(g));
+
+    using Edge = typename Graph::edge_descriptor;
+    using Vertex = typename Graph::vertex_descriptor;
+    using EdgeSet = set<Edge>;
 
     EdgeSet deadEdges;
     BackEdges<EdgeSet> be(deadEdges);
 
-    depth_first_search(g, visitor(be).root_vertex(srcVertex));
+    depth_first_search(g, visitor(be).root_vertex(src));
     auto af = make_bad_edge_filter(&deadEdges);
     auto acyclic_g = make_filtered_graph(g, af);
 
-    vector<VertexT> topoOrder; /* actually reverse topological order */
+    vector<Vertex> topoOrder; /* actually reverse topological order */
     topoOrder.reserve(deadNodes.size());
     topological_sort(acyclic_g, back_inserter(topoOrder));
 
     for (const auto &e : deadEdges) {
-        u32 srcIdx = g[source(e, g)].index;
+        size_t srcIdx = g[source(e, g)].index;
         if (srcIdx != NODE_START_DOTSTAR) {
             deadNodes[srcIdx] = true;
         }
     }
 
-    for (VertexT v : reverse(topoOrder)) {
+    for (auto v : reverse(topoOrder)) {
         for (const auto &e : in_edges_range(v, g)) {
             if (deadNodes[g[source(e, g)].index]) {
                 deadNodes[g[v].index] = true;
@@ -158,6 +160,8 @@ void findLoopReachable(const GraphT &g,
             }
         }
     }
+
+    return deadNodes;
 }
 
 template <class GraphT>
@@ -269,12 +273,11 @@ void calcAndStoreDepth(const Graph &g,
     }
 }
 
-void calcDepths(const NGHolder &g, std::vector<NFAVertexDepth> &depths) {
+vector<NFAVertexDepth> calcDepths(const NGHolder &g) {
     assert(hasCorrectlyNumberedVertices(g));
     const size_t numVertices = num_vertices(g);
-    depths.clear();
-    depths.resize(numVertices);
 
+    vector<NFAVertexDepth> depths(numVertices);
     vector<int> dMin;
     vector<int> dMax;
 
@@ -282,8 +285,7 @@ void calcDepths(const NGHolder &g, std::vector<NFAVertexDepth> &depths) {
      * create a filtered graph for max depth calculations: all nodes/edges
      * reachable from a loop need to be removed
      */
-    vector<bool> deadNodes(numVertices);
-    findLoopReachable(g, g.start, deadNodes);
+    auto deadNodes = findLoopReachable(g, g.start);
 
     DEBUG_PRINTF("doing start\n");
     calcAndStoreDepth(g, g.start, deadNodes, dMin, dMax, depths,
@@ -291,14 +293,15 @@ void calcDepths(const NGHolder &g, std::vector<NFAVertexDepth> &depths) {
     DEBUG_PRINTF("doing startds\n");
     calcAndStoreDepth(g, g.startDs, deadNodes, dMin, dMax, depths,
                       &NFAVertexDepth::fromStartDotStar);
+
+    return depths;
 }
 
-void calcDepths(const NGHolder &g, std::vector<NFAVertexRevDepth> &depths) {
+vector<NFAVertexRevDepth> calcRevDepths(const NGHolder &g) {
     assert(hasCorrectlyNumberedVertices(g));
     const size_t numVertices = num_vertices(g);
-    depths.clear();
-    depths.resize(numVertices);
 
+    vector<NFAVertexRevDepth> depths(numVertices);
     vector<int> dMin;
     vector<int> dMax;
 
@@ -312,8 +315,7 @@ void calcDepths(const NGHolder &g, std::vector<NFAVertexRevDepth> &depths) {
      * create a filtered graph for max depth calculations: all nodes/edges
      * reachable from a loop need to be removed
      */
-    vector<bool> deadNodes(numVertices);
-    findLoopReachable(rg, g.acceptEod, deadNodes);
+    auto deadNodes = findLoopReachable(rg, g.acceptEod);
 
     DEBUG_PRINTF("doing accept\n");
     calcAndStoreDepth<RevNFAGraph, NFAVertexRevDepth>(
@@ -324,14 +326,15 @@ void calcDepths(const NGHolder &g, std::vector<NFAVertexRevDepth> &depths) {
     calcAndStoreDepth<RevNFAGraph, NFAVertexRevDepth>(
         rg, g.acceptEod, deadNodes, dMin, dMax, depths,
         &NFAVertexRevDepth::toAcceptEod);
+
+    return depths;
 }
 
-void calcDepths(const NGHolder &g, vector<NFAVertexBidiDepth> &depths) {
+vector<NFAVertexBidiDepth> calcBidiDepths(const NGHolder &g) {
     assert(hasCorrectlyNumberedVertices(g));
     const size_t numVertices = num_vertices(g);
-    depths.clear();
-    depths.resize(numVertices);
 
+    vector<NFAVertexBidiDepth> depths(numVertices);
     vector<int> dMin;
     vector<int> dMax;
 
@@ -339,8 +342,7 @@ void calcDepths(const NGHolder &g, vector<NFAVertexBidiDepth> &depths) {
      * create a filtered graph for max depth calculations: all nodes/edges
      * reachable from a loop need to be removed
      */
-    vector<bool> deadNodes(numVertices);
-    findLoopReachable(g, g.start, deadNodes);
+    auto deadNodes = findLoopReachable(g, g.start);
 
     DEBUG_PRINTF("doing start\n");
     calcAndStoreDepth<NGHolder, NFAVertexBidiDepth>(
@@ -354,8 +356,7 @@ void calcDepths(const NGHolder &g, vector<NFAVertexBidiDepth> &depths) {
     /* Now go backwards */
     typedef reverse_graph<NGHolder, const NGHolder &> RevNFAGraph;
     const RevNFAGraph rg(g);
-    deadNodes.assign(numVertices, false);
-    findLoopReachable(rg, g.acceptEod, deadNodes);
+    deadNodes = findLoopReachable(rg, g.acceptEod);
 
     DEBUG_PRINTF("doing accept\n");
     calcAndStoreDepth<RevNFAGraph, NFAVertexBidiDepth>(
@@ -366,26 +367,27 @@ void calcDepths(const NGHolder &g, vector<NFAVertexBidiDepth> &depths) {
     calcAndStoreDepth<RevNFAGraph, NFAVertexBidiDepth>(
         rg, g.acceptEod, deadNodes, dMin, dMax, depths,
         &NFAVertexBidiDepth::toAcceptEod);
+
+    return depths;
 }
 
-void calcDepthsFrom(const NGHolder &g, const NFAVertex src,
-                    vector<DepthMinMax> &depths) {
+vector<DepthMinMax> calcDepthsFrom(const NGHolder &g, const NFAVertex src) {
     assert(hasCorrectlyNumberedVertices(g));
     const size_t numVertices = num_vertices(g);
 
-    vector<bool> deadNodes(numVertices);
-    findLoopReachable(g, g.start, deadNodes);
+    auto deadNodes = findLoopReachable(g, g.start);
 
     vector<int> dMin, dMax;
     calcDepthFromSource(g, src, deadNodes, dMin, dMax);
 
-    depths.clear();
-    depths.resize(numVertices);
+    vector<DepthMinMax> depths(numVertices);
 
     for (auto v : vertices_range(g)) {
-        u32 idx = g[v].index;
+        auto idx = g[v].index;
         depths.at(idx) = getDepths(idx, dMin, dMax);
     }
+
+    return depths;
 }
 
 } // namespace ue2
diff --git a/src/nfagraph/ng_depth.h b/src/nfagraph/ng_depth.h
index 16231ea1e..36cca87e8 100644
--- a/src/nfagraph/ng_depth.h
+++ b/src/nfagraph/ng_depth.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,23 +26,22 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-/** \file
+/**
+ * \file
  * \brief NFA graph vertex depth calculations.
  */
 
-#ifndef STRUCTURAL_ANALYSIS_H
-#define STRUCTURAL_ANALYSIS_H
+#ifndef NG_DEPTH_H
+#define NG_DEPTH_H
 
-#include "nfagraph/ng_holder.h"
 #include "ue2common.h"
+#include "nfagraph/ng_holder.h"
 #include "util/depth.h"
 
 #include <vector>
 
 namespace ue2 {
 
-class NGHolder;
-
 /**
  * \brief Encapsulates min/max depths relative to the start and startDs
  * vertices.
@@ -72,28 +71,29 @@ struct NFAVertexBidiDepth {
 };
 
 /**
- * \brief Calculate depths from start and startDs.
- * Fills the vector \p depths (indexed by \p vertex_index).
+ * \brief Calculate depths from start and startDs. Returns them in a vector,
+ * indexed by vertex index.
  */
-void calcDepths(const NGHolder &g, std::vector<NFAVertexDepth> &depths);
+std::vector<NFAVertexDepth> calcDepths(const NGHolder &g);
 
 /**
- * \brief Calculate depths to accept and acceptEod.
- * Fills the vector \p depths (indexed by \p vertex_index).
+ * \brief Calculate depths to accept and acceptEod. Returns them in a vector,
+ * indexed by vertex index.
  */
-void calcDepths(const NGHolder &g, std::vector<NFAVertexRevDepth> &depths);
+std::vector<NFAVertexRevDepth> calcRevDepths(const NGHolder &g);
 
 /**
- * \brief Calculate depths to/from all special vertices.
- * Fills the vector \p depths (indexed by \p vertex_index).
+ * \brief Calculate depths to/from all special vertices. Returns them in a
+ * vector, indexed by vertex index.
  */
-void calcDepths(const NGHolder &g, std::vector<NFAVertexBidiDepth> &depths);
+std::vector<NFAVertexBidiDepth> calcBidiDepths(const NGHolder &g);
 
-/** Calculate the (min, max) depths from the given \p src to every vertex in
- * the graph and return them in a vector, indexed by \p vertex_index. */
-void calcDepthsFrom(const NGHolder &g, const NFAVertex src,
-                    std::vector<DepthMinMax> &depths);
+/**
+ * \brief Calculate the (min, max) depths from the given \p src to every vertex
+ * in the graph and return them in a vector, indexed by \p vertex_index.
+ */
+std::vector<DepthMinMax> calcDepthsFrom(const NGHolder &g, const NFAVertex src);
 
 } // namespace ue2
 
-#endif
+#endif // NG_DEPTH_H
diff --git a/src/nfagraph/ng_dominators.cpp b/src/nfagraph/ng_dominators.cpp
index d01af9947..50536b760 100644
--- a/src/nfagraph/ng_dominators.cpp
+++ b/src/nfagraph/ng_dominators.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -62,8 +62,8 @@ unordered_map<NFAVertex, NFAVertex> calcDominators(const Graph &g,
     vector<Vertex> vertices_by_dfnum(num_verts, Graph::null_vertex());
 
     // Output map.
-    unordered_map<Vertex, Vertex> doms;
-    auto dom_map = make_assoc_property_map(doms);
+    vector<Vertex> doms(num_verts, Graph::null_vertex());
+    auto dom_map = make_iterator_property_map(doms.begin(), index_map);
 
     boost_ue2::lengauer_tarjan_dominator_tree(g, source, index_map, dfnum_map,
                                               parent_map, vertices_by_dfnum,
@@ -71,10 +71,12 @@ unordered_map<NFAVertex, NFAVertex> calcDominators(const Graph &g,
 
     /* Translate back to an NFAVertex map */
     unordered_map<NFAVertex, NFAVertex> doms2;
-    for (const auto &e : doms) {
-        NFAVertex f(e.first);
-        NFAVertex s(e.second);
-        doms2[f] = s;
+    doms2.reserve(num_verts);
+    for (auto v : vertices_range(g)) {
+        auto dom_of_v = doms[g[v].index];
+        if (dom_of_v) {
+            doms2.emplace(v, dom_of_v);
+        }
     }
     return doms2;
 }
diff --git a/src/nfagraph/ng_dump.cpp b/src/nfagraph/ng_dump.cpp
index fc840f251..094d24015 100644
--- a/src/nfagraph/ng_dump.cpp
+++ b/src/nfagraph/ng_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,24 +35,25 @@
 
 #include "config.h"
 
-#include "ng_dump.h"
+#include "nfagraph/ng_dump.h"
 
-#include "hwlm/hwlm_build.h"
-#include "ng.h"
-#include "ng_util.h"
-#include "parser/position.h"
+#include "hs_compile.h" /* for HS_MODE_* flags */
 #include "ue2common.h"
+#include "compiler/compiler.h"
+#include "hwlm/hwlm_build.h"
 #include "nfa/accel.h"
 #include "nfa/nfa_internal.h" // for MO_INVALID_IDX
-#include "smallwrite/smallwrite_dump.h"
+#include "nfagraph/ng.h"
+#include "nfagraph/ng_util.h"
+#include "parser/position.h"
 #include "rose/rose_build.h"
 #include "rose/rose_internal.h"
+#include "smallwrite/smallwrite_dump.h"
 #include "util/bitutils.h"
 #include "util/dump_charclass.h"
 #include "util/report.h"
 #include "util/report_manager.h"
 #include "util/ue2string.h"
-#include "hs_compile.h" /* for HS_MODE_* flags */
 
 #include <cmath>
 #include <fstream>
@@ -287,13 +288,13 @@ void dumpGraphImpl(const char *name, const GraphT &g,
 // manual instantiation of templated dumpGraph above.
 template void dumpGraphImpl(const char *, const NGHolder &);
 
-void dumpDotWrapperImpl(const NGWrapper &nw, const char *name,
-                        const Grey &grey) {
+void dumpDotWrapperImpl(const NGHolder &g, const ExpressionInfo &expr,
+                        const char *name, const Grey &grey) {
     if (grey.dumpFlags & Grey::DUMP_INT_GRAPH) {
         stringstream ss;
-        ss << grey.dumpPath << "Expr_" << nw.expressionIndex << "_" << name << ".dot";
+        ss << grey.dumpPath << "Expr_" << expr.index << "_" << name << ".dot";
         DEBUG_PRINTF("dumping dot graph to '%s'\n", ss.str().c_str());
-        dumpGraphImpl(ss.str().c_str(), nw);
+        dumpGraphImpl(ss.str().c_str(), g);
     }
 }
 
diff --git a/src/nfagraph/ng_dump.h b/src/nfagraph/ng_dump.h
index b20d9f1be..077f07cef 100644
--- a/src/nfagraph/ng_dump.h
+++ b/src/nfagraph/ng_dump.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -48,7 +48,7 @@ namespace ue2 {
 
 class NGHolder;
 class NG;
-class NGWrapper;
+class ExpressionInfo;
 class ReportManager;
 
 // Implementations for stubs below -- all have the suffix "Impl".
@@ -61,7 +61,8 @@ void dumpGraphImpl(const char *name, const GraphT &g);
 template <typename GraphT>
 void dumpGraphImpl(const char *name, const GraphT &g, const ReportManager &rm);
 
-void dumpDotWrapperImpl(const NGWrapper &w, const char *name, const Grey &grey);
+void dumpDotWrapperImpl(const NGHolder &g, const ExpressionInfo &expr,
+                        const char *name, const Grey &grey);
 
 void dumpComponentImpl(const NGHolder &g, const char *name, u32 expr, u32 comp,
                        const Grey &grey);
@@ -88,10 +89,10 @@ static inline void dumpGraph(UNUSED const char *name, UNUSED const GraphT &g) {
 // Stubs which call through to dump code if compiled in.
 
 UNUSED static inline
-void dumpDotWrapper(UNUSED const NGWrapper &w, UNUSED const char *name,
-                    UNUSED const Grey &grey) {
+void dumpDotWrapper(UNUSED const NGHolder &g, UNUSED const ExpressionInfo &expr,
+                    UNUSED const char *name, UNUSED const Grey &grey) {
 #ifdef DUMP_SUPPORT
-    dumpDotWrapperImpl(w, name, grey);
+    dumpDotWrapperImpl(g, expr, name, grey);
 #endif
 }
 
diff --git a/src/nfagraph/ng_equivalence.cpp b/src/nfagraph/ng_equivalence.cpp
index 32a392a6d..438e5ea8a 100644
--- a/src/nfagraph/ng_equivalence.cpp
+++ b/src/nfagraph/ng_equivalence.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -63,10 +63,10 @@ class VertexInfo;
 struct VertexInfoPtrCmp {
     // for flat_set
     bool operator()(const VertexInfo *a, const VertexInfo *b) const;
-    // for unordered_set
-    size_t operator()(const VertexInfo *a) const;
 };
 
+using VertexInfoSet = flat_set<VertexInfo *, VertexInfoPtrCmp>;
+
 /** Precalculated (and maintained) information about a vertex. */
 class VertexInfo {
 public:
@@ -74,8 +74,8 @@ class VertexInfo {
         : v(v_in), vert_index(g[v].index), cr(g[v].char_reach),
           equivalence_class(~0), vertex_flags(g[v].assert_flags) {}
 
-    flat_set<VertexInfo *, VertexInfoPtrCmp> pred; //!< predecessors of this vertex
-    flat_set<VertexInfo *, VertexInfoPtrCmp> succ; //!< successors of this vertex
+    VertexInfoSet pred; //!< predecessors of this vertex
+    VertexInfoSet succ; //!< successors of this vertex
     NFAVertex v;
     size_t vert_index;
     CharReach cr;
@@ -86,21 +86,11 @@ class VertexInfo {
     unsigned vertex_flags;
 };
 
-}
-
-typedef ue2::unordered_set<VertexInfo *, VertexInfoPtrCmp> VertexInfoSet;
-
 // compare two vertex info pointers on their vertex index
 bool VertexInfoPtrCmp::operator()(const VertexInfo *a,
                                   const VertexInfo *b) const {
     return a->vert_index < b->vert_index;
 }
-// provide a "hash" for vertex info pointer by returning its vertex index
-size_t VertexInfoPtrCmp::operator()(const VertexInfo *a) const {
-    return a->vert_index;
-}
-
-namespace {
 
 // to avoid traversing infomap each time we need to check the class during
 // partitioning, we will cache the information pertaining to a particular class
@@ -133,7 +123,7 @@ class ClassInfo {
 
     friend size_t hash_value(const ClassInfo &c) {
         size_t val = 0;
-        boost::hash_combine(val, boost::hash_range(begin(c.rs), end(c.rs)));
+        boost::hash_combine(val, c.rs);
         boost::hash_combine(val, c.vertex_flags);
         boost::hash_combine(val, c.cr);
         boost::hash_combine(val, c.adjacent_cr);
@@ -342,9 +332,9 @@ vector<VertexInfoSet> partitionGraph(vector<unique_ptr<VertexInfo>> &infos,
     vector<NFAVertexRevDepth> rdepths;
 
     if (eq == LEFT_EQUIVALENCE) {
-        calcDepths(g, depths);
+        depths = calcDepths(g);
     } else {
-        calcDepths(g, rdepths);
+        rdepths = calcRevDepths(g);
     }
 
     // partition the graph based on CharReach
diff --git a/src/nfagraph/ng_expr_info.cpp b/src/nfagraph/ng_expr_info.cpp
index b43c7fd1f..5f5bbea74 100644
--- a/src/nfagraph/ng_expr_info.cpp
+++ b/src/nfagraph/ng_expr_info.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -27,8 +27,8 @@
  */
 
 /** \file
- * \brief Code for discovering properties of an NGWrapper used by
- * hs_expression_info.
+ * \brief Code for discovering properties of an NFA graph used by
+ * hs_expression_info().
  */
 #include "ng_expr_info.h"
 
@@ -37,10 +37,14 @@
 #include "ng_asserts.h"
 #include "ng_depth.h"
 #include "ng_edge_redundancy.h"
+#include "ng_extparam.h"
+#include "ng_fuzzy.h"
 #include "ng_holder.h"
+#include "ng_prune.h"
 #include "ng_reports.h"
 #include "ng_util.h"
 #include "ue2common.h"
+#include "compiler/expression_info.h"
 #include "parser/position.h" // for POS flags
 #include "util/boundary_reports.h"
 #include "util/compile_context.h"
@@ -58,42 +62,42 @@ namespace ue2 {
 
 /* get rid of leading \b and multiline ^ vertices */
 static
-void removeLeadingVirtualVerticesFromRoot(NGWrapper &w, NFAVertex root) {
+void removeLeadingVirtualVerticesFromRoot(NGHolder &g, NFAVertex root) {
     vector<NFAVertex> victims;
 
-    for (auto v : adjacent_vertices_range(root, w)) {
-        if (w[v].assert_flags & POS_FLAG_VIRTUAL_START) {
+    for (auto v : adjacent_vertices_range(root, g)) {
+        if (g[v].assert_flags & POS_FLAG_VIRTUAL_START) {
             DEBUG_PRINTF("(?m)^ vertex or leading \\[bB] vertex\n");
             victims.push_back(v);
         }
     }
 
     for (auto u : victims) {
-        for (auto v : adjacent_vertices_range(u, w)) {
-            add_edge_if_not_present(root, v, w);
+        for (auto v : adjacent_vertices_range(u, g)) {
+            add_edge_if_not_present(root, v, g);
         }
     }
 
-    remove_vertices(victims, w);
+    remove_vertices(victims, g);
 }
 
 static
-void checkVertex(const ReportManager &rm, const NGWrapper &w, NFAVertex v,
+void checkVertex(const ReportManager &rm, const NGHolder &g, NFAVertex v,
                  const vector<DepthMinMax> &depths, DepthMinMax &info) {
-    if (is_any_accept(v, w)) {
+    if (is_any_accept(v, g)) {
         return;
     }
-    if (is_any_start(v, w)) {
-        info.min = 0;
+    if (is_any_start(v, g)) {
+        info.min = depth(0);
         info.max = max(info.max, depth(0));
         return;
     }
 
-    u32 idx = w[v].index;
+    u32 idx = g[v].index;
     assert(idx < depths.size());
     const DepthMinMax &d = depths.at(idx);
 
-    for (ReportID report_id : w[v].reports) {
+    for (ReportID report_id : g[v].reports) {
         const Report &report = rm.getReport(report_id);
         assert(report.type == EXTERNAL_CALLBACK);
 
@@ -118,7 +122,7 @@ void checkVertex(const ReportManager &rm, const NGWrapper &w, NFAVertex v,
             rd.max = min(rd.max, max_offset);
         }
 
-        DEBUG_PRINTF("vertex %zu report %u: %s\n", w[v].index, report_id,
+        DEBUG_PRINTF("vertex %zu report %u: %s\n", g[v].index, report_id,
                       rd.str().c_str());
 
         info = unionDepthMinMax(info, rd);
@@ -126,8 +130,8 @@ void checkVertex(const ReportManager &rm, const NGWrapper &w, NFAVertex v,
 }
 
 static
-bool hasOffsetAdjust(const ReportManager &rm, const NGWrapper &w) {
-    for (const auto &report_id : all_reports(w)) {
+bool hasOffsetAdjust(const ReportManager &rm, const NGHolder &g) {
+    for (const auto &report_id : all_reports(g)) {
         if (rm.getReport(report_id).offsetAdjust) {
             return true;
         }
@@ -135,28 +139,61 @@ bool hasOffsetAdjust(const ReportManager &rm, const NGWrapper &w) {
     return false;
 }
 
-void fillExpressionInfo(ReportManager &rm, NGWrapper &w, hs_expr_info *info) {
+void fillExpressionInfo(ReportManager &rm, const CompileContext &cc,
+                        NGHolder &g, ExpressionInfo &expr,
+                        hs_expr_info *info) {
     assert(info);
 
+    // remove reports that aren't on vertices connected to accept.
+    clearReports(g);
+
+    assert(allMatchStatesHaveReports(g));
+
+    /*
+     * Note: the following set of analysis passes / transformations should
+     * match those in NG::addGraph().
+     */
+
     /* ensure utf8 starts at cp boundary */
-    ensureCodePointStart(rm, w);
-    resolveAsserts(rm, w);
-    optimiseVirtualStarts(w);
+    ensureCodePointStart(rm, g, expr);
+
+    if (can_never_match(g)) {
+        throw CompileError(expr.index, "Pattern can never match.");
+    }
+
+    // validate graph's suitability for fuzzing
+    validate_fuzzy_compile(g, expr.edit_distance, expr.utf8, cc.grey);
+
+    resolveAsserts(rm, g, expr);
+    assert(allMatchStatesHaveReports(g));
+
+    // fuzz graph - this must happen before any transformations are made
+    make_fuzzy(g, expr.edit_distance, cc.grey);
+
+    pruneUseless(g);
+    pruneEmptyVertices(g);
+
+    if (can_never_match(g)) {
+        throw CompileError(expr.index, "Pattern can never match.");
+    }
+
+    optimiseVirtualStarts(g);
+
+    propagateExtendedParams(g, expr, rm);
 
-    removeLeadingVirtualVerticesFromRoot(w, w.start);
-    removeLeadingVirtualVerticesFromRoot(w, w.startDs);
+    removeLeadingVirtualVerticesFromRoot(g, g.start);
+    removeLeadingVirtualVerticesFromRoot(g, g.startDs);
 
-    vector<DepthMinMax> depths;
-    calcDepthsFrom(w, w.start, depths);
+    auto depths = calcDepthsFrom(g, g.start);
 
     DepthMinMax d;
 
-    for (auto u : inv_adjacent_vertices_range(w.accept, w)) {
-        checkVertex(rm, w, u, depths, d);
+    for (auto u : inv_adjacent_vertices_range(g.accept, g)) {
+        checkVertex(rm, g, u, depths, d);
     }
 
-    for (auto u : inv_adjacent_vertices_range(w.acceptEod, w)) {
-        checkVertex(rm, w, u, depths, d);
+    for (auto u : inv_adjacent_vertices_range(g.acceptEod, g)) {
+        checkVertex(rm, g, u, depths, d);
     }
 
     if (d.max.is_finite()) {
@@ -170,9 +207,9 @@ void fillExpressionInfo(ReportManager &rm, NGWrapper &w, hs_expr_info *info) {
         info->min_width = UINT_MAX;
     }
 
-    info->unordered_matches = hasOffsetAdjust(rm, w);
-    info->matches_at_eod = can_match_at_eod(w);
-    info->matches_only_at_eod = can_only_match_at_eod(w);
+    info->unordered_matches = hasOffsetAdjust(rm, g);
+    info->matches_at_eod = can_match_at_eod(g);
+    info->matches_only_at_eod = can_only_match_at_eod(g);
 }
 
 } // namespace ue2
diff --git a/src/nfagraph/ng_expr_info.h b/src/nfagraph/ng_expr_info.h
index dcc5a419f..f9bd68093 100644
--- a/src/nfagraph/ng_expr_info.h
+++ b/src/nfagraph/ng_expr_info.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -27,7 +27,7 @@
  */
 
 /** \file
- * \brief Code for discovering properties of an NGWrapper used by
+ * \brief Code for discovering properties of an expression used by
  * hs_expression_info.
  */
 
@@ -36,14 +36,15 @@
 
 struct hs_expr_info;
 
-#include "ue2common.h"
-
 namespace ue2 {
 
-class NGWrapper;
+class ExpressionInfo;
+class NGHolder;
 class ReportManager;
+struct CompileContext;
 
-void fillExpressionInfo(ReportManager &rm, NGWrapper &w, hs_expr_info *info);
+void fillExpressionInfo(ReportManager &rm, const CompileContext &cc,
+                        NGHolder &g, ExpressionInfo &expr, hs_expr_info *info);
 
 } // namespace ue2
 
diff --git a/src/nfagraph/ng_extparam.cpp b/src/nfagraph/ng_extparam.cpp
index a504ac50a..bc7f81efd 100644
--- a/src/nfagraph/ng_extparam.cpp
+++ b/src/nfagraph/ng_extparam.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,28 +26,32 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-/** \file
+/**
+ * \file
  * \brief Propagate extended parameters to vertex reports and reduce graph if
  * possible.
  *
  * This code handles the propagation of the extension parameters specified by
- * the user with the hs_expr_ext structure into the reports on the graph's
+ * the user with the \ref hs_expr_ext structure into the reports on the graph's
  * vertices.
  *
  * There are also some analyses that prune edges that cannot contribute to a
  * match given these constraints, or transform the graph in order to make a
  * constraint implicit.
  */
+
+#include "ng_extparam.h"
+
 #include "ng.h"
 #include "ng_depth.h"
 #include "ng_dump.h"
-#include "ng_extparam.h"
 #include "ng_prune.h"
 #include "ng_reports.h"
 #include "ng_som_util.h"
 #include "ng_width.h"
 #include "ng_util.h"
 #include "ue2common.h"
+#include "compiler/compiler.h"
 #include "parser/position.h"
 #include "util/compile_context.h"
 #include "util/compile_error.h"
@@ -65,8 +69,28 @@ namespace ue2 {
 static const u32 MAX_MAXOFFSET_TO_ANCHOR = 2000;
 static const u32 MAX_MINLENGTH_TO_CONVERT = 2000;
 
-/** \brief Find the (min, max) offset adjustment for the reports on a given
- * vertex. */
+/** True if all the given reports have the same extparam bounds. */
+template<typename Container>
+bool hasSameBounds(const Container &reports, const ReportManager &rm) {
+    assert(!reports.empty());
+
+    const auto &first = rm.getReport(*reports.begin());
+    for (auto id : reports) {
+        const auto &report = rm.getReport(id);
+        if (report.minOffset != first.minOffset ||
+            report.maxOffset != first.maxOffset ||
+            report.minLength != first.minLength) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+/**
+ * \brief Find the (min, max) offset adjustment for the reports on a given
+ * vertex.
+ */
 static
 pair<s32,s32> getMinMaxOffsetAdjust(const ReportManager &rm,
                                     const NGHolder &g, NFAVertex v) {
@@ -127,54 +151,76 @@ DepthMinMax findMatchLengths(const ReportManager &rm, const NGHolder &g) {
     return match_depths;
 }
 
-/** \brief Replace the graph's reports with new reports that specify bounds. */
-static
-void updateReportBounds(ReportManager &rm, NGWrapper &g, NFAVertex accept,
-                        set<NFAVertex> &done) {
+template<typename Function>
+void replaceReports(NGHolder &g, NFAVertex accept, flat_set<NFAVertex> &seen,
+                    Function func) {
     for (auto v : inv_adjacent_vertices_range(accept, g)) {
-        // Don't operate on g.accept itself.
         if (v == g.accept) {
+            // Don't operate on accept: the accept->acceptEod edge is stylised.
             assert(accept == g.acceptEod);
+            assert(g[v].reports.empty());
             continue;
         }
 
-        // Don't operate on a vertex we've already done.
-        if (contains(done, v)) {
-            continue;
+        if (!seen.insert(v).second) {
+            continue; // We have already processed v.
         }
-        done.insert(v);
 
-        flat_set<ReportID> new_reports;
         auto &reports = g[v].reports;
+        if (reports.empty()) {
+            continue;
+        }
+        decltype(g[v].reports) new_reports;
+        for (auto id : g[v].reports) {
+            new_reports.insert(func(v, id));
+        }
+        reports = std::move(new_reports);
+    }
+}
 
-        for (auto id : reports) {
-            Report ir = rm.getReport(id); // make a copy
-            assert(!ir.hasBounds());
-
-            // Note that we need to cope with offset adjustment here.
-
-            ir.minOffset = g.min_offset - ir.offsetAdjust;
-            if (g.max_offset == MAX_OFFSET) {
-                ir.maxOffset = MAX_OFFSET;
-            } else {
-                ir.maxOffset = g.max_offset - ir.offsetAdjust;
-            }
-            assert(ir.maxOffset >= ir.minOffset);
+/**
+ * Generic function for replacing all the reports in the graph.
+ *
+ * Pass this a function that takes a vertex and a ReportID returns another
+ * ReportID (or the same one) to replace it with.
+ */
+template<typename Function>
+void replaceReports(NGHolder &g, Function func) {
+    flat_set<NFAVertex> seen;
+    replaceReports(g, g.accept, seen, func);
+    replaceReports(g, g.acceptEod, seen, func);
+}
 
-            ir.minLength = g.min_length;
-            if (g.min_length && !g.som) {
-                ir.quashSom = true;
-            }
+/** \brief Replace the graph's reports with new reports that specify bounds. */
+static
+void updateReportBounds(ReportManager &rm, NGHolder &g,
+                        const ExpressionInfo &expr) {
+    DEBUG_PRINTF("updating report bounds\n");
+    replaceReports(g, [&](NFAVertex, ReportID id) {
+        Report report = rm.getReport(id); // make a copy
+        assert(!report.hasBounds());
+
+        // Note that we need to cope with offset adjustment here.
+
+        report.minOffset = expr.min_offset - report.offsetAdjust;
+        if (expr.max_offset == MAX_OFFSET) {
+            report.maxOffset = MAX_OFFSET;
+        } else {
+            report.maxOffset = expr.max_offset - report.offsetAdjust;
+        }
+        assert(report.maxOffset >= report.minOffset);
 
-            DEBUG_PRINTF("id %u -> min_offset=%llu, max_offset=%llu, "
-                         "min_length=%llu\n",
-                         id, ir.minOffset, ir.maxOffset, ir.minLength);
-            new_reports.insert(rm.getInternalId(ir));
+        report.minLength = expr.min_length;
+        if (expr.min_length && !expr.som) {
+            report.quashSom = true;
         }
 
-        DEBUG_PRINTF("swapping reports on vertex %zu\n", g[v].index);
-        reports.swap(new_reports);
-    }
+        DEBUG_PRINTF("id %u -> min_offset=%llu, max_offset=%llu, "
+                     "min_length=%llu\n", id, report.minOffset,
+                     report.maxOffset, report.minLength);
+
+        return rm.getInternalId(report);
+    });
 }
 
 static
@@ -187,31 +233,93 @@ bool hasVirtualStarts(const NGHolder &g) {
     return false;
 }
 
-/** If the pattern is unanchored, has a max_offset and has not asked for SOM,
- * we can use that knowledge to anchor it which will limit its lifespan. Note
- * that we can't use this transformation if there's a min_length, as it's
- * currently handled using "sly SOM".
+/** Set the min_length param for all reports to zero.  */
+static
+void clearMinLengthParam(NGHolder &g, ReportManager &rm) {
+    DEBUG_PRINTF("clearing min length\n");
+    replaceReports(g, [&rm](NFAVertex, ReportID id) {
+        const auto &report = rm.getReport(id);
+        if (report.minLength) {
+            Report new_report = report;
+            new_report.minLength = 0;
+            return rm.getInternalId(new_report);
+        }
+        return id;
+    });
+}
+
+/**
+ * Set the min_offset param to zero and the max_offset param to MAX_OFFSET for
+ * all reports.
+ */
+static
+void clearOffsetParams(NGHolder &g, ReportManager &rm) {
+    DEBUG_PRINTF("clearing min and max offset\n");
+    replaceReports(g, [&rm](NFAVertex, ReportID id) {
+        const auto &report = rm.getReport(id);
+        if (report.minLength) {
+            Report new_report = report;
+            new_report.minOffset = 0;
+            new_report.maxOffset = MAX_OFFSET;
+            return rm.getInternalId(new_report);
+        }
+        return id;
+    });
+}
+
+/**
+ * If the pattern is unanchored, has a max_offset and has not asked for SOM, we
+ * can use that knowledge to anchor it which will limit its lifespan. Note that
+ * we can't use this transformation if there's a min_length, as it's currently
+ * handled using "sly SOM".
  *
  * Note that it is possible to handle graphs that have a combination of
  * anchored and unanchored paths, but it's too tricky for the moment.
  */
 static
-bool anchorPatternWithBoundedRepeat(NGWrapper &g, const depth &minWidth,
-                                    const depth &maxWidth) {
-    assert(!g.som);
-    assert(g.max_offset != MAX_OFFSET);
+bool anchorPatternWithBoundedRepeat(NGHolder &g, ReportManager &rm) {
+    if (!isFloating(g)) {
+        return false;
+    }
+
+    const auto &reports = all_reports(g);
+    if (reports.empty()) {
+        return false;
+    }
+
+    if (any_of_in(reports, [&](ReportID id) {
+            const auto &report = rm.getReport(id);
+            return report.maxOffset == MAX_OFFSET || report.minLength ||
+                   report.offsetAdjust;
+        })) {
+        return false;
+    }
+
+    if (!hasSameBounds(reports, rm)) {
+        DEBUG_PRINTF("mixed report bounds\n");
+        return false;
+    }
+
+    const depth minWidth = findMinWidth(g);
+    const depth maxWidth = findMaxWidth(g);
+
     assert(minWidth <= maxWidth);
     assert(maxWidth.is_reachable());
 
+    const auto &first_report = rm.getReport(*reports.begin());
+    const auto min_offset = first_report.minOffset;
+    const auto max_offset = first_report.maxOffset;
+    assert(max_offset < MAX_OFFSET);
+
     DEBUG_PRINTF("widths=[%s,%s], min/max offsets=[%llu,%llu]\n",
-                 minWidth.str().c_str(), maxWidth.str().c_str(), g.min_offset,
-                 g.max_offset);
+                 minWidth.str().c_str(), maxWidth.str().c_str(),
+                 min_offset, max_offset);
 
-    if (g.max_offset > MAX_MAXOFFSET_TO_ANCHOR) {
+    if (max_offset > MAX_MAXOFFSET_TO_ANCHOR) {
         return false;
     }
 
-    if (g.max_offset < minWidth) {
+    if (max_offset < minWidth) {
         assert(0);
         return false;
     }
@@ -232,10 +340,10 @@ bool anchorPatternWithBoundedRepeat(NGWrapper &g, const depth &minWidth,
     u32 min_bound, max_bound;
     if (maxWidth.is_infinite()) {
         min_bound = 0;
-        max_bound = g.max_offset - minWidth;
+        max_bound = max_offset - minWidth;
     } else {
-        min_bound = g.min_offset > maxWidth ? g.min_offset - maxWidth : 0;
-        max_bound = g.max_offset - minWidth;
+        min_bound = min_offset > maxWidth ? min_offset - maxWidth : 0;
+        max_bound = max_offset - minWidth;
     }
 
     DEBUG_PRINTF("prepending ^.{%u,%u}\n", min_bound, max_bound);
@@ -288,6 +396,13 @@ bool anchorPatternWithBoundedRepeat(NGWrapper &g, const depth &minWidth,
     renumber_vertices(g);
     renumber_edges(g);
 
+    if (minWidth == maxWidth) {
+        // For a fixed width pattern, we can retire the offsets as
+        // they are implicit in the graph now.
+        clearOffsetParams(g, rm);
+    }
+
+    clearReports(g);
     return true;
 }
 
@@ -315,7 +430,7 @@ NFAVertex findSingleCyclic(const NGHolder &g) {
 }
 
 static
-bool hasOffsetAdjust(const ReportManager &rm, NGWrapper &g,
+bool hasOffsetAdjust(const ReportManager &rm, NGHolder &g,
                      int *adjust) {
     const auto &reports = all_reports(g);
     if (reports.empty()) {
@@ -336,16 +451,27 @@ bool hasOffsetAdjust(const ReportManager &rm, NGWrapper &g,
     return true;
 }
 
-/** If the pattern has a min_length and is of "ratchet" form with one unbounded
+/**
+ * If the pattern has a min_length and is of "ratchet" form with one unbounded
  * repeat, that repeat can become a bounded repeat.
  *
  *     /foo.*bar/{min_length=100} --> /foo.{94,}bar/
  */
 static
-bool transformMinLengthToRepeat(const ReportManager &rm, NGWrapper &g) {
-    assert(g.min_length);
+bool transformMinLengthToRepeat(NGHolder &g, ReportManager &rm) {
+    const auto &reports = all_reports(g);
+
+    if (reports.empty()) {
+        return false;
+    }
 
-    if (g.min_length > MAX_MINLENGTH_TO_CONVERT) {
+    if (!hasSameBounds(reports, rm)) {
+        DEBUG_PRINTF("mixed report bounds\n");
+        return false;
+    }
+
+    const auto &min_length = rm.getReport(*reports.begin()).minLength;
+    if (!min_length || min_length > MAX_MINLENGTH_TO_CONVERT) {
         return false;
     }
 
@@ -375,7 +501,6 @@ bool transformMinLengthToRepeat(const ReportManager &rm, NGWrapper &g) {
 
     u32 width = 0;
 
-
     // Walk from the start vertex to the cyclic state and ensure we have a
     // chain of vertices.
     while (v != cyclic) {
@@ -437,10 +562,10 @@ bool transformMinLengthToRepeat(const ReportManager &rm, NGWrapper &g) {
     DEBUG_PRINTF("width=%u, vertex %zu is cyclic\n", width,
                   g[cyclic].index);
 
-    if (width >= g.min_length) {
+    if (width >= min_length) {
         DEBUG_PRINTF("min_length=%llu is guaranteed, as width=%u\n",
-                      g.min_length, width);
-        g.min_length = 0;
+                      min_length, width);
+        clearMinLengthParam(g, rm);
         return true;
     }
 
@@ -468,7 +593,7 @@ bool transformMinLengthToRepeat(const ReportManager &rm, NGWrapper &g) {
 
     const CharReach &cr = g[cyclic].char_reach;
 
-    for (u32 i = 0; i < g.min_length - width - 1; ++i) {
+    for (u32 i = 0; i < min_length - width - 1; ++i) {
         v = add_vertex(g);
         g[v].char_reach = cr;
 
@@ -485,28 +610,27 @@ bool transformMinLengthToRepeat(const ReportManager &rm, NGWrapper &g) {
 
     renumber_vertices(g);
     renumber_edges(g);
+    clearMinLengthParam(g, rm);
     clearReports(g);
-
-    g.min_length = 0;
     return true;
 }
 
 static
-bool hasExtParams(const NGWrapper &g) {
-    if (g.min_length != 0) {
+bool hasExtParams(const ExpressionInfo &expr) {
+    if (expr.min_length != 0) {
         return true;
     }
-    if (g.min_offset != 0) {
+    if (expr.min_offset != 0) {
         return true;
     }
-    if (g.max_offset != MAX_OFFSET) {
+    if (expr.max_offset != MAX_OFFSET) {
         return true;
     }
     return false;
 }
 
-static
-depth maxDistFromStart(const NFAVertexBidiDepth &d) {
+template<class VertexDepth>
+depth maxDistFromStart(const VertexDepth &d) {
     if (!d.fromStartDotStar.max.is_unreachable()) {
         // A path from startDs, any path, implies we can match at any offset.
         return depth::infinity();
@@ -535,7 +659,7 @@ const depth& minDistToAccept(const NFAVertexBidiDepth &d) {
 }
 
 static
-bool isEdgePrunable(const NGWrapper &g,
+bool isEdgePrunable(const NGHolder &g, const Report &report,
                     const vector<NFAVertexBidiDepth> &depths,
                     const NFAEdge &e) {
     const NFAVertex u = source(e, g);
@@ -564,29 +688,29 @@ bool isEdgePrunable(const NGWrapper &g,
     const NFAVertexBidiDepth &du = depths.at(u_idx);
     const NFAVertexBidiDepth &dv = depths.at(v_idx);
 
-    if (g.min_offset) {
+    if (report.minOffset) {
         depth max_offset = maxDistFromStart(du) + maxDistToAccept(dv);
-        if (max_offset.is_finite() && max_offset < g.min_offset) {
+        if (max_offset.is_finite() && max_offset < report.minOffset) {
             DEBUG_PRINTF("max_offset=%s too small\n", max_offset.str().c_str());
             return true;
         }
     }
 
-    if (g.max_offset != MAX_OFFSET) {
+    if (report.maxOffset != MAX_OFFSET) {
         depth min_offset = minDistFromStart(du) + minDistToAccept(dv);
         assert(min_offset.is_finite());
 
-        if (min_offset > g.max_offset) {
+        if (min_offset > report.maxOffset) {
             DEBUG_PRINTF("min_offset=%s too large\n", min_offset.str().c_str());
             return true;
         }
     }
 
-    if (g.min_length && is_any_accept(v, g)) {
+    if (report.minLength && is_any_accept(v, g)) {
         // Simple take on min_length. If we're an edge to accept and our max
         // dist from start is too small, we can be pruned.
         const depth &width = du.fromStart.max;
-        if (width.is_finite() && width < g.min_length) {
+        if (width.is_finite() && width < report.minLength) {
             DEBUG_PRINTF("max width %s from start too small for min_length\n",
                          width.str().c_str());
             return true;
@@ -597,14 +721,25 @@ bool isEdgePrunable(const NGWrapper &g,
 }
 
 static
-void pruneExtUnreachable(NGWrapper &g) {
-    vector<NFAVertexBidiDepth> depths;
-    calcDepths(g, depths);
+void pruneExtUnreachable(NGHolder &g, const ReportManager &rm) {
+    const auto &reports = all_reports(g);
+    if (reports.empty()) {
+        return;
+    }
+
+    if (!hasSameBounds(reports, rm)) {
+        DEBUG_PRINTF("report bounds vary\n");
+        return;
+    }
+
+    const auto &report = rm.getReport(*reports.begin());
+
+    auto depths = calcBidiDepths(g);
 
     vector<NFAEdge> dead;
 
     for (const auto &e : edges_range(g)) {
-        if (isEdgePrunable(g, depths, e)) {
+        if (isEdgePrunable(g, report, depths, e)) {
             DEBUG_PRINTF("pruning\n");
             dead.push_back(e);
         }
@@ -616,32 +751,45 @@ void pruneExtUnreachable(NGWrapper &g) {
 
     remove_edges(dead, g);
     pruneUseless(g);
+    clearReports(g);
 }
 
-/** Remove vacuous edges in graphs where the min_offset or min_length
- * constraints dictate that they can never produce a match. */
+/**
+ * Remove vacuous edges in graphs where the min_offset or min_length
+ * constraints dictate that they can never produce a match.
+ */
 static
-void pruneVacuousEdges(NGWrapper &g) {
-    if (!g.min_length && !g.min_offset) {
-        return;
-    }
-
+void pruneVacuousEdges(NGHolder &g, const ReportManager &rm) {
     vector<NFAEdge> dead;
 
+    auto has_min_offset = [&](NFAVertex v) {
+        assert(!g[v].reports.empty()); // must be reporter
+        return all_of_in(g[v].reports, [&](ReportID id) {
+            return rm.getReport(id).minOffset > 0;
+        });
+    };
+
+    auto has_min_length = [&](NFAVertex v) {
+        assert(!g[v].reports.empty()); // must be reporter
+        return all_of_in(g[v].reports, [&](ReportID id) {
+            return rm.getReport(id).minLength > 0;
+        });
+    };
+
     for (const auto &e : edges_range(g)) {
         const NFAVertex u = source(e, g);
         const NFAVertex v = target(e, g);
 
-        // Special case: Crudely remove vacuous edges from start in graphs with a
-        // min_offset.
-        if (g.min_offset && u == g.start && is_any_accept(v, g)) {
+        // Special case: Crudely remove vacuous edges from start in graphs with
+        // a min_offset.
+        if (u == g.start && is_any_accept(v, g) && has_min_offset(u)) {
             DEBUG_PRINTF("vacuous edge in graph with min_offset!\n");
             dead.push_back(e);
             continue;
         }
 
         // If a min_length is set, vacuous edges can be removed.
-        if (g.min_length && is_any_start(u, g) && is_any_accept(v, g)) {
+        if (is_any_start(u, g) && is_any_accept(v, g) && has_min_length(u)) {
             DEBUG_PRINTF("vacuous edge in graph with min_length!\n");
             dead.push_back(e);
             continue;
@@ -652,12 +800,14 @@ void pruneVacuousEdges(NGWrapper &g) {
         return;
     }
 
+    DEBUG_PRINTF("removing %zu vacuous edges\n", dead.size());
     remove_edges(dead, g);
     pruneUseless(g);
+    clearReports(g);
 }
 
 static
-void pruneUnmatchable(NGWrapper &g, const vector<DepthMinMax> &depths,
+void pruneUnmatchable(NGHolder &g, const vector<DepthMinMax> &depths,
                       const ReportManager &rm, NFAVertex accept) {
     vector<NFAEdge> dead;
 
@@ -668,6 +818,11 @@ void pruneUnmatchable(NGWrapper &g, const vector<DepthMinMax> &depths,
             continue;
         }
 
+        if (!hasSameBounds(g[v].reports, rm)) {
+            continue;
+        }
+        const auto &report = rm.getReport(*g[v].reports.begin());
+
         u32 idx = g[v].index;
         DepthMinMax d = depths[idx]; // copy
         pair<s32, s32> adj = getMinMaxOffsetAdjust(rm, g, v);
@@ -676,16 +831,16 @@ void pruneUnmatchable(NGWrapper &g, const vector<DepthMinMax> &depths,
         d.min += adj.first;
         d.max += adj.second;
 
-        if (d.max.is_finite() && d.max < g.min_length) {
+        if (d.max.is_finite() && d.max < report.minLength) {
             DEBUG_PRINTF("prune, max match length %s < min_length=%llu\n",
-                         d.max.str().c_str(), g.min_length);
+                         d.max.str().c_str(), report.minLength);
             dead.push_back(e);
             continue;
         }
 
-        if (g.max_offset != MAX_OFFSET && d.min > g.max_offset) {
+        if (report.maxOffset != MAX_OFFSET && d.min > report.maxOffset) {
             DEBUG_PRINTF("prune, min match length %s > max_offset=%llu\n",
-                         d.min.str().c_str(), g.max_offset);
+                         d.min.str().c_str(), report.maxOffset);
             dead.push_back(e);
             continue;
         }
@@ -694,11 +849,15 @@ void pruneUnmatchable(NGWrapper &g, const vector<DepthMinMax> &depths,
     remove_edges(dead, g);
 }
 
-/** Remove edges to accepts that can never produce a match long enough to
- * satisfy our min_length and max_offset constraints. */
+/**
+ * Remove edges to accepts that can never produce a match long enough to
+ * satisfy our min_length and max_offset constraints.
+ */
 static
-void pruneUnmatchable(NGWrapper &g, const ReportManager &rm) {
-    if (!g.min_length) {
+void pruneUnmatchable(NGHolder &g, const ReportManager &rm) {
+    if (!any_of_in(all_reports(g), [&](ReportID id) {
+            return rm.getReport(id).minLength > 0;
+        })) {
         return;
     }
 
@@ -708,33 +867,19 @@ void pruneUnmatchable(NGWrapper &g, const ReportManager &rm) {
     pruneUnmatchable(g, depths, rm, g.acceptEod);
 
     pruneUseless(g);
-}
-
-static
-bool isUnanchored(const NGHolder &g) {
-    for (auto v : adjacent_vertices_range(g.start, g)) {
-        if (!edge(g.startDs, v, g).second) {
-            DEBUG_PRINTF("fail, %zu is anchored vertex\n", g[v].index);
-            return false;
-        }
-    }
-    return true;
+    clearReports(g);
 }
 
 static
 bool hasOffsetAdjustments(const ReportManager &rm, const NGHolder &g) {
-    for (auto report : all_reports(g)) {
-        const Report &ir = rm.getReport(report);
-        if (ir.offsetAdjust) {
-            return true;
-        }
-    }
-    return false;
+    return any_of_in(all_reports(g), [&rm](ReportID id) {
+        return rm.getReport(id).offsetAdjust != 0;
+    });
 }
 
-void handleExtendedParams(ReportManager &rm, NGWrapper &g,
-                          UNUSED const CompileContext &cc) {
-    if (!hasExtParams(g)) {
+void propagateExtendedParams(NGHolder &g, ExpressionInfo &expr,
+                             ReportManager &rm) {
+    if (!hasExtParams(expr)) {
         return;
     }
 
@@ -742,132 +887,158 @@ void handleExtendedParams(ReportManager &rm, NGWrapper &g,
     depth maxWidth = findMaxWidth(g);
     bool is_anchored = !has_proper_successor(g.startDs, g)
                      && out_degree(g.start, g);
-    bool has_offset_adj = hasOffsetAdjustments(rm, g);
-
-    DEBUG_PRINTF("minWidth=%s, maxWidth=%s, anchored=%d, offset_adj=%d\n",
-                 minWidth.str().c_str(), maxWidth.str().c_str(), is_anchored,
-                 has_offset_adj);
 
     DepthMinMax match_depths = findMatchLengths(rm, g);
     DEBUG_PRINTF("match depths %s\n", match_depths.str().c_str());
 
-    if (is_anchored && maxWidth.is_finite() && g.min_offset > maxWidth) {
+    if (is_anchored && maxWidth.is_finite() && expr.min_offset > maxWidth) {
         ostringstream oss;
         oss << "Expression is anchored and cannot satisfy min_offset="
-            << g.min_offset << " as it can only produce matches of length "
+            << expr.min_offset << " as it can only produce matches of length "
             << maxWidth << " bytes at most.";
-        throw CompileError(g.expressionIndex, oss.str());
+        throw CompileError(expr.index, oss.str());
     }
 
-    if (minWidth > g.max_offset) {
+    if (minWidth > expr.max_offset) {
         ostringstream oss;
-        oss << "Expression has max_offset=" << g.max_offset << " but requires "
-             << minWidth << " bytes to match.";
-        throw CompileError(g.expressionIndex, oss.str());
+        oss << "Expression has max_offset=" << expr.max_offset
+            << " but requires " << minWidth << " bytes to match.";
+        throw CompileError(expr.index, oss.str());
     }
 
-    if (maxWidth.is_finite() && match_depths.max < g.min_length) {
+    if (maxWidth.is_finite() && match_depths.max < expr.min_length) {
         ostringstream oss;
-        oss << "Expression has min_length=" << g.min_length << " but can "
+        oss << "Expression has min_length=" << expr.min_length << " but can "
             "only produce matches of length " << match_depths.max <<
             " bytes at most.";
-        throw CompileError(g.expressionIndex, oss.str());
+        throw CompileError(expr.index, oss.str());
     }
 
-    if (g.min_length && g.min_length <= match_depths.min) {
+    if (expr.min_length && expr.min_length <= match_depths.min) {
         DEBUG_PRINTF("min_length=%llu constraint is unnecessary\n",
-                     g.min_length);
-        g.min_length = 0;
+                     expr.min_length);
+        expr.min_length = 0;
     }
 
-    if (!hasExtParams(g)) {
+    if (!hasExtParams(expr)) {
         return;
     }
 
-    pruneVacuousEdges(g);
-    pruneUnmatchable(g, rm);
+    updateReportBounds(rm, g, expr);
+}
 
-    if (!has_offset_adj) {
-        pruneExtUnreachable(g);
-    }
+/**
+ * If the pattern is completely anchored and has a min_length set, this can
+ * be converted to a min_offset.
+ */
+static
+void replaceMinLengthWithOffset(NGHolder &g, ReportManager &rm) {
+    if (has_proper_successor(g.startDs, g)) {
+        return; // not wholly anchored
+    }
+
+    replaceReports(g, [&rm](NFAVertex, ReportID id) {
+        const auto &report = rm.getReport(id);
+        if (report.minLength) {
+            Report new_report = report;
+            u64a min_len_offset = report.minLength - report.offsetAdjust;
+            new_report.minOffset = max(report.minOffset, min_len_offset);
+            new_report.minLength = 0;
+            return rm.getInternalId(new_report);
+        }
+        return id;
+    });
+}
+
+/**
+ * Clear offset bounds on reports that are not needed because they're satisfied
+ * by vertex depth.
+ */
+static
+void removeUnneededOffsetBounds(NGHolder &g, ReportManager &rm) {
+    auto depths = calcDepths(g);
+
+    replaceReports(g, [&](NFAVertex v, ReportID id) {
+        const auto &d = depths.at(g[v].index);
+        const depth &min_depth = min(d.fromStartDotStar.min, d.fromStart.min);
+        const depth &max_depth = maxDistFromStart(d);
+
+        DEBUG_PRINTF("vertex %zu has min_depth=%s, max_depth=%s\n", g[v].index,
+                     min_depth.str().c_str(), max_depth.str().c_str());
+
+        Report report = rm.getReport(id); // copy
+        bool modified = false;
+        if (report.minOffset && !report.offsetAdjust &&
+            report.minOffset <= min_depth) {
+            report.minOffset = 0;
+            modified = true;
+        }
+        if (report.maxOffset != MAX_OFFSET && max_depth.is_finite() &&
+            report.maxOffset >= max_depth) {
+            report.maxOffset = MAX_OFFSET;
+            modified = true;
+        }
+        if (modified) {
+            DEBUG_PRINTF("vertex %zu, changed bounds to [%llu,%llu]\n",
+                         g[v].index, report.minOffset, report.maxOffset);
+            return rm.getInternalId(report);
+        }
 
-    // We may have removed all the edges to accept, in which case this
-    // expression cannot match.
-    if (in_degree(g.accept, g) == 0 && in_degree(g.acceptEod, g) == 1) {
-        throw CompileError(g.expressionIndex, "Extended parameter "
-                "constraints can not be satisfied for any match from "
-                "this expression.");
+        return id;
+    });
+}
+
+void reduceExtendedParams(NGHolder &g, ReportManager &rm, som_type som) {
+    if (!any_of_in(all_reports(g),
+                   [&](ReportID id) { return rm.getReport(id).hasBounds(); })) {
+        DEBUG_PRINTF("no extparam bounds\n");
+        return;
     }
 
-    // Remove reports on vertices without an edge to accept (which have been
-    // pruned above).
-    clearReports(g);
+    DEBUG_PRINTF("graph has extparam bounds\n");
 
-    // Recalc.
-    minWidth = findMinWidth(g);
-    maxWidth = findMaxWidth(g);
-    is_anchored = proper_out_degree(g.startDs, g) == 0 &&
-                  out_degree(g.start, g);
-    has_offset_adj = hasOffsetAdjustments(rm, g);
+    pruneVacuousEdges(g, rm);
+    if (can_never_match(g)) {
+        return;
+    }
 
-    // If the pattern is completely anchored and has a min_length set, this can
-    // be converted to a min_offset.
-    if (g.min_length && (g.min_offset <= g.min_length) && is_anchored) {
-        DEBUG_PRINTF("converting min_length to min_offset=%llu for "
-                     "anchored case\n", g.min_length);
-        g.min_offset = g.min_length;
-        g.min_length = 0;
+    pruneUnmatchable(g, rm);
+    if (can_never_match(g)) {
+        return;
     }
 
-    if (g.min_offset && g.min_offset <= minWidth && !has_offset_adj) {
-        DEBUG_PRINTF("min_offset=%llu constraint is unnecessary\n",
-                     g.min_offset);
-        g.min_offset = 0;
+    if (!hasOffsetAdjustments(rm, g)) {
+        pruneExtUnreachable(g, rm);
+        if (can_never_match(g)) {
+            return;
+        }
     }
 
-    if (!hasExtParams(g)) {
+    replaceMinLengthWithOffset(g, rm);
+    if (can_never_match(g)) {
         return;
     }
 
     // If the pattern has a min_length and is of "ratchet" form with one
     // unbounded repeat, that repeat can become a bounded repeat.
     // e.g. /foo.*bar/{min_length=100} --> /foo.{94,}bar/
-    if (g.min_length && transformMinLengthToRepeat(rm, g)) {
-        DEBUG_PRINTF("converted min_length to bounded repeat\n");
-        // recalc
-        minWidth = findMinWidth(g);
+    transformMinLengthToRepeat(g, rm);
+    if (can_never_match(g)) {
+        return;
     }
 
     // If the pattern is unanchored, has a max_offset and has not asked for
     // SOM, we can use that knowledge to anchor it which will limit its
     // lifespan. Note that we can't use this transformation if there's a
     // min_length, as it's currently handled using "sly SOM".
-
-    // Note that it is possible to handle graphs that have a combination of
-    // anchored and unanchored paths, but it's too tricky for the moment.
-
-    if (g.max_offset != MAX_OFFSET && !g.som && !g.min_length &&
-                !has_offset_adj && isUnanchored(g)) {
-        if (anchorPatternWithBoundedRepeat(g, minWidth, maxWidth)) {
-            DEBUG_PRINTF("minWidth=%s, maxWidth=%s\n", minWidth.str().c_str(),
-                         maxWidth.str().c_str());
-            if (minWidth == maxWidth) {
-                // For a fixed width pattern, we can retire the offsets as they
-                // are implicit in the graph now.
-                g.min_offset = 0;
-                g.max_offset = MAX_OFFSET;
-            }
+    if (som == SOM_NONE) {
+        anchorPatternWithBoundedRepeat(g, rm);
+        if (can_never_match(g)) {
+            return;
         }
     }
-    //dumpGraph("final.dot", g);
-
-    if (!hasExtParams(g)) {
-        return;
-    }
 
-    set<NFAVertex> done;
-    updateReportBounds(rm, g, g.accept, done);
-    updateReportBounds(rm, g, g.acceptEod, done);
+    removeUnneededOffsetBounds(g, rm);
 }
 
 } // namespace ue2
diff --git a/src/nfagraph/ng_extparam.h b/src/nfagraph/ng_extparam.h
index d5df1cf6d..ae818075c 100644
--- a/src/nfagraph/ng_extparam.h
+++ b/src/nfagraph/ng_extparam.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,14 +34,30 @@
 #ifndef NG_EXTPARAM_H
 #define NG_EXTPARAM_H
 
+#include "som/som.h"
+
 namespace ue2 {
 
-struct CompileContext;
-class NGWrapper;
+class ExpressionInfo;
+class NGHolder;
 class ReportManager;
 
-void handleExtendedParams(ReportManager &rm, NGWrapper &g,
-                          const CompileContext &cc);
+/**
+ * \brief Propagate extended parameter information to vertex reports. Will
+ * throw CompileError if this expression's extended parameters are not
+ * satisfiable.
+ *
+ * This will also remove extended parameter constraints that are guaranteed to
+ * be satisfied from ExpressionInfo.
+ */
+void propagateExtendedParams(NGHolder &g, ExpressionInfo &expr,
+                             ReportManager &rm);
+
+/**
+ * \brief Perform graph reductions (if possible) to do with extended parameter
+ * constraints on reports.
+ */
+void reduceExtendedParams(NGHolder &g, ReportManager &rm, som_type som);
 
 } // namespace ue2
 
diff --git a/src/nfagraph/ng_fuzzy.cpp b/src/nfagraph/ng_fuzzy.cpp
new file mode 100644
index 000000000..2c3d85bd5
--- /dev/null
+++ b/src/nfagraph/ng_fuzzy.cpp
@@ -0,0 +1,685 @@
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Graph fuzzer for approximate matching
+ */
+
+#include "ng_fuzzy.h"
+
+#include "ng.h"
+#include "ng_depth.h"
+#include "ng_util.h"
+
+#include <map>
+#include <vector>
+using namespace std;
+
+namespace ue2 {
+
+// returns all successors up to a given depth in a vector of sets, indexed by
+// zero-based depth from source vertex
+static
+vector<flat_set<NFAVertex>> gatherSuccessorsByDepth(const NGHolder &g,
+                                                    NFAVertex src, u32 depth) {
+    vector<flat_set<NFAVertex>> result(depth);
+    flat_set<NFAVertex> cur, next;
+
+    assert(depth > 0);
+
+    // populate current set of successors
+    for (auto v : adjacent_vertices_range(src, g)) {
+        // ignore self-loops
+        if (src == v) {
+            continue;
+        }
+        DEBUG_PRINTF("Node %zu depth 1\n", g[v].index);
+        cur.insert(v);
+    }
+    result[0] = cur;
+
+    for (unsigned d = 1; d < depth; d++) {
+        // collect all successors for all current level vertices
+        for (auto v : cur) {
+            // don't go past special nodes
+            if (is_special(v, g)) {
+                continue;
+            }
+
+            for (auto succ : adjacent_vertices_range(v, g)) {
+                // ignore self-loops
+                if (v == succ) {
+                    continue;
+                }
+                DEBUG_PRINTF("Node %zu depth %u\n", g[succ].index, d + 1);
+                next.insert(succ);
+            }
+        }
+        result[d] = next;
+        next.swap(cur);
+        next.clear();
+    }
+
+    return result;
+}
+
+// returns all predecessors up to a given depth in a vector of sets, indexed by
+// zero-based depth from source vertex
+static
+vector<flat_set<NFAVertex>> gatherPredecessorsByDepth(const NGHolder &g,
+                                                      NFAVertex src,
+                                                      u32 depth) {
+    vector<flat_set<NFAVertex>> result(depth);
+    flat_set<NFAVertex> cur, next;
+
+    assert(depth > 0);
+
+    // populate current set of successors
+    for (auto v : inv_adjacent_vertices_range(src, g)) {
+        // ignore self-loops
+        if (src == v) {
+            continue;
+        }
+        DEBUG_PRINTF("Node %zu depth 1\n", g[v].index);
+        cur.insert(v);
+    }
+    result[0] = cur;
+
+    for (unsigned d = 1; d < depth; d++) {
+        // collect all successors for all current level vertices
+        for (auto v : cur) {
+            for (auto pred : inv_adjacent_vertices_range(v, g)) {
+                // ignore self-loops
+                if (v == pred) {
+                    continue;
+                }
+                DEBUG_PRINTF("Node %zu depth %u\n", g[pred].index, d + 1);
+                next.insert(pred);
+            }
+        }
+        result[d] = next;
+        next.swap(cur);
+        next.clear();
+    }
+
+    return result;
+}
+
+/*
+ * This struct produces a fuzzed graph; that is, a graph that is able to match
+ * the original pattern, as well as input data within a certain edit distance.
+ * Construct the struct, then call fuzz_graph() to transform the graph.
+ *
+ * Terminology used:
+ * - Shadow vertices: vertices mirroring the original graph at various edit
+ * distances
+ * - Shadow graph level: edit distance of a particular shadow graph
+ * - Helpers: dot vertices assigned to shadow vertices, used for insert/replace
+ */
+struct ShadowGraph {
+    NGHolder &g;
+    u32 edit_distance;
+    map<pair<NFAVertex, u32>, NFAVertex> shadow_map;
+    map<pair<NFAVertex, u32>, NFAVertex> helper_map;
+    map<NFAVertex, NFAVertex> clones;
+    // edge creation is deferred
+    vector<pair<NFAVertex, NFAVertex>> edges_to_be_added;
+    flat_set<NFAVertex> orig;
+
+    ShadowGraph(NGHolder &g_in, u32 ed_in) : g(g_in), edit_distance(ed_in) {}
+
+    void fuzz_graph() {
+        if (edit_distance == 0) {
+            return;
+        }
+
+        // step 1: prepare the vertices, helpers and shadows according to
+        // the original graph
+        prepare_graph();
+
+        // step 2: add shadow and helper nodes
+        build_shadow_graph();
+
+        // step 3: set up reports for newly created vertices (and make clones
+        // if necessary)
+        create_reports();
+
+        // step 4: wire up shadow graph and helpers for insert/replace/remove
+        connect_shadow_graph();
+
+        // step 5: commit all the edge wirings
+        DEBUG_PRINTF("Committing edge wirings\n");
+        for (const auto &p : edges_to_be_added) {
+            add_edge_if_not_present(p.first, p.second, g);
+        }
+
+        DEBUG_PRINTF("Done!\n");
+    }
+
+private:
+    const NFAVertex& get_clone(const NFAVertex &v) {
+        return contains(clones, v) ?
+                    clones[v] : v;
+    }
+
+    void connect_to_clones(const NFAVertex &u, const NFAVertex &v) {
+        const NFAVertex &clone_u = get_clone(u);
+        const NFAVertex &clone_v = get_clone(v);
+
+        edges_to_be_added.emplace_back(u, v);
+        DEBUG_PRINTF("Adding edge: %zu -> %zu\n", g[u].index, g[v].index);
+
+        // do not connect clones to accepts, we do it during cloning
+        if (is_any_accept(clone_v, g)) {
+            return;
+        }
+        edges_to_be_added.emplace_back(clone_u, clone_v);
+        DEBUG_PRINTF("Adding edge: %zu -> %zu\n", g[clone_u].index,
+                     g[clone_v].index);
+    }
+
+    void prepare_graph() {
+        DEBUG_PRINTF("Building shadow graphs\n");
+
+        for (auto v : vertices_range(g)) {
+            // all level 0 vertices are their own helpers and their own shadows
+            helper_map[make_pair(v, 0)] = v;
+            shadow_map[make_pair(v, 0)] = v;
+
+            // find special nodes
+            if (is_any_accept(v, g)) {
+                DEBUG_PRINTF("Node %zu is a special node\n", g[v].index);
+                for (unsigned edit = 1; edit <= edit_distance; edit++) {
+                    // all accepts are their own shadows and helpers at all
+                    // levels
+                    shadow_map[make_pair(v, edit)] = v;
+                    helper_map[make_pair(v, edit)] = v;
+                }
+                continue;
+            }
+            DEBUG_PRINTF("Node %zu is to be shadowed\n", g[v].index);
+            orig.insert(v);
+        }
+    }
+
+    void build_shadow_graph() {
+        for (auto v : orig) {
+            DEBUG_PRINTF("Adding shadow/helper nodes for node %zu\n",
+                         g[v].index);
+            for (unsigned dist = 1; dist <= edit_distance; dist++) {
+                auto shadow_v = v;
+
+                // start and startDs cannot have shadows but do have helpers
+                if (!is_any_start(v, g)) {
+                    shadow_v = clone_vertex(g, v);
+                    DEBUG_PRINTF("New shadow node ID: %zu (level %u)\n",
+                                 g[shadow_v].index, dist);
+                }
+                shadow_map[make_pair(v, dist)] = shadow_v;
+
+                // if there's nowhere to go from this vertex, no helper needed
+                if (proper_out_degree(v, g) < 1) {
+                    helper_map[make_pair(v, dist)] = shadow_v;
+                    continue;
+                }
+
+                auto helper_v = clone_vertex(g, v);
+                DEBUG_PRINTF("New helper node ID: %zu (level %u)\n",
+                             g[helper_v].index, dist);
+
+                // this is a helper, so make it a dot
+                g[helper_v].char_reach = CharReach::dot();
+                // do not copy virtual start's assert flags
+                if (is_virtual_start(v, g)) {
+                    g[helper_v].assert_flags = 0;
+                }
+                helper_map[make_pair(v, dist)] = helper_v;
+            }
+        }
+    }
+
+    // wire up successors according to the original graph, wire helpers
+    // to shadow successors (insert/replace)
+    void connect_succs(NFAVertex v, u32 dist) {
+        DEBUG_PRINTF("Wiring up successors for node %zu shadow level %u\n",
+                     g[v].index, dist);
+        const auto &cur_shadow_v = shadow_map[make_pair(v, dist)];
+        const auto &cur_shadow_helper = helper_map[make_pair(v, dist)];
+
+        // multiple insert
+        if (dist > 1) {
+            const auto &prev_level_helper = helper_map[make_pair(v, dist - 1)];
+            connect_to_clones(prev_level_helper, cur_shadow_helper);
+        }
+
+        for (auto orig_dst : adjacent_vertices_range(v, g)) {
+            const auto &shadow_dst = shadow_map[make_pair(orig_dst, dist)];
+
+            connect_to_clones(cur_shadow_v, shadow_dst);
+
+            // ignore startDs for insert/replace
+            if (orig_dst == g.startDs) {
+                continue;
+            }
+
+            connect_to_clones(cur_shadow_helper, shadow_dst);
+        }
+    }
+
+    // wire up predecessors according to the original graph, wire
+    // predecessors to helpers (replace), wire predecessor helpers to
+    // helpers (multiple replace)
+    void connect_preds(NFAVertex v, u32 dist) {
+        DEBUG_PRINTF("Wiring up predecessors for node %zu shadow level %u\n",
+                     g[v].index, dist);
+        const auto &cur_shadow_v = shadow_map[make_pair(v, dist)];
+        const auto &cur_shadow_helper = helper_map[make_pair(v, dist)];
+
+        auto orig_src_vertices = inv_adjacent_vertices_range(v, g);
+        for (auto orig_src : orig_src_vertices) {
+            // ignore edges from start to startDs
+            if (v == g.startDs && orig_src == g.start) {
+                continue;
+            }
+            // ignore self-loops for replace
+            if (orig_src != v) {
+                // do not wire a replace node for start vertices if we
+                // have a virtual start
+                if (is_virtual_start(v, g) && is_any_start(orig_src, g)) {
+                    continue;
+                }
+
+                if (dist) {
+                    const auto &prev_level_src =
+                        shadow_map[make_pair(orig_src, dist - 1)];
+                    const auto &prev_level_helper =
+                        helper_map[make_pair(orig_src, dist - 1)];
+
+                    connect_to_clones(prev_level_src, cur_shadow_helper);
+                    connect_to_clones(prev_level_helper, cur_shadow_helper);
+                }
+            }
+            // wire predecessor according to original graph
+            const auto &shadow_src = shadow_map[make_pair(orig_src, dist)];
+
+            connect_to_clones(shadow_src, cur_shadow_v);
+        }
+    }
+
+    // wire up previous level helper to current shadow (insert)
+    void connect_helpers(NFAVertex v, u32 dist) {
+        DEBUG_PRINTF("Wiring up helpers for node %zu shadow level %u\n",
+                     g[v].index, dist);
+        const auto &cur_shadow_helper = helper_map[make_pair(v, dist)];
+        auto prev_level_v = shadow_map[make_pair(v, dist - 1)];
+
+        connect_to_clones(prev_level_v, cur_shadow_helper);
+    }
+
+    /*
+     * wiring edges for removal is a special case.
+     *
+     * when wiring edges for removal, as well as wiring up immediate
+     * predecessors to immediate successors, we also need to wire up more
+     * distant successors to their respective shadow graph levels.
+     *
+     * for example, consider graph start->a->b->c->d->accept.
+     *
+     * at edit distance 1, we need remove edges start->b, a->c, b->d, and
+     * c->accept, all going from original graph (level 0) to shadow graph
+     * level 1.
+     *
+     * at edit distance 2, we also need edges start->c, a->d and b->accept,
+     * all going from level 0 to shadow graph level 2.
+     *
+     * this is propagated to all shadow levels; that is, given edit
+     * distance 3, we will have edges from shadow levels 0->1, 0->2,
+     * 0->3, 1->2, 1->3, and 2->3.
+     *
+     * therefore, we wire them in steps: first wire with step 1 (0->1, 1->2,
+     * 2->3) at depth 1, then wire with step 2 (0->2, 1->3) at depth 2, etc.
+     *
+     * we also have to wire helpers to their removal successors, to
+     * accommodate for a replace followed by a remove, on all shadow levels.
+     *
+     * and finally, we also have to wire source shadows into removal
+     * successor helpers on a level above, to accommodate for a remove
+     * followed by a replace.
+     */
+    void connect_removals(NFAVertex v) {
+        DEBUG_PRINTF("Wiring up remove edges for node %zu\n", g[v].index);
+
+        // vertices returned by this function don't include self-loops
+        auto dst_vertices_by_depth =
+            gatherSuccessorsByDepth(g, v, edit_distance);
+        auto orig_src_vertices = inv_adjacent_vertices_range(v, g);
+        for (auto orig_src : orig_src_vertices) {
+            // ignore self-loops
+            if (orig_src == v) {
+                continue;
+            }
+            for (unsigned step = 1; step <= edit_distance; step++) {
+                for (unsigned dist = step; dist <= edit_distance; dist++) {
+                    auto &dst_vertices = dst_vertices_by_depth[step - 1];
+                    for (auto &orig_dst : dst_vertices) {
+                        const auto &shadow_src =
+                            shadow_map[make_pair(orig_src, dist - step)];
+                        const auto &shadow_helper =
+                            helper_map[make_pair(orig_src, dist - step)];
+                        const auto &shadow_dst =
+                                shadow_map[make_pair(orig_dst, dist)];
+
+                        // removal
+                        connect_to_clones(shadow_src, shadow_dst);
+
+                        // removal from helper vertex
+                        connect_to_clones(shadow_helper, shadow_dst);
+
+                        // removal into helper, requires additional edit
+                        if ((dist + 1) <= edit_distance) {
+                            const auto &next_level_helper =
+                                    helper_map[make_pair(orig_dst, dist + 1)];
+
+                            connect_to_clones(shadow_src, next_level_helper);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    void connect_shadow_graph() {
+        DEBUG_PRINTF("Wiring up the graph\n");
+
+        for (auto v : orig) {
+
+            DEBUG_PRINTF("Wiring up edges for node %zu\n", g[v].index);
+
+            for (unsigned dist = 0; dist <= edit_distance; dist++) {
+
+                // handle insert/replace
+                connect_succs(v, dist);
+
+                // handle replace/multiple insert
+                connect_preds(v, dist);
+
+                // handle helpers
+                if (dist > 0) {
+                    connect_helpers(v, dist);
+                }
+            }
+
+            // handle removals
+            connect_removals(v);
+        }
+    }
+
+    void connect_to_targets(NFAVertex src, const flat_set<NFAVertex> &targets) {
+        for (auto dst : targets) {
+            DEBUG_PRINTF("Adding edge: %zu -> %zu\n", g[src].index,
+                         g[dst].index);
+            edges_to_be_added.emplace_back(src, dst);
+        }
+    }
+
+    // create a clone of the vertex, but overwrite its report set
+    void create_clone(NFAVertex v, const flat_set<ReportID> &reports,
+                      unsigned max_edit_distance,
+                      const flat_set<NFAVertex> &targets) {
+        // some vertices may have the same reports, but different successors;
+        // therefore, we may need to connect them multiple times, but still only
+        // clone once
+        bool needs_cloning = !contains(clones, v);
+
+        DEBUG_PRINTF("Cloning node %zu\n", g[v].index);
+        // go through all shadows and helpers, including
+        // original vertex
+        for (unsigned d = 0; d < max_edit_distance; d++) {
+            auto shadow_v = shadow_map[make_pair(v, d)];
+            auto helper_v = helper_map[make_pair(v, d)];
+
+            NFAVertex new_shadow_v, new_helper_v;
+
+            // make sure we don't clone the same vertex twice
+            if (needs_cloning) {
+                new_shadow_v = clone_vertex(g, shadow_v);
+                DEBUG_PRINTF("New shadow node ID: %zu (level %u)\n",
+                             g[new_shadow_v].index, d);
+                clones[shadow_v] = new_shadow_v;
+            } else {
+                new_shadow_v = clones[shadow_v];
+            }
+            g[new_shadow_v].reports = reports;
+
+            connect_to_targets(new_shadow_v, targets);
+
+            if (shadow_v == helper_v) {
+                continue;
+            }
+            if (needs_cloning) {
+                new_helper_v = clone_vertex(g, helper_v);
+                DEBUG_PRINTF("New helper node ID: %zu (level %u)\n",
+                             g[new_helper_v].index, d);
+                clones[helper_v] = new_helper_v;
+            } else {
+                new_helper_v = clones[helper_v];
+            }
+            g[new_helper_v].reports = reports;
+
+            connect_to_targets(new_helper_v, targets);
+        }
+    }
+
+    void write_reports(NFAVertex v, const flat_set<ReportID> &reports,
+                       unsigned max_edit_distance,
+                       const flat_set<NFAVertex> &targets) {
+        // we're overwriting reports, but we're not losing any
+        // information as we already cached all the different report
+        // sets, so vertices having different reports will be cloned and set up
+        // with the correct report set
+
+        // go through all shadows and helpers, including original
+        // vertex
+        for (unsigned d = 0; d < max_edit_distance; d++) {
+            auto shadow_v = shadow_map[make_pair(v, d)];
+            auto helper_v = helper_map[make_pair(v, d)];
+            DEBUG_PRINTF("Setting up reports for shadow node: %zu "
+                         "(level %u)\n",
+                         g[shadow_v].index, d);
+            DEBUG_PRINTF("Setting up reports for helper node: %zu "
+                         "(level %u)\n",
+                         g[helper_v].index, d);
+            g[shadow_v].reports = reports;
+            g[helper_v].reports = reports;
+
+            connect_to_targets(shadow_v, targets);
+            connect_to_targets(helper_v, targets);
+        }
+    }
+
+    /*
+     * we may have multiple report sets per graph. that means, whenever we
+     * construct additional paths through the graph (alternations, removals), we
+     * have to account for the fact that some vertices are predecessors to
+     * vertices with different report sets.
+     *
+     * whenever that happens, we have to clone the paths for both report sets,
+     * and set up these new vertices with their respective report sets as well.
+     *
+     * in order to do that, we first have to get all the predecessors for accept
+     * and acceptEod vertices. then, go through them one by one, and take note
+     * of the report lists. the first report set we find, wins, the rest we
+     * clone.
+     *
+     * we also have to do this in two passes, because there may be vertices that
+     * are predecessors to vertices with different report sets, so to avoid
+     * overwriting reports we will be caching reports info instead.
+     */
+    void create_reports() {
+        map<flat_set<ReportID>, flat_set<NFAVertex>> reports_to_vertices;
+        flat_set<NFAVertex> accepts{g.accept, g.acceptEod};
+
+        // gather reports info from all vertices connected to accept
+        for (auto accept : accepts) {
+            for (auto src : inv_adjacent_vertices_range(accept, g)) {
+                // skip special vertices
+                if (is_special(src, g)) {
+                    continue;
+                }
+                reports_to_vertices[g[src].reports].insert(src);
+            }
+        }
+
+        // we expect to see at most two report sets
+        assert(reports_to_vertices.size() > 0 &&
+               reports_to_vertices.size() <= 2);
+
+        // set up all reports
+        bool clone = false;
+        for (auto &pair : reports_to_vertices) {
+            const auto &reports = pair.first;
+            const auto &vertices = pair.second;
+
+            for (auto src : vertices) {
+                // get all predecessors up to edit distance
+                auto src_vertices_by_depth =
+                        gatherPredecessorsByDepth(g, src, edit_distance);
+
+                // find which accepts source vertex connects to
+                flat_set<NFAVertex> targets;
+                for (const auto &accept : accepts) {
+                    NFAEdge e = edge(src, accept, g);
+                    if (e) {
+                        targets.insert(accept);
+                    }
+                }
+                assert(targets.size());
+
+                for (unsigned d = 0; d < src_vertices_by_depth.size(); d++) {
+                    const auto &preds = src_vertices_by_depth[d];
+                    for (auto v : preds) {
+                        // only clone a node if it already contains reports
+                        if (clone && !g[v].reports.empty()) {
+                            create_clone(v, reports, edit_distance - d,
+                                         targets);
+                        } else {
+                            write_reports(v, reports, edit_distance - d,
+                                          targets);
+                        }
+                    }
+                }
+            }
+            // clone vertices only if it's not our first report set
+            clone = true;
+        }
+    }
+};
+
+// check if we will edit our way into a vacuous pattern
+static
+bool will_turn_vacuous(const NGHolder &g, u32 edit_distance) {
+    auto depths = calcRevDepths(g);
+
+    depth min_depth = depth::infinity();
+    auto idx = g[g.start].index;
+
+    // check distance from start to accept/acceptEod
+    if (depths[idx].toAccept.min.is_finite()) {
+        min_depth = min(depths[idx].toAccept.min, min_depth);
+    }
+    if (depths[idx].toAcceptEod.min.is_finite()) {
+        min_depth = min(depths[idx].toAcceptEod.min, min_depth);
+    }
+
+    idx = g[g.startDs].index;
+
+    // check distance from startDs to accept/acceptEod
+    if (depths[idx].toAccept.min.is_finite()) {
+        min_depth = min(depths[idx].toAccept.min, min_depth);
+    }
+    if (depths[idx].toAcceptEod.min.is_finite()) {
+        min_depth = min(depths[idx].toAcceptEod.min, min_depth);
+    }
+
+    assert(min_depth.is_finite());
+
+    // now, check if we can edit our way into a vacuous pattern
+    if (min_depth <= (u64a) edit_distance + 1) {
+        DEBUG_PRINTF("Pattern will turn vacuous if approximately matched\n");
+        return true;
+    }
+    return false;
+}
+
+void validate_fuzzy_compile(const NGHolder &g, u32 edit_distance, bool utf8,
+                            const Grey &grey) {
+    if (edit_distance == 0) {
+        return;
+    }
+    if (!grey.allowApproximateMatching) {
+        throw CompileError("Approximate matching is disabled.");
+    }
+    if (edit_distance > grey.maxEditDistance) {
+        throw CompileError("Edit distance is too big.");
+    }
+    if (utf8) {
+        throw CompileError("UTF-8 is disallowed for approximate matching.");
+    }
+    // graph isn't fuzzable if there are edge assertions anywhere in the graph
+    for (auto e : edges_range(g)) {
+        if (g[e].assert_flags) {
+            throw CompileError("Zero-width assertions are disallowed for "
+                               "approximate matching.");
+        }
+    }
+    if (will_turn_vacuous(g, edit_distance)) {
+        throw CompileError("Approximate matching patterns that reduce to "
+                           "vacuous patterns are disallowed.");
+    }
+}
+
+void make_fuzzy(NGHolder &g, u32 edit_distance, const Grey &grey) {
+    if (edit_distance == 0) {
+        return;
+    }
+
+    assert(grey.allowApproximateMatching);
+    assert(grey.maxEditDistance >= edit_distance);
+
+    ShadowGraph sg(g, edit_distance);
+    sg.fuzz_graph();
+
+    // For safety, enforce limit on actual vertex count.
+    if (num_vertices(g) > grey.limitApproxMatchingVertices) {
+        DEBUG_PRINTF("built %zu vertices > limit of %u\n", num_vertices(g),
+                     grey.limitApproxMatchingVertices);
+        throw ResourceLimitError();
+    }
+}
+
+} // namespace ue2
diff --git a/src/nfa/multiaccel_compilehelper.h b/src/nfagraph/ng_fuzzy.h
similarity index 58%
rename from src/nfa/multiaccel_compilehelper.h
rename to src/nfagraph/ng_fuzzy.h
index 27dbe634a..a2c821273 100644
--- a/src/nfa/multiaccel_compilehelper.h
+++ b/src/nfagraph/ng_fuzzy.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,50 +26,24 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef MULTIACCELCOMPILE_H_
-#define MULTIACCELCOMPILE_H_
-
-#include "ue2common.h"
+/** \file
+ * \brief Graph fuzzer for approximate matching
+ */
 
-#include "nfagraph/ng_limex_accel.h"
+#ifndef NG_FUZZY_H
+#define NG_FUZZY_H
 
-#include <vector>
+#include "ue2common.h"
 
 namespace ue2 {
+struct Grey;
+class NGHolder;
+class ReportManager;
 
-/* accel scheme state machine */
-enum accel_scheme_state {
-    STATE_FIRST_RUN,
-    STATE_SECOND_RUN,
-    STATE_WAITING_FOR_GRAB,
-    STATE_FIRST_TAIL,
-    STATE_SECOND_TAIL,
-    STATE_STOPPED,
-    STATE_INVALID
-};
-
-struct accel_data {
-    MultibyteAccelInfo::multiaccel_type type = MultibyteAccelInfo::MAT_NONE;
-    accel_scheme_state state = STATE_INVALID;
-    unsigned len1 = 0; /* length of first run */
-    unsigned len2 = 0; /* length of second run, if present */
-    unsigned tlen1 = 0; /* first tail length */
-    unsigned tlen2 = 0; /* second tail length */
-};
-
-class MultiaccelCompileHelper {
-private:
-    const CharReach &cr;
-    u32 offset;
-    std::vector<accel_data> accels;
-    unsigned max_len;
-public:
-    MultiaccelCompileHelper(const CharReach &cr, u32 off, unsigned max_len);
-    bool canAdvance();
-    MultibyteAccelInfo getBestScheme();
-    void advance(const ue2::CharReach &cr);
-};
+void validate_fuzzy_compile(const NGHolder &g, u32 edit_distance, bool utf8,
+                            const Grey &grey);
 
-}; // namespace
+void make_fuzzy(NGHolder &g, u32 edit_distance, const Grey &grey);
+}
 
-#endif /* MULTIACCELCOMPILE_H_ */
+#endif // NG_FUZZY_H
diff --git a/src/nfagraph/ng_is_equal.h b/src/nfagraph/ng_is_equal.h
index 8eba2af59..52b29882f 100644
--- a/src/nfagraph/ng_is_equal.h
+++ b/src/nfagraph/ng_is_equal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -39,7 +39,6 @@
 #include "ue2common.h"
 
 #include <memory>
-#include <boost/core/noncopyable.hpp>
 
 namespace ue2 {
 
diff --git a/src/nfagraph/ng_lbr.cpp b/src/nfagraph/ng_lbr.cpp
index d832bdaac..9bf16efea 100644
--- a/src/nfagraph/ng_lbr.cpp
+++ b/src/nfagraph/ng_lbr.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,7 +26,8 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-/** \file
+/**
+ * \file
  * \brief Large Bounded Repeat (LBR) engine build code.
  */
 
@@ -128,25 +129,24 @@ void fillNfa(NFA *nfa, lbr_common *c, ReportID report, const depth &repeatMin,
 }
 
 template <class LbrStruct> static
-aligned_unique_ptr<NFA> makeLbrNfa(NFAEngineType nfa_type,
-                                   enum RepeatType rtype,
-                                   const depth &repeatMax) {
+bytecode_ptr<NFA> makeLbrNfa(NFAEngineType nfa_type, enum RepeatType rtype,
+                             const depth &repeatMax) {
     size_t tableLen = 0;
     if (rtype == REPEAT_SPARSE_OPTIMAL_P) {
         tableLen = sizeof(u64a) * (repeatMax + 1);
     }
     size_t len = sizeof(NFA) + sizeof(LbrStruct) + sizeof(RepeatInfo) +
                  tableLen + sizeof(u64a);
-    aligned_unique_ptr<NFA> nfa = aligned_zmalloc_unique<NFA>(len);
+    auto nfa = make_zeroed_bytecode_ptr<NFA>(len);
     nfa->type = verify_u8(nfa_type);
     nfa->length = verify_u32(len);
     return nfa;
 }
 
 static
-aligned_unique_ptr<NFA> buildLbrDot(const CharReach &cr, const depth &repeatMin,
-                                    const depth &repeatMax, u32 minPeriod,
-                                    bool is_reset, ReportID report) {
+bytecode_ptr<NFA> buildLbrDot(const CharReach &cr, const depth &repeatMin,
+                              const depth &repeatMax, u32 minPeriod,
+                              bool is_reset, ReportID report) {
     if (!cr.all()) {
         return nullptr;
     }
@@ -164,10 +164,9 @@ aligned_unique_ptr<NFA> buildLbrDot(const CharReach &cr, const depth &repeatMin,
 }
 
 static
-aligned_unique_ptr<NFA> buildLbrVerm(const CharReach &cr,
-                                     const depth &repeatMin,
-                                     const depth &repeatMax, u32 minPeriod,
-                                     bool is_reset, ReportID report) {
+bytecode_ptr<NFA> buildLbrVerm(const CharReach &cr, const depth &repeatMin,
+                               const depth &repeatMax, u32 minPeriod,
+                               bool is_reset, ReportID report) {
     const CharReach escapes(~cr);
 
     if (escapes.count() != 1) {
@@ -188,10 +187,9 @@ aligned_unique_ptr<NFA> buildLbrVerm(const CharReach &cr,
 }
 
 static
-aligned_unique_ptr<NFA> buildLbrNVerm(const CharReach &cr,
-                                      const depth &repeatMin,
-                                      const depth &repeatMax, u32 minPeriod,
-                                      bool is_reset, ReportID report) {
+bytecode_ptr<NFA> buildLbrNVerm(const CharReach &cr, const depth &repeatMin,
+                                const depth &repeatMax, u32 minPeriod,
+                                bool is_reset, ReportID report) {
     const CharReach escapes(cr);
 
     if (escapes.count() != 1) {
@@ -212,10 +210,9 @@ aligned_unique_ptr<NFA> buildLbrNVerm(const CharReach &cr,
 }
 
 static
-aligned_unique_ptr<NFA> buildLbrShuf(const CharReach &cr,
-                                     const depth &repeatMin,
-                                     const depth &repeatMax, u32 minPeriod,
-                                     bool is_reset, ReportID report) {
+bytecode_ptr<NFA> buildLbrShuf(const CharReach &cr, const depth &repeatMin,
+                               const depth &repeatMax, u32 minPeriod,
+                               bool is_reset, ReportID report) {
     enum RepeatType rtype = chooseRepeatType(repeatMin, repeatMax, minPeriod,
                                              is_reset);
     auto nfa = makeLbrNfa<lbr_shuf>(LBR_NFA_SHUF, rtype, repeatMax);
@@ -233,10 +230,9 @@ aligned_unique_ptr<NFA> buildLbrShuf(const CharReach &cr,
 }
 
 static
-aligned_unique_ptr<NFA> buildLbrTruf(const CharReach &cr,
-                                     const depth &repeatMin,
-                                     const depth &repeatMax, u32 minPeriod,
-                                     bool is_reset, ReportID report) {
+bytecode_ptr<NFA> buildLbrTruf(const CharReach &cr, const depth &repeatMin,
+                               const depth &repeatMax, u32 minPeriod,
+                               bool is_reset, ReportID report) {
     enum RepeatType rtype = chooseRepeatType(repeatMin, repeatMax, minPeriod,
                                              is_reset);
     auto nfa = makeLbrNfa<lbr_truf>(LBR_NFA_TRUF, rtype, repeatMax);
@@ -252,10 +248,9 @@ aligned_unique_ptr<NFA> buildLbrTruf(const CharReach &cr,
 }
 
 static
-aligned_unique_ptr<NFA> constructLBR(const CharReach &cr,
-                                     const depth &repeatMin,
-                                     const depth &repeatMax, u32 minPeriod,
-                                     bool is_reset, ReportID report) {
+bytecode_ptr<NFA> constructLBR(const CharReach &cr, const depth &repeatMin,
+                               const depth &repeatMax, u32 minPeriod,
+                               bool is_reset, ReportID report) {
     DEBUG_PRINTF("bounds={%s,%s}, cr=%s (count %zu), report=%u\n",
                  repeatMin.str().c_str(), repeatMax.str().c_str(),
                  describeClass(cr, 20, CC_OUT_TEXT).c_str(), cr.count(),
@@ -263,8 +258,8 @@ aligned_unique_ptr<NFA> constructLBR(const CharReach &cr,
     assert(repeatMin <= repeatMax);
     assert(repeatMax.is_reachable());
 
-    aligned_unique_ptr<NFA> nfa
-        = buildLbrDot(cr, repeatMin, repeatMax, minPeriod, is_reset, report);
+    auto nfa =
+        buildLbrDot(cr, repeatMin, repeatMax, minPeriod, is_reset, report);
 
     if (!nfa) {
         nfa = buildLbrVerm(cr, repeatMin, repeatMax, minPeriod, is_reset,
@@ -291,10 +286,10 @@ aligned_unique_ptr<NFA> constructLBR(const CharReach &cr,
     return nfa;
 }
 
-aligned_unique_ptr<NFA> constructLBR(const CastleProto &proto,
-                                     const vector<vector<CharReach>> &triggers,
-                                     const CompileContext &cc,
-                                     const ReportManager &rm) {
+bytecode_ptr<NFA> constructLBR(const CastleProto &proto,
+                               const vector<vector<CharReach>> &triggers,
+                               const CompileContext &cc,
+                               const ReportManager &rm) {
     if (!cc.grey.allowLbr) {
         return nullptr;
     }
@@ -330,10 +325,10 @@ aligned_unique_ptr<NFA> constructLBR(const CastleProto &proto,
 }
 
 /** \brief Construct an LBR engine from the given graph \p g. */
-aligned_unique_ptr<NFA> constructLBR(const NGHolder &g,
-                                     const vector<vector<CharReach>> &triggers,
-                                     const CompileContext &cc,
-                                     const ReportManager &rm) {
+bytecode_ptr<NFA> constructLBR(const NGHolder &g,
+                               const vector<vector<CharReach>> &triggers,
+                               const CompileContext &cc,
+                               const ReportManager &rm) {
     if (!cc.grey.allowLbr) {
         return nullptr;
     }
diff --git a/src/nfagraph/ng_lbr.h b/src/nfagraph/ng_lbr.h
index 99cb0fcb0..1eec96535 100644
--- a/src/nfagraph/ng_lbr.h
+++ b/src/nfagraph/ng_lbr.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,7 +26,8 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-/** \file
+/**
+ * \file
  * \brief Large Bounded Repeat (LBR) engine build code.
  */
 
@@ -34,7 +35,7 @@
 #define NG_LBR_H
 
 #include "ue2common.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"
 
 #include <memory>
 #include <vector>
@@ -51,14 +52,16 @@ struct CompileContext;
 struct Grey;
 
 /** \brief Construct an LBR engine from the given graph \p g. */
-aligned_unique_ptr<NFA>
+bytecode_ptr<NFA>
 constructLBR(const NGHolder &g,
              const std::vector<std::vector<CharReach>> &triggers,
              const CompileContext &cc, const ReportManager &rm);
 
-/** \brief Construct an LBR engine from the given CastleProto, which should
- * contain only one repeat. */
-aligned_unique_ptr<NFA>
+/**
+ * \brief Construct an LBR engine from the given CastleProto, which should
+ * contain only one repeat.
+ */
+bytecode_ptr<NFA>
 constructLBR(const CastleProto &proto,
              const std::vector<std::vector<CharReach>> &triggers,
              const CompileContext &cc, const ReportManager &rm);
diff --git a/src/nfagraph/ng_limex.cpp b/src/nfagraph/ng_limex.cpp
index e92790b98..283bba22c 100644
--- a/src/nfagraph/ng_limex.cpp
+++ b/src/nfagraph/ng_limex.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,9 +26,11 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-/** \file
+/**
+ * \file
  * \brief Limex NFA construction code.
  */
+
 #include "ng_limex.h"
 
 #include "grey.h"
@@ -354,7 +356,7 @@ void attemptToUseAsStart(const NGHolder &g,  NFAVertex u,
         auto ni_inserter = inserter(new_inter, new_inter.end());
         set_intersection(top_inter.begin(), top_inter.end(),
                          v_tops.begin(), v_tops.end(), ni_inserter);
-        top_inter = move(new_inter);
+        top_inter = std::move(new_inter);
         succs.insert(v);
     }
 
@@ -623,7 +625,7 @@ void remapReportsToPrograms(NGHolder &h, const ReportManager &rm) {
 }
 
 static
-aligned_unique_ptr<NFA>
+bytecode_ptr<NFA>
 constructNFA(const NGHolder &h_in, const ReportManager *rm,
              const map<u32, u32> &fixed_depth_tops,
              const map<u32, vector<vector<CharReach>>> &triggers,
@@ -682,7 +684,7 @@ constructNFA(const NGHolder &h_in, const ReportManager *rm,
                     zombies, do_accel, compress_state, hint, cc);
 }
 
-aligned_unique_ptr<NFA>
+bytecode_ptr<NFA>
 constructNFA(const NGHolder &h_in, const ReportManager *rm,
              const map<u32, u32> &fixed_depth_tops,
              const map<u32, vector<vector<CharReach>>> &triggers,
@@ -696,7 +698,7 @@ constructNFA(const NGHolder &h_in, const ReportManager *rm,
 
 #ifndef RELEASE_BUILD
 // Variant that allows a hint to be specified.
-aligned_unique_ptr<NFA>
+bytecode_ptr<NFA>
 constructNFA(const NGHolder &h_in, const ReportManager *rm,
              const map<u32, u32> &fixed_depth_tops,
              const map<u32, vector<vector<CharReach>>> &triggers,
@@ -709,8 +711,8 @@ constructNFA(const NGHolder &h_in, const ReportManager *rm,
 #endif // RELEASE_BUILD
 
 static
-aligned_unique_ptr<NFA> constructReversedNFA_i(const NGHolder &h_in, u32 hint,
-                                               const CompileContext &cc) {
+bytecode_ptr<NFA> constructReversedNFA_i(const NGHolder &h_in, u32 hint,
+                                         const CompileContext &cc) {
     // Make a mutable copy of the graph that we can renumber etc.
     NGHolder h;
     cloneHolder(h, h_in);
@@ -739,16 +741,16 @@ aligned_unique_ptr<NFA> constructReversedNFA_i(const NGHolder &h_in, u32 hint,
                     zombies, false, false, hint, cc);
 }
 
-aligned_unique_ptr<NFA> constructReversedNFA(const NGHolder &h_in,
-                                             const CompileContext &cc) {
+bytecode_ptr<NFA> constructReversedNFA(const NGHolder &h_in,
+                                       const CompileContext &cc) {
     u32 hint = INVALID_NFA; // no hint
     return constructReversedNFA_i(h_in, hint, cc);
 }
 
 #ifndef RELEASE_BUILD
 // Variant that allows a hint to be specified.
-aligned_unique_ptr<NFA> constructReversedNFA(const NGHolder &h_in, u32 hint,
-                                             const CompileContext &cc) {
+bytecode_ptr<NFA> constructReversedNFA(const NGHolder &h_in, u32 hint,
+                                       const CompileContext &cc) {
     return constructReversedNFA_i(h_in, hint, cc);
 }
 #endif // RELEASE_BUILD
diff --git a/src/nfagraph/ng_limex.h b/src/nfagraph/ng_limex.h
index 1e36e03dc..9bf46d693 100644
--- a/src/nfagraph/ng_limex.h
+++ b/src/nfagraph/ng_limex.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,7 +26,8 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-/** \file
+/**
+ * \file
  * \brief Limex NFA construction code.
  */
 
@@ -35,7 +36,7 @@
 
 #include "ue2common.h"
 #include "som/som.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"
 
 #include <map>
 #include <memory>
@@ -51,7 +52,8 @@ class NGHolder;
 class ReportManager;
 struct CompileContext;
 
-/** \brief Determine if the given graph is implementable as an NFA.
+/**
+ * \brief Determine if the given graph is implementable as an NFA.
  *
  * Returns zero if the NFA is not implementable (usually because it has too
  * many states for any of our models). Otherwise returns the number of states.
@@ -62,11 +64,14 @@ struct CompileContext;
 u32 isImplementableNFA(const NGHolder &g, const ReportManager *rm,
                        const CompileContext &cc);
 
-/** \brief Late-stage graph reductions.
+/**
+ * \brief Late-stage graph reductions.
  *
  * This will call \ref removeRedundancy and apply its changes to the given
- * holder only if it is implementable afterwards. */
-void reduceImplementableGraph(NGHolder &g, som_type som, const ReportManager *rm,
+ * holder only if it is implementable afterwards.
+ */
+void reduceImplementableGraph(NGHolder &g, som_type som,
+                              const ReportManager *rm,
                               const CompileContext &cc);
 
 /**
@@ -79,7 +84,8 @@ void reduceImplementableGraph(NGHolder &g, som_type som, const ReportManager *rm
 u32 countAccelStates(const NGHolder &g, const ReportManager *rm,
                      const CompileContext &cc);
 
-/** \brief Construct an NFA from the given NFAGraph.
+/**
+ * \brief Construct an NFA from the given graph.
  *
  * Returns zero if the NFA is not implementable (usually because it has too
  * many states for any of our models). Otherwise returns the number of states.
@@ -90,23 +96,25 @@ u32 countAccelStates(const NGHolder &g, const ReportManager *rm,
  * Note: this variant of the function allows a model to be specified with the
  * \a hint parameter.
  */
-aligned_unique_ptr<NFA>
+bytecode_ptr<NFA>
 constructNFA(const NGHolder &g, const ReportManager *rm,
              const std::map<u32, u32> &fixed_depth_tops,
              const std::map<u32, std::vector<std::vector<CharReach>>> &triggers,
              bool compress_state, const CompileContext &cc);
 
-/** \brief Build a reverse NFA from the graph given, which should have already
+/**
+ * \brief Build a reverse NFA from the graph given, which should have already
  * been reversed.
  *
  * Used for reverse NFAs used in SOM mode.
  */
-aligned_unique_ptr<NFA> constructReversedNFA(const NGHolder &h,
-                                             const CompileContext &cc);
+bytecode_ptr<NFA> constructReversedNFA(const NGHolder &h,
+                                       const CompileContext &cc);
 
 #ifndef RELEASE_BUILD
 
-/** \brief Construct an NFA (with model type hint) from the given NFAGraph.
+/**
+ * \brief Construct an NFA (with model type hint) from the given graph.
  *
  * Returns zero if the NFA is not implementable (usually because it has too
  * many states for any of our models). Otherwise returns the number of states.
@@ -117,19 +125,20 @@ aligned_unique_ptr<NFA> constructReversedNFA(const NGHolder &h,
  * Note: this variant of the function allows a model to be specified with the
  * \a hint parameter.
  */
-aligned_unique_ptr<NFA>
+bytecode_ptr<NFA>
 constructNFA(const NGHolder &g, const ReportManager *rm,
              const std::map<u32, u32> &fixed_depth_tops,
              const std::map<u32, std::vector<std::vector<CharReach>>> &triggers,
              bool compress_state, u32 hint, const CompileContext &cc);
 
-/** \brief Build a reverse NFA (with model type hint) from the graph given,
+/**
+ * \brief Build a reverse NFA (with model type hint) from the graph given,
  * which should have already been reversed.
  *
  * Used for reverse NFAs used in SOM mode.
  */
-aligned_unique_ptr<NFA> constructReversedNFA(const NGHolder &h, u32 hint,
-                                             const CompileContext &cc);
+bytecode_ptr<NFA> constructReversedNFA(const NGHolder &h, u32 hint,
+                                       const CompileContext &cc);
 
 #endif // RELEASE_BUILD
 
diff --git a/src/nfagraph/ng_limex_accel.cpp b/src/nfagraph/ng_limex_accel.cpp
index bfba7c71b..80e08a7f9 100644
--- a/src/nfagraph/ng_limex_accel.cpp
+++ b/src/nfagraph/ng_limex_accel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -37,7 +37,6 @@
 #include "ue2common.h"
 
 #include "nfa/accel.h"
-#include "nfa/multiaccel_compilehelper.h"
 
 #include "util/bitutils.h" // for CASE_CLEAR
 #include "util/charreach.h"
@@ -45,12 +44,16 @@
 #include "util/container.h"
 #include "util/dump_charclass.h"
 #include "util/graph_range.h"
+#include "util/small_vector.h"
 #include "util/target_info.h"
 
 #include <algorithm>
 #include <map>
 
+#include <boost/range/adaptor/map.hpp>
+
 using namespace std;
+using boost::adaptors::map_keys;
 
 namespace ue2 {
 
@@ -135,15 +138,15 @@ void findAccelFriends(const NGHolder &g, NFAVertex v,
 static
 void findPaths(const NGHolder &g, NFAVertex v,
                const vector<CharReach> &refined_cr,
-               vector<vector<CharReach> > *paths,
+               vector<vector<CharReach>> *paths,
                const flat_set<NFAVertex> &forbidden, u32 depth) {
     static const u32 MAGIC_TOO_WIDE_NUMBER = 16;
     if (!depth) {
-        paths->push_back(vector<CharReach>());
+        paths->push_back({});
         return;
     }
     if (v == g.accept || v == g.acceptEod) {
-        paths->push_back(vector<CharReach>());
+        paths->push_back({});
         if (!generates_callbacks(g) || v == g.acceptEod) {
             paths->back().push_back(CharReach()); /* red tape options */
         }
@@ -157,42 +160,37 @@ void findPaths(const NGHolder &g, NFAVertex v,
     if (out_degree(v, g) >= MAGIC_TOO_WIDE_NUMBER
         || hasSelfLoop(v, g)) {
         /* give up on pushing past this point */
-        paths->push_back(vector<CharReach>());
-        vector<CharReach> &p = paths->back();
-        p.push_back(cr);
+        paths->push_back({cr});
         return;
     }
 
+    vector<vector<CharReach>> curr;
     for (auto w : adjacent_vertices_range(v, g)) {
         if (contains(forbidden, w)) {
             /* path has looped back to one of the active+boring acceleration
              * states.  We can ignore this path if we have sufficient back-
              * off. */
-            paths->push_back(vector<CharReach>());
-            paths->back().push_back(CharReach());
+            paths->push_back({CharReach()});
             continue;
         }
 
         u32 new_depth = depth - 1;
-        vector<vector<CharReach> > curr;
         do {
             curr.clear();
             findPaths(g, w, refined_cr, &curr, forbidden, new_depth);
         } while (new_depth-- && curr.size() >= MAGIC_TOO_WIDE_NUMBER);
 
-        for (vector<vector<CharReach> >::iterator it = curr.begin();
-             it != curr.end(); ++it) {
-            paths->push_back(vector<CharReach>());
-            vector<CharReach> &p = paths->back();
-            p.swap(*it);
-            p.push_back(cr);
+        for (auto &c : curr) {
+            c.push_back(cr);
+            paths->push_back(std::move(c));
         }
     }
 }
 
+namespace {
 struct SAccelScheme {
-    SAccelScheme(const CharReach &cr_in, u32 offset_in)
-        : cr(cr_in), offset(offset_in) {
+    SAccelScheme(CharReach cr_in, u32 offset_in)
+        : cr(std::move(cr_in)), offset(offset_in) {
         assert(offset <= MAX_ACCEL_DEPTH);
     }
 
@@ -215,30 +213,43 @@ struct SAccelScheme {
     CharReach cr = CharReach::dot();
     u32 offset = MAX_ACCEL_DEPTH + 1;
 };
+}
+
+/**
+ * \brief Limit on the number of (recursive) calls to findBestInternal().
+ */
+static constexpr size_t MAX_FINDBEST_CALLS = 1000000;
 
 static
-void findBest(vector<vector<CharReach> >::const_iterator pb,
-              vector<vector<CharReach> >::const_iterator pe,
-              const SAccelScheme &curr, SAccelScheme *best) {
+void findBestInternal(vector<vector<CharReach>>::const_iterator pb,
+                      vector<vector<CharReach>>::const_iterator pe,
+                      size_t *num_calls, const SAccelScheme &curr,
+                      SAccelScheme *best) {
     assert(curr.offset <= MAX_ACCEL_DEPTH);
+
+    if (++(*num_calls) > MAX_FINDBEST_CALLS) {
+        DEBUG_PRINTF("hit num_calls limit %zu\n", *num_calls);
+        return;
+    }
+
     DEBUG_PRINTF("paths left %zu\n", pe - pb);
     if (pb == pe) {
         if (curr < *best) {
-            DEBUG_PRINTF("new best\n");
             *best = curr;
+            DEBUG_PRINTF("new best: count=%zu, class=%s, offset=%u\n",
+                         best->cr.count(), describeClass(best->cr).c_str(),
+                         best->offset);
         }
-        *best = curr;
         return;
     }
 
     DEBUG_PRINTF("p len %zu\n", pb->end() - pb->begin());
 
-    vector<SAccelScheme> priority_path;
+    small_vector<SAccelScheme, 10> priority_path;
     priority_path.reserve(pb->size());
     u32 i = 0;
-    for (vector<CharReach>::const_iterator p = pb->begin(); p != pb->end();
-         ++p, i++) {
-        SAccelScheme as(*p | curr.cr, MAX(i, curr.offset));
+    for (auto p = pb->begin(); p != pb->end(); ++p, i++) {
+        SAccelScheme as(*p | curr.cr, max(i, curr.offset));
         if (*best < as) {
             DEBUG_PRINTF("worse\n");
             continue;
@@ -259,18 +270,13 @@ void findBest(vector<vector<CharReach> >::const_iterator pb,
     }
     DEBUG_PRINTF("---\n");
 
-    for (vector<SAccelScheme>::const_iterator it = priority_path.begin();
-         it != priority_path.end(); ++it) {
-        DEBUG_PRINTF("%u:|| = %zu; p remaining len %zu\n", i, it->cr.count(),
-                     priority_path.end() - it);
-
-        SAccelScheme in = move(*it);
-
+    for (const SAccelScheme &in : priority_path) {
+        DEBUG_PRINTF("in: count %zu\n", in.cr.count());
         if (*best < in) {
             DEBUG_PRINTF("worse\n");
             continue;
         }
-        findBest(pb + 1, pe, in, best);
+        findBestInternal(pb + 1, pe, num_calls, in, best);
 
         if (curr.cr == best->cr) {
             return; /* could only get better by offset */
@@ -278,9 +284,23 @@ void findBest(vector<vector<CharReach> >::const_iterator pb,
     }
 }
 
+static
+SAccelScheme findBest(const vector<vector<CharReach>> &paths,
+                      const CharReach &terminating) {
+    SAccelScheme curr(terminating, 0U);
+    SAccelScheme best;
+    size_t num_calls = 0;
+    findBestInternal(paths.begin(), paths.end(), &num_calls, curr, &best);
+    DEBUG_PRINTF("findBest completed, num_calls=%zu\n", num_calls);
+    DEBUG_PRINTF("selected scheme: count=%zu, class=%s, offset=%u\n",
+                 best.cr.count(), describeClass(best.cr).c_str(), best.offset);
+    return best;
+}
+
+namespace {
 struct DAccelScheme {
-    DAccelScheme(const CharReach &cr_in, u32 offset_in)
-        : double_cr(cr_in), double_offset(offset_in) {
+    DAccelScheme(CharReach cr_in, u32 offset_in)
+        : double_cr(std::move(cr_in)), double_offset(offset_in) {
         assert(double_offset <= MAX_ACCEL_DEPTH);
     }
 
@@ -319,6 +339,7 @@ struct DAccelScheme {
     CharReach double_cr;
     u32 double_offset = 0;
 };
+}
 
 static
 DAccelScheme make_double_accel(DAccelScheme as, CharReach cr_1,
@@ -391,11 +412,10 @@ void findDoubleBest(vector<vector<CharReach> >::const_iterator pb,
 
     DEBUG_PRINTF("p len %zu\n", pb->end() - pb->begin());
 
-    vector<DAccelScheme> priority_path;
+    small_vector<DAccelScheme, 10> priority_path;
     priority_path.reserve(pb->size());
     u32 i = 0;
-    for (vector<CharReach>::const_iterator p = pb->begin();
-         p != pb->end() && next(p) != pb->end();
+    for (auto p = pb->begin(); p != pb->end() && next(p) != pb->end();
          ++p, i++) {
         DAccelScheme as = make_double_accel(curr, *p, *next(p), i);
         if (*best < as) {
@@ -411,9 +431,7 @@ void findDoubleBest(vector<vector<CharReach> >::const_iterator pb,
                  best->double_byte.size(), best->double_cr.count(),
                  best->double_offset);
 
-    for (vector<DAccelScheme>::const_iterator it = priority_path.begin();
-         it != priority_path.end(); ++it) {
-        DAccelScheme in = move(*it);
+    for (const DAccelScheme &in : priority_path) {
         DEBUG_PRINTF("in: %zu pairs, %zu singles, offset %u\n",
                      in.double_byte.size(), in.double_cr.count(),
                      in.double_offset);
@@ -427,14 +445,12 @@ void findDoubleBest(vector<vector<CharReach> >::const_iterator pb,
 
 #ifdef DEBUG
 static
-void dumpPaths(const vector<vector<CharReach> > &paths) {
-    for (vector<vector<CharReach> >::const_iterator p = paths.begin();
-         p != paths.end(); ++p) {
+void dumpPaths(const vector<vector<CharReach>> &paths) {
+    for (const auto &path : paths) {
         DEBUG_PRINTF("path: [");
-        for (vector<CharReach>::const_iterator it = p->begin(); it != p->end();
-             ++it) {
+        for (const auto &cr : path) {
             printf(" [");
-            describeClass(stdout, *it, 20, CC_OUT_TEXT);
+            describeClass(stdout, cr, 20, CC_OUT_TEXT);
             printf("]");
         }
         printf(" ]\n");
@@ -545,14 +561,14 @@ DAccelScheme findBestDoubleAccelScheme(vector<vector<CharReach> > paths,
 
 #define MAX_EXPLORE_PATHS 40
 
-AccelScheme findBestAccelScheme(vector<vector<CharReach> > paths,
+AccelScheme findBestAccelScheme(vector<vector<CharReach>> paths,
                                 const CharReach &terminating,
                                 bool look_for_double_byte) {
     AccelScheme rv;
     if (look_for_double_byte) {
         DAccelScheme da = findBestDoubleAccelScheme(paths, terminating);
         if (da.double_byte.size() <= DOUBLE_SHUFTI_LIMIT) {
-            rv.double_byte = move(da.double_byte);
+            rv.double_byte = std::move(da.double_byte);
             rv.double_cr = move(da.double_cr);
             rv.double_offset = da.double_offset;
         }
@@ -568,21 +584,18 @@ AccelScheme findBestAccelScheme(vector<vector<CharReach> > paths,
     /* if we were smart we would do something netflowy on the paths to find the
      * best cut. But we aren't, so we will just brute force it.
      */
-    SAccelScheme curr(terminating, 0U);
-    SAccelScheme best;
-    findBest(paths.begin(), paths.end(), curr, &best);
+    SAccelScheme best = findBest(paths, terminating);
 
     /* find best is a bit lazy in terms of minimising the offset, see if we can
      * make it better. need to find the min max offset that we need.*/
     u32 offset = 0;
-    for (vector<vector<CharReach> >::iterator p = paths.begin();
-         p != paths.end(); ++p) {
+    for (const auto &path : paths) {
         u32 i = 0;
-        for (vector<CharReach>::iterator it = p->begin(); it != p->end();
-             ++it, i++) {
-            if (it->isSubsetOf(best.cr)) {
+        for (const auto &cr : path) {
+            if (cr.isSubsetOf(best.cr)) {
                 break;
             }
+            i++;
         }
         offset = MAX(offset, i);
     }
@@ -620,17 +633,15 @@ AccelScheme nfaFindAccel(const NGHolder &g, const vector<NFAVertex> &verts,
         return AccelScheme(); /* invalid scheme */
     }
 
-    vector<vector<CharReach> > paths;
+    vector<vector<CharReach>> paths;
     flat_set<NFAVertex> ignore_vert_set(verts.begin(), verts.end());
 
     /* Note: we can not in general (TODO: ignore when possible) ignore entries
      * into the bounded repeat cyclic states as that is when the magic happens
      */
-    for (map<NFAVertex, BoundedRepeatSummary>::const_iterator it
-             = br_cyclic.begin();
-         it != br_cyclic.end(); ++it) {
+    for (auto v : br_cyclic | map_keys) {
         /* TODO: can allow if repeatMin <= 1 ? */
-        ignore_vert_set.erase(it->first);
+        ignore_vert_set.erase(v);
     }
 
     for (auto v : verts) {
@@ -643,9 +654,8 @@ AccelScheme nfaFindAccel(const NGHolder &g, const vector<NFAVertex> &verts,
     }
 
     /* paths built wrong: reverse them */
-    for (vector<vector<CharReach> >::iterator it = paths.begin();
-         it != paths.end(); ++it) {
-        reverse(it->begin(), it->end());
+    for (auto &path : paths) {
+        reverse(path.begin(), path.end());
     }
 
     return findBestAccelScheme(std::move(paths), terminating,
@@ -691,134 +701,6 @@ NFAVertex get_sds_or_proxy(const NGHolder &g) {
     return g.startDs;
 }
 
-static
-NFAVertex find_next(const NFAVertex v, const NGHolder &g) {
-    NFAVertex res = NGHolder::null_vertex();
-    for (NFAVertex u : adjacent_vertices_range(v, g)) {
-        if (u != v) {
-            res = u;
-            break;
-        }
-    }
-    return res;
-}
-
-/** \brief Check if vertex \a v is a multi accelerable state (for a limex NFA). */
-MultibyteAccelInfo nfaCheckMultiAccel(const NGHolder &g,
-                                      const vector<NFAVertex> &states,
-                                      const CompileContext &cc) {
-    // For a set of states to be accelerable, we basically have to have only
-    // one state to accelerate.
-    if (states.size() != 1) {
-        DEBUG_PRINTF("can't accelerate multiple states\n");
-        return MultibyteAccelInfo();
-    }
-
-    // Get our base vertex
-    NFAVertex v = states[0];
-
-    // We need the base vertex to be a self-looping dotall leading to exactly
-    // one vertex.
-    if (!hasSelfLoop(v, g)) {
-        DEBUG_PRINTF("base vertex has self-loop\n");
-        return MultibyteAccelInfo();
-    }
-
-    if (!g[v].char_reach.all()) {
-        DEBUG_PRINTF("can't accelerate anything but dot\n");
-        return MultibyteAccelInfo();
-    }
-
-    if (proper_out_degree(v, g) != 1) {
-        DEBUG_PRINTF("can't accelerate states with multiple successors\n");
-        return MultibyteAccelInfo();
-    }
-
-    // find our start vertex
-    NFAVertex cur = find_next(v, g);
-    if (cur == NGHolder::null_vertex()) {
-        DEBUG_PRINTF("invalid start vertex\n");
-        return MultibyteAccelInfo();
-    }
-
-    bool has_offset = false;
-    u32 offset = 0;
-    CharReach cr = g[cur].char_reach;
-
-    // if we start with a dot, we have an offset, so defer figuring out the
-    // real CharReach for this accel scheme
-    if (cr == CharReach::dot()) {
-        has_offset = true;
-        offset = 1;
-    }
-
-    // figure out our offset
-    while (has_offset) {
-        // vertices have to have no self loops
-        if (hasSelfLoop(cur, g)) {
-            DEBUG_PRINTF("can't have self-loops\n");
-            return MultibyteAccelInfo();
-        }
-
-        // we have to have exactly 1 successor to have this acceleration scheme
-        if (out_degree(cur, g) != 1) {
-            DEBUG_PRINTF("can't have multiple successors\n");
-            return MultibyteAccelInfo();
-        }
-
-        cur = *adjacent_vertices(cur, g).first;
-
-        // if we met a special vertex, bail out
-        if (is_special(cur, g)) {
-            DEBUG_PRINTF("can't have special vertices\n");
-            return MultibyteAccelInfo();
-        }
-
-        // now, get the real char reach
-        if (g[cur].char_reach != CharReach::dot()) {
-            cr = g[cur].char_reach;
-            has_offset = false;
-        } else {
-            offset++;
-        }
-    }
-
-    // now, fire up the compilation machinery
-    target_t ti = cc.target_info;
-    unsigned max_len = ti.has_avx2() ? MULTIACCEL_MAX_LEN_AVX2 : MULTIACCEL_MAX_LEN_SSE;
-    MultiaccelCompileHelper mac(cr, offset, max_len);
-
-    while (mac.canAdvance()) {
-        // vertices have to have no self loops
-        if (hasSelfLoop(cur, g)) {
-            break;
-        }
-
-        // we have to have exactly 1 successor to have this acceleration scheme
-        if (out_degree(cur, g) != 1) {
-            break;
-        }
-
-        cur = *adjacent_vertices(cur, g).first;
-
-        // if we met a special vertex, bail out
-        if (is_special(cur, g)) {
-            break;
-        }
-
-        mac.advance(g[cur].char_reach);
-    }
-    MultibyteAccelInfo mai = mac.getBestScheme();
-#ifdef DEBUG
-    DEBUG_PRINTF("Multibyte acceleration scheme: type: %u offset: %u lengths: %u,%u\n",
-                 mai.type, mai.offset, mai.len1, mai.len2);
-    for (size_t c = mai.cr.find_first(); c != CharReach::npos; c = mai.cr.find_next(c)) {
-        DEBUG_PRINTF("multibyte accel char: %zu\n", c);
-    }
-#endif
-    return mai;
-}
-
 /** \brief Check if vertex \a v is an accelerable state (for a limex NFA). */
 bool nfaCheckAccel(const NGHolder &g, NFAVertex v,
                    const vector<CharReach> &refined_cr,
diff --git a/src/nfagraph/ng_limex_accel.h b/src/nfagraph/ng_limex_accel.h
index cb3d12104..f0c98db2c 100644
--- a/src/nfagraph/ng_limex_accel.h
+++ b/src/nfagraph/ng_limex_accel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -51,9 +51,6 @@ namespace ue2 {
 #define MAX_MERGED_ACCEL_STOPS 200
 #define ACCEL_MAX_STOP_CHAR 24
 #define ACCEL_MAX_FLOATING_STOP_CHAR 192 /* accelerating sds is important */
-#define MULTIACCEL_MIN_LEN 3
-#define MULTIACCEL_MAX_LEN_SSE 15
-#define MULTIACCEL_MAX_LEN_AVX2 31
 
 // forward-declaration of CompileContext
 struct CompileContext;
@@ -84,11 +81,6 @@ bool nfaCheckAccel(const NGHolder &g, NFAVertex v,
                    const std::map<NFAVertex, BoundedRepeatSummary> &br_cyclic,
                    AccelScheme *as, bool allow_wide);
 
-/** \brief Check if vertex \a v is a multi accelerable state (for a limex NFA).
- */
-MultibyteAccelInfo nfaCheckMultiAccel(const NGHolder &g,
-                                      const std::vector<NFAVertex> &verts,
-                                      const CompileContext &cc);
 
 } // namespace ue2
 
diff --git a/src/nfagraph/ng_literal_analysis.cpp b/src/nfagraph/ng_literal_analysis.cpp
index a5f3468b8..a6664b07e 100644
--- a/src/nfagraph/ng_literal_analysis.cpp
+++ b/src/nfagraph/ng_literal_analysis.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -734,55 +734,30 @@ vector<u64a> scoreEdges(const NGHolder &g, const flat_set<NFAEdge> &known_bad) {
     return scores;
 }
 
-static
-bool splitOffLeadingLiteral_i(const NGHolder &g, bool anch,
-                              ue2_literal *lit_out,
-                              NGHolder *rhs) {
-    NFAVertex u;
-    NFAVertex v;
-
-    if (!anch) {
-        DEBUG_PRINTF("looking for leading floating literal\n");
-        set<NFAVertex> s_succ;
-        insert(&s_succ, adjacent_vertices(g.start, g));
-
-        set<NFAVertex> sds_succ;
-        insert(&sds_succ, adjacent_vertices(g.startDs, g));
-
-        bool floating = is_subset_of(s_succ, sds_succ);
-        if (!floating) {
-            DEBUG_PRINTF("not floating\n");
-            return false;
-        }
-
-        sds_succ.erase(g.startDs);
-        if (sds_succ.size() != 1) {
-            DEBUG_PRINTF("branchy root\n");
-            return false;
-        }
-
-        u = g.startDs;
-        v = *sds_succ.begin();
-    } else {
-        DEBUG_PRINTF("looking for leading anchored literal\n");
+bool splitOffLeadingLiteral(const NGHolder &g, ue2_literal *lit_out,
+                            NGHolder *rhs) {
+    DEBUG_PRINTF("looking for leading floating literal\n");
+    set<NFAVertex> s_succ;
+    insert(&s_succ, adjacent_vertices(g.start, g));
 
-        if (proper_out_degree(g.startDs, g)) {
-            DEBUG_PRINTF("not anchored\n");
-            return false;
-        }
+    set<NFAVertex> sds_succ;
+    insert(&sds_succ, adjacent_vertices(g.startDs, g));
 
-        set<NFAVertex> s_succ;
-        insert(&s_succ, adjacent_vertices(g.start, g));
-        s_succ.erase(g.startDs);
-        if (s_succ.size() != 1) {
-            DEBUG_PRINTF("branchy root\n");
-            return false;
-        }
+    bool floating = is_subset_of(s_succ, sds_succ);
+    if (!floating) {
+        DEBUG_PRINTF("not floating\n");
+        return false;
+    }
 
-        u = g.start;
-        v = *s_succ.begin();
+    sds_succ.erase(g.startDs);
+    if (sds_succ.size() != 1) {
+        DEBUG_PRINTF("branchy root\n");
+        return false;
     }
 
+    NFAVertex u = g.startDs;
+    NFAVertex v = *sds_succ.begin();
+
     while (true) {
         DEBUG_PRINTF("validating vertex %zu\n", g[v].index);
 
@@ -838,8 +813,7 @@ bool splitOffLeadingLiteral_i(const NGHolder &g, bool anch,
     assert(u != g.startDs);
 
     ue2::unordered_map<NFAVertex, NFAVertex> rhs_map;
-    vector<NFAVertex> pivots;
-    insert(&pivots, pivots.end(), adjacent_vertices(u, g));
+    vector<NFAVertex> pivots = make_vector_from(adjacent_vertices(u, g));
     splitRHS(g, pivots, rhs, &rhs_map);
 
     DEBUG_PRINTF("literal is '%s' (len %zu)\n", dumpString(*lit_out).c_str(),
@@ -848,17 +822,6 @@ bool splitOffLeadingLiteral_i(const NGHolder &g, bool anch,
     return true;
 }
 
-bool splitOffLeadingLiteral(const NGHolder &g, ue2_literal *lit_out,
-                            NGHolder *rhs) {
-    return splitOffLeadingLiteral_i(g, false, lit_out, rhs);
-}
-
-bool splitOffAnchoredLeadingLiteral(const NGHolder &g, ue2_literal *lit_out,
-                                    NGHolder *rhs) {
-    return splitOffLeadingLiteral_i(g, true, lit_out, rhs);
-}
-
-
 bool getTrailingLiteral(const NGHolder &g, ue2_literal *lit_out) {
     if (in_degree(g.acceptEod, g) != 1) {
         return false;
diff --git a/src/nfagraph/ng_literal_analysis.h b/src/nfagraph/ng_literal_analysis.h
index 6fd9c5251..6bb875561 100644
--- a/src/nfagraph/ng_literal_analysis.h
+++ b/src/nfagraph/ng_literal_analysis.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -87,9 +87,6 @@ u64a sanitizeAndCompressAndScore(std::set<ue2_literal> &s);
 bool splitOffLeadingLiteral(const NGHolder &g, ue2_literal *lit_out,
                             NGHolder *rhs);
 
-bool splitOffAnchoredLeadingLiteral(const NGHolder &g, ue2_literal *lit_out,
-                            NGHolder *rhs);
-
 bool getTrailingLiteral(const NGHolder &g, ue2_literal *lit_out);
 
 /** \brief Returns true if the given literal is the only thing in the graph,
diff --git a/src/nfagraph/ng_literal_component.cpp b/src/nfagraph/ng_literal_component.cpp
index e3cfe8678..de05e4909 100644
--- a/src/nfagraph/ng_literal_component.cpp
+++ b/src/nfagraph/ng_literal_component.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,12 +30,15 @@
  * \brief Literal Component Splitting. Identifies literals that span the
  * graph and moves them into Rose.
  */
+
+#include "ng_literal_component.h"
+
 #include "grey.h"
 #include "ng.h"
-#include "ng_literal_component.h"
 #include "ng_prune.h"
 #include "ng_util.h"
 #include "ue2common.h"
+#include "compiler/compiler.h"
 #include "rose/rose_build.h"
 #include "util/container.h"
 #include "util/graph.h"
@@ -47,8 +50,8 @@ using namespace std;
 namespace ue2 {
 
 static
-bool isLiteralChar(const NGWrapper &g, NFAVertex v,
-                   bool &nocase, bool &casefixed) {
+bool isLiteralChar(const NGHolder &g, NFAVertex v, bool &nocase,
+                   bool &casefixed) {
     const CharReach &cr = g[v].char_reach;
     const size_t num = cr.count();
     if (num > 2) {
@@ -93,7 +96,7 @@ void addToString(string &s, const NGHolder &g, NFAVertex v) {
 }
 
 static
-bool splitOffLiteral(NG &ng, NGWrapper &g, NFAVertex v, const bool anchored,
+bool splitOffLiteral(NG &ng, NGHolder &g, NFAVertex v, const bool anchored,
                      set<NFAVertex> &dead) {
     DEBUG_PRINTF("examine vertex %zu\n", g[v].index);
     bool nocase = false, casefixed = false;
@@ -185,7 +188,7 @@ bool splitOffLiteral(NG &ng, NGWrapper &g, NFAVertex v, const bool anchored,
 }
 
 /** \brief Split off literals. True if any changes were made to the graph. */
-bool splitOffLiterals(NG &ng, NGWrapper &g) {
+bool splitOffLiterals(NG &ng, NGHolder &g) {
     if (!ng.cc.grey.allowLiteral) {
         return false;
     }
diff --git a/src/nfagraph/ng_literal_component.h b/src/nfagraph/ng_literal_component.h
index dc177c404..1f284ce36 100644
--- a/src/nfagraph/ng_literal_component.h
+++ b/src/nfagraph/ng_literal_component.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -37,10 +37,10 @@
 namespace ue2 {
 
 class NG;
-class NGWrapper;
+class NGHolder;
 
 /** \brief Split off literals. True if any changes were made to the graph. */
-bool splitOffLiterals(NG &ng, NGWrapper &graph);
+bool splitOffLiterals(NG &ng, NGHolder &g);
 
 } // namespace ue2
 
diff --git a/src/nfagraph/ng_mcclellan.cpp b/src/nfagraph/ng_mcclellan.cpp
index 375086a46..9448a0bf3 100644
--- a/src/nfagraph/ng_mcclellan.cpp
+++ b/src/nfagraph/ng_mcclellan.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -42,6 +42,8 @@
 #include "util/bitfield.h"
 #include "util/determinise.h"
 #include "util/graph_range.h"
+#include "util/hash.h"
+#include "util/hash_dynamic_bitset.h"
 #include "util/make_unique.h"
 #include "util/report_manager.h"
 #include "util/ue2_containers.h"
@@ -377,7 +379,9 @@ class Automaton_Base {
             NFAVertex v = sq.first;
             u32 vert_id = graph[v].index;
             squash.set(vert_id);
-            squash_mask[vert_id] = shrinkStateSet(sq.second);
+            squash_mask[vert_id]
+                = Automaton_Traits::copy_states(std::move(sq.second),
+                                                numStates);
         }
 
         cr_by_index = populateCR(graph, v_by_index, alpha);
@@ -385,21 +389,11 @@ class Automaton_Base {
             dynamic_bitset<> temp(numStates);
             markToppableStarts(graph, unused, single_trigger, triggers,
                                &temp);
-            toppable = Automaton_Traits::copy_states(temp, numStates);
+            toppable = Automaton_Traits::copy_states(std::move(temp),
+                                                     numStates);
         }
     }
 
-private:
-    // Convert an NFAStateSet (as used by the squash code) into a StateSet
-    StateSet shrinkStateSet(const NFAStateSet &in) const {
-        StateSet out = Automaton_Traits::init_states(numStates);
-        for (size_t i = in.find_first(); i != in.npos && i < out.size();
-             i = in.find_next(i)) {
-            out.set(i);
-        }
-        return out;
-    }
-
 public:
     void transition(const StateSet &in, StateSet *next) {
         transition_graph(*this, v_by_index, in, next);
@@ -467,13 +461,13 @@ class Automaton_Base {
 
 struct Big_Traits {
     using StateSet = dynamic_bitset<>;
-    using StateMap = map<StateSet, dstate_id_t>;
+    using StateMap = unordered_map<StateSet, dstate_id_t, hash_dynamic_bitset>;
 
     static StateSet init_states(u32 num) {
         return StateSet(num);
     }
 
-    static StateSet copy_states(const dynamic_bitset<> &in, UNUSED u32 num) {
+    static StateSet copy_states(dynamic_bitset<> in, UNUSED u32 num) {
         assert(in.size() == num);
         return in;
     }
diff --git a/src/nfagraph/ng_prefilter.cpp b/src/nfagraph/ng_prefilter.cpp
index 012b4e8d8..3cd9d06d8 100644
--- a/src/nfagraph/ng_prefilter.cpp
+++ b/src/nfagraph/ng_prefilter.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,7 +26,8 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-/** \file
+/**
+ * \file
  * \brief Prefilter Reductions.
  *
  * This file contains routines for reducing the size of an NFA graph that we
@@ -92,13 +93,13 @@ struct RegionInfo {
     u32 id;                             //!< region id
     deque<NFAVertex> vertices;          //!< vertices in the region
     CharReach reach;                    //!< union of region reach
-    depth minWidth = 0;                 //!< min width of region subgraph
-    depth maxWidth = depth::infinity(); //!< max width of region subgraph
+    depth minWidth{0};                  //!< min width of region subgraph
+    depth maxWidth{depth::infinity()};  //!< max width of region subgraph
     bool atBoundary = false;            //!< region is next to an accept
 
     // Bigger score is better.
     size_t score() const {
-        // FIXME: charreach should be a signal?
+        // TODO: charreach should be a signal?
         size_t numVertices = vertices.size();
         if (atBoundary) {
             return numVertices - min(PENALTY_BOUNDARY, numVertices);
diff --git a/src/nfagraph/ng_puff.cpp b/src/nfagraph/ng_puff.cpp
index 7281471fc..984518b0f 100644
--- a/src/nfagraph/ng_puff.cpp
+++ b/src/nfagraph/ng_puff.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -94,8 +94,7 @@ void wireNewAccepts(NGHolder &g, NFAVertex head,
 static
 bool isFixedDepth(const NGHolder &g, NFAVertex v) {
     // If the vertex is reachable from startDs, it can't be fixed depth.
-    vector<DepthMinMax> depthFromStartDs;
-    calcDepthsFrom(g, g.startDs, depthFromStartDs);
+    auto depthFromStartDs = calcDepthsFrom(g, g.startDs);
 
     u32 idx = g[v].index;
     const DepthMinMax &ds = depthFromStartDs.at(idx);
@@ -104,8 +103,7 @@ bool isFixedDepth(const NGHolder &g, NFAVertex v) {
         return false;
     }
 
-    vector<DepthMinMax> depthFromStart;
-    calcDepthsFrom(g, g.start, depthFromStart);
+    auto depthFromStart = calcDepthsFrom(g, g.start);
 
     /* we can still consider the head of a puff chain as at fixed depth if
      * it has a self-loop: so we look at all the preds of v (other than v
diff --git a/src/nfagraph/ng_region.cpp b/src/nfagraph/ng_region.cpp
index 0ecd7bd63..91904b466 100644
--- a/src/nfagraph/ng_region.cpp
+++ b/src/nfagraph/ng_region.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -70,65 +70,61 @@ using namespace std;
 
 namespace ue2 {
 
-typedef ue2::unordered_set<NFAEdge> BackEdgeSet;
-typedef boost::filtered_graph<NGHolder, bad_edge_filter<BackEdgeSet>>
-    AcyclicGraph;
+using BackEdgeSet = unordered_set<NFAEdge>;
+using AcyclicGraph =
+    boost::filtered_graph<NGHolder, bad_edge_filter<BackEdgeSet>>;
 
 namespace {
 struct exit_info {
     explicit exit_info(NFAVertex v) : exit(v) {}
 
     NFAVertex exit;
-    ue2::unordered_set<NFAVertex> open;
+    flat_set<NFAVertex> open;
 };
 }
 
 static
 void checkAndAddExitCandidate(const AcyclicGraph &g,
-                              const ue2::unordered_set<NFAVertex> &r,
-                              NFAVertex v, vector<exit_info> *exits) {
-    // set when we find our first candidate.
-    ue2::unordered_set<NFAVertex> *open = nullptr;
+                              const unordered_set<NFAVertex> &r, NFAVertex v,
+                              vector<exit_info> &exits) {
+    exit_info v_exit(v);
+    auto &open = v_exit.open;
 
     /* find the set of vertices reachable from v which are not in r */
     for (auto w : adjacent_vertices_range(v, g)) {
-        if (!contains(r, NFAVertex(w))) {
-            if (!open) {
-                exits->push_back(exit_info(NFAVertex(v)));
-                open = &exits->back().open;
-            }
-            open->insert(NFAVertex(w));
+        if (!contains(r, w)) {
+            open.insert(w);
         }
     }
 
-    if (open) {
+    if (!open.empty()) {
         DEBUG_PRINTF("exit %zu\n", g[v].index);
+        exits.push_back(move(v_exit));
     }
 }
 
 static
-void findExits(const AcyclicGraph &g, const ue2::unordered_set<NFAVertex> &r,
-               vector<exit_info> *exits) {
-    exits->clear();
-
+void findExits(const AcyclicGraph &g, const unordered_set<NFAVertex> &r,
+               vector<exit_info> &exits) {
+    exits.clear();
     for (auto v : r) {
         checkAndAddExitCandidate(g, r, v, exits);
     }
 }
 
 static
-void refineExits(const AcyclicGraph &g, const ue2::unordered_set<NFAVertex> &r,
-                 NFAVertex new_v, vector<exit_info> *exits) {
-    for (u32 i = 0; i < exits->size(); i++) {
-        (*exits)[i].open.erase(new_v); /* new_v is no long an open edge */
-        if ((*exits)[i].open.empty()) { /* no open edges: no longer an exit */
-            /* shuffle to back and kill */
-            (*exits)[i] = exits->back();
-            exits->pop_back();
-            i--;
-        }
+void refineExits(const AcyclicGraph &g, const unordered_set<NFAVertex> &r,
+                 NFAVertex new_v, vector<exit_info> &exits) {
+    /* new_v is no long an open edge */
+    for (auto &exit : exits) {
+        exit.open.erase(new_v);
     }
 
+    /* no open edges: no longer an exit */
+    exits.erase(remove_if(exits.begin(), exits.end(),
+                  [&](const exit_info &exit) { return exit.open.empty(); }),
+                exits.end());
+
     checkAndAddExitCandidate(g, r, new_v, exits);
 }
 
@@ -136,7 +132,7 @@ void refineExits(const AcyclicGraph &g, const ue2::unordered_set<NFAVertex> &r,
  */
 static
 bool exitValid(UNUSED const AcyclicGraph &g, const vector<exit_info> &exits,
-               const ue2::unordered_set<NFAVertex> &open_jumps) {
+               const flat_set<NFAVertex> &open_jumps) {
     if (exits.empty() || (exits.size() < 2 && open_jumps.empty())) {
         return true;
     }
@@ -165,8 +161,8 @@ bool exitValid(UNUSED const AcyclicGraph &g, const vector<exit_info> &exits,
 }
 
 static
-void setRegion(const ue2::unordered_set<NFAVertex> &r, u32 rid,
-               ue2::unordered_map<NFAVertex, u32> &regions) {
+void setRegion(const unordered_set<NFAVertex> &r, u32 rid,
+               unordered_map<NFAVertex, u32> &regions) {
     for (auto v : r) {
         regions[v] = rid;
     }
@@ -176,34 +172,36 @@ static
 void buildInitialCandidate(const AcyclicGraph &g,
                            vector<NFAVertex>::const_reverse_iterator &it,
                            const vector<NFAVertex>::const_reverse_iterator &ite,
-                           ue2::unordered_set<NFAVertex> *candidate,
+                           unordered_set<NFAVertex> &candidate,
                            /* in exits of prev region;
                             * out exits from candidate */
-                           vector<exit_info> *exits,
-                           ue2::unordered_set<NFAVertex> *open_jumps) {
+                           vector<exit_info> &exits,
+                           flat_set<NFAVertex> &open_jumps) {
     if (it == ite) {
-        candidate->clear();
-        exits->clear();
+        candidate.clear();
+        exits.clear();
         return;
     }
 
-    if (exits->empty()) {
+    if (exits.empty()) {
         DEBUG_PRINTF("odd\n");
-        candidate->clear();
+        candidate.clear();
         DEBUG_PRINTF("adding %zu to initial\n", g[*it].index);
-        candidate->insert(*it);
-        open_jumps->erase(*it);
-        checkAndAddExitCandidate(g, *candidate, *it, exits);
+        candidate.insert(*it);
+        open_jumps.erase(*it);
+        checkAndAddExitCandidate(g, candidate, *it, exits);
         ++it;
         return;
     }
 
-    ue2::unordered_set<NFAVertex> enters = (*exits)[0].open;
-    candidate->clear();
+    // Note: findExits() will clear exits, so it's safe to mutate/move its
+    // elements here.
+    auto &enters = exits.front().open;
+    candidate.clear();
 
     for (; it != ite; ++it) {
         DEBUG_PRINTF("adding %zu to initial\n", g[*it].index);
-        candidate->insert(*it);
+        candidate.insert(*it);
         if (contains(enters, *it)) {
             break;
         }
@@ -211,33 +209,34 @@ void buildInitialCandidate(const AcyclicGraph &g,
 
     if (it != ite) {
         enters.erase(*it);
-        open_jumps->swap(enters);
-        DEBUG_PRINTF("oj size = %zu\n", open_jumps->size());
+        open_jumps = move(enters);
+        DEBUG_PRINTF("oj size = %zu\n", open_jumps.size());
         ++it;
     } else {
-        open_jumps->clear();
+        open_jumps.clear();
     }
 
-    findExits(g, *candidate, exits);
+    findExits(g, candidate, exits);
 }
 
 static
 void findDagLeaders(const NGHolder &h, const AcyclicGraph &g,
                     const vector<NFAVertex> &topo,
-                    ue2::unordered_map<NFAVertex, u32> &regions) {
+                    unordered_map<NFAVertex, u32> &regions) {
     assert(!topo.empty());
     u32 curr_id = 0;
-    vector<NFAVertex>::const_reverse_iterator t_it = topo.rbegin();
-    vector<exit_info> exits;
-    ue2::unordered_set<NFAVertex> candidate;
-    ue2::unordered_set<NFAVertex> open_jumps;
+    auto t_it = topo.rbegin();
+    unordered_set<NFAVertex> candidate;
+    flat_set<NFAVertex> open_jumps;
     DEBUG_PRINTF("adding %zu to current\n", g[*t_it].index);
     assert(t_it != topo.rend());
     candidate.insert(*t_it++);
     DEBUG_PRINTF("adding %zu to current\n", g[*t_it].index);
     assert(t_it != topo.rend());
     candidate.insert(*t_it++);
-    findExits(g, candidate, &exits);
+
+    vector<exit_info> exits;
+    findExits(g, candidate, exits);
 
     while (t_it != topo.rend()) {
         assert(!candidate.empty());
@@ -253,14 +252,14 @@ void findDagLeaders(const NGHolder &h, const AcyclicGraph &g,
                 DEBUG_PRINTF("setting region %u\n", curr_id);
             }
             setRegion(candidate, curr_id++, regions);
-            buildInitialCandidate(g, t_it, topo.rend(), &candidate, &exits,
-                                  &open_jumps);
+            buildInitialCandidate(g, t_it, topo.rend(), candidate, exits,
+                                  open_jumps);
         } else {
             NFAVertex curr = *t_it;
             DEBUG_PRINTF("adding %zu to current\n", g[curr].index);
             candidate.insert(curr);
             open_jumps.erase(curr);
-            refineExits(g, candidate, *t_it, &exits);
+            refineExits(g, candidate, *t_it, exits);
             DEBUG_PRINTF("    open jumps %zu exits %zu\n", open_jumps.size(),
                          exits.size());
             ++t_it;
@@ -273,7 +272,7 @@ void findDagLeaders(const NGHolder &h, const AcyclicGraph &g,
 static
 void mergeUnderBackEdges(const NGHolder &g, const vector<NFAVertex> &topo,
                          const BackEdgeSet &backEdges,
-                         ue2::unordered_map<NFAVertex, u32> &regions) {
+                         unordered_map<NFAVertex, u32> &regions) {
     for (const auto &e : backEdges) {
         NFAVertex u = source(e, g);
         NFAVertex v = target(e, g);
@@ -343,7 +342,7 @@ void reorderSpecials(const NGHolder &w, const AcyclicGraph &acyclic_g,
 
 static
 void liftSinks(const AcyclicGraph &acyclic_g, vector<NFAVertex> &topoOrder) {
-    ue2::unordered_set<NFAVertex> sinks;
+    unordered_set<NFAVertex> sinks;
     for (auto v : vertices_range(acyclic_g)) {
         if (is_special(v, acyclic_g)) {
             continue;
@@ -388,7 +387,7 @@ void liftSinks(const AcyclicGraph &acyclic_g, vector<NFAVertex> &topoOrder) {
         }
         NFAVertex s = *ri;
         DEBUG_PRINTF("handling sink %zu\n", acyclic_g[s].index);
-        ue2::unordered_set<NFAVertex> parents;
+        unordered_set<NFAVertex> parents;
         for (const auto &e : in_edges_range(s, acyclic_g)) {
             parents.insert(NFAVertex(source(e, acyclic_g)));
         }
@@ -416,6 +415,7 @@ vector<NFAVertex> buildTopoOrder(const NGHolder &w,
                                  const AcyclicGraph &acyclic_g,
                                  vector<boost::default_color_type> &colours) {
     vector<NFAVertex> topoOrder;
+    topoOrder.reserve(num_vertices(w));
 
     topological_sort(acyclic_g, back_inserter(topoOrder),
                      color_map(make_iterator_property_map(colours.begin(),
@@ -438,7 +438,7 @@ vector<NFAVertex> buildTopoOrder(const NGHolder &w,
     return topoOrder;
 }
 
-ue2::unordered_map<NFAVertex, u32> assignRegions(const NGHolder &g) {
+unordered_map<NFAVertex, u32> assignRegions(const NGHolder &g) {
     assert(hasCorrectlyNumberedVertices(g));
     const u32 numVertices = num_vertices(g);
     DEBUG_PRINTF("assigning regions for %u vertices in holder\n", numVertices);
@@ -460,7 +460,7 @@ ue2::unordered_map<NFAVertex, u32> assignRegions(const NGHolder &g) {
     vector<NFAVertex> topoOrder = buildTopoOrder(g, acyclic_g, colours);
 
     // Everybody starts in region 0.
-    ue2::unordered_map<NFAVertex, u32> regions;
+    unordered_map<NFAVertex, u32> regions;
     regions.reserve(numVertices);
     for (auto v : vertices_range(g)) {
         regions.emplace(v, 0);
diff --git a/src/nfagraph/ng_repeat.cpp b/src/nfagraph/ng_repeat.cpp
index a16e2715b..60ad22009 100644
--- a/src/nfagraph/ng_repeat.cpp
+++ b/src/nfagraph/ng_repeat.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -105,8 +105,8 @@ typedef boost::filtered_graph<NGHolder, ReachFilter<NGHolder>> RepeatGraph;
 
 struct ReachSubgraph {
     vector<NFAVertex> vertices;
-    depth repeatMin = 0;
-    depth repeatMax = 0;
+    depth repeatMin{0};
+    depth repeatMax{0};
     u32 minPeriod = 1;
     bool is_reset = false;
     enum RepeatType historyType = REPEAT_RING;
@@ -118,13 +118,12 @@ struct ReachSubgraph {
 static
 void findInitDepths(const NGHolder &g,
                     ue2::unordered_map<NFAVertex, NFAVertexDepth> &depths) {
-    vector<NFAVertexDepth> d;
-    calcDepths(g, d);
+    auto d = calcDepths(g);
 
     for (auto v : vertices_range(g)) {
-        u32 idx = g[v].index;
+        size_t idx = g[v].index;
         assert(idx < d.size());
-        depths.insert(make_pair(v, d[idx]));
+        depths.emplace(v, d[idx]);
     }
 }
 
@@ -296,9 +295,8 @@ void splitSubgraph(const NGHolder &g, const deque<NFAVertex> &verts,
     ue2::unordered_map<NFAVertex, NFAVertex> verts_map; // in g -> in verts_g
     fillHolder(&verts_g, g, verts, &verts_map);
 
-    NFAUndirectedGraph ug;
     ue2::unordered_map<NFAVertex, NFAUndirectedVertex> old2new;
-    createUnGraph(verts_g, true, true, ug, old2new);
+    auto ug = createUnGraph(verts_g, true, true, old2new);
 
     ue2::unordered_map<NFAUndirectedVertex, u32> repeatMap;
 
@@ -587,8 +585,8 @@ bool processSubgraph(const NGHolder &g, ReachSubgraph &rsi,
                      range.first, range.second);
         return false;
     }
-    rsi.repeatMin = range.first;
-    rsi.repeatMax = range.second;
+    rsi.repeatMin = depth(range.first);
+    rsi.repeatMax = depth(range.second);
 
     // If we've got a self-loop anywhere, we've got inf max.
     if (anySelfLoop(g, rsi.vertices.begin(), rsi.vertices.end())) {
@@ -1020,9 +1018,8 @@ void buildReachSubgraphs(const NGHolder &g, vector<ReachSubgraph> &rs,
         return;
     }
 
-    NFAUndirectedGraph ug;
     unordered_map<RepeatGraph::vertex_descriptor, NFAUndirectedVertex> old2new;
-    createUnGraph(rg, true, true, ug, old2new);
+    auto ug = createUnGraph(rg, true, true, old2new);
 
     unordered_map<NFAUndirectedVertex, u32> repeatMap;
 
@@ -2108,7 +2105,7 @@ void populateFixedTopInfo(const map<u32, u32> &fixed_depth_tops,
                 td = depth::infinity();
                 break;
             }
-            depth td_t = fixed_depth_tops.at(top);
+            depth td_t(fixed_depth_tops.at(top));
             if (td == td_t) {
                 continue;
             } else if (td == depth::infinity()) {
@@ -2481,7 +2478,7 @@ bool isPureRepeat(const NGHolder &g, PureRepeat &repeat) {
         // have the same report set as the vertices in the repeat.
         if (repeat.bounds.min == depth(1) &&
             g[g.start].reports == g[v].reports) {
-            repeat.bounds.min = 0;
+            repeat.bounds.min = depth(0);
             DEBUG_PRINTF("graph is %s repeat\n", repeat.bounds.str().c_str());
         } else {
             DEBUG_PRINTF("not a supported repeat\n");
diff --git a/src/nfagraph/ng_reports.cpp b/src/nfagraph/ng_reports.cpp
index 3d18a6209..4e9b498df 100644
--- a/src/nfagraph/ng_reports.cpp
+++ b/src/nfagraph/ng_reports.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -65,6 +65,26 @@ bool can_exhaust(const NGHolder &g, const ReportManager &rm) {
     return true;
 }
 
+void set_report(NGHolder &g, ReportID internal_report) {
+    // First, wipe the report IDs on all vertices.
+    for (auto v : vertices_range(g)) {
+        g[v].reports.clear();
+    }
+
+    // Any predecessors of accept get our id.
+    for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
+        g[v].reports.insert(internal_report);
+    }
+
+    // Same for preds of acceptEod, except accept itself.
+    for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) {
+        if (v == g.accept) {
+            continue;
+        }
+        g[v].reports.insert(internal_report);
+    }
+}
+
 /** Derive a maximum offset for the graph from the max_offset values of its
  * reports. Returns MAX_OFFSET for inf. */
 u64a findMaxOffset(const NGHolder &g, const ReportManager &rm) {
diff --git a/src/nfagraph/ng_reports.h b/src/nfagraph/ng_reports.h
index 3047ff0b4..31c953088 100644
--- a/src/nfagraph/ng_reports.h
+++ b/src/nfagraph/ng_reports.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -48,6 +48,10 @@ std::set<ReportID> all_reports(const NGHolder &g);
 /** True if *all* reports in the graph are exhaustible. */
 bool can_exhaust(const NGHolder &g, const ReportManager &rm);
 
+/** Replaces all existing reports on the holder with the provided internal
+ *  report id. */
+void set_report(NGHolder &g, ReportID internal_report);
+
 /** Derive a maximum offset for the graph from the max_offset values of its
  * reports. Returns MAX_OFFSET for inf. */
 u64a findMaxOffset(const NGHolder &g, const ReportManager &rm);
diff --git a/src/nfagraph/ng_rose.cpp b/src/nfagraph/ng_rose.cpp
deleted file mode 100644
index 7066ab27d..000000000
--- a/src/nfagraph/ng_rose.cpp
+++ /dev/null
@@ -1,2977 +0,0 @@
-/*
- * Copyright (c) 2015-2017, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/** \file
- * \brief Rose construction from NGHolder.
- */
-
-// #define DEBUG
-// #define DEBUG_ROSE
-#include "ng_rose.h"
-
-#include "grey.h"
-#include "ng_depth.h"
-#include "ng_dominators.h"
-#include "ng_equivalence.h"
-#include "ng_holder.h"
-#include "ng_is_equal.h"
-#include "ng_literal_analysis.h"
-#include "ng_netflow.h"
-#include "ng_prune.h"
-#include "ng_redundancy.h"
-#include "ng_region.h"
-#include "ng_reports.h"
-#include "ng_split.h"
-#include "ng_util.h"
-#include "ng_width.h"
-#include "rose/rose_build.h"
-#include "rose/rose_build_util.h"
-#include "rose/rose_in_dump.h"
-#include "rose/rose_in_graph.h"
-#include "rose/rose_in_util.h"
-#include "util/compare.h"
-#include "util/compile_context.h"
-#include "util/container.h"
-#include "util/graph.h"
-#include "util/graph_range.h"
-#include "util/make_unique.h"
-#include "util/order_check.h"
-#include "util/ue2string.h"
-#include "util/ue2_containers.h"
-
-#include <set>
-#include <utility>
-#include <vector>
-#include <boost/core/noncopyable.hpp>
-
-#define NDEBUG_PRINTF(x, ...) \
-    do { if (0) { DEBUG_PRINTF(x,  ## __VA_ARGS__); } } while (0)
-
-using namespace std;
-
-namespace ue2 {
-
-/**
- * Maps vertices in the original graph to vertices on edge graphs. Each edge
- * graph should contain at most one copy of the vertex. Multiple images for a
- * vertex arise after we split on multiple literals - in this cases all edges
- * should share a common graph.
- *
- * If, when an edge is split, a vertex ends up in both the LHS and RHS then only
- * the LHS is tracked. This is because in general we want to simplify the LHS
- * and allow complexity to be pushed further back.
- */
-typedef ue2::unordered_map<NFAVertex, vector<pair<RoseInEdge, NFAVertex> > >
-    vdest_map_t;
-
-typedef ue2::unordered_map<RoseInEdge, vector<NFAVertex> > vsrc_map_t;
-
-/**
- * \brief Maximum width of the character class usable as an escape class.
- */
-static const u32 MAX_ESCAPE_CHARS = 20;
-
-static
-u32 maxDelay(const CompileContext &cc) {
-    if (!cc.streaming) {
-        return MO_INVALID_IDX;
-    }
-    return cc.grey.maxHistoryAvailable;
-}
-
-static
-bool createsAnchoredLHS(const NGHolder &g, const vector<NFAVertex> &vv,
-                        const vector<NFAVertexDepth> &depths,
-                        const Grey &grey, depth max_depth = depth::infinity()) {
-    max_depth = min(max_depth, depth(grey.maxAnchoredRegion));
-
-    for (auto v : vv) {
-        /* avoid issues of self loops blowing out depths:
-         *     look at preds, add 1 */
-        for (auto u : inv_adjacent_vertices_range(v, g)) {
-            if (u == v) {
-                continue;
-            }
-
-            u32 idx = g[u].index;
-            assert(idx < depths.size());
-            if (maxDistFromStartOfData(depths.at(idx)) >= max_depth) {
-                return false;
-            }
-        }
-    }
-    return true;
-}
-
-static
-bool createsTransientLHS(const NGHolder &g, const vector<NFAVertex> &vv,
-                         const vector<NFAVertexDepth> &depths,
-                         const Grey &grey) {
-    const depth max_depth(grey.maxHistoryAvailable);
-
-    for (auto v : vv) {
-        /* avoid issues of self loops blowing out depths:
-         *     look at preds, add 1 */
-        for (auto u : inv_adjacent_vertices_range(v, g)) {
-            if (u == v) {
-                continue;
-            }
-
-            u32 idx = g[u].index;
-            assert(idx < depths.size());
-            if (maxDistFromInit(depths.at(idx)) >= max_depth) {
-                return false;
-            }
-        }
-    }
-    return true;
-}
-
-static
-bool isLHSUsablyAnchored(const NGHolder &g,
-                         const vector<NFAVertexDepth> &depths,
-                         const Grey &grey) {
-    assert(in_degree(g.acceptEod, g) == 1);
-
-    vector<NFAVertex> accepts;
-    insert(&accepts, accepts.end(), inv_adjacent_vertices(g.accept, g));
-
-    bool rv = createsAnchoredLHS(g, accepts, depths, grey);
-    DEBUG_PRINTF("lhs is %susably anchored\n", rv ? "" : "not ");
-    return rv;
-}
-
-static
-bool isLHSTransient(const NGHolder &g,
-                    const vector<NFAVertexDepth> &depths,
-                    const Grey &grey) {
-    assert(in_degree(g.acceptEod, g) == 1);
-
-    vector<NFAVertex> accepts;
-    insert(&accepts, accepts.end(), inv_adjacent_vertices(g.accept, g));
-
-    bool rv = createsTransientLHS(g, accepts, depths, grey);
-    DEBUG_PRINTF("lhs is %stransient\n", rv ? "" : "not ");
-    return rv;
-}
-
-namespace {
-
-/**
- * Information on a cut: vertices and literals.
- */
-struct VertLitInfo {
-    VertLitInfo(NFAVertex v, const set<ue2_literal> &litlit)
-        : vv(vector<NFAVertex>(1, v)), lit(litlit) {}
-    VertLitInfo(const vector<NFAVertex> &vvvv, const set<ue2_literal> &litlit)
-        : vv(vvvv), lit(litlit) {}
-    vector<NFAVertex> vv;
-    set<ue2_literal> lit;
-};
-
-/**
- * A factory for candidate simple cuts (literals/vertices).
- */
-class LitCollection : boost::noncopyable {
-    vector<unique_ptr<VertLitInfo>> lits; /**< sorted list of potential cuts */
-    const NGHolder &g; /**< graph on which cuts are found */
-    const vector<NFAVertexDepth> &depths; /**< depth information for g */
-    const ue2::unordered_map<NFAVertex, u32> &region_map; /**< region map for g */
-
-    /** Set of vertices to avoid selecting as end vertices for cuts as previous
-     * cuts overlap them. This is solely to prevent us picking literal sets
-     * which do not add significant value. */
-    ue2::unordered_set<NFAVertex> poisoned;
-
-    /** Back-edges in g. */
-    ue2::unordered_map<NFAVertex, vector<NFAVertex> > back_edges;
-
-    const Grey &grey;
-    bool seeking_transient;
-    bool seeking_anchored;
-
-    void poisonLHS(const VertLitInfo &picked);
-    void poisonLitVerts(const VertLitInfo &picked);
-    void poisonCandidates(const VertLitInfo &picked);
-
-    friend class LitComparator;
-
-public:
-    LitCollection(const NGHolder &g_in, const vector<NFAVertexDepth> &depths_in,
-                  const ue2::unordered_map<NFAVertex, u32> &region_map_in,
-                  const set<NFAVertex> &ap, const set<NFAVertex> &ap_raw,
-                  u32 min_len, bool desperation, const CompileContext &cc,
-                  bool override_literal_quality_check = false);
-
-    /**< Returns the next candidate cut. Cut still needs to be inspected for
-     * complete envelopment. */
-    unique_ptr<VertLitInfo> pickNext(void);
-};
-
-/**
- * \brief Comparator class for sorting LitCollection::lits.
- *
- * This is separated out from LitCollection itself as passing LitCollection to
- * std::sort() would incur a (potentially expensive) copy.
- */
-class LitComparator {
-public:
-    explicit LitComparator(const LitCollection &lc_in) : lc(lc_in) {}
-    bool operator()(const unique_ptr<VertLitInfo> &a,
-                    const unique_ptr<VertLitInfo> &b) const {
-        assert(a && b);
-
-        if (lc.seeking_anchored) {
-            bool a_anchored =
-                createsAnchoredLHS(lc.g, a->vv, lc.depths, lc.grey);
-            bool b_anchored =
-                createsAnchoredLHS(lc.g, b->vv, lc.depths, lc.grey);
-
-            if (a_anchored != b_anchored) {
-                return a_anchored < b_anchored;
-            }
-        }
-
-        if (lc.seeking_transient) {
-            bool a_transient =
-                createsTransientLHS(lc.g, a->vv, lc.depths, lc.grey);
-            bool b_transient =
-                createsTransientLHS(lc.g, b->vv, lc.depths, lc.grey);
-
-            if (a_transient != b_transient) {
-                return a_transient < b_transient;
-            }
-        }
-
-        u64a score_a = scoreSet(a->lit);
-        u64a score_b = scoreSet(b->lit);
-
-        if (score_a != score_b) {
-            return score_a > score_b;
-        }
-
-        /* vertices should only be in one candidate cut */
-        assert(a->vv == b->vv || a->vv.front() != b->vv.front());
-        return lc.g[a->vv.front()].index >
-               lc.g[b->vv.front()].index;
-    }
-
-private:
-    const LitCollection &lc;
-};
-
-static
-size_t shorter_than(const set<ue2_literal> &s, size_t limit) {
-    size_t count = 0;
-
-    for (const auto &lit : s) {
-        if (lit.length() < limit) {
-            count++;
-        }
-    }
-
-    return count;
-}
-
-static
-u32 min_len(const set<ue2_literal> &s) {
-    u32 rv = ~0U;
-
-    for (const auto &lit : s) {
-        rv = min(rv, (u32)lit.length());
-    }
-
-    return rv;
-}
-
-static
-u32 max_len(const set<ue2_literal> &s) {
-    u32 rv = 0;
-
-    for (const auto &lit : s) {
-        rv = max(rv, (u32)lit.length());
-    }
-
-    return rv;
-}
-
-static
-u32 min_period(const set<ue2_literal> &s) {
-    u32 rv = ~0U;
-
-    for (const auto &lit : s) {
-        rv = min(rv, (u32)minStringPeriod(lit));
-    }
-    DEBUG_PRINTF("min period %u\n", rv);
-    return rv;
-}
-
-static
-bool validateRoseLiteralSetQuality(const set<ue2_literal> &s, u64a score,
-                                   u32 min_allowed_len, bool desperation,
-                                   bool override_literal_quality_check) {
-    if (!override_literal_quality_check && score >= NO_LITERAL_AT_EDGE_SCORE) {
-        DEBUG_PRINTF("candidate is too bad %llu/%zu\n", score, s.size());
-        return false;
-    }
-
-    assert(!s.empty());
-    if (s.empty()) {
-        DEBUG_PRINTF("candidate is too bad/something went wrong\n");
-        return false;
-    }
-
-    u32 s_min_len = min_len(s);
-    u32 s_min_period = min_period(s);
-    size_t short_count = shorter_than(s, 5);
-
-    DEBUG_PRINTF("cand '%s': score %llu count=%zu min_len=%u min_period=%u"
-                 " short_count=%zu desp=%d\n",
-                 dumpString(*s.begin()).c_str(), score, s.size(), s_min_len,
-                 s_min_period, short_count, (int)desperation);
-
-    bool ok = true;
-
-    if (s.size() > 10 /* magic number is magic */
-        || s_min_len < min_allowed_len
-        || (s_min_period <= 1 && !override_literal_quality_check
-            && min_allowed_len != 1)) {
-        ok = false;
-    }
-
-    if (!ok && desperation
-        && s.size() <= 20 /* more magic numbers are magical */
-        && (s_min_len > 5 || (s_min_len > 2 && short_count <= 10))
-        && s_min_period > 1) {
-        DEBUG_PRINTF("candidate is ok\n");
-        ok = true;
-    }
-
-    if (!ok && desperation
-        && s.size() <= 50 /* more magic numbers are magical */
-        && s_min_len > 10
-        && s_min_period > 1) {
-        DEBUG_PRINTF("candidate is ok\n");
-        ok = true;
-    }
-
-    if (!ok) {
-        DEBUG_PRINTF("candidate is too bad\n");
-        return false;
-    }
-
-    return true;
-}
-
-static UNUSED
-void dumpRoseLiteralSet(const set<ue2_literal> &s) {
-    for (UNUSED const auto &lit : s) {
-        DEBUG_PRINTF("    lit: %s\n", dumpString(lit).c_str());
-    }
-}
-
-static
-void getSimpleRoseLiterals(const NGHolder &g, const set<NFAVertex> &a_dom,
-                           vector<unique_ptr<VertLitInfo>> *lits,
-                           u32 min_allowed_len, bool desperation,
-                           bool override_literal_quality_check) {
-    map<NFAVertex, u64a> scores;
-    map<NFAVertex, unique_ptr<VertLitInfo>> lit_info;
-    set<ue2_literal> s;
-
-    for (auto v : a_dom) {
-        s = getLiteralSet(g, v, true); /* RHS will take responsibility for any
-                                          revisits to the target vertex */
-
-        if (s.empty()) {
-            DEBUG_PRINTF("candidate is too bad\n");
-            continue;
-        }
-
-        DEBUG_PRINTF("|candidate raw literal set| = %zu\n", s.size());
-        dumpRoseLiteralSet(s);
-        u64a score = compressAndScore(s);
-
-        if (!validateRoseLiteralSetQuality(s, score, min_allowed_len,
-                                           desperation,
-                                           override_literal_quality_check)) {
-            continue;
-        }
-
-        DEBUG_PRINTF("candidate is a candidate\n");
-        scores[v] = score;
-        lit_info.insert(make_pair(v, ue2::make_unique<VertLitInfo>(v, s)));
-    }
-
-    /* try to filter out cases where appending some characters produces worse
-     * literals. Only bother to look back one byte, TODO make better */
-    for (auto u : a_dom) {
-        if (out_degree(u, g) != 1 || !scores[u]) {
-            continue;
-        }
-        NFAVertex v = *adjacent_vertices(u, g).first;
-        if (contains(scores, v) && scores[v] >= scores[u]) {
-            DEBUG_PRINTF("killing off v as score %llu >= %llu\n",
-                         scores[v], scores[u]);
-            lit_info.erase(v);
-        }
-    }
-
-    lits->reserve(lit_info.size());
-    for (auto &m : lit_info) {
-        lits->push_back(move(m.second));
-    }
-    DEBUG_PRINTF("%zu candidate literal sets\n", lits->size());
-}
-
-static
-void getRegionRoseLiterals(const NGHolder &g,
-                           const ue2::unordered_map<NFAVertex, u32> &region_map,
-                           const set<NFAVertex> &a_dom_raw,
-                           vector<unique_ptr<VertLitInfo>> *lits,
-                           u32 min_allowed_len, bool desperation,
-                           bool override_literal_quality_check) {
-    /* This allows us to get more places to chop the graph as we are not limited
-       to points where there is a single vertex to split. */
-
-    /* TODO: operate over 'proto-regions' which ignore back edges */
-
-    set<u32> mand, optional;
-    map<u32, vector<NFAVertex> > exits;
-
-    for (auto v : vertices_range(g)) {
-        assert(contains(region_map, v));
-        const u32 region = region_map.at(v);
-
-        if (is_any_start(v, g) || region == 0) {
-            continue;
-        }
-
-        if (is_any_accept(v, g)) {
-            continue;
-        }
-
-        if (isRegionExit(g, v, region_map)) {
-            exits[region].push_back(v);
-        }
-
-        if (isRegionEntry(g, v, region_map)) {
-            // Determine whether this region is mandatory or optional. We only
-            // need to do this check for the first entry vertex we encounter
-            // for this region.
-            if (!contains(mand, region) && !contains(optional, region)) {
-                if (isOptionalRegion(g, v, region_map)) {
-                    optional.insert(region);
-                } else {
-                    mand.insert(region);
-                }
-            }
-        }
-    }
-
-    for (const auto &m : exits) {
-        if (0) {
-        next_cand:
-            continue;
-        }
-
-        const u32 region = m.first;
-        const vector<NFAVertex> &vv = m.second;
-        assert(!vv.empty());
-
-        if (!contains(mand, region)) {
-            continue;
-        }
-
-        for (auto v : vv) {
-             /* if an exit is in a_dom_raw, the region is already handled well
-              * by getSimpleRoseLiterals */
-            if (contains(a_dom_raw, v)) {
-                goto next_cand;
-            }
-        }
-
-        /* the final region may not have a neat exit. validate that all exits
-         * have an edge to each accept or none do */
-        bool edge_to_a = edge(vv[0], g.accept, g).second;
-        bool edge_to_aeod = edge(vv[0], g.acceptEod, g).second;
-        const auto &reports = g[vv[0]].reports;
-        for (auto v : vv) {
-            if (edge_to_a != edge(v, g.accept, g).second) {
-                goto next_cand;
-            }
-
-            if (edge_to_aeod != edge(v, g.acceptEod, g).second) {
-                goto next_cand;
-            }
-
-            if (g[v].reports != reports) {
-                goto next_cand;
-            }
-        }
-
-        DEBUG_PRINTF("inspecting region %u\n", region);
-        set<ue2_literal> s;
-        for (auto v : vv) {
-            DEBUG_PRINTF("   exit vertex: %zu\n", g[v].index);
-            /* Note: RHS can not be depended on to take all subsequent revisits
-             * to this vertex */
-            set<ue2_literal> ss = getLiteralSet(g, v, false);
-            if (ss.empty()) {
-                DEBUG_PRINTF("candidate is too bad\n");
-                goto next_cand;
-            }
-            insert(&s, ss);
-        }
-
-        assert(!s.empty());
-
-        DEBUG_PRINTF("|candidate raw literal set| = %zu\n", s.size());
-        dumpRoseLiteralSet(s);
-        u64a score = compressAndScore(s);
-        DEBUG_PRINTF("|candidate literal set| = %zu\n", s.size());
-        dumpRoseLiteralSet(s);
-
-        if (!validateRoseLiteralSetQuality(s, score, min_allowed_len,
-                                           desperation,
-                                           override_literal_quality_check)) {
-            continue;
-        }
-
-        DEBUG_PRINTF("candidate is a candidate\n");
-        lits->push_back(ue2::make_unique<VertLitInfo>(vv, s));
-    }
-}
-
-static
-void gatherBackEdges(const NGHolder &g,
-                     ue2::unordered_map<NFAVertex, vector<NFAVertex>> *out) {
-    set<NFAEdge> backEdges;
-    BackEdges<set<NFAEdge>> be(backEdges);
-    depth_first_search(g, visitor(be).root_vertex(g.start));
-
-    for (const auto &e : backEdges) {
-        (*out)[source(e, g)].push_back(target(e, g));
-    }
-}
-
-LitCollection::LitCollection(const NGHolder &g_in,
-                        const vector<NFAVertexDepth> &depths_in,
-                        const ue2::unordered_map<NFAVertex, u32> &region_map_in,
-                        const set<NFAVertex> &a_dom,
-                        const set<NFAVertex> &a_dom_raw, u32 min_len,
-                        bool desperation, const CompileContext &cc,
-                        bool override_literal_quality_check)
-    : g(g_in), depths(depths_in), region_map(region_map_in), grey(cc.grey),
-      seeking_transient(cc.streaming), seeking_anchored(true) {
-    getSimpleRoseLiterals(g, a_dom, &lits, min_len, desperation,
-                          override_literal_quality_check);
-    getRegionRoseLiterals(g, region_map, a_dom_raw, &lits, min_len, desperation,
-                          override_literal_quality_check);
-    DEBUG_PRINTF("lit coll is looking for a%d t%d\n", (int)seeking_anchored,
-                 (int)seeking_transient);
-    DEBUG_PRINTF("we have %zu candidate literal splits\n", lits.size());
-    sort(lits.begin(), lits.end(), LitComparator(*this));
-    gatherBackEdges(g, &back_edges);
-}
-
-void LitCollection::poisonLHS(const VertLitInfo &picked) {
-        DEBUG_PRINTF("found anchored %d transient %d\n",
-                     (int)createsAnchoredLHS(g, picked.vv, depths, grey),
-                     (int)createsTransientLHS(g, picked.vv, depths, grey));
-    set<NFAVertex> curr;
-    set<NFAVertex> next;
-
-    insert(&curr, picked.vv);
-
-    while (!curr.empty()) {
-        insert(&poisoned, curr);
-        next.clear();
-        for (auto v : curr) {
-            for (auto u : inv_adjacent_vertices_range(v, g)) {
-                if (!is_special(u, g) && !contains(poisoned, u)) {
-                    next.insert(u);
-                }
-            }
-        }
-
-        curr.swap(next);
-    }
-
-    seeking_transient = false;
-    seeking_anchored = false;
-
-    /* reprioritise cuts now that the LHS is taken care off */
-    sort(lits.begin(), lits.end(), LitComparator(*this));
-}
-
-static
-void flood_back(const NGHolder &g, u32 len, const set<NFAVertex> &initial,
-                set<NFAVertex> *visited) {
-    vector<NFAVertex> curr;
-    vector<NFAVertex> next;
-
-    insert(&curr, curr.end(), initial);
-
-    insert(visited, initial);
-
-    /* bfs: flood back len vertices */
-    for (u32 i = 1; i < len; i++) {
-        next.clear();
-        DEBUG_PRINTF("poison %u/%u: curr %zu\n", i, len, curr.size());
-
-        for (auto v : curr) {
-            for (auto u : inv_adjacent_vertices_range(v, g)) {
-                if (!contains(*visited, u)) {
-                    next.push_back(u);
-                    visited->insert(u);
-                }
-            }
-        }
-
-        next.swap(curr);
-    }
-}
-
-/**
- * Add vertices near a picked literal to the poison set unless it looks
- * like they may still add value (ie they are on they other side of cycle).
- */
-void LitCollection::poisonLitVerts(const VertLitInfo &picked) {
-    DEBUG_PRINTF("poisoning vertices associated with picked literals\n");
-
-    u32 len = max_len(picked.lit);
-
-    /* poison vertices behind */
-
-    set<NFAVertex> starters;
-    insert(&starters, picked.vv);
-
-    set<NFAVertex> visited;
-
-    flood_back(g, len, starters, &visited);
-
-    DEBUG_PRINTF("flood %zu vertices\n", visited.size());
-
-    /* inspect any back edges which are in the flooded subgraph; look for any
-     * destination vertices which are not starters */
-    set<NFAVertex> anti;
-    for (auto u : visited) {
-        if (!contains(back_edges, u) || contains(starters, u)) {
-            continue;
-        }
-
-        for (auto v : back_edges[u]) {
-            if (contains(visited, v) && !contains(starters, v)) {
-                anti.insert(v);
-            }
-        }
-    }
-    DEBUG_PRINTF("%zu cycle ends\n", visited.size());
-
-    /* remove any vertices which lie on the other side of a cycle from the
-     * visited set */
-    set<NFAVertex> anti_pred;
-    flood_back(g, len - 1, anti, &anti_pred);
-
-    DEBUG_PRINTF("flood visited %zu vertices; anti %zu\n", visited.size(),
-                 anti_pred.size());
-
-    erase_all(&visited, anti_pred);
-
-    DEBUG_PRINTF("filtered flood visited %zu vertices\n", visited.size());
-
-    insert(&poisoned, visited);
-
-    insert(&poisoned, starters); /* complicated back loops can result in start
-                                    vertices being removed from the visited
-                                    set */
-
-    for (UNUSED auto v : picked.vv) {
-        assert(contains(poisoned, v));
-    }
-
-    /* TODO: poison vertices in front of us? */
-}
-
-void LitCollection::poisonCandidates(const VertLitInfo &picked) {
-    assert(!picked.lit.empty());
-    if (picked.lit.empty()) {
-        return;
-    }
-
-    if ((seeking_anchored && createsAnchoredLHS(g, picked.vv, depths, grey))
-    || (seeking_transient && createsTransientLHS(g, picked.vv, depths, grey))) {
-        /* We don't want to pick anything to the LHS of picked.v any more as we
-         * have something good. We also don't want to provide any bonus for
-         * remaining literals based on anchoredness/transientness of the lhs.
-         */
-        poisonLHS(picked);
-    } else {
-        poisonLitVerts(picked);
-    }
-}
-
-unique_ptr<VertLitInfo> LitCollection::pickNext() {
-    while (!lits.empty()) {
-        if (0) {
-        next_lit:
-            continue;
-        }
-
-        for (auto v : lits.back()->vv) {
-            if (contains(poisoned, v)) {
-                DEBUG_PRINTF("skipping '%s' as overlapped\n",
-                             dumpString(*(lits.back()->lit.begin())).c_str());
-                lits.pop_back();
-                goto next_lit;
-            }
-        }
-
-        unique_ptr<VertLitInfo> rv = move(lits.back());
-        lits.pop_back();
-        poisonCandidates(*rv);
-        DEBUG_PRINTF("best is '%s' %zu a%d t%d\n",
-                     dumpString(*(rv->lit.begin())).c_str(),
-                     g[rv->vv.front()].index,
-                     (int)createsAnchoredLHS(g, rv->vv, depths, grey),
-                     (int)createsTransientLHS(g, rv->vv, depths, grey));
-
-        return rv;
-    }
-
-    return nullptr;
-}
-
-}
-
-static
-bool can_match(const NGHolder &g, const ue2_literal &lit, bool overhang_ok) {
-    set<NFAVertex> curr, next;
-    curr.insert(g.accept);
-
-    for (auto it = lit.rbegin(); it != lit.rend(); ++it) {
-        next.clear();
-
-        for (auto v : curr) {
-            for (auto u : inv_adjacent_vertices_range(v, g)) {
-                if (u == g.start) {
-                    if (overhang_ok) {
-                        DEBUG_PRINTF("bail\n");
-                        return true;
-                    } else {
-                        continue; /* it is not possible for a lhs literal to
-                                   * overhang the start */
-                    }
-                }
-
-                const CharReach &cr = g[u].char_reach;
-                if (!overlaps(*it, cr)) {
-                    DEBUG_PRINTF("skip\n");
-                    continue;
-                }
-
-                next.insert(u);
-            }
-        }
-
-        curr.swap(next);
-    }
-
-    return !curr.empty();
-}
-
-u32 removeTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
-                                u32 max_delay, bool overhang_ok) {
-    assert(isCorrectlyTopped(g));
-    if (max_delay == MO_INVALID_IDX) {
-        max_delay--;
-    }
-
-    DEBUG_PRINTF("killing off '%s'\n", dumpString(lit).c_str());
-    set<NFAVertex> curr, next;
-    curr.insert(g.accept);
-
-    auto it = lit.rbegin();
-    for (u32 delay = max_delay; delay > 0 && it != lit.rend(); delay--, ++it) {
-        next.clear();
-        for (auto v : curr) {
-            for (auto u : inv_adjacent_vertices_range(v, g)) {
-                if (u == g.start) {
-                    if (overhang_ok) {
-                        DEBUG_PRINTF("bail\n");
-                        goto bail; /* things got complicated */
-                    } else {
-                        continue; /* it is not possible for a lhs literal to
-                                   * overhang the start */
-                    }
-                }
-
-                const CharReach &cr = g[u].char_reach;
-                if (!overlaps(*it, cr)) {
-                    DEBUG_PRINTF("skip\n");
-                    continue;
-                }
-                if (isSubsetOf(*it, cr)) {
-                    next.insert(u);
-                } else {
-                    DEBUG_PRINTF("bail\n");
-                    goto bail; /* things got complicated */
-                }
-            }
-        }
-
-        curr.swap(next);
-    }
- bail:
-    if (curr.empty()) {
-        /* This can happen when we have an edge representing a cross from two
-         * sides of an alternation. This whole edge needs to be marked as
-         * dead */
-        assert(0); /* should have been picked up by can match */
-        return MO_INVALID_IDX;
-    }
-
-    u32 delay = distance(lit.rbegin(), it);
-    assert(delay <= max_delay);
-    assert(delay <= lit.length());
-    DEBUG_PRINTF("managed delay %u (of max %u)\n", delay, max_delay);
-
-    set<NFAVertex> pred;
-    for (auto v : curr) {
-        insert(&pred, inv_adjacent_vertices_range(v, g));
-    }
-
-    clear_in_edges(g.accept, g);
-    clearReports(g);
-
-    for (auto v : pred) {
-        NFAEdge e = add_edge(v, g.accept, g);
-        g[v].reports.insert(0);
-        if (is_triggered(g) && v == g.start) {
-            g[e].tops.insert(DEFAULT_TOP);
-        }
-    }
-
-    pruneUseless(g);
-    assert(allMatchStatesHaveReports(g));
-    assert(isCorrectlyTopped(g));
-
-    DEBUG_PRINTF("graph has %zu vertices left\n", num_vertices(g));
-    return delay;
-}
-
-void restoreTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
-                                  u32 delay, const vector<NFAVertex> &preds) {
-    assert(delay <= lit.length());
-    assert(isCorrectlyTopped(g));
-    DEBUG_PRINTF("adding on '%s' %u\n", dumpString(lit).c_str(), delay);
-
-    NFAVertex prev = g.accept;
-    auto it = lit.rbegin();
-    while (delay--) {
-        NFAVertex curr = add_vertex(g);
-        assert(it != lit.rend());
-        g[curr].char_reach = *it;
-        add_edge(curr, prev, g);
-        ++it;
-        prev = curr;
-    }
-
-    for (auto v : preds) {
-        NFAEdge e = add_edge(v, prev, g);
-        if (v == g.start && is_triggered(g)) {
-            g[e].tops.insert(DEFAULT_TOP);
-        }
-    }
-
-    // Every predecessor of accept must have a report.
-    for (auto u : inv_adjacent_vertices_range(g.accept, g)) {
-        g[u].reports.insert(0);
-    }
-
-    renumber_vertices(g);
-    renumber_edges(g);
-    assert(allMatchStatesHaveReports(g));
-    assert(isCorrectlyTopped(g));
-}
-
-void restoreTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
-                                  u32 delay) {
-    vector<NFAVertex> preds;
-    insert(&preds, preds.end(), inv_adjacent_vertices(g.accept, g));
-    clear_in_edges(g.accept, g);
-
-    for (auto v : preds) {
-        g[v].reports.clear(); /* clear report from old accepts */
-    }
-
-    restoreTrailingLiteralStates(g, lit, delay, preds);
-}
-
-/* return false if we should get rid of the edge altogether */
-static
-bool removeLiteralFromLHS(RoseInGraph &ig, const RoseInEdge &lhs,
-                          const CompileContext &cc) {
-    unique_ptr<NGHolder> h = cloneHolder(*ig[lhs].graph);
-    NGHolder &g = *h;
-    assert(ig[target(lhs, ig)].type == RIV_LITERAL);
-    const ue2_literal &lit = ig[target(lhs, ig)].s;
-
-    /* lhs should be connected to a start */
-    assert(ig[source(lhs, ig)].type == RIV_START
-           || ig[source(lhs, ig)].type == RIV_ANCHORED_START);
-
-    if (in_degree(g.acceptEod, g) != 1 /* edge from accept */) {
-        assert(0);
-        return true;
-    }
-    if (lit.empty()) {
-        assert(0);
-        return true;
-    }
-
-    const u32 max_delay = maxDelay(cc);
-
-    // In streaming mode, we must limit the depth to the available history
-    // UNLESS the given literal follows start or startDs and has nothing
-    // before it that we will need to account for. In that case, we can
-    // lean on FDR's support for long literals.
-    if (literalIsWholeGraph(g, lit)) {
-        assert(!ig[lhs].haig);
-        assert(ig[lhs].minBound == 0);
-        assert(ig[lhs].maxBound == ROSE_BOUND_INF);
-        DEBUG_PRINTF("literal is the whole graph\n");
-
-        u32 delay = removeTrailingLiteralStates(g, lit, MO_INVALID_IDX, false);
-        assert(delay == lit.length());
-        ig[lhs].graph = move(h);
-        ig[lhs].graph_lag = delay;
-        return true;
-    }
-
-    if (!can_match(g, lit, false)) {
-        /* This is can happen if the literal arises from a large cyclic
-           to/beyond the pivot. As the LHS graph only cares about the first
-           reach of the pivot, this literal is junk */
-        DEBUG_PRINTF("bogus edge\n");
-        return false;
-    }
-
-    u32 delay = removeTrailingLiteralStates(g, lit, max_delay,
-                                            false /* can't overhang start */);
-
-    if (delay == MO_INVALID_IDX) {
-        /* This is can happen if the literal arises from a large cyclic
-           to/beyond the pivot. As the LHS graph only cares about the first
-           reach of the pivot, this literal is junk */
-        DEBUG_PRINTF("bogus edge\n");
-        return false;
-    }
-
-    if (!delay) {
-        return true;
-    }
-
-    DEBUG_PRINTF("setting delay %u on lhs %p\n", delay, h.get());
-
-    ig[lhs].graph = move(h);
-    ig[lhs].graph_lag = delay;
-    return true;
-}
-
-static
-void handleLhsCliche(RoseInGraph &ig, const RoseInEdge &lhs) {
-    const NGHolder &h = *ig[lhs].graph;
-
-    size_t s_od = out_degree(h.start, h);
-    size_t sds_od = out_degree(h.startDs, h);
-
-    assert(in_degree(h.acceptEod, h) == 1 /* edge from accept */);
-    /* need to check if simple floating start */
-    if (edge(h.startDs, h.accept, h).second && sds_od == 2
-        && ((s_od == 2 && edge(h.start, h.accept, h).second) || s_od == 1)) {
-        /* no need for graph */
-        ig[lhs].graph.reset();
-        ig[lhs].graph_lag = 0;
-        DEBUG_PRINTF("lhs is floating start\n");
-        return;
-    }
-
-    /* need to check if a simple anchor */
-    /* start would have edges to sds and accept in this case */
-    if (edge(h.start, h.accept, h).second && s_od == 2 && sds_od == 1) {
-        if (ig[source(lhs, ig)].type == RIV_ANCHORED_START) {
-            // assert(ig[lhs].graph_lag == ig[target(lhs, ig)].s.length());
-            if (ig[lhs].graph_lag != ig[target(lhs, ig)].s.length()) {
-                DEBUG_PRINTF("oddness\n");
-                return;
-            }
-            ig[lhs].graph.reset();
-            ig[lhs].graph_lag = 0;
-            ig[lhs].maxBound = 0;
-            DEBUG_PRINTF("lhs is anchored start\n");
-        } else {
-            DEBUG_PRINTF("lhs rewiring start\n");
-            assert(ig[source(lhs, ig)].type == RIV_START);
-            RoseInVertex t = target(lhs, ig);
-            remove_edge(lhs, ig);
-            RoseInVertex s2
-                = add_vertex(RoseInVertexProps::makeStart(true), ig);
-            add_edge(s2, t, RoseInEdgeProps(0U, 0U), ig);
-        }
-        return;
-    }
-}
-
-static
-void filterCandPivots(const NGHolder &g, const set<NFAVertex> &cand_raw,
-                      set<NFAVertex> *out) {
-    for (auto u : cand_raw) {
-        const CharReach &u_cr = g[u].char_reach;
-        if (u_cr.count() > 40) {
-            continue; /* too wide to be plausible */
-        }
-
-        if (u_cr.count() > 2) {
-            /* include u as a candidate as successor may have backed away from
-             * expanding through it */
-            out->insert(u);
-            continue;
-        }
-
-        NFAVertex v = getSoleDestVertex(g, u);
-        if (v && in_degree(v, g) == 1 && out_degree(u, g) == 1) {
-            const CharReach &v_cr = g[v].char_reach;
-            if (v_cr.count() == 1 || v_cr.isCaselessChar()) {
-                continue; /* v will always generate better literals */
-            }
-        }
-
-        out->insert(u);
-    }
-}
-
-/* cand_raw is the candidate set before filtering points which are clearly
- * a bad idea. */
-static
-void getCandidatePivots(const NGHolder &g, set<NFAVertex> *cand,
-                        set<NFAVertex> *cand_raw) {
-    ue2::unordered_map<NFAVertex, NFAVertex> dominators =
-        findDominators(g);
-
-    set<NFAVertex> accepts;
-
-    for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
-        if (is_special(v, g)) {
-            continue;
-        }
-        accepts.insert(v);
-    }
-    for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) {
-        if (is_special(v, g)) {
-            continue;
-        }
-        accepts.insert(v);
-    }
-
-    assert(!accepts.empty());
-
-    vector<NFAVertex> dom_trace;
-    auto ait = accepts.begin();
-    assert(ait != accepts.end());
-    NFAVertex curr = *ait;
-    while (curr && !is_special(curr, g)) {
-        dom_trace.push_back(curr);
-        curr = dominators[curr];
-    }
-    reverse(dom_trace.begin(), dom_trace.end());
-    for (++ait; ait != accepts.end(); ++ait) {
-        curr = *ait;
-        vector<NFAVertex> dom_trace2;
-        while (curr && !is_special(curr, g)) {
-            dom_trace2.push_back(curr);
-            curr = dominators[curr];
-        }
-        reverse(dom_trace2.begin(), dom_trace2.end());
-        auto dti = dom_trace.begin(), dtie = dom_trace.end();
-        auto dtj = dom_trace2.begin(), dtje = dom_trace2.end();
-        while (dti != dtie && dtj != dtje && *dti == *dtj) {
-            ++dti;
-            ++dtj;
-        }
-        dom_trace.erase(dti, dtie);
-    }
-
-    cand_raw->insert(dom_trace.begin(), dom_trace.end());
-
-    filterCandPivots(g, *cand_raw, cand);
-}
-
-static
-void deanchorIfNeeded(NGHolder &g, bool *orig_anch) {
-    DEBUG_PRINTF("hi\n");
-    if (proper_out_degree(g.startDs, g)) {
-        return;
-    }
-
-    /* look for a non-special dot with a loop following start */
-    set<NFAVertex> succ_g;
-    insert(&succ_g, adjacent_vertices(g.start, g));
-    succ_g.erase(g.startDs);
-
-    for (auto v : adjacent_vertices_range(g.start, g)) {
-        DEBUG_PRINTF("inspecting cand %zu || =%zu\n", g[v].index,
-                     g[v].char_reach.size());
-
-        if (v == g.startDs || !g[v].char_reach.all()) {
-            continue;
-        }
-
-        set<NFAVertex> succ_v;
-        insert(&succ_v, adjacent_vertices(v, g));
-
-        if (succ_v == succ_g) {
-            DEBUG_PRINTF("found ^.*\n");
-            *orig_anch = true;
-            for (auto succ : succ_g) {
-                add_edge(g.startDs, succ, g);
-            }
-            clear_vertex(v, g);
-            remove_vertex(v, g);
-            renumber_vertices(g);
-            return;
-        }
-
-        if (succ_g.size() == 1 && hasSelfLoop(v, g)) {
-            DEBUG_PRINTF("found ^.+\n");
-            *orig_anch = true;
-            add_edge(g.startDs, v, g);
-            remove_edge(v, v, g);
-            return;
-        }
-    }
-}
-
-static
-unique_ptr<RoseInGraph> makeTrivialGraph(const NGHolder &h,
-                                         vdest_map_t &v_dest_map,
-                                         vsrc_map_t &v_src_map) {
-    shared_ptr<NGHolder> root_g = cloneHolder(h);
-    bool orig_anch = isAnchored(*root_g);
-    deanchorIfNeeded(*root_g, &orig_anch);
-
-    DEBUG_PRINTF("orig_anch %d\n", (int)orig_anch);
-
-    unique_ptr<RoseInGraph> igp = ue2::make_unique<RoseInGraph>();
-    RoseInVertex start =
-        add_vertex(RoseInVertexProps::makeStart(orig_anch), *igp);
-    RoseInVertex accept =
-        add_vertex(RoseInVertexProps::makeAccept(set<ReportID>()), *igp);
-
-    RoseInEdge e =
-        add_edge(start, accept, RoseInEdgeProps(root_g, 0), *igp).first;
-
-    for (auto v : vertices_range(*root_g)) {
-        v_dest_map[v].emplace_back(e, v);
-        v_src_map[e].push_back(v);
-    }
-
-    return igp;
-}
-
-static never_inline
-void updateVDestMap(const vector<pair<RoseInEdge, NFAVertex> > &images,
-                    const ue2::unordered_map<NFAVertex, NFAVertex> &lhs_map,
-                    const vector<RoseInEdge> &l_e,
-                    const ue2::unordered_map<NFAVertex, NFAVertex> &rhs_map,
-                    const vector<RoseInEdge> &r_e,
-                    vdest_map_t &v_dest_map, vsrc_map_t &v_src_map) {
-    RoseInEdge e = images.front().first;
-    set<RoseInEdge> edge_set;
-    for (const auto &image : images) {
-        edge_set.insert(image.first);
-    }
-    const vector<NFAVertex> &domain = v_src_map[e];
-    vector<pair<RoseInEdge, NFAVertex> > temp;
-
-    for (auto v : domain) {
-        vdest_map_t::iterator it = v_dest_map.find(v);
-        assert(it != v_dest_map.end());
-
-        temp.clear();
-
-        for (const auto &dest : it->second) {
-            const RoseInEdge &old_e = dest.first;
-            const NFAVertex old_dest = dest.second;
-            if (old_e != e) {
-                if (!contains(edge_set, old_e)) {
-                    temp.emplace_back(old_e, old_dest);
-                }
-            } else if (contains(lhs_map, old_dest)) {
-                for (const auto &e2 : l_e) {
-                    temp.emplace_back(e2, lhs_map.at(old_dest));
-                }
-            /* only allow v to be tracked on one side of the split */
-            } else if (contains(rhs_map, old_dest)) {
-                for (const auto &e2 : r_e) {
-                    temp.emplace_back(e2, rhs_map.at(old_dest));
-                }
-            }
-        }
-        NDEBUG_PRINTF("%zu images for vertex; prev %zu\n", temp.size(),
-                     it->second.size());
-        it->second.swap(temp);
-    }
-}
-
-/** Returns the collection of vertices from the original graph which end up
- * having an image in the [lr]hs side of the graph split. */
-static never_inline
-void fillDomain(const vdest_map_t &v_dest_map, const vsrc_map_t &v_src_map,
-                RoseInEdge e,
-                const ue2::unordered_map<NFAVertex, NFAVertex> &split_map,
-                vector<NFAVertex> *out) {
-    const vector<NFAVertex> &presplit_domain = v_src_map.at(e);
-    for (auto v : presplit_domain) {
-        /* v is in the original graph, need to find its image on e's graph */
-        typedef vector<pair<RoseInEdge, NFAVertex> > dests_t;
-        const dests_t &dests = v_dest_map.at(v);
-        for (const auto &dest : dests) {
-            if (dest.first == e) {
-                NFAVertex vv = dest.second;
-                /* vv is v image on e's graph */
-                if (contains(split_map, vv)) {
-                    out->push_back(v);
-                }
-            }
-        }
-    }
-}
-
-static
-void getSourceVerts(RoseInGraph &ig,
-                    const vector<pair<RoseInEdge, NFAVertex> > &images,
-                    vector<RoseInVertex> *out) {
-    set<RoseInVertex> seen;
-    for (const auto &image : images) {
-        RoseInVertex s = source(image.first, ig);
-        if (contains(seen, s)) {
-            continue;
-        }
-        seen.insert(s);
-        out->push_back(s);
-    }
-}
-
-static
-void getDestVerts(RoseInGraph &ig,
-                  const vector<pair<RoseInEdge, NFAVertex> > &images,
-                  vector<RoseInVertex> *out) {
-    set<RoseInVertex> seen;
-    for (const auto &image : images) {
-        RoseInVertex t = target(image.first, ig);
-        if (contains(seen, t)) {
-            continue;
-        }
-        seen.insert(t);
-        out->push_back(t);
-    }
-}
-
-static
-void getSourceVerts(RoseInGraph &ig, const vector<RoseInEdge> &edges,
-                    vector<RoseInVertex> *out) {
-    set<RoseInVertex> seen;
-    for (const auto &e : edges) {
-        RoseInVertex s = source(e, ig);
-        if (contains(seen, s)) {
-            continue;
-        }
-        seen.insert(s);
-        out->push_back(s);
-    }
-}
-
-static
-void getDestVerts(RoseInGraph &ig, const vector<RoseInEdge> &edges,
-                  vector<RoseInVertex> *out) {
-    set<RoseInVertex> seen;
-    for (const auto &e : edges) {
-        RoseInVertex t = target(e, ig);
-        if (contains(seen, t)) {
-            continue;
-        }
-        seen.insert(t);
-        out->push_back(t);
-    }
-}
-
-static
-bool splitRoseEdge(RoseInGraph &ig, const VertLitInfo &split,
-                   vdest_map_t &v_dest_map, vsrc_map_t &v_src_map) {
-    const vector<NFAVertex> &root_splitters = split.vv; /* vertices in the
-                                                           'root' graph */
-    assert(!root_splitters.empty());
-
-    /* need copy as split rose edge will update orig map */
-    vector<pair<RoseInEdge, NFAVertex> > images
-        = v_dest_map[root_splitters[0]];
-    DEBUG_PRINTF("splitting %zu rose edge with %zu literals\n",
-                 images.size(), split.lit.size());
-
-    /* note: as we haven't removed literals yet the graphs on all edges that we
-     * are going to split should be identical */
-    const auto &base_graph = ig[images.front().first].graph;
-
-    vector<NFAVertex> splitters; /* vertices in the graph being split */
-    for (auto v : root_splitters) {
-        if (!contains(v_dest_map, v)) {
-            DEBUG_PRINTF("vertex to split on is no longer in the graph\n");
-            return false;
-        }
-
-        /* sanity check: verify all edges have the same underlying graph */
-        for (UNUSED const auto &m : v_dest_map[v]) {
-            assert(base_graph == ig[m.first].graph);
-        }
-        assert(v_dest_map[v].size() == images.size());
-
-        splitters.push_back(v_dest_map[v].front().second);
-    }
-
-    /* note: the set of split edges should form a complete bipartite graph */
-    vector<RoseInVertex> src_verts;
-    vector<RoseInVertex> dest_verts;
-    getSourceVerts(ig, images, &src_verts);
-    getDestVerts(ig, images, &dest_verts);
-    assert(images.size() == src_verts.size() * dest_verts.size());
-
-    shared_ptr<NGHolder> lhs = make_shared<NGHolder>();
-    shared_ptr<NGHolder> rhs = make_shared<NGHolder>();
-
-    ue2::unordered_map<NFAVertex, NFAVertex> lhs_map;
-    ue2::unordered_map<NFAVertex, NFAVertex> rhs_map;
-
-    assert(base_graph);
-    splitGraph(*base_graph, splitters, lhs.get(), &lhs_map,
-                                       rhs.get(), &rhs_map);
-
-    RoseInEdge first_e = images.front().first;
-
-    /* all will be suffix or none */
-    bool suffix = ig[target(first_e, ig)].type == RIV_ACCEPT;
-
-    set<ReportID> splitter_reports;
-    for (auto v : splitters) {
-        insert(&splitter_reports, (*base_graph)[v].reports);
-    }
-
-    bool do_accept = false;
-    bool do_accept_eod = false;
-    assert(rhs);
-    if (isVacuous(*rhs) && suffix) {
-        if (edge(rhs->start, rhs->accept, *rhs).second) {
-                DEBUG_PRINTF("rhs has a cliche\n");
-                do_accept = true;
-                remove_edge(rhs->start, rhs->accept, *rhs);
-        }
-
-        if (edge(rhs->start, rhs->acceptEod, *rhs).second) {
-            DEBUG_PRINTF("rhs has an eod cliche\n");
-            do_accept_eod = true;
-            remove_edge(rhs->start, rhs->acceptEod, *rhs);
-        }
-    }
-
-    bool do_norm = out_degree(rhs->start, *rhs) != 1; /* check if we still have
-                                                         a graph left over */
-    vector<NFAVertex> lhs_domain;
-    vector<NFAVertex> rhs_domain;
-    fillDomain(v_dest_map, v_src_map, first_e, lhs_map, &lhs_domain);
-    fillDomain(v_dest_map, v_src_map, first_e, rhs_map, &rhs_domain);
-
-    vector<RoseInEdge> l_e;
-    vector<RoseInEdge> r_e;
-    for (const auto &lit : split.lit) {
-        DEBUG_PRINTF("best is '%s'\n", escapeString(lit).c_str());
-        RoseInVertex v
-            = add_vertex(RoseInVertexProps::makeLiteral(lit), ig);
-
-        /* work out delay later */
-        if (do_accept) {
-            DEBUG_PRINTF("rhs has a cliche\n");
-            RoseInVertex tt = add_vertex(RoseInVertexProps::makeAccept(
-                                        splitter_reports), ig);
-            add_edge(v, tt, RoseInEdgeProps(0U, 0U), ig);
-        }
-
-        if (do_accept_eod) {
-            DEBUG_PRINTF("rhs has an eod cliche\n");
-            RoseInVertex tt = add_vertex(RoseInVertexProps::makeAcceptEod(
-                                        splitter_reports), ig);
-            add_edge(v, tt, RoseInEdgeProps(0U, 0U), ig);
-        }
-
-        for (auto src_v : src_verts) {
-            l_e.push_back(add_edge(src_v, v,
-                                   RoseInEdgeProps(lhs, 0U), ig).first);
-            v_src_map[l_e.back()] = lhs_domain;
-        }
-
-        if (do_norm) {
-            for (auto dst_v : dest_verts) {
-                /* work out delay later */
-                assert(out_degree(rhs->start, *rhs) > 1);
-                r_e.push_back(
-                    add_edge(v, dst_v, RoseInEdgeProps(rhs, 0U), ig).first);
-                v_src_map[r_e.back()] = rhs_domain;
-            }
-        }
-    }
-
-    updateVDestMap(images, lhs_map, l_e, rhs_map, r_e, v_dest_map, v_src_map);
-
-    for (const auto &image : images) {
-        /* remove old edge */
-        remove_edge(image.first, ig);
-        v_src_map.erase(image.first);
-    }
-
-    return true;
-}
-
-static
-bool isStarCliche(const NGHolder &g) {
-    DEBUG_PRINTF("checking graph with %zu vertices\n", num_vertices(g));
-
-    bool nonspecials_seen = false;
-
-    for (auto v : vertices_range(g)) {
-        if (is_special(v, g)) {
-            continue;
-        }
-
-        if (nonspecials_seen) {
-            return false;
-        }
-        nonspecials_seen = true;
-
-        if (!g[v].char_reach.all()) {
-            return false;
-        }
-
-        if (!hasSelfLoop(v, g)) {
-            return false;
-        }
-        if (!edge(v, g.accept, g).second) {
-            return false;
-        }
-    }
-
-    if (!nonspecials_seen) {
-        return false;
-    }
-
-    if (!edge(g.start, g.accept, g).second) {
-        return false;
-    }
-
-    return true;
-}
-
-static
-void processInfixes(RoseInGraph &ig, const CompileContext &cc) {
-    /* we want to ensure that every prefix/infix graph is unique at this stage
-     * as we have not done any analysis to check if they are safe to share */
-
-    vector<RoseInEdge> dead;
-
-    for (const auto &e : edges_range(ig)) {
-        if (!ig[e].graph) {
-            continue;
-        }
-
-        RoseInVertex u = source(e, ig), v = target(e, ig);
-
-        // Infixes are edges between two literals.
-        if (ig[u].type != RIV_LITERAL || ig[v].type != RIV_LITERAL) {
-            continue;
-        }
-
-        if (ig[e].graph_lag) {
-            continue; /* already looked at */
-        }
-
-        DEBUG_PRINTF("looking at infix %p\n", ig[e].graph.get());
-
-        const ue2_literal &lit1 = ig[u].s;
-        const ue2_literal &lit2 = ig[v].s;
-        size_t overlap = maxOverlap(lit1, lit2, 0);
-
-        const NGHolder &h = *ig[e].graph;
-
-        DEBUG_PRINTF("infix rose between literals '%s' and '%s', overlap %zu,"
-                     "size %zu\n",
-                     dumpString(lit1).c_str(), dumpString(lit2).c_str(),
-                     overlap, num_vertices(h));
-
-        if (!can_match(h, lit2, true)) {
-            DEBUG_PRINTF("found bogus edge\n");
-            dead.push_back(e);
-            continue;
-        }
-
-        unique_ptr<NGHolder> h_new = cloneHolder(h);
-
-        u32 delay = removeTrailingLiteralStates(*h_new, lit2, MO_INVALID_IDX);
-        if (delay == MO_INVALID_IDX) {
-            DEBUG_PRINTF("found bogus edge\n");
-            dead.push_back(e);
-            continue;
-        }
-
-        // Delay can be set to at most lit2.length() - overlap, but we must
-        // truncate to history available in streaming mode.
-        u32 max_allowed_delay = lit2.length() - overlap;
-        LIMIT_TO_AT_MOST(&max_allowed_delay, delay);
-
-        if (cc.streaming) {
-            LIMIT_TO_AT_MOST(&max_allowed_delay, cc.grey.maxHistoryAvailable);
-        }
-
-        if (delay != max_allowed_delay) {
-            restoreTrailingLiteralStates(*h_new, lit2, delay);
-            delay = removeTrailingLiteralStates(*h_new, lit2, max_allowed_delay);
-        }
-
-        if (isStarCliche(*h_new)) {
-            DEBUG_PRINTF("is a X star!\n");
-            ig[e].graph.reset();
-            ig[e].graph_lag = 0;
-        } else {
-            ig[e].graph = move(h_new);
-            ig[e].graph_lag = delay;
-            DEBUG_PRINTF("delay increased to %u\n", delay);
-        }
-    }
-
-    for (const auto &e : dead) {
-        remove_edge(e, ig);
-    }
-}
-
-static
-void poisonNetflowScores(RoseInGraph &ig, RoseInEdge lhs,
-                         vector<u64a> *scores) {
-    assert(ig[lhs].graph);
-    NGHolder &h = *ig[lhs].graph;
-
-    if (ig[target(lhs, ig)].type != RIV_LITERAL) {
-        /* nothing to poison in outfixes */
-        assert(ig[target(lhs, ig)].type == RIV_ACCEPT);
-        return;
-    }
-
-    set<NFAVertex> curr, next;
-    insert(&curr, inv_adjacent_vertices(h.accept, h));
-    set<NFAEdge> poisoned;
-    u32 len = ig[target(lhs, ig)].s.length();
-    assert(len);
-    while (len) {
-        next.clear();
-        for (auto v : curr) {
-            insert(&poisoned, in_edges(v, h));
-            insert(&next, inv_adjacent_vertices(v, h));
-        }
-
-        curr.swap(next);
-        len--;
-    }
-
-    for (const auto &e : poisoned) {
-        (*scores)[h[e].index] = NO_LITERAL_AT_EDGE_SCORE;
-    }
-}
-
-#define MAX_NETFLOW_CUT_WIDTH 40 /* magic number is magic */
-#define MAX_LEN_2_LITERALS_PER_CUT 3
-
-static
-bool checkValidNetflowLits(NGHolder &h, const vector<u64a> &scores,
-                           const map<NFAEdge, set<ue2_literal>> &cut_lits,
-                           const Grey &grey) {
-    DEBUG_PRINTF("cut width %zu\n", cut_lits.size());
-    if (cut_lits.size() > MAX_NETFLOW_CUT_WIDTH) {
-        return false;
-    }
-
-    u32 len_2_count = 0;
-
-    for (const auto &cut : cut_lits) {
-        if (scores[h[cut.first].index] >= NO_LITERAL_AT_EDGE_SCORE) {
-            DEBUG_PRINTF("cut uses a forbidden edge\n");
-            return false;
-        }
-
-        if (min_len(cut.second) < grey.minRoseNetflowLiteralLength) {
-            DEBUG_PRINTF("cut uses a bad literal\n");
-            return false;
-        }
-
-        for (const auto &lit : cut.second) {
-            if (lit.length() == 2) {
-                len_2_count++;
-            }
-        }
-    }
-
-    if (len_2_count > MAX_LEN_2_LITERALS_PER_CUT) {
-        return false;
-    }
-
-    return true;
-}
-
-static
-void splitEdgesByCut(RoseInGraph &ig, const vector<RoseInEdge> &to_cut,
-                     const vector<NFAEdge> &cut,
-                     const map<NFAEdge, set<ue2_literal> > &cut_lits) {
-    assert(!to_cut.empty());
-    assert(ig[to_cut.front()].graph);
-    NGHolder &h = *ig[to_cut.front()].graph;
-
-    /* note: the set of split edges should form a complete bipartite graph */
-    vector<RoseInVertex> src_verts;
-    vector<RoseInVertex> dest_verts;
-    getSourceVerts(ig, to_cut, &src_verts);
-    getDestVerts(ig, to_cut, &dest_verts);
-    assert(to_cut.size() == src_verts.size() * dest_verts.size());
-
-    map<vector<NFAVertex>, shared_ptr<NGHolder> > done_rhs;
-
-    /* iterate over cut for determinism */
-    for (const auto &e : cut) {
-        NFAVertex prev_v = source(e, h);
-        NFAVertex pivot = target(e, h);
-
-        vector<NFAVertex> adj;
-        insert(&adj, adj.end(), adjacent_vertices(pivot, h));
-        /* we can ignore presence of accept, accepteod in adj as it is best
-           effort */
-
-        if (!contains(done_rhs, adj)) {
-            ue2::unordered_map<NFAVertex, NFAVertex> temp_map;
-            shared_ptr<NGHolder> new_rhs = make_shared<NGHolder>();
-            splitRHS(h, adj, new_rhs.get(), &temp_map);
-            remove_edge(new_rhs->start, new_rhs->accept, *new_rhs);
-            remove_edge(new_rhs->start, new_rhs->acceptEod, *new_rhs);
-            done_rhs.insert(make_pair(adj, new_rhs));
-            /* TODO need to update v_mapping (if we were doing more cuts) */
-        }
-
-        DEBUG_PRINTF("splitting on pivot %zu\n", h[pivot].index);
-        ue2::unordered_map<NFAVertex, NFAVertex> temp_map;
-        shared_ptr<NGHolder> new_lhs = make_shared<NGHolder>();
-        splitLHS(h, pivot, new_lhs.get(), &temp_map);
-
-        /* want to cut of paths to pivot from things other than the pivot -
-         * makes a more svelte graphy */
-        clear_in_edges(temp_map[pivot], *new_lhs);
-        add_edge(temp_map[prev_v], temp_map[pivot], *new_lhs);
-
-        pruneUseless(*new_lhs);
-
-        const set<ue2_literal> &lits = cut_lits.at(e);
-        for (const auto &lit : lits) {
-            RoseInVertex v
-                = add_vertex(RoseInVertexProps::makeLiteral(lit), ig);
-
-            if (edge(pivot, h.accept, h).second) {
-                /* literal has a direct connection to accept */
-                assert(ig[dest_verts.front()].type == RIV_ACCEPT);
-                const auto &reports = h[pivot].reports;
-                RoseInVertex tt =
-                    add_vertex(RoseInVertexProps::makeAccept(reports), ig);
-                add_edge(v, tt, RoseInEdgeProps(0U, 0U), ig);
-            }
-
-            if (edge(pivot, h.acceptEod, h).second) {
-                /* literal has a direct connection to accept */
-                assert(ig[dest_verts.front()].type == RIV_ACCEPT);
-                const auto &reports = h[pivot].reports;
-                RoseInVertex tt = add_vertex(
-                    RoseInVertexProps::makeAcceptEod(reports), ig);
-                add_edge(v, tt, RoseInEdgeProps(0U, 0U), ig);
-            }
-
-            assert(done_rhs[adj].get());
-            shared_ptr<NGHolder> new_rhs = done_rhs[adj];
-            if (out_degree(new_rhs->start, *new_rhs) != 1) {
-                for (auto dst_v : dest_verts) {
-                    add_edge(v, dst_v, RoseInEdgeProps(done_rhs[adj], 0), ig);
-                }
-            }
-
-            for (auto src_v : src_verts) {
-                add_edge(src_v, v, RoseInEdgeProps(new_lhs, 0), ig);
-            }
-        }
-    }
-
-    /* TODO need to update v_mapping (if we were doing more cuts) */
-
-    for (const auto &e : to_cut) {
-        assert(ig[e].graph.get() == &h);
-        remove_edge(e, ig);
-    }
-}
-
-static
-bool doNetflowCut(RoseInGraph &ig, const vector<RoseInEdge> &to_cut,
-                  const Grey &grey) {
-    DEBUG_PRINTF("doing netflow cut\n");
-    /* TODO: we should really get literals/scores from the full graph as this
-     * allows us to overlap the graph. Doesn't matter at the moment as we
-     * are working on the LHS. */
-
-    NGHolder &h = *ig[to_cut.front()].graph;
-    if (num_edges(h) > grey.maxRoseNetflowEdges) {
-        /* We have a limit on this because scoring edges and running netflow
-         * gets very slow for big graphs. */
-        DEBUG_PRINTF("too many edges, skipping netflow cut\n");
-        return false;
-    }
-
-    renumber_vertices(h);
-    renumber_edges(h);
-    /* Step 1: Get scores for all edges */
-    vector<u64a> scores = scoreEdges(h); /* scores by edge_index */
-    /* Step 2: poison scores for edges covered by successor literal */
-    for (const auto &e : to_cut) {
-        assert(&h == ig[e].graph.get());
-        poisonNetflowScores(ig, e, &scores);
-    }
-    /* Step 3: Find cutset based on scores */
-    vector<NFAEdge> cut = findMinCut(h, scores);
-
-    /* Step 4: Get literals corresponding to cut edges */
-    map<NFAEdge, set<ue2_literal>> cut_lits;
-    for (const auto &e : cut) {
-        set<ue2_literal> lits = getLiteralSet(h, e);
-        compressAndScore(lits);
-        cut_lits[e] = lits;
-    }
-
-    /* if literals are underlength bail or if it involves a forbidden edge*/
-    if (!checkValidNetflowLits(h, scores, cut_lits, grey)) {
-        return false;
-    }
-    DEBUG_PRINTF("splitting\n");
-
-    /* Step 5: Split graph based on cuts */
-    splitEdgesByCut(ig, to_cut, cut, cut_lits);
-    return true;
-}
-
-/** \brief Returns the number of intermediate vertices in the shortest path
- * between (from, to). */
-static
-u32 min_dist_between(NFAVertex from, NFAVertex to, const NGHolder &g) {
-    // Check for the trivial case: that way we don't have to set up the
-    // containers below.
-    if (edge(from, to, g).second) {
-        return 0;
-    }
-
-    ue2::unordered_set<NFAVertex> visited;
-    visited.insert(from);
-
-    flat_set<NFAVertex> curr, next;
-    curr.insert(from);
-
-    assert(from != to);
-
-    u32 d = 0;
-
-    while (!curr.empty()) {
-        next.clear();
-        for (auto v : curr) {
-            for (auto w : adjacent_vertices_range(v, g)) {
-                if (w == to) {
-                    return d;
-                }
-                if (visited.insert(w).second) { // first visit to *ai
-                    next.insert(w);
-                }
-            }
-        }
-
-        d++;
-        curr.swap(next);
-    }
-    assert(0);
-    return ROSE_BOUND_INF;
-}
-
-/** Literals which are completely enveloped by a successor are trouble because
- * hamsterwheel acceleration can skip past the start of the literal. */
-static
-bool enveloped(const vector<NFAVertex> &cand_split_v,
-               const set<ue2_literal> &cand_lit, const NGHolder &g,
-               const RoseInVertexProps &succ) {
-    if (succ.type != RIV_LITERAL) {
-        return false;
-    }
-
-    /* TODO: handle multiple v more precisely: not all candidate v can start all
-     * candidate literals */
-
-    for (auto v : cand_split_v) {
-        u32 rhs_min_len = min_dist_between(v, g.accept, g);
-        if (rhs_min_len + min_len(cand_lit) >= succ.s.length()) {
-            return false;
-        }
-    }
-
-    return true; /* we are in trouble */
-}
-
-static
-bool enveloped(const VertLitInfo &cand_split, const RoseInGraph &ig,
-               const vdest_map_t &v_dest_map) {
-    for (auto v : cand_split.vv) {
-        const auto &images = v_dest_map.at(v);
-        for (const auto &image : images) {
-            /* check that we aren't enveloped by the successor */
-            if (enveloped(vector<NFAVertex>(1, image.second), cand_split.lit,
-                          *ig[image.first].graph,
-                          ig[target(image.first, ig)])) {
-                return true;
-            }
-
-            const RoseInVertexProps &pred = ig[source(image.first, ig)];
-            if (pred.type != RIV_LITERAL) {
-                continue;
-            }
-
-            /* check we don't envelop the pred */
-            const NGHolder &g = *ig[image.first].graph;
-            u32 lhs_min_len = min_dist_between(g.start, image.second, g);
-            if (lhs_min_len + pred.s.length() < max_len(cand_split.lit)) {
-                return true;
-            }
-        }
-    }
-
-    return false;
-}
-
-static
-bool attemptSplit(RoseInGraph &ig, vdest_map_t &v_dest_map,
-                  vsrc_map_t &v_src_map, const vector<RoseInEdge> &v_e,
-                  LitCollection &lits) {
-    NGHolder &h = *ig[v_e.front()].graph;
-    unique_ptr<VertLitInfo> split = lits.pickNext();
-
-    while (split) {
-        for (const auto &e : v_e) {
-            RoseInVertex t = target(e, ig);
-            if (enveloped(split->vv, split->lit, h, ig[t])) {
-                DEBUG_PRINTF("enveloped\n");
-                split = lits.pickNext();
-                goto next_split;
-            }
-        }
-        break;
-    next_split:;
-    }
-
-    if (!split) {
-        return false;
-    }
-
-    for (auto v : split->vv) {
-        if (edge(v, h.accept, h).second) {
-            return false;
-        }
-    }
-
-    DEBUG_PRINTF("saved by a bad literal\n");
-    splitRoseEdge(ig, *split, v_dest_map, v_src_map);
-    return true;
-}
-
-static
-void appendLiteral(const ue2_literal &s, const CharReach &cr,
-                   vector<ue2_literal> *out) {
-    for (size_t c = cr.find_first(); c != CharReach::npos;
-         c = cr.find_next(c)) {
-        bool nocase = ourisalpha(c) && cr.test(mytoupper(c))
-            && cr.test(mytolower(c));
-
-        if (nocase && (char)c == mytolower(c)) {
-            continue; /* uppercase already handled us */
-        }
-
-        out->push_back(s);
-        out->back().push_back(c, nocase);
-    }
-}
-
-static
-bool findAnchoredLiterals(const NGHolder &g, vector<ue2_literal> *out,
-                          vector<NFAVertex> *pivots_out) {
-
-    DEBUG_PRINTF("trying for anchored\n");
-#define MAX_ANCHORED_LITERALS 30
-#define MAX_ANCHORED_LITERAL_LEN 30
-
-    /* TODO: this could be beefed up by going region-by-region but currently
-     * that brings back bad memories of ng_rose. OR any AA region we can build
-     * a dfa out of */
-    assert(!proper_out_degree(g.startDs, g));
-
-    vector<ue2_literal> lits;
-    lits.push_back(ue2_literal());
-
-    set<NFAVertex> curr;
-    insert(&curr, adjacent_vertices(g.start, g));
-    curr.erase(g.startDs);
-
-    set<NFAVertex> old;
-
-    if (contains(curr, g.accept) || curr.empty()) {
-        DEBUG_PRINTF("surprise accept/voidness\n");
-        return false;
-    }
-
-    while (!curr.empty()) {
-        set<NFAVertex> next_verts;
-        insert(&next_verts, adjacent_vertices(*curr.begin(), g));
-        bool can_extend
-            = !next_verts.empty() && !contains(next_verts, g.accept);
-        CharReach cr;
-
-        for (auto v : curr) {
-            assert(!is_special(v, g));
-
-            if (can_extend) {
-                /* next verts must agree */
-                set<NFAVertex> next_verts_local;
-                insert(&next_verts_local, adjacent_vertices(v, g));
-                can_extend = next_verts_local == next_verts;
-            }
-
-            cr |= g[v].char_reach;
-        }
-
-        if (!can_extend) {
-            goto bail;
-        }
-
-        /* extend literals */
-        assert(cr.any());
-        vector<ue2_literal> next_lits;
-        for (const auto &lit : lits) {
-            appendLiteral(lit, cr, &next_lits);
-            if (next_lits.size() > MAX_ANCHORED_LITERALS) {
-                goto bail;
-            }
-        }
-
-        assert(!next_lits.empty());
-        old.swap(curr);
-
-        if (next_lits[0].length() <= MAX_ANCHORED_LITERAL_LEN) {
-            curr.swap(next_verts);
-        } else {
-            curr.clear();
-        }
-
-        lits.swap(next_lits);
-    }
- bail:
-    assert(!lits.empty());
-    for (UNUSED const auto &lit : lits) {
-        DEBUG_PRINTF("found anchored string: %s\n", dumpString(lit).c_str());
-    }
-
-    insert(pivots_out, pivots_out->end(), old);
-    out->swap(lits);
-    return !out->empty() && !out->begin()->empty();
-}
-
-static
-bool tryForAnchoredImprovement(RoseInGraph &ig, RoseInEdge e) {
-    vector<ue2_literal> lits;
-    vector<NFAVertex> pivots;
-
-    if (!findAnchoredLiterals(*ig[e].graph, &lits, &pivots)) {
-        DEBUG_PRINTF("unable to find literals\n");
-        return false;
-    }
-    DEBUG_PRINTF("found %zu literals to act as anchors\n", lits.size());
-
-    RoseInVertex s = source(e, ig);
-    RoseInVertex t = target(e, ig);
-
-    assert(!ig[e].graph_lag);
-
-    shared_ptr<NGHolder> lhs = make_shared<NGHolder>();
-    shared_ptr<NGHolder> rhs = make_shared<NGHolder>();
-    ue2::unordered_map<NFAVertex, NFAVertex> temp1;
-    ue2::unordered_map<NFAVertex, NFAVertex> temp2;
-
-    splitGraph(*ig[e].graph, pivots, lhs.get(), &temp1, rhs.get(), &temp2);
-
-    for (const auto &lit : lits) {
-        RoseInVertex v = add_vertex(RoseInVertexProps::makeLiteral(lit),
-                                      ig);
-        add_edge(s, v, RoseInEdgeProps(lhs, 0U), ig);
-        add_edge(v, t, RoseInEdgeProps(rhs, 0U), ig);
-    }
-    remove_edge(e, ig);
-
-    return true;
-}
-
-#define MAX_SINGLE_BYTE_ANCHORED_DIST 30
-
-/* returns true if we should make another pass */
-static
-bool lastChanceImproveLHS(RoseInGraph &ig, RoseInEdge lhs,
-                          const CompileContext &cc) {
-    DEBUG_PRINTF("argh lhs is nasty\n");
-    assert(ig[lhs].graph);
-
-    /* customise the lhs for this literal */
-    /* TODO better, don't recalc */
-    if (ig[target(lhs, ig)].type == RIV_LITERAL) {
-        const NGHolder &h = *ig[lhs].graph;
-
-        /* sanitise literal on lhs */
-        const ue2_literal &s = ig[target(lhs, ig)].s;
-
-        if (!can_match(h, s, false)) {
-            DEBUG_PRINTF("found bogus edge\n");
-            return false;
-        }
-
-        /* see if we can build some anchored literals out of this */
-        if (isAnchored(h) && tryForAnchoredImprovement(ig, lhs)) {
-            return true;
-        }
-
-        unique_ptr<NGHolder> cust = cloneHolder(h);
-        u32 d = removeTrailingLiteralStates(*cust, s, MO_INVALID_IDX);
-        if (d == MO_INVALID_IDX) {
-            DEBUG_PRINTF("found bogus edge\n");
-            return false;
-        }
-        restoreTrailingLiteralStates(*cust, s, d);
-        ig[lhs].graph = move(cust);
-    }
-
-    NGHolder &lhs_graph = *ig[lhs].graph;
-    set<NFAVertex> cand;
-    set<NFAVertex> cand_raw;
-    getCandidatePivots(lhs_graph, &cand, &cand_raw);
-    vdest_map_t v_dest_map;
-    vsrc_map_t v_src_map;
-    for (auto v : vertices_range(lhs_graph)) {
-        v_dest_map[v].emplace_back(lhs, v);
-        v_src_map[lhs].push_back(v);
-    }
-
-    vector<NFAVertexDepth> depths;
-    calcDepths(lhs_graph, depths);
-
-    /* need to ensure regions are valid before we do lit discovery */
-    auto region_map = assignRegions(lhs_graph);
-
-    vector<RoseInEdge> to_cut(1, lhs);
-    DEBUG_PRINTF("see if we can get a better lhs by another cut\n");
-    LitCollection lit1(lhs_graph, depths, region_map, cand, cand_raw,
-                       cc.grey.minRoseLiteralLength, true, cc);
-    if (attemptSplit(ig, v_dest_map, v_src_map, to_cut, lit1)) {
-        return true;
-    }
-
-    if (doNetflowCut(ig, to_cut, cc.grey)) {
-        return true;
-    }
-
-    DEBUG_PRINTF("eek last chance try len 1 if it creates an anchored lhs\n");
-    {
-        LitCollection lits(lhs_graph, depths, region_map, cand, cand_raw, 1,
-                           true, cc, true);
-        unique_ptr<VertLitInfo> split = lits.pickNext();
-
-        /* TODO fix edge to accept check */
-        while (split
-               && (enveloped(split->vv, split->lit, lhs_graph,
-                             ig[target(lhs, ig)])
-                   || edge(split->vv.front(), lhs_graph.accept, lhs_graph).second
-                   || !createsAnchoredLHS(lhs_graph, split->vv, depths, cc.grey,
-                                          MAX_SINGLE_BYTE_ANCHORED_DIST))) {
-            split = lits.pickNext();
-        }
-
-        if (split) {
-            DEBUG_PRINTF("saved by a really bad literal\n");
-            splitRoseEdge(ig, *split, v_dest_map, v_src_map);
-            return true;
-        }
-    }
-
-    return false;
-}
-
-/* returns false if nothing happened */
-static
-bool lastChanceImproveLHS(RoseInGraph &ig, const vector<RoseInEdge> &to_cut,
-                          const CompileContext &cc) {
-    DEBUG_PRINTF("argh lhses are nasty\n");
-
-    NGHolder &lhs_graph = *ig[to_cut.front()].graph;
-    set<NFAVertex> cand;
-    set<NFAVertex> cand_raw;
-    getCandidatePivots(lhs_graph, &cand, &cand_raw);
-    vdest_map_t v_dest_map;
-    vsrc_map_t v_src_map;
-    for (auto v : vertices_range(lhs_graph)) {
-        for (const auto &e : to_cut) {
-            v_dest_map[v].emplace_back(e, v);
-            v_src_map[e].push_back(v);
-        }
-    }
-
-    vector<NFAVertexDepth> depths;
-    calcDepths(lhs_graph, depths);
-
-    auto region_map = assignRegions(lhs_graph);
-
-    DEBUG_PRINTF("see if we can get a better lhs by allowing another cut\n");
-    LitCollection lit1(lhs_graph, depths, region_map, cand, cand_raw,
-                       cc.grey.minRoseLiteralLength, true, cc);
-    if (attemptSplit(ig, v_dest_map, v_src_map, to_cut, lit1)) {
-        return true;
-    }
-
-    return doNetflowCut(ig, to_cut, cc.grey);
-}
-
-static
-bool improveLHS(RoseInGraph &ig, const vector<RoseInEdge> &edges,
-                const CompileContext &cc) {
-    bool rv = false;
-
-    vector<RoseInVertex> src_verts;
-    getSourceVerts(ig, edges, &src_verts);
-
-    map<RoseInVertex, vector<RoseInEdge>> by_src;
-    for (const auto &e : edges) {
-        by_src[source(e, ig)].push_back(e);
-    }
-
-    for (auto v : src_verts) {
-        const vector<RoseInEdge> &local = by_src[v];
-
-        vector<NGHolder *> graphs;
-        map<NGHolder *, vector<RoseInEdge> > by_graph;
-        for (const auto &e : local) {
-            NGHolder *gp = ig[e].graph.get();
-            if (!contains(by_graph, gp)) {
-                graphs.push_back(gp);
-            }
-            by_graph[gp].push_back(e);
-        }
-
-        for (auto h : graphs) {
-            const vector<RoseInEdge> &local2 = by_graph[h];
-            if (local2.size() == 1) {
-                rv |= lastChanceImproveLHS(ig, local2.front(), cc);
-                continue;
-            }
-
-            bool lrv = lastChanceImproveLHS(ig, local2, cc);
-            if (lrv) {
-                rv = true;
-            } else {
-                for (const auto &e2 : local2) {
-                    rv |= lastChanceImproveLHS(ig, e2, cc);
-                }
-            }
-        }
-    }
-
-    return rv;
-}
-
-static
-void processLHS(RoseInGraph &ig, const CompileContext &cc) {
-    bool redo;
-    do {
-        redo = false;
-        vector<RoseInEdge> to_improve;
-        for (const auto &lhs : edges_range(ig)) {
-            if (ig[source(lhs, ig)].type != RIV_START
-                && ig[source(lhs, ig)].type != RIV_ANCHORED_START) {
-                continue;
-            }
-
-            if (ig[target(lhs, ig)].type == RIV_LITERAL) {
-                DEBUG_PRINTF("checking lhs->'%s'\n",
-                             ig[target(lhs, ig)].s.c_str());
-            } else {
-                DEBUG_PRINTF("checking lhs->?\n");
-            }
-
-
-            /* if check if lhs is nasty */
-            if (ig[target(lhs, ig)].type == RIV_ACCEPT) {
-                to_improve.push_back(lhs);
-                continue;
-            }
-
-            assert(ig[lhs].graph);
-            const NGHolder *h = ig[lhs].graph.get();
-
-            vector<NFAVertexDepth> depths;
-            calcDepths(*h, depths);
-
-            if (!isLHSTransient(*h, depths, cc.grey)
-                && !literalIsWholeGraph(*h, ig[target(lhs, ig)].s)
-                && !isLHSUsablyAnchored(*h, depths, cc.grey)) {
-                to_improve.push_back(lhs);
-            }
-        }
-
-        DEBUG_PRINTF("inspecting %zu lhs\n", to_improve.size());
-        if (to_improve.size() > 50) {
-            DEBUG_PRINTF("too big\n");
-            break;
-        }
-
-        redo = improveLHS(ig, to_improve, cc);
-        DEBUG_PRINTF("redo = %d\n", (int)redo);
-    } while (redo);
-
-    vector<RoseInEdge> to_inspect; /* to prevent surprises caused by us
-                                    * altering the graph while iterating */
-    for (const auto &e : edges_range(ig)) {
-        if (ig[source(e, ig)].type == RIV_START
-            || ig[source(e, ig)].type == RIV_ANCHORED_START) {
-            to_inspect.push_back(e);
-        }
-    }
-
-    for (const auto &lhs : to_inspect) {
-        if (ig[target(lhs, ig)].type == RIV_LITERAL) {
-            if (removeLiteralFromLHS(ig, lhs, cc)) {
-                handleLhsCliche(ig, lhs);
-            } else {
-                /* telling us to delete the edge */
-                remove_edge(lhs, ig);
-            }
-        }
-    }
-}
-
-static
-void tryNetflowCutForRHS(RoseInGraph &ig, const Grey &grey) {
-    vector<RoseInEdge> to_improve;
-    for (const auto &rhs : edges_range(ig)) {
-        if (ig[target(rhs, ig)].type != RIV_ACCEPT) {
-            continue;
-        }
-
-        if (ig[source(rhs, ig)].type == RIV_LITERAL) {
-            DEBUG_PRINTF("checking '%s'->rhs\n", ig[source(rhs, ig)].s.c_str());
-        } else {
-            DEBUG_PRINTF("checking ?->rhs\n");
-        }
-
-        if (!ig[rhs].graph) {
-            continue;
-        }
-
-        DEBUG_PRINTF("%zu vertices\n", num_vertices(*ig[rhs].graph));
-        if (num_vertices(*ig[rhs].graph) < 512) {
-            DEBUG_PRINTF("small\n");
-            continue;
-        }
-
-        /* if check if rhs is nasty */
-        to_improve.push_back(rhs);
-    }
-
-    DEBUG_PRINTF("inspecting %zu lhs\n", to_improve.size());
-    if (to_improve.size() > 50) {
-        DEBUG_PRINTF("too big\n");
-        return;
-    }
-
-    for (const auto &e : to_improve) {
-        vector<RoseInEdge> to_cut(1, e);
-        doNetflowCut(ig, to_cut, grey);
-    }
-}
-
-/* just make the string nocase and get the graph to handle case mask, TODO.
- * This could be more nuanced but the effort would probably be better spent
- * just making rose less bad. */
-static
-void makeNocaseWithPrefixMask(RoseInGraph &g, RoseInVertex v) {
-    for (const auto &e : in_edges_range(v, g)) {
-        const RoseInVertex u = source(e, g);
-
-        if (!g[e].graph) {
-            g[e].graph = make_shared<NGHolder>(whatRoseIsThis(g, e));
-            g[e].graph_lag = g[v].s.length();
-            NGHolder &h = *g[e].graph;
-
-            assert(!g[e].maxBound || g[e].maxBound == ROSE_BOUND_INF);
-
-            if (g[u].type == RIV_START) {
-                add_edge(h.startDs, h.accept, h);
-                h[h.startDs].reports.insert(0);
-            } else if (g[e].maxBound == ROSE_BOUND_INF) {
-                add_edge(h.start, h.accept, h);
-                NFAVertex ds = add_vertex(h);
-
-                h[ds].char_reach = CharReach::dot();
-
-                NFAEdge e_start_to_ds = add_edge(h.start, ds, h);
-                add_edge(ds, ds, h);
-                add_edge(ds, h.accept, h);
-                h[h.start].reports.insert(0);
-                h[ds].reports.insert(0);
-
-                if (g[u].type == RIV_LITERAL) {
-                    h[e_start_to_ds].tops.insert(DEFAULT_TOP);
-                }
-            } else {
-                assert(g[u].type == RIV_ANCHORED_START);
-                add_edge(h.start, h.accept, h);
-                h[h.start].reports.insert(0);
-            }
-        }
-
-        if (!g[e].graph_lag) {
-            continue;
-        }
-        unique_ptr<NGHolder> newg = cloneHolder(*g[e].graph);
-        restoreTrailingLiteralStates(*newg, g[v].s, g[e].graph_lag);
-        g[e].graph_lag = 0;
-        g[e].graph = move(newg);
-    }
-
-    make_nocase(&g[v].s);
-}
-
-static
-unique_ptr<NGHolder> makeGraphCopy(const NGHolder *g) {
-    if (g) {
-        return cloneHolder(*g);
-    } else {
-        return nullptr;
-    }
-}
-
-static
-void explodeLiteral(RoseInGraph &g, RoseInVertex v,
-                    vector<ue2_literal> &exploded) {
-    for (const auto &lit : exploded) {
-        RoseInVertex v_new = add_vertex(g[v], g);
-        g[v_new].s = lit;
-
-        for (const auto &e : in_edges_range(v, g)) {
-            RoseInEdge e2 = add_edge(source(e, g), v_new, g[e], g);
-            // FIXME: are we safe to share graphs here? For now, make our very
-            // own copy.
-            g[e2].graph = makeGraphCopy(g[e].graph.get());
-        }
-
-        for (const auto &e : out_edges_range(v, g)) {
-            RoseInEdge e2 = add_edge(v_new, target(e, g), g[e], g);
-            // FIXME: are we safe to share graphs here? For now, make our very
-            // own copy.
-            g[e2].graph = makeGraphCopy(g[e].graph.get());
-        }
-    }
-
-    clear_vertex(v, g);
-    remove_vertex(v, g);
-}
-
-/* Sadly rose is hacky in terms of mixed case literals. TODO: remove when rose
- * becomes less bad */
-static
-void handleLongMixedSensitivityLiterals(RoseInGraph &g) {
-    const size_t maxExploded = 8; // only case-explode this far
-
-    vector<RoseInVertex> verts;
-
-    for (auto v : vertices_range(g)) {
-        if (g[v].type != RIV_LITERAL) {
-            continue;
-        }
-
-        ue2_literal &s = g[v].s;
-
-        if (!mixed_sensitivity(s)) {
-            continue;
-        }
-
-        if (s.length() < MAX_MASK2_WIDTH) {
-            DEBUG_PRINTF("mixed lit will be handled by benefits mask\n");
-            continue;
-        }
-
-        DEBUG_PRINTF("found mixed lit of len %zu\n", s.length());
-        verts.push_back(v);
-    }
-
-    for (auto v : verts) {
-        vector<ue2_literal> exploded;
-        case_iter cit = caseIterateBegin(g[v].s), cite = caseIterateEnd();
-        for (; cit != cite; ++cit) {
-            exploded.emplace_back(*cit, false);
-            if (exploded.size() > maxExploded) {
-                goto dont_explode;
-            }
-        }
-
-        DEBUG_PRINTF("exploding literal into %zu pieces\n", exploded.size());
-        explodeLiteral(g, v, exploded);
-        continue;
-
-    dont_explode:
-        DEBUG_PRINTF("converting to nocase with prefix mask\n");
-        makeNocaseWithPrefixMask(g, v);
-    }
-
-    DEBUG_PRINTF("done!\n");
-}
-
-static
-void dedupe(RoseInGraph &g) {
-    /* We know that every prefix/infix is unique after the rose construction.
-     *
-     * If a vertex has out-going graphs with the same rewind and they are equal
-     * we can dedupe the graph.
-     *
-     * After this, we may share graphs on out-edges of a vertex. */
-    map<pair<u32, u64a>, vector<shared_ptr<NGHolder>>> buckets;
-
-    for (auto v : vertices_range(g)) {
-        buckets.clear();
-
-        for (const auto &e : out_edges_range(v, g)) {
-            if (!g[e].graph || g[target(e, g)].type != RIV_LITERAL) {
-                continue;
-            }
-            auto k = make_pair(g[e].graph_lag, hash_holder(*g[e].graph));
-            auto &bucket = buckets[k];
-            for (const auto &h : bucket) {
-                if (is_equal(*g[e].graph, 0U, *h, 0U)) {
-                    g[e].graph = h;
-                    goto next_edge;
-                }
-            }
-
-            bucket.push_back(g[e].graph);
-        next_edge:;
-        }
-    }
-}
-
-static
-bool pureReport(NFAVertex v, const NGHolder &g) {
-    for (auto w : adjacent_vertices_range(v, g)) {
-        if (w != g.accept && w != g.acceptEod) {
-            return false;
-        }
-    }
-    return true;
-}
-
-static
-bool pureReport(const vector<NFAVertex> &vv, const NGHolder &g) {
-    for (auto v : vv) {
-        if (!pureReport(v, g)) {
-            return false;
-        }
-    }
-
-    return true;
-}
-
-/* ensures that a vertex is followed by a start construct AND the cyclic states
- * has a reasonably wide reach */
-static
-bool followedByStar(NFAVertex v, const NGHolder &g) {
-    set<NFAVertex> succ;
-    insert(&succ, adjacent_vertices(v, g));
-
-    set<NFAVertex> asucc;
-
-    for (auto w : adjacent_vertices_range(v, g)) {
-        if (g[w].char_reach.count() < N_CHARS - MAX_ESCAPE_CHARS) {
-            continue; /* state is too narrow to be considered as a sane star
-                         cyclic */
-        }
-
-        asucc.clear();
-        insert(&asucc, adjacent_vertices(w, g));
-
-        if (asucc == succ) {
-            return true;
-        }
-    }
-    return false;
-}
-
-static
-bool followedByStar(const vector<NFAVertex> &vv, const NGHolder &g) {
-    for (auto v : vv) {
-        if (!followedByStar(v, g)) {
-            return false;
-        }
-    }
-
-    return true;
-}
-
-static
-bool isEodPrefixCandidate(const NGHolder &g) {
-    if (in_degree(g.accept, g)) {
-        DEBUG_PRINTF("graph isn't eod anchored\n");
-        return false;
-    }
-
-    // TODO: handle more than one report.
-    if (all_reports(g).size() != 1) {
-        return false;
-    }
-
-    return true;
-}
-
-
-static
-bool isEodWithPrefix(const RoseInGraph &g) {
-    if (num_vertices(g) != 2) {
-        return false;
-    }
-
-    for (const auto &e : edges_range(g)) {
-        RoseInVertex u = source(e, g), v = target(e, g);
-        DEBUG_PRINTF("edge from %d -> %d\n", g[u].type, g[v].type);
-
-        if (g[u].type != RIV_START && g[u].type != RIV_ANCHORED_START) {
-            DEBUG_PRINTF("source not start, type=%d\n", g[u].type);
-            return false;
-        }
-
-        if (g[v].type != RIV_ACCEPT && g[v].type != RIV_ACCEPT_EOD) {
-            DEBUG_PRINTF("target not accept, type=%d\n", g[v].type);
-            return false;
-        }
-
-        // Haigs not handled.
-        if (g[e].haig) {
-            DEBUG_PRINTF("edge has haig\n");
-            return false;
-        }
-
-        if (!g[e].graph) {
-            DEBUG_PRINTF("no graph on edge\n");
-            return false;
-        }
-
-        if (!isEodPrefixCandidate(*g[e].graph)) {
-            DEBUG_PRINTF("graph is not eod prefix candidate\n");
-            return false;
-        }
-    }
-
-    return true;
-}
-
-static
-void processEodPrefixes(RoseInGraph &g) {
-    // Find edges to accept with EOD-anchored graphs that we can move over to
-    // acceptEod.
-    vector<RoseInEdge> acc_edges;
-    for (const auto &e : edges_range(g)) {
-        if (g[target(e, g)].type != RIV_ACCEPT) {
-            continue;
-        }
-        if (g[e].haig || !g[e].graph) {
-            continue;
-        }
-        if (!isEodPrefixCandidate(*g[e].graph)) {
-            continue;
-        }
-
-        // TODO: handle cases with multiple out-edges.
-        if (out_degree(source(e, g), g) > 1) {
-            continue;
-        }
-
-        acc_edges.push_back(e);
-    }
-
-    set<RoseInVertex> accepts;
-
-    for (const RoseInEdge &e : acc_edges) {
-        RoseInVertex u = source(e, g), v = target(e, g);
-        assert(g[e].graph);
-        assert(g[v].type == RIV_ACCEPT);
-        assert(all_reports(*g[e].graph).size() == 1);
-
-        // Move this edge from accept to acceptEod and give it the right reports
-        // from the graph on the edge.
-        const set<ReportID> reports = all_reports(*g[e].graph);
-        RoseInVertex w = add_vertex(
-                RoseInVertexProps::makeAcceptEod(reports), g);
-        add_edge(u, w, g[e], g);
-
-        remove_edge(e, g);
-        accepts.insert(v);
-    }
-
-    for (auto v : accepts) {
-        if (!in_degree(v, g)) {
-            remove_vertex(v, g);
-        }
-    }
-}
-
-/** Run some reduction passes on the graphs on our edges. */
-static
-void reduceGraphs(RoseInGraph &g, const CompileContext &cc) {
-    for (const auto &e : edges_range(g)) {
-        if (!g[e].graph) {
-            continue;
-        }
-        NGHolder &h = *g[e].graph;
-        assert(h.kind == whatRoseIsThis(g, e));
-        DEBUG_PRINTF("before, graph %p has %zu vertices, %zu edges\n", &h,
-                     num_vertices(h), num_edges(h));
-
-        pruneUseless(h);
-
-        reduceGraphEquivalences(h, cc);
-
-        removeRedundancy(h, SOM_NONE); /* rose doesn't track som */
-
-        DEBUG_PRINTF("after, graph %p has %zu vertices, %zu edges\n", &h,
-                     num_vertices(h), num_edges(h));
-
-        // It's possible that one of our graphs may have reduced to a dot-star
-        // cliche, i.e. it contains a startDs->accept edge. If so, we can
-        // remove it from the edge and just use edge bounds to represent it.
-        if (edge(h.startDs, h.accept, h).second) {
-            DEBUG_PRINTF("graph reduces to dot-star, deleting\n");
-            g[e].graph.reset();
-            g[e].graph_lag = 0;
-            g[e].minBound = 0;
-            g[e].maxBound = ROSE_BOUND_INF;
-        }
-    }
-}
-
-static
-unique_ptr<RoseInGraph> buildRose(const NGHolder &h, bool desperation,
-                                  const CompileContext &cc) {
-    /* Need to pick a pivot point which splits the graph in two with starts on
-     * one side and accepts on the other. Thus the pivot needs to dominate all
-     * the accept vertices */
-
-    /* maps a vertex in h to one of its images in the rose graph */
-    vdest_map_t v_dest_map;
-    vsrc_map_t v_src_map;
-
-    /* create trivial rose graph */
-    unique_ptr<RoseInGraph> igp = makeTrivialGraph(h, v_dest_map, v_src_map);
-    RoseInGraph &ig = *igp;
-
-    /* root graph is the graph on the only edge in our new RoseInGraph */
-    assert(num_edges(ig) == 1);
-    shared_ptr<NGHolder> root_g = ig[*edges(ig).first].graph;
-    assert(root_g);
-
-    /* find the literals */
-    set<NFAVertex> cand;
-    set<NFAVertex> cand_raw;
-    getCandidatePivots(*root_g, &cand, &cand_raw);
-
-    DEBUG_PRINTF("|cand| = %zu\n", cand.size());
-
-    vector<NFAVertexDepth> depths;
-    calcDepths(*root_g, depths);
-
-    auto region_map = assignRegions(*root_g);
-
-    LitCollection lits(*root_g, depths, region_map, cand, cand_raw,
-                       cc.grey.minRoseLiteralLength, desperation, cc);
-
-    for (u32 i = 0; i < cc.grey.roseDesiredSplit; ++i) {
-        DEBUG_PRINTF("attempting split %u (desired %u)\n", i,
-                     cc.grey.roseDesiredSplit);
-        unique_ptr<VertLitInfo> split = lits.pickNext();
-
-        /* need to check we aren't creating any enveloping literals */
-        while (split && enveloped(*split, ig, v_dest_map)) {
-            DEBUG_PRINTF("bad cand; getting next split\n");
-            split = lits.pickNext();
-        }
-
-        if (!split) {
-            DEBUG_PRINTF("no more lits :(\n");
-            break;
-        }
-        splitRoseEdge(ig, *split, v_dest_map, v_src_map);
-    }
-
-    /* try for more split literals if they are followed by .* or accept */
-    for (;;) {
-        DEBUG_PRINTF("attempting bonus split\n");
-        unique_ptr<VertLitInfo> split = lits.pickNext();
-
-        /* need to check we aren't creating any enveloping literals */
-        while (split
-               && (enveloped(*split, ig, v_dest_map)
-                   || (!pureReport(split->vv, *root_g)
-                       && !followedByStar(split->vv, *root_g)))) {
-            DEBUG_PRINTF("bad cand; getting next split\n");
-            split = lits.pickNext();
-        }
-
-        if (!split) {
-            DEBUG_PRINTF("no more lits :(\n");
-            break;
-        }
-        DEBUG_PRINTF("got bonus split\n");
-        splitRoseEdge(ig, *split, v_dest_map, v_src_map);
-    }
-
-    processLHS(ig, cc);
-
-    if (num_vertices(ig) <= 2) {
-        // At present, we don't accept all outfixes.
-        // However, we do handle the specific case of a rose that precedes an
-        // acceptEod, which we will support as a prefix to a special EOD event
-        // "literal".
-        if (!isEodWithPrefix(ig)) {
-            igp.reset();
-            return igp;
-        }
-    }
-
-    processEodPrefixes(ig);
-
-    processInfixes(ig, cc);
-
-    handleLongMixedSensitivityLiterals(ig);
-
-    dedupe(ig);
-
-    pruneUseless(ig);
-
-    reduceGraphs(ig, cc);
-
-    dumpPreRoseGraph(ig, cc.grey);
-
-    renumber_vertices(ig);
-    calcVertexOffsets(ig);
-    return igp;
-}
-
-static
-void desperationImprove(RoseInGraph &ig, const CompileContext &cc) {
-    DEBUG_PRINTF("rose said no; can we do better?\n");
-
-    /* infixes are tricky as we have to worry about delays, enveloping
-     * literals, etc */
-    tryNetflowCutForRHS(ig, cc.grey);
-    processInfixes(ig, cc);
-
-    handleLongMixedSensitivityLiterals(ig);
-    dedupe(ig);
-    pruneUseless(ig);
-    renumber_vertices(ig);
-    calcVertexOffsets(ig);
-}
-
-bool splitOffRose(RoseBuild &rose, const NGHolder &h, bool prefilter,
-                  const CompileContext &cc) {
-    if (!cc.grey.allowRose) {
-        return false;
-    }
-
-    // We should have at least one edge into accept or acceptEod!
-    assert(in_degree(h.accept, h) || in_degree(h.acceptEod, h) > 1);
-
-    unique_ptr<RoseInGraph> igp = buildRose(h, false, cc);
-    if (igp && rose.addRose(*igp, prefilter)) {
-        goto ok;
-    }
-
-    igp = buildRose(h, true, cc);
-
-    if (igp) {
-        if (rose.addRose(*igp, prefilter)) {
-            goto ok;
-        }
-
-        desperationImprove(*igp, cc);
-
-        if (rose.addRose(*igp, prefilter)) {
-            goto ok;
-        }
-    }
-
-    DEBUG_PRINTF("rose build failed\n");
-    return false;
-
-ok:
-    DEBUG_PRINTF("rose build ok\n");
-    return true;
-}
-
-bool finalChanceRose(RoseBuild &rose, const NGHolder &h, bool prefilter,
-                     const CompileContext &cc) {
-    DEBUG_PRINTF("final chance rose\n");
-    if (!cc.grey.allowRose) {
-        return false;
-    }
-    assert(h.kind == NFA_OUTFIX);
-
-    ue2_literal lit;
-    bool anch = false;
-    shared_ptr<NGHolder> rhs = make_shared<NGHolder>();
-    if (!splitOffLeadingLiteral(h, &lit, &*rhs)) {
-        DEBUG_PRINTF("no floating literal\n");
-        anch = true;
-        if (!splitOffAnchoredLeadingLiteral(h, &lit, &*rhs)) {
-            DEBUG_PRINTF("no anchored literal\n");
-            return false;
-        }
-    }
-
-    if (lit.length() < cc.grey.minRoseLiteralLength
-        || minStringPeriod(lit) < 2 ) {
-        DEBUG_PRINTF("lit too weak\n");
-        return false;
-    }
-
-    assert(lit.length() <= MAX_MASK2_WIDTH || !mixed_sensitivity(lit));
-
-    RoseInGraph ig;
-    RoseInVertex s
-        = add_vertex(RoseInVertexProps::makeStart(anch), ig);
-    RoseInVertex v = add_vertex(RoseInVertexProps::makeLiteral(lit), ig);
-    add_edge(s, v, RoseInEdgeProps(0, anch ? 0 : ROSE_BOUND_INF), ig);
-
-    ue2_literal lit2;
-    if (getTrailingLiteral(h, &lit2)
-        && lit2.length() >= cc.grey.minRoseLiteralLength
-        && minStringPeriod(lit2) >= 2) {
-
-        /* TODO: handle delay */
-        size_t overlap = maxOverlap(lit, lit2, 0);
-        u32 delay2 = lit2.length() - overlap;
-        delay2 = min(delay2, maxDelay(cc));
-        delay2 = removeTrailingLiteralStates(*rhs, lit2, delay2);
-        rhs->kind = NFA_INFIX;
-        assert(delay2 <= lit2.length());
-
-        RoseInVertex w
-            = add_vertex(RoseInVertexProps::makeLiteral(lit2), ig);
-        add_edge(v, w, RoseInEdgeProps(rhs, delay2), ig);
-
-        NFAVertex reporter = getSoleSourceVertex(h, h.accept);
-        assert(reporter);
-        const auto &reports = h[reporter].reports;
-        RoseInVertex a =
-            add_vertex(RoseInVertexProps::makeAccept(reports), ig);
-        add_edge(w, a, RoseInEdgeProps(0U, 0U), ig);
-    } else {
-        RoseInVertex a =
-            add_vertex(RoseInVertexProps::makeAccept(set<ReportID>()), ig);
-        add_edge(v, a, RoseInEdgeProps(rhs, 0U), ig);
-    }
-
-    renumber_vertices(ig);
-    calcVertexOffsets(ig);
-
-    return rose.addRose(ig, prefilter, true /* final chance */);
-}
-
-bool checkRose(const ReportManager &rm, const NGHolder &h, bool prefilter,
-               const CompileContext &cc) {
-    if (!cc.grey.allowRose) {
-        return false;
-    }
-
-    // We should have at least one edge into accept or acceptEod!
-    assert(in_degree(h.accept, h) || in_degree(h.acceptEod, h) > 1);
-
-    unique_ptr<RoseInGraph> igp;
-
-    // First pass.
-
-    igp = buildRose(h, false, cc);
-    if (igp && roseCheckRose(*igp, prefilter, rm, cc)) {
-        return true;
-    }
-
-    // Second ("desperation") pass.
-
-    igp = buildRose(h, true, cc);
-    if (igp) {
-        if (roseCheckRose(*igp, prefilter, rm, cc)) {
-            return true;
-        }
-
-        desperationImprove(*igp, cc);
-
-        if (roseCheckRose(*igp, prefilter, rm, cc)) {
-            return true;
-        }
-    }
-
-    return false;
-}
-
-} // namespace ue2
diff --git a/src/nfagraph/ng_rose.h b/src/nfagraph/ng_rose.h
deleted file mode 100644
index d180e8a5f..000000000
--- a/src/nfagraph/ng_rose.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2015-2016, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-/** \file
- * \brief Rose construction from NGHolder.
- */
-
-#ifndef NG_ROSE_H
-#define NG_ROSE_H
-
-#include "ng_holder.h"
-#include "ue2common.h"
-
-#include <vector>
-
-namespace ue2 {
-
-class NGHolder;
-class ReportManager;
-class RoseBuild;
-
-struct CompileContext;
-struct ue2_literal;
-
-/** \brief Attempt to consume the entire pattern in graph \a h with Rose.
- * Returns true if successful. */
-bool splitOffRose(RoseBuild &rose, const NGHolder &h, bool prefilter,
-                  const CompileContext &cc);
-
-/** \brief Attempt to consume the entire pattern in graph \a h with Rose.
- * This is the last attempt to handle a pattern before we resort to an outfix.
- * Returns true if successful. */
-bool finalChanceRose(RoseBuild &rose, const NGHolder &h, bool prefilter,
-                     const CompileContext &cc);
-
-/** \brief True if the pattern in \a h is consumable by Rose. This function
- * may be conservative (return false even if supported) for efficiency. */
-bool checkRose(const ReportManager &rm, const NGHolder &h, bool prefilter,
-               const CompileContext &cc);
-
-/** \brief Returns the delay or MO_INVALID_IDX if the graph cannot match with
- * the trailing literal. */
-u32 removeTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
-                                u32 max_delay, bool overhang_ok = true);
-
-void restoreTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
-                                  u32 delay);
-
-void restoreTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
-                                  u32 delay,
-                                  const std::vector<NFAVertex> &preds);
-
-} // namespace ue2
-
-#endif // NG_ROSE_H
diff --git a/src/nfagraph/ng_som.cpp b/src/nfagraph/ng_som.cpp
index f6ba0fa7d..674381031 100644
--- a/src/nfagraph/ng_som.cpp
+++ b/src/nfagraph/ng_som.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,9 +26,13 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-/** \file
+/**
+ * \file
  * \brief SOM ("Start of Match") analysis.
  */
+
+#include "ng_som.h"
+
 #include "ng.h"
 #include "ng_dump.h"
 #include "ng_equivalence.h"
@@ -40,15 +44,15 @@
 #include "ng_redundancy.h"
 #include "ng_region.h"
 #include "ng_reports.h"
-#include "ng_rose.h"
-#include "ng_som.h"
 #include "ng_som_add_redundancy.h"
 #include "ng_som_util.h"
 #include "ng_split.h"
 #include "ng_util.h"
+#include "ng_violet.h"
 #include "ng_width.h"
 #include "grey.h"
 #include "ue2common.h"
+#include "compiler/compiler.h"
 #include "nfa/goughcompile.h"
 #include "nfa/nfa_internal.h" // for MO_INVALID_IDX
 #include "parser/position.h"
@@ -1584,8 +1588,9 @@ void dumpSomPlan(UNUSED const NGHolder &g, UNUSED const som_plan &p,
  * implement the full pattern.
  */
 static
-void implementSomPlan(NG &ng, const NGWrapper &w, u32 comp_id, NGHolder &g,
-                      vector<som_plan> &plan, const u32 first_som_slot) {
+void implementSomPlan(NG &ng, const ExpressionInfo &expr, u32 comp_id,
+                      NGHolder &g, vector<som_plan> &plan,
+                      const u32 first_som_slot) {
     ReportManager &rm = ng.rm;
     SomSlotManager &ssm = ng.ssm;
 
@@ -1598,14 +1603,14 @@ void implementSomPlan(NG &ng, const NGWrapper &w, u32 comp_id, NGHolder &g,
 
     // Root plan, which already has a SOM slot assigned (first_som_slot).
     dumpSomPlan(g, plan.front(), 0);
-    dumpSomSubComponent(*plan.front().prefix, "04_som", w.expressionIndex,
-                        comp_id, 0, ng.cc.grey);
+    dumpSomSubComponent(*plan.front().prefix, "04_som", expr.index, comp_id, 0,
+                        ng.cc.grey);
     assert(plan.front().prefix);
     if (plan.front().escapes.any() && !plan.front().is_reset) {
         /* setup escaper for first som location */
         if (!createEscaper(ng, *plan.front().prefix, plan.front().escapes,
                            first_som_slot)) {
-            throw CompileError(w.expressionIndex, "Pattern is too large.");
+            throw CompileError(expr.index, "Pattern is too large.");
         }
     }
 
@@ -1617,7 +1622,7 @@ void implementSomPlan(NG &ng, const NGWrapper &w, u32 comp_id, NGHolder &g,
     for (++it; it != plan.end(); ++it) {
         const u32 plan_num = it - plan.begin();
         dumpSomPlan(g, *it, plan_num);
-        dumpSomSubComponent(*it->prefix, "04_som", w.expressionIndex, comp_id,
+        dumpSomSubComponent(*it->prefix, "04_som", expr.index, comp_id,
                             plan_num, ng.cc.grey);
 
         assert(it->parent < plan_num);
@@ -1628,7 +1633,7 @@ void implementSomPlan(NG &ng, const NGWrapper &w, u32 comp_id, NGHolder &g,
 
         assert(!it->no_implement);
         if (!buildMidfix(ng, *it, som_slot_in, som_slot_out)) {
-            throw CompileError(w.expressionIndex, "Pattern is too large.");
+            throw CompileError(expr.index, "Pattern is too large.");
         }
         updateReportToUseRecordedSom(rm, g, it->reporters_in, som_slot_in);
         updateReportToUseRecordedSom(rm, g, it->reporters, som_slot_out);
@@ -1639,7 +1644,7 @@ void implementSomPlan(NG &ng, const NGWrapper &w, u32 comp_id, NGHolder &g,
         renumber_vertices(*plan.front().prefix);
         assert(plan.front().prefix->kind == NFA_OUTFIX);
         if (!ng.addHolder(*plan.front().prefix)) {
-            throw CompileError(w.expressionIndex, "Pattern is too large.");
+            throw CompileError(expr.index, "Pattern is too large.");
         }
     }
 }
@@ -1727,19 +1732,19 @@ void clearProperInEdges(NGHolder &g, const NFAVertex sink) {
 
 namespace {
 struct SomRevNfa {
-    SomRevNfa(NFAVertex s, ReportID r, aligned_unique_ptr<NFA> n)
+    SomRevNfa(NFAVertex s, ReportID r, bytecode_ptr<NFA> n)
         : sink(s), report(r), nfa(move(n)) {}
-	SomRevNfa(SomRevNfa&& s) // MSVC2013 needs this for emplace
-		: sink(s.sink), report(s.report), nfa(move(s.nfa)) {}
+    SomRevNfa(SomRevNfa &&s) // MSVC2013 needs this for emplace
+        : sink(s.sink), report(s.report), nfa(move(s.nfa)) {}
     NFAVertex sink;
     ReportID report;
-    aligned_unique_ptr<NFA> nfa;
+    bytecode_ptr<NFA> nfa;
 };
 }
 
 static
-aligned_unique_ptr<NFA> makeBareSomRevNfa(const NGHolder &g,
-                                          const CompileContext &cc) {
+bytecode_ptr<NFA> makeBareSomRevNfa(const NGHolder &g,
+                                    const CompileContext &cc) {
     // Create a reversed anchored version of this NFA which fires a zero report
     // ID on accept.
     NGHolder g_rev;
@@ -1755,7 +1760,7 @@ aligned_unique_ptr<NFA> makeBareSomRevNfa(const NGHolder &g,
 
     DEBUG_PRINTF("building a rev NFA with %zu vertices\n", num_vertices(g_rev));
 
-    aligned_unique_ptr<NFA> nfa = constructReversedNFA(g_rev, cc);
+    auto nfa = constructReversedNFA(g_rev, cc);
     if (!nfa) {
         return nfa;
     }
@@ -1790,7 +1795,7 @@ bool makeSomRevNfa(vector<SomRevNfa> &som_nfas, const NGHolder &g,
 
     renumber_vertices(g2); // for findMinWidth, findMaxWidth.
 
-    aligned_unique_ptr<NFA> nfa = makeBareSomRevNfa(g2, cc);
+    auto nfa = makeBareSomRevNfa(g2, cc);
     if (!nfa) {
         DEBUG_PRINTF("couldn't build rev nfa\n");
         return false;
@@ -1852,7 +1857,7 @@ bool doSomRevNfa(NG &ng, NGHolder &g, const CompileContext &cc) {
 }
 
 static
-u32 doSomRevNfaPrefix(NG &ng, const NGWrapper &w, NGHolder &g,
+u32 doSomRevNfaPrefix(NG &ng, const ExpressionInfo &expr, NGHolder &g,
                       const CompileContext &cc) {
     depth maxWidth = findMaxWidth(g);
 
@@ -1861,7 +1866,7 @@ u32 doSomRevNfaPrefix(NG &ng, const NGWrapper &w, NGHolder &g,
 
     auto nfa = makeBareSomRevNfa(g, cc);
     if (!nfa) {
-        throw CompileError(w.expressionIndex, "Pattern is too large.");
+        throw CompileError(expr.index, "Pattern is too large.");
     }
 
     if (ng.cc.streaming) {
@@ -2055,8 +2060,8 @@ void roseAddHaigLiteral(RoseBuild &tb, const shared_ptr<NGHolder> &prefix,
 }
 
 static
-sombe_rv doHaigLitSom(NG &ng, NGHolder &g, const NGWrapper &w, u32 comp_id,
-                      som_type som,
+sombe_rv doHaigLitSom(NG &ng, NGHolder &g, const ExpressionInfo &expr,
+                      u32 comp_id, som_type som,
                       const ue2::unordered_map<NFAVertex, u32> &regions,
                       const map<u32, region_info> &info,
                       map<u32, region_info>::const_iterator lower_bound) {
@@ -2073,11 +2078,11 @@ sombe_rv doHaigLitSom(NG &ng, NGHolder &g, const NGWrapper &w, u32 comp_id,
     const u32 numSomLocsBefore = ssm.numSomSlots(); /* for rollback */
     u32 som_loc = ssm.getPrivateSomSlot();
 
-    if (!checkRose(rm, g, false, cc) && !isImplementableNFA(g, &rm, cc)) {
+    if (!checkViolet(rm, g, false, cc) && !isImplementableNFA(g, &rm, cc)) {
         // This is an optimisation: if we can't build a Haig from a portion of
         // the graph, then we won't be able to manage it as an outfix either
         // when we fall back.
-        throw CompileError(w.expressionIndex, "Pattern is too large.");
+        throw CompileError(expr.index, "Pattern is too large.");
     }
 
     while (1) {
@@ -2152,7 +2157,7 @@ sombe_rv doHaigLitSom(NG &ng, NGHolder &g, const NGWrapper &w, u32 comp_id,
                 goto next_try;
             }
 
-            implementSomPlan(ng, w, comp_id, g, plan, som_loc);
+            implementSomPlan(ng, expr, comp_id, g, plan, som_loc);
 
             Report ir = makeCallback(0U, 0);
             assert(!plan.empty());
@@ -2877,7 +2882,7 @@ unique_ptr<NGHolder> makePrefixForChain(NGHolder &g,
     return prefix;
 }
 
-sombe_rv doSom(NG &ng, NGHolder &g, const NGWrapper &w, u32 comp_id,
+sombe_rv doSom(NG &ng, NGHolder &g, const ExpressionInfo &expr, u32 comp_id,
                som_type som) {
     assert(som);
     DEBUG_PRINTF("som hello\n");
@@ -3001,7 +3006,7 @@ sombe_rv doSom(NG &ng, NGHolder &g, const NGWrapper &w, u32 comp_id,
         /* create prefix to set the som_loc */
         updatePrefixReports(rm, *prefix, INTERNAL_SOM_LOC_SET_IF_UNSET);
         if (prefix_by_rev) {
-            u32 rev_comp_id = doSomRevNfaPrefix(ng, w, *prefix, cc);
+            u32 rev_comp_id = doSomRevNfaPrefix(ng, expr, *prefix, cc);
             updatePrefixReportsRevNFA(rm, *prefix, rev_comp_id);
         }
         renumber_vertices(*prefix);
@@ -3084,18 +3089,18 @@ sombe_rv doSom(NG &ng, NGHolder &g, const NGWrapper &w, u32 comp_id,
         updatePrefixReports(rm, *prefix, INTERNAL_SOM_LOC_SET);
     }
     if (prefix_by_rev && !plan.front().no_implement) {
-        u32 rev_comp_id = doSomRevNfaPrefix(ng, w, *prefix, cc);
+        u32 rev_comp_id = doSomRevNfaPrefix(ng, expr, *prefix, cc);
         updatePrefixReportsRevNFA(rm, *prefix, rev_comp_id);
     }
 
-    implementSomPlan(ng, w, comp_id, g, plan, som_loc);
+    implementSomPlan(ng, expr, comp_id, g, plan, som_loc);
 
     DEBUG_PRINTF("success\n");
     return SOMBE_HANDLED_INTERNAL;
 }
 
-sombe_rv doSomWithHaig(NG &ng, NGHolder &g, const NGWrapper &w, u32 comp_id,
-                       som_type som) {
+sombe_rv doSomWithHaig(NG &ng, NGHolder &g, const ExpressionInfo &expr,
+                       u32 comp_id, som_type som) {
     assert(som);
 
     DEBUG_PRINTF("som+haig hello\n");
@@ -3132,7 +3137,7 @@ sombe_rv doSomWithHaig(NG &ng, NGHolder &g, const NGWrapper &w, u32 comp_id,
     buildRegionMapping(g, regions, info, true);
 
     sombe_rv rv =
-        doHaigLitSom(ng, g, w, comp_id, som, regions, info, info.begin());
+        doHaigLitSom(ng, g, expr, comp_id, som, regions, info, info.begin());
     if (rv == SOMBE_FAIL) {
         clear_graph(g);
         cloneHolder(g, g_pristine);
diff --git a/src/nfagraph/ng_som.h b/src/nfagraph/ng_som.h
index 707109454..ecae4c67f 100644
--- a/src/nfagraph/ng_som.h
+++ b/src/nfagraph/ng_som.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,12 +34,14 @@
 #define NG_SOM_H
 
 #include "som/som.h"
+#include "ue2common.h"
 
 namespace ue2 {
 
+class ExpressionInfo;
 class NG;
 class NGHolder;
-class NGWrapper;
+class ReportManager;
 struct Grey;
 
 enum sombe_rv {
@@ -63,14 +65,14 @@ enum sombe_rv {
  *  May throw a "Pattern too large" exception if prefixes of the
  * pattern are too large to compile.
  */
-sombe_rv doSom(NG &ng, NGHolder &h, const NGWrapper &w, u32 comp_id,
+sombe_rv doSom(NG &ng, NGHolder &h, const ExpressionInfo &expr, u32 comp_id,
                som_type som);
 
 /** Returns SOMBE_FAIL (and the original graph) if SOM cannot be established.
  * May also throw pattern too large if prefixes of the pattern are too large to
  * compile. */
-sombe_rv doSomWithHaig(NG &ng, NGHolder &h, const NGWrapper &w, u32 comp_id,
-                       som_type som);
+sombe_rv doSomWithHaig(NG &ng, NGHolder &h, const ExpressionInfo &expr,
+                       u32 comp_id, som_type som);
 
 void makeReportsSomPass(ReportManager &rm, NGHolder &g);
 
diff --git a/src/nfagraph/ng_som_util.cpp b/src/nfagraph/ng_som_util.cpp
index c43373415..a3b6ee5fd 100644
--- a/src/nfagraph/ng_som_util.cpp
+++ b/src/nfagraph/ng_som_util.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -78,8 +78,8 @@ vector<DepthMinMax> getDistancesFromSOM(const NGHolder &g_orig) {
 
     //dumpGraph("som_depth.dot", g);
 
-    vector<DepthMinMax> temp_depths; // numbered by vertex index in g
-    calcDepthsFrom(g, g.start, temp_depths);
+    // Find depths, indexed by vertex index in g
+    auto temp_depths = calcDepthsFrom(g, g.start);
 
     // Transfer depths, indexed by vertex index in g_orig.
     vector<DepthMinMax> depths(num_vertices(g_orig));
@@ -94,7 +94,7 @@ vector<DepthMinMax> getDistancesFromSOM(const NGHolder &g_orig) {
 
         if (v_orig == g_orig.startDs || is_virtual_start(v_orig, g_orig)) {
             // StartDs and virtual starts always have zero depth.
-            d = DepthMinMax(0, 0);
+            d = DepthMinMax(depth(0), depth(0));
         } else {
             u32 new_idx = g[v_new].index;
             d = temp_depths.at(new_idx);
diff --git a/src/nfagraph/ng_stop.cpp b/src/nfagraph/ng_stop.cpp
index e601f5411..c335540ac 100644
--- a/src/nfagraph/ng_stop.cpp
+++ b/src/nfagraph/ng_stop.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -60,10 +60,9 @@ namespace {
 
 /** Depths from start, startDs for this graph. */
 struct InitDepths {
-    explicit InitDepths(const NGHolder &g) {
-        calcDepthsFrom(g, g.start, start);
-        calcDepthsFrom(g, g.startDs, startDs);
-    }
+    explicit InitDepths(const NGHolder &g)
+        : start(calcDepthsFrom(g, g.start)),
+          startDs(calcDepthsFrom(g, g.startDs)) {}
 
     depth maxDist(const NGHolder &g, NFAVertex v) const {
         u32 idx = g[v].index;
diff --git a/src/nfagraph/ng_undirected.h b/src/nfagraph/ng_undirected.h
index 7df6c7dc4..1e27ad791 100644
--- a/src/nfagraph/ng_undirected.h
+++ b/src/nfagraph/ng_undirected.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,8 +30,8 @@
  * \brief Create an undirected graph from an NFAGraph.
  */
 
-#ifndef NG_UNDIRECTED_H_CB42C71CF38E3D
-#define NG_UNDIRECTED_H_CB42C71CF38E3D
+#ifndef NG_UNDIRECTED_H
+#define NG_UNDIRECTED_H
 
 #include "ng_holder.h"
 #include "ng_util.h"
@@ -52,13 +52,13 @@ namespace ue2 {
  * of parallel edges. The only vertex property constructed is \a
  * vertex_index_t.
  */
-typedef boost::adjacency_list<boost::setS,        // out edges
-                              boost::listS,       // vertices
-                              boost::undirectedS, // graph is undirected
-                              boost::property<boost::vertex_index_t, size_t> >
-NFAUndirectedGraph;
+using NFAUndirectedGraph = boost::adjacency_list<
+    boost::listS,                                    // out edges
+    boost::listS,                                    // vertices
+    boost::undirectedS,                              // graph is undirected
+    boost::property<boost::vertex_index_t, size_t>>; // vertex properties
 
-typedef NFAUndirectedGraph::vertex_descriptor NFAUndirectedVertex;
+using NFAUndirectedVertex = NFAUndirectedGraph::vertex_descriptor;
 
 /**
  * Make a copy of an NFAGraph with undirected edges, optionally without start
@@ -67,15 +67,17 @@ typedef NFAUndirectedGraph::vertex_descriptor NFAUndirectedVertex;
  * Note that new vertex indices are assigned contiguously in \a vertices(g)
  * order.
  */
-template <typename GraphT>
-void createUnGraph(const GraphT &g,
+template <typename Graph>
+NFAUndirectedGraph createUnGraph(const Graph &g,
            bool excludeStarts,
            bool excludeAccepts,
-           NFAUndirectedGraph &ug,
-           ue2::unordered_map<typename GraphT::vertex_descriptor,
-                              NFAUndirectedVertex> &old2new) {
+           unordered_map<typename Graph::vertex_descriptor,
+                         NFAUndirectedVertex> &old2new) {
+    NFAUndirectedGraph ug;
     size_t idx = 0;
-    typedef typename GraphT::vertex_descriptor VertexT;
+
+    assert(old2new.empty());
+    old2new.reserve(num_vertices(g));
 
     for (auto v : ue2::vertices_range(g)) {
         // skip all accept nodes
@@ -88,32 +90,47 @@ void createUnGraph(const GraphT &g,
             continue;
         }
 
-        NFAUndirectedVertex nuv = boost::add_vertex(ug);
-        old2new[v] = nuv;
+        auto nuv = boost::add_vertex(ug);
+        old2new.emplace(v, nuv);
         boost::put(boost::vertex_index, ug, nuv, idx++);
     }
 
+    // Track seen edges so that we don't insert parallel edges.
+    using Vertex = typename Graph::vertex_descriptor;
+    unordered_set<std::pair<Vertex, Vertex>> seen;
+    seen.reserve(num_edges(g));
+    auto make_ordered_edge = [](Vertex a, Vertex b) {
+        return std::make_pair(std::min(a, b), std::max(a, b));
+    };
+
     for (const auto &e : ue2::edges_range(g)) {
-        VertexT src = source(e, g);
-        VertexT targ = target(e, g);
+        auto u = source(e, g);
+        auto v = target(e, g);
 
-        if ((excludeAccepts && is_any_accept(src, g))
-            || (excludeStarts && is_any_start(src, g))) {
+        if ((excludeAccepts && is_any_accept(u, g))
+            || (excludeStarts && is_any_start(u, g))) {
             continue;
         }
 
-        if ((excludeAccepts && is_any_accept(targ, g))
-            || (excludeStarts && is_any_start(targ, g))) {
+        if ((excludeAccepts && is_any_accept(v, g))
+            || (excludeStarts && is_any_start(v, g))) {
             continue;
         }
 
-        NFAUndirectedVertex new_src = old2new[src];
-        NFAUndirectedVertex new_targ = old2new[targ];
+        if (!seen.emplace(make_ordered_edge(u, v)).second) {
+            continue; // skip parallel edge.
+        }
 
-        boost::add_edge(new_src, new_targ, ug);
+        NFAUndirectedVertex new_u = old2new.at(u);
+        NFAUndirectedVertex new_v = old2new.at(v);
+
+        boost::add_edge(new_u, new_v, ug);
     }
+
+    assert(!has_parallel_edge(ug));
+    return ug;
 }
 
 } // namespace ue2
 
-#endif /* NG_UNDIRECTED_H_CB42C71CF38E3D */
+#endif /* NG_UNDIRECTED_H */
diff --git a/src/nfagraph/ng_utf8.cpp b/src/nfagraph/ng_utf8.cpp
index 383aa142d..89500fe39 100644
--- a/src/nfagraph/ng_utf8.cpp
+++ b/src/nfagraph/ng_utf8.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,6 +34,7 @@
 #include "ng.h"
 #include "ng_prune.h"
 #include "ng_util.h"
+#include "compiler/compiler.h"
 #include "util/graph_range.h"
 #include "util/unicode_def.h"
 
@@ -45,14 +46,14 @@ using namespace std;
 namespace ue2 {
 
 static
-void allowIllegal(NGWrapper &w, NFAVertex v, u8 pred_char) {
-    if (in_degree(v, w) != 1) {
+void allowIllegal(NGHolder &g, NFAVertex v, u8 pred_char) {
+    if (in_degree(v, g) != 1) {
         DEBUG_PRINTF("unexpected pred\n");
         assert(0); /* should be true due to the early stage of this analysis */
         return;
     }
 
-    CharReach &cr = w[v].char_reach;
+    CharReach &cr = g[v].char_reach;
     if (pred_char == 0xe0) {
         assert(cr.isSubsetOf(CharReach(0xa0, 0xbf)));
         if (cr == CharReach(0xa0, 0xbf)) {
@@ -79,8 +80,8 @@ void allowIllegal(NGWrapper &w, NFAVertex v, u8 pred_char) {
  * above \\x{10ffff} or they represent overlong encodings. As we require valid
  * UTF-8 input, we have no defined behaviour in these cases, as a result we can
  * accept them if it simplifies the graph. */
-void relaxForbiddenUtf8(NGWrapper &w) {
-    if (!w.utf8) {
+void relaxForbiddenUtf8(NGHolder &g, const ExpressionInfo &expr) {
+    if (!expr.utf8) {
         return;
     }
 
@@ -88,12 +89,12 @@ void relaxForbiddenUtf8(NGWrapper &w) {
     const CharReach f0(0xf0);
     const CharReach f4(0xf4);
 
-    for (auto v : vertices_range(w)) {
-        const CharReach &cr = w[v].char_reach;
+    for (auto v : vertices_range(g)) {
+        const CharReach &cr = g[v].char_reach;
         if (cr == e0 || cr == f0 || cr == f4) {
             u8 pred_char = cr.find_first();
-            for (auto t : adjacent_vertices_range(v, w)) {
-                allowIllegal(w, t, pred_char);
+            for (auto t : adjacent_vertices_range(v, g)) {
+                allowIllegal(g, t, pred_char);
             }
         }
     }
diff --git a/src/nfagraph/ng_utf8.h b/src/nfagraph/ng_utf8.h
index e1b08e405..7c4288336 100644
--- a/src/nfagraph/ng_utf8.h
+++ b/src/nfagraph/ng_utf8.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,7 +35,7 @@
 
 namespace ue2 {
 
-class NGWrapper;
+class ExpressionInfo;
 class NGHolder;
 
 /** \brief Relax forbidden UTF-8 sequences.
@@ -44,7 +44,7 @@ class NGHolder;
  * above \\x{10ffff} or they represent overlong encodings. As we require valid
  * UTF-8 input, we have no defined behaviour in these cases, as a result we can
  * accept them if it simplifies the graph. */
-void relaxForbiddenUtf8(NGWrapper &w);
+void relaxForbiddenUtf8(NGHolder &g, const ExpressionInfo &expr);
 
 /** \brief Contract cycles of UTF-8 code points down to a single cyclic vertex
  * where possible, based on the assumption that we will always be matching
diff --git a/src/nfagraph/ng_util.cpp b/src/nfagraph/ng_util.cpp
index 5252eb18d..0776fa044 100644
--- a/src/nfagraph/ng_util.cpp
+++ b/src/nfagraph/ng_util.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,6 +34,7 @@
 #include "grey.h"
 #include "ng_depth.h" // for NFAVertexDepth
 #include "ng_dump.h"
+#include "ng_prune.h"
 #include "ue2common.h"
 #include "nfa/limex_limits.h" // for NFA_MAX_TOP_MASKS.
 #include "parser/position.h"
@@ -43,6 +44,7 @@
 #include "util/ue2string.h"
 #include "util/report_manager.h"
 
+#include <limits>
 #include <map>
 #include <set>
 #include <boost/graph/filtered_graph.hpp>
@@ -672,6 +674,86 @@ void reverseHolder(const NGHolder &g_in, NGHolder &g) {
     assert(num_edges(g) == num_edges(g_in));
 }
 
+u32 removeTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
+                                u32 max_delay, bool overhang_ok) {
+    assert(isCorrectlyTopped(g));
+    if (max_delay == numeric_limits<u32>::max()) {
+        max_delay--;
+    }
+
+    DEBUG_PRINTF("killing off '%s'\n", dumpString(lit).c_str());
+    set<NFAVertex> curr, next;
+    curr.insert(g.accept);
+
+    auto it = lit.rbegin();
+    for (u32 delay = max_delay; delay > 0 && it != lit.rend(); delay--, ++it) {
+        next.clear();
+        for (auto v : curr) {
+            for (auto u : inv_adjacent_vertices_range(v, g)) {
+                if (u == g.start) {
+                    if (overhang_ok) {
+                        DEBUG_PRINTF("bail\n");
+                        goto bail; /* things got complicated */
+                    } else {
+                        continue; /* it is not possible for a lhs literal to
+                                   * overhang the start */
+                    }
+                }
+
+                const CharReach &cr = g[u].char_reach;
+                if (!overlaps(*it, cr)) {
+                    DEBUG_PRINTF("skip\n");
+                    continue;
+                }
+                if (isSubsetOf(*it, cr)) {
+                    next.insert(u);
+                } else {
+                    DEBUG_PRINTF("bail\n");
+                    goto bail; /* things got complicated */
+                }
+            }
+        }
+
+        curr.swap(next);
+    }
+ bail:
+    if (curr.empty()) {
+        /* This can happen when we have an edge representing a cross from two
+         * sides of an alternation. This whole edge needs to be marked as
+         * dead */
+        assert(0); /* should have been picked up by can match */
+        return numeric_limits<u32>::max();
+    }
+
+    u32 delay = distance(lit.rbegin(), it);
+    assert(delay <= max_delay);
+    assert(delay <= lit.length());
+    DEBUG_PRINTF("managed delay %u (of max %u)\n", delay, max_delay);
+
+    set<NFAVertex> pred;
+    for (auto v : curr) {
+        insert(&pred, inv_adjacent_vertices_range(v, g));
+    }
+
+    clear_in_edges(g.accept, g);
+    clearReports(g);
+
+    for (auto v : pred) {
+        NFAEdge e = add_edge(v, g.accept, g);
+        g[v].reports.insert(0);
+        if (is_triggered(g) && v == g.start) {
+            g[e].tops.insert(DEFAULT_TOP);
+        }
+    }
+
+    pruneUseless(g);
+    assert(allMatchStatesHaveReports(g));
+    assert(isCorrectlyTopped(g));
+
+    DEBUG_PRINTF("graph has %zu vertices left\n", num_vertices(g));
+    return delay;
+}
+
 #ifndef NDEBUG
 
 bool allMatchStatesHaveReports(const NGHolder &g) {
diff --git a/src/nfagraph/ng_util.h b/src/nfagraph/ng_util.h
index a07525339..1d3a6f325 100644
--- a/src/nfagraph/ng_util.h
+++ b/src/nfagraph/ng_util.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -124,6 +124,22 @@ bad_edge_filter<EdgeSet> make_bad_edge_filter(const EdgeSet *e) {
     return bad_edge_filter<EdgeSet>(e);
 }
 
+/** \brief vertex graph filter. */
+template<typename VertexSet>
+struct bad_vertex_filter {
+    bad_vertex_filter() = default;
+    explicit bad_vertex_filter(const VertexSet *bad_v) : bad_vertices(bad_v) {}
+    bool operator()(const typename VertexSet::value_type &v) const {
+        return !contains(*bad_vertices, v); /* keep vertices not in bad set */
+    }
+    const VertexSet *bad_vertices = nullptr;
+};
+
+template<typename VertexSet>
+bad_vertex_filter<VertexSet> make_bad_vertex_filter(const VertexSet *v) {
+    return bad_vertex_filter<VertexSet>(v);
+}
+
 /** Visitor that records back edges */
 template <typename BackEdgeSet>
 class BackEdges : public boost::default_dfs_visitor {
@@ -275,6 +291,11 @@ void duplicateReport(NGHolder &g, ReportID r_old, ReportID r_new);
  * accepts. */
 void reverseHolder(const NGHolder &g, NGHolder &out);
 
+/** \brief Returns the delay or ~0U if the graph cannot match with
+ * the trailing literal. */
+u32 removeTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
+                                u32 max_delay, bool overhang_ok = true);
+
 #ifndef NDEBUG
 
 // Assertions: only available in internal builds.
diff --git a/src/nfagraph/ng_vacuous.cpp b/src/nfagraph/ng_vacuous.cpp
index 53672a1bd..d1123dff4 100644
--- a/src/nfagraph/ng_vacuous.cpp
+++ b/src/nfagraph/ng_vacuous.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,29 +34,31 @@
 #include "grey.h"
 #include "ng.h"
 #include "ng_util.h"
+#include "compiler/compiler.h"
 
 using namespace std;
 
 namespace ue2 {
 
 static
-ReportID getInternalId(ReportManager &rm, const NGWrapper &graph) {
-    Report ir = rm.getBasicInternalReport(graph);
+ReportID getInternalId(ReportManager &rm, const ExpressionInfo &expr) {
+    Report ir = rm.getBasicInternalReport(expr);
 
     // Apply any extended params.
-    if (graph.min_offset || graph.max_offset != MAX_OFFSET) {
-        ir.minOffset = graph.min_offset;
-        ir.maxOffset = graph.max_offset;
+    if (expr.min_offset || expr.max_offset != MAX_OFFSET) {
+        ir.minOffset = expr.min_offset;
+        ir.maxOffset = expr.max_offset;
     }
 
-    assert(!graph.min_length); // should be handled elsewhere.
+    assert(!expr.min_length); // should be handled elsewhere.
 
     return rm.getInternalId(ir);
 }
 
 static
-void makeFirehose(BoundaryReports &boundary, ReportManager &rm, NGWrapper &g) {
-    const ReportID r = getInternalId(rm, g);
+void makeFirehose(BoundaryReports &boundary, ReportManager &rm, NGHolder &g,
+                  const ExpressionInfo &expr) {
+    const ReportID r = getInternalId(rm, expr);
 
     boundary.report_at_0_eod.insert(r);
     boundary.report_at_0.insert(r);
@@ -81,8 +83,8 @@ void makeFirehose(BoundaryReports &boundary, ReportManager &rm, NGWrapper &g) {
 
 static
 void makeAnchoredAcceptor(BoundaryReports &boundary, ReportManager &rm,
-                          NGWrapper &g) {
-    boundary.report_at_0.insert(getInternalId(rm, g));
+                          NGHolder &g, const ExpressionInfo &expr) {
+    boundary.report_at_0.insert(getInternalId(rm, expr));
     remove_edge(g.start, g.accept, g);
     remove_edge(g.start, g.acceptEod, g);
     g[g.start].reports.clear();
@@ -90,8 +92,8 @@ void makeAnchoredAcceptor(BoundaryReports &boundary, ReportManager &rm,
 
 static
 void makeEndAnchoredAcceptor(BoundaryReports &boundary, ReportManager &rm,
-                             NGWrapper &g) {
-    boundary.report_at_eod.insert(getInternalId(rm, g));
+                             NGHolder &g, const ExpressionInfo &expr) {
+    boundary.report_at_eod.insert(getInternalId(rm, expr));
     remove_edge(g.startDs, g.acceptEod, g);
     remove_edge(g.start, g.acceptEod, g);
     g[g.start].reports.clear();
@@ -100,18 +102,18 @@ void makeEndAnchoredAcceptor(BoundaryReports &boundary, ReportManager &rm,
 
 static
 void makeNothingAcceptor(BoundaryReports &boundary, ReportManager &rm,
-                         NGWrapper &g) {
-    boundary.report_at_0_eod.insert(getInternalId(rm, g));
+                         NGHolder &g, const ExpressionInfo &expr) {
+    boundary.report_at_0_eod.insert(getInternalId(rm, expr));
     remove_edge(g.start, g.acceptEod, g);
     g[g.start].reports.clear();
 }
 
 bool splitOffVacuous(BoundaryReports &boundary, ReportManager &rm,
-                     NGWrapper &g) {
+                     NGHolder &g, const ExpressionInfo &expr) {
     if (edge(g.startDs, g.accept, g).second) {
         // e.g. '.*'; match "between" every byte
         DEBUG_PRINTF("graph is firehose\n");
-        makeFirehose(boundary, rm, g);
+        makeFirehose(boundary, rm, g, expr);
         return true;
     }
 
@@ -119,19 +121,19 @@ bool splitOffVacuous(BoundaryReports &boundary, ReportManager &rm,
 
     if (edge(g.start, g.accept, g).second) {
         DEBUG_PRINTF("creating anchored acceptor\n");
-        makeAnchoredAcceptor(boundary, rm, g);
+        makeAnchoredAcceptor(boundary, rm, g, expr);
         work_done = true;
     }
 
     if (edge(g.startDs, g.acceptEod, g).second) {
         DEBUG_PRINTF("creating end-anchored acceptor\n");
-        makeEndAnchoredAcceptor(boundary, rm, g);
+        makeEndAnchoredAcceptor(boundary, rm, g, expr);
         work_done = true;
     }
 
     if (edge(g.start, g.acceptEod, g).second) {
         DEBUG_PRINTF("creating nothing acceptor\n");
-        makeNothingAcceptor(boundary, rm, g);
+        makeNothingAcceptor(boundary, rm, g, expr);
         work_done = true;
     }
 
diff --git a/src/nfagraph/ng_vacuous.h b/src/nfagraph/ng_vacuous.h
index ebbc9d17b..c33cb312d 100644
--- a/src/nfagraph/ng_vacuous.h
+++ b/src/nfagraph/ng_vacuous.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -36,12 +36,13 @@
 namespace ue2 {
 
 struct BoundaryReports;
-class NGWrapper;
+class ExpressionInfo;
+class NGHolder;
 class ReportManager;
 
 // Returns true if a "vacuous" reporter was created.
 bool splitOffVacuous(BoundaryReports &boundary, ReportManager &rm,
-                     NGWrapper &graph);
+                     NGHolder &g, const ExpressionInfo &expr);
 
 } // namespace ue2
 
diff --git a/src/nfagraph/ng_violet.cpp b/src/nfagraph/ng_violet.cpp
index 985246f03..4195045c4 100644
--- a/src/nfagraph/ng_violet.cpp
+++ b/src/nfagraph/ng_violet.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -38,15 +38,17 @@
 #include "ng_holder.h"
 #include "ng_is_equal.h"
 #include "ng_literal_analysis.h"
+#include "ng_limex.h"
+#include "ng_mcclellan.h"
 #include "ng_netflow.h"
 #include "ng_prune.h"
 #include "ng_redundancy.h"
 #include "ng_region.h"
 #include "ng_reports.h"
-#include "ng_rose.h"
 #include "ng_split.h"
 #include "ng_util.h"
 #include "ng_width.h"
+#include "nfa/rdfa.h"
 #include "rose/rose_build.h"
 #include "rose/rose_build_util.h"
 #include "rose/rose_in_dump.h"
@@ -66,7 +68,7 @@
 #include <set>
 #include <utility>
 #include <vector>
-#include <boost/core/noncopyable.hpp>
+#include <boost/dynamic_bitset.hpp>
 #include <boost/range/adaptor/map.hpp>
 
 #define STAGE_DEBUG_PRINTF DEBUG_PRINTF
@@ -130,6 +132,44 @@ bool createsTransientLHS(const NGHolder &g, const vector<NFAVertex> &vv,
     return true;
 }
 
+static
+double calcSplitRatio(const NGHolder &g, const vector<NFAVertex> &vv) {
+    flat_set<NFAVertex> not_reachable;
+    find_unreachable(g, vv, &not_reachable);
+    double rv = (double)not_reachable.size() / num_vertices(g);
+    rv = rv > 0.5 ? 1 - rv : rv;
+
+    return rv;
+}
+
+static
+size_t shorter_than(const set<ue2_literal> &s, size_t limit) {
+    return count_if(s.begin(), s.end(),
+                    [&](const ue2_literal &a) { return a.length() < limit; });
+}
+
+static
+u32 min_len(const set<ue2_literal> &s) {
+    u32 rv = ~0U;
+
+    for (const auto &lit : s) {
+        rv = min(rv, (u32)lit.length());
+    }
+
+    return rv;
+}
+
+static
+u32 min_period(const set<ue2_literal> &s) {
+    u32 rv = ~0U;
+
+    for (const auto &lit : s) {
+        rv = min(rv, (u32)minStringPeriod(lit));
+    }
+    DEBUG_PRINTF("min period %u\n", rv);
+    return rv;
+}
+
 namespace {
 /**
  * Information on a cut: vertices and literals.
@@ -148,18 +188,19 @@ struct VertLitInfo {
 
     bool creates_anchored = false;
     bool creates_transient = false;
+    double split_ratio = 0;
 };
 
+#define LAST_CHANCE_STRONG_LEN 1
+
 /**
- * \brief Comparator class for sorting LitCollection::lits.
- *
- * This is separated out from LitCollection itself as passing LitCollection to
- * std::sort() would incur a (potentially expensive) copy.
+ * \brief Comparator class for comparing different literal cuts.
  */
 class LitComparator {
 public:
-    LitComparator(const NGHolder &g_in, bool sa, bool st)
-        : g(g_in), seeking_anchored(sa), seeking_transient(st) {}
+    LitComparator(const NGHolder &g_in, bool sa, bool st, bool lc)
+        : g(g_in), seeking_anchored(sa), seeking_transient(st),
+          last_chance(lc) {}
     bool operator()(const unique_ptr<VertLitInfo> &a,
                     const unique_ptr<VertLitInfo> &b) const {
         assert(a && b);
@@ -176,6 +217,14 @@ class LitComparator {
             }
         }
 
+        if (last_chance
+            && min_len(a->lit) > LAST_CHANCE_STRONG_LEN
+            && min_len(b->lit) > LAST_CHANCE_STRONG_LEN) {
+            DEBUG_PRINTF("using split ratio %g , %g\n", a->split_ratio,
+                          b->split_ratio);
+            return a->split_ratio < b->split_ratio;
+        }
+
         u64a score_a = scoreSet(a->lit);
         u64a score_b = scoreSet(b->lit);
 
@@ -193,52 +242,29 @@ class LitComparator {
 
     bool seeking_anchored;
     bool seeking_transient;
+    bool last_chance;
 };
 }
 
-static
-size_t shorter_than(const set<ue2_literal> &s, size_t limit) {
-    size_t count = 0;
-
-    for (const auto &lit : s) {
-        if (lit.length() < limit) {
-            count++;
-        }
-    }
-
-    return count;
-}
-
-static
-u32 min_len(const set<ue2_literal> &s) {
-    u32 rv = ~0U;
-
-    for (const auto &lit : s) {
-        rv = min(rv, (u32)lit.length());
-    }
-
-    return rv;
-}
-
-static
-u32 min_period(const set<ue2_literal> &s) {
-    u32 rv = ~0U;
-
-    for (const auto &lit : s) {
-        rv = min(rv, (u32)minStringPeriod(lit));
-    }
-    DEBUG_PRINTF("min period %u\n", rv);
-    return rv;
-}
-
 #define MIN_ANCHORED_LEN 2
+#define MIN_ANCHORED_DESPERATE_LEN 1
 
+/* anchored here means that the cut creates a 'usefully' anchored LHS */
 static
 bool validateRoseLiteralSetQuality(const set<ue2_literal> &s, u64a score,
                                    bool anchored, u32 min_allowed_floating_len,
-                                   bool desperation) {
+                                   bool desperation, bool last_chance) {
     u32 min_allowed_len = anchored ? MIN_ANCHORED_LEN
                                    : min_allowed_floating_len;
+    if (anchored && last_chance) {
+        min_allowed_len = MIN_ANCHORED_DESPERATE_LEN;
+    }
+    if (last_chance) {
+        desperation = true;
+    }
+
+    DEBUG_PRINTF("validating%s set, min allowed len %u\n",
+                 anchored ? " anchored" : "", min_allowed_len);
 
     assert(none_of(begin(s), end(s), bad_mixed_sensitivity));
 
@@ -267,6 +293,7 @@ bool validateRoseLiteralSetQuality(const set<ue2_literal> &s, u64a score,
     if (s.size() > 10 /* magic number is magic */
         || s_min_len < min_allowed_len
         || (s_min_period <= 1 && min_allowed_len != 1)) {
+        DEBUG_PRINTF("candidate may be bad\n");
         ok = false;
     }
 
@@ -307,7 +334,7 @@ void getSimpleRoseLiterals(const NGHolder &g, bool seeking_anchored,
                            const set<NFAVertex> &a_dom,
                            vector<unique_ptr<VertLitInfo>> *lits,
                            u32 min_allowed_len, bool desperation,
-                           const CompileContext &cc) {
+                           bool last_chance, const CompileContext &cc) {
     assert(depths || !seeking_anchored);
 
     map<NFAVertex, u64a> scores;
@@ -333,7 +360,7 @@ void getSimpleRoseLiterals(const NGHolder &g, bool seeking_anchored,
         }
 
         if (!validateRoseLiteralSetQuality(s, score, anchored, min_allowed_len,
-                                           desperation)) {
+                                           desperation, last_chance)) {
             continue;
         }
 
@@ -370,7 +397,7 @@ void getRegionRoseLiterals(const NGHolder &g, bool seeking_anchored,
                            const set<NFAVertex> *allowed,
                            vector<unique_ptr<VertLitInfo>> *lits,
                            u32 min_allowed_len, bool desperation,
-                           const CompileContext &cc) {
+                           bool last_chance, const CompileContext &cc) {
     /* This allows us to get more places to split the graph as we are not
        limited to points where there is a single vertex to split at. */
 
@@ -490,7 +517,7 @@ void getRegionRoseLiterals(const NGHolder &g, bool seeking_anchored,
         }
 
         if (!validateRoseLiteralSetQuality(s, score, anchored, min_allowed_len,
-                                           desperation)) {
+                                           desperation, last_chance)) {
             goto next_cand;
         }
 
@@ -588,6 +615,7 @@ unique_ptr<VertLitInfo> findBestSplit(const NGHolder &g,
                                       bool for_prefix, u32 min_len,
                                       const set<NFAVertex> *allowed_cand,
                                       const set<NFAVertex> *disallowed_cand,
+                                      bool last_chance,
                                       const CompileContext &cc) {
     assert(!for_prefix || depths);
 
@@ -634,17 +662,16 @@ unique_ptr<VertLitInfo> findBestSplit(const NGHolder &g,
     DEBUG_PRINTF("|cand| = %zu\n", cand.size());
 
     bool seeking_anchored = for_prefix;
-    bool seeking_transient = for_prefix; //cc.streaming;
+    bool seeking_transient = for_prefix;
 
-    /* TODO: revisit when backstop goes away */
     bool desperation = for_prefix && cc.streaming;
 
     vector<unique_ptr<VertLitInfo>> lits; /**< sorted list of potential cuts */
 
     getSimpleRoseLiterals(g, seeking_anchored, depths, cand, &lits, min_len,
-                          desperation, cc);
+                          desperation, last_chance, cc);
     getRegionRoseLiterals(g, seeking_anchored, depths, cand_raw, allowed_cand,
-                          &lits, min_len, desperation, cc);
+                          &lits, min_len, desperation, last_chance, cc);
 
     if (lits.empty()) {
         DEBUG_PRINTF("no literals found\n");
@@ -658,7 +685,14 @@ unique_ptr<VertLitInfo> findBestSplit(const NGHolder &g,
         }
     }
 
-    auto cmp = LitComparator(g, seeking_anchored, seeking_transient);
+    if (last_chance) {
+        for (auto &a : lits) {
+            a->split_ratio = calcSplitRatio(g, a->vv);
+        }
+    }
+
+    auto cmp = LitComparator(g, seeking_anchored, seeking_transient,
+                             last_chance);
 
     unique_ptr<VertLitInfo> best = move(lits.back());
     lits.pop_back();
@@ -684,27 +718,39 @@ void poisonFromSuccessor(const NGHolder &h, const ue2_literal &succ,
     DEBUG_PRINTF("poisoning holder of size %zu, succ len %zu\n",
                  num_vertices(h), succ.length());
 
-    map<NFAVertex, flat_set<NFAEdge> > curr;
+    using EdgeSet = boost::dynamic_bitset<>;
+
+    const size_t edge_count = num_edges(h);
+    EdgeSet bad_edges(edge_count);
+
+    unordered_map<NFAVertex, EdgeSet> curr;
     for (const auto &e : in_edges_range(h.accept, h)) {
-        curr[source(e, h)].insert(e);
+        auto &path_set = curr[source(e, h)];
+        if (path_set.empty()) {
+            path_set.resize(edge_count);
+        }
+        path_set.set(h[e].index);
     }
 
-    map<NFAVertex, flat_set<NFAEdge> > next;
+    unordered_map<NFAVertex, EdgeSet> next;
     for (auto it = succ.rbegin(); it != succ.rend(); ++it) {
         for (const auto &path : curr) {
             NFAVertex u = path.first;
             const auto &path_set = path.second;
             if (u == h.start && overhang_ok) {
                 DEBUG_PRINTF("poisoning early %zu [overhang]\n",
-                             path_set.size());
-                insert(&bad, path_set);
+                             path_set.count());
+                bad_edges |= path_set;
                 continue;
             }
             if (overlaps(h[u].char_reach, *it)) {
                 for (const auto &e : in_edges_range(u, h)) {
                     auto &new_path_set = next[source(e, h)];
-                    insert(&new_path_set, path_set);
-                    new_path_set.insert(e);
+                    if (new_path_set.empty()) {
+                        new_path_set.resize(edge_count);
+                    }
+                    new_path_set |= path_set;
+                    new_path_set.set(h[e].index);
                 }
             }
         }
@@ -716,8 +762,14 @@ void poisonFromSuccessor(const NGHolder &h, const ue2_literal &succ,
 
     assert(overhang_ok || !curr.empty());
     for (const auto &path : curr) {
-        insert(&bad, path.second);
-        DEBUG_PRINTF("poisoning %zu vertices\n", path.second.size());
+        bad_edges |= path.second;
+        DEBUG_PRINTF("poisoning %zu vertices\n", path.second.count());
+    }
+
+    for (const auto &e : edges_range(h)) {
+        if (bad_edges.test(h[e].index)) {
+            bad.insert(e);
+        }
     }
 }
 
@@ -733,6 +785,11 @@ void poisonForGoodPrefix(const NGHolder &h,
     }
 }
 
+static UNUSED
+bool is_any_accept_type(RoseInVertexType t) {
+    return t == RIV_ACCEPT || t == RIV_ACCEPT_EOD;
+}
+
 static
 flat_set<NFAEdge> poisonEdges(const NGHolder &h,
                          const vector<NFAVertexDepth> *depths,
@@ -746,7 +803,8 @@ flat_set<NFAEdge> poisonEdges(const NGHolder &h,
     for (const RoseInEdge &ve : ee) {
         if (vg[target(ve, vg)].type != RIV_LITERAL) {
             /* nothing to poison in suffixes/outfixes */
-            assert(vg[target(ve, vg)].type == RIV_ACCEPT);
+            assert(generates_callbacks(h));
+            assert(is_any_accept_type(vg[target(ve, vg)].type));
             continue;
         }
         succs.insert({vg[target(ve, vg)].s,
@@ -793,7 +851,19 @@ unique_ptr<VertLitInfo> findBestNormalSplit(const NGHolder &g,
     set<NFAVertex> bad_vertices = poisonVertices(g, vg, ee, cc.grey);
 
     return findBestSplit(g, nullptr, false, cc.grey.minRoseLiteralLength,
-                         nullptr, &bad_vertices, cc);
+                         nullptr, &bad_vertices, false, cc);
+}
+
+static
+unique_ptr<VertLitInfo> findBestLastChanceSplit(const NGHolder &g,
+                                                const RoseInGraph &vg,
+                                                const vector<RoseInEdge> &ee,
+                                                const CompileContext &cc) {
+    assert(g.kind == NFA_OUTFIX || g.kind == NFA_INFIX || g.kind == NFA_SUFFIX);
+    set<NFAVertex> bad_vertices = poisonVertices(g, vg, ee, cc.grey);
+
+    return findBestSplit(g, nullptr, false, cc.grey.minRoseLiteralLength,
+                         nullptr, &bad_vertices, true, cc);
 }
 
 static
@@ -870,11 +940,12 @@ unique_ptr<VertLitInfo> findBestPrefixSplit(const NGHolder &g,
                                         const vector<NFAVertexDepth> &depths,
                                         const RoseInGraph &vg,
                                         const vector<RoseInEdge> &ee,
+                                        bool last_chance,
                                         const CompileContext &cc) {
-    assert(g.kind == NFA_PREFIX);
+    assert(g.kind == NFA_PREFIX || g.kind == NFA_OUTFIX);
     set<NFAVertex> bad_vertices = poisonVertices(g, vg, ee, cc.grey);
     auto rv = findBestSplit(g, &depths, true, cc.grey.minRoseLiteralLength,
-                            nullptr, &bad_vertices, cc);
+                            nullptr, &bad_vertices, last_chance, cc);
 
     /* large back edges may prevent us identifying anchored or transient cases
      * properly - use a simple walk instead */
@@ -905,7 +976,7 @@ unique_ptr<VertLitInfo> findBestCleanSplit(const NGHolder &g,
         return nullptr;
     }
     return findBestSplit(g, nullptr, false, cc.grey.violetEarlyCleanLiteralLen,
-                         &cleanSplits, nullptr, cc);
+                         &cleanSplits, nullptr, false, cc);
 }
 
 static
@@ -961,7 +1032,7 @@ bool splitRoseEdge(const NGHolder &base_graph, RoseInGraph &vg,
                  to_string(lhs->kind).c_str(), num_vertices(*lhs),
                  to_string(rhs->kind).c_str(), num_vertices(*rhs));
 
-    bool suffix = vg[target(ee.front(), vg)].type == RIV_ACCEPT;
+    bool suffix = generates_callbacks(base_graph);
 
     if (is_triggered(base_graph)) {
         /* if we are already guarded, check if the split reduces the size of
@@ -1377,12 +1448,11 @@ RoseInGraph populateTrivialGraph(const NGHolder &h) {
 }
 
 static
-void avoidOutfixes(RoseInGraph &vg, const CompileContext &cc) {
+void avoidOutfixes(RoseInGraph &vg, bool last_chance,
+                   const CompileContext &cc) {
     STAGE_DEBUG_PRINTF("AVOIDING OUTFIX\n");
-    if (num_vertices(vg) > 2) {
-        /* must be at least one literal aside from start and accept */
-        return;
-    }
+    assert(num_vertices(vg) == 2);
+    assert(num_edges(vg) == 1);
 
     RoseInEdge e = *edges(vg).first;
 
@@ -1392,13 +1462,27 @@ void avoidOutfixes(RoseInGraph &vg, const CompileContext &cc) {
     renumber_vertices(h);
     renumber_edges(h);
 
-    unique_ptr<VertLitInfo> split = findBestNormalSplit(h, vg, {e}, cc);
+    unique_ptr<VertLitInfo>  split = findBestNormalSplit(h, vg, {e}, cc);
 
     if (split && splitRoseEdge(h, vg, {e}, *split)) {
         DEBUG_PRINTF("split on simple literal\n");
-    } else {
-        doNetflowCut(h, nullptr, vg, {e}, false, cc.grey);
+        return;
     }
+
+    if (last_chance) {
+        /* look for a prefix split as it allows us to accept very weak anchored
+         * literals. */
+        auto depths = calcDepths(h);
+
+        split = findBestPrefixSplit(h, depths, vg, {e}, last_chance, cc);
+
+        if (split && splitRoseEdge(h, vg, {e}, *split)) {
+            DEBUG_PRINTF("split on simple literal\n");
+            return;
+        }
+    }
+
+    doNetflowCut(h, nullptr, vg, {e}, false, cc.grey);
 }
 
 static
@@ -1463,6 +1547,11 @@ void removeRedundantLiteralsFromPrefixes(RoseInGraph &g,
             continue;
         }
 
+        if (g[e].graph_lag) {
+            /* already removed redundant parts of literals */
+            continue;
+        }
+
         assert(!g[t].delay);
         const ue2_literal &lit = g[t].s;
 
@@ -1564,20 +1653,22 @@ void removeRedundantLiteralsFromInfix(const NGHolder &h, RoseInGraph &ig,
      * taking into account overlap of successor literals. */
 
     set<ue2_literal> preds;
+    set<ue2_literal> succs;
     for (const RoseInEdge &e : ee) {
         RoseInVertex u = source(e, ig);
         assert(ig[u].type == RIV_LITERAL);
-        assert(!ig[e].graph_lag);
         assert(!ig[u].delay);
         preds.insert(ig[u].s);
-    }
 
-    set<ue2_literal> succs;
-    for (const RoseInEdge &e : ee) {
         RoseInVertex v = target(e, ig);
         assert(ig[v].type == RIV_LITERAL);
         assert(!ig[v].delay);
         succs.insert(ig[v].s);
+
+        if (ig[e].graph_lag) {
+            /* already removed redundant parts of literals */
+            return;
+        }
     }
 
     map<ue2_literal, pair<shared_ptr<NGHolder>, u32> > graphs; /* + delay */
@@ -1818,6 +1909,59 @@ bool makeTransientFromLongLiteral(NGHolder &h, RoseInGraph &vg,
     return true;
 }
 
+static
+void restoreTrailingLiteralStates(NGHolder &g, const ue2_literal &lit,
+                                  u32 delay, const vector<NFAVertex> &preds) {
+    assert(delay <= lit.length());
+    assert(isCorrectlyTopped(g));
+    DEBUG_PRINTF("adding on '%s' %u\n", dumpString(lit).c_str(), delay);
+
+    NFAVertex prev = g.accept;
+    auto it = lit.rbegin();
+    while (delay--) {
+        NFAVertex curr = add_vertex(g);
+        assert(it != lit.rend());
+        g[curr].char_reach = *it;
+        add_edge(curr, prev, g);
+        ++it;
+        prev = curr;
+    }
+
+    for (auto v : preds) {
+        NFAEdge e = add_edge_if_not_present(v, prev, g);
+        if (v == g.start && is_triggered(g)) {
+            g[e].tops.insert(DEFAULT_TOP);
+        }
+    }
+
+    // Every predecessor of accept must have a report.
+    set_report(g, 0);
+
+    renumber_vertices(g);
+    renumber_edges(g);
+    assert(allMatchStatesHaveReports(g));
+    assert(isCorrectlyTopped(g));
+}
+
+static
+void restoreTrailingLiteralStates(NGHolder &g,
+                                  const vector<pair<ue2_literal, u32>> &lits) {
+    vector<NFAVertex> preds;
+    insert(&preds, preds.end(), inv_adjacent_vertices(g.accept, g));
+    clear_in_edges(g.accept, g);
+
+    for (auto v : preds) {
+        g[v].reports.clear(); /* clear report from old accepts */
+    }
+
+    for (const auto &p : lits) {
+        const ue2_literal &lit = p.first;
+        u32 delay = p.second;
+
+        restoreTrailingLiteralStates(g, lit, delay, preds);
+    }
+}
+
 static
 bool improvePrefix(NGHolder &h, RoseInGraph &vg, const vector<RoseInEdge> &ee,
                    const CompileContext &cc) {
@@ -1828,8 +1972,7 @@ bool improvePrefix(NGHolder &h, RoseInGraph &vg, const vector<RoseInEdge> &ee,
     renumber_vertices(h);
     renumber_edges(h);
 
-    vector<NFAVertexDepth> depths;
-    calcDepths(h, depths);
+    auto depths = calcDepths(h);
 
     /* If the reason the prefix is not transient is due to a very long literal
      * following, we can make it transient by restricting ourselves to using
@@ -1838,7 +1981,7 @@ bool improvePrefix(NGHolder &h, RoseInGraph &vg, const vector<RoseInEdge> &ee,
         return true;
     }
 
-    unique_ptr<VertLitInfo> split = findBestPrefixSplit(h, depths, vg, ee, cc);
+    auto split = findBestPrefixSplit(h, depths, vg, ee, false, cc);
 
     if (split && (split->creates_transient || split->creates_anchored)
         && splitRoseEdge(h, vg, ee, *split)) {
@@ -1897,27 +2040,18 @@ bool improvePrefix(NGHolder &h, RoseInGraph &vg, const vector<RoseInEdge> &ee,
         trimmed.clear();
         for (auto &elem : trimmed_vec) {
             shared_ptr<NGHolder> &hp = elem.first;
-            NGHolder &eh = *hp;
-
-            vector<NFAVertex> base_states;
-            insert(&base_states, base_states.end(),
-                   inv_adjacent_vertices(eh.accept, eh));
-            clear_in_edges(eh.accept, eh);
-
-            for (auto v : base_states) {
-                eh[v].reports.clear(); /* clear report from old accepts */
-            }
+            vector<pair<ue2_literal, u32>> succ_lits;
 
             for (const auto &edge_delay : elem.second) {
                 const RoseInEdge &e = edge_delay.first;
                 u32 delay = edge_delay.second;
-                auto succ_lit = vg[target(e, vg)].s;
+                auto lit = vg[target(e, vg)].s;
 
                 vg[e].graph = hp;
-                assert(delay <= succ_lit.length());
-                restoreTrailingLiteralStates(*vg[e].graph, succ_lit, delay,
-                                             base_states);
+                assert(delay <= lit.length());
+                succ_lits.emplace_back(lit, delay);
             }
+            restoreTrailingLiteralStates(*hp, succ_lits);
         }
         return true;
     }
@@ -2234,7 +2368,7 @@ bool replaceSuffixWithInfix(const NGHolder &h, RoseInGraph &vg,
 
         if (vli.lit.empty()
             || !validateRoseLiteralSetQuality(vli.lit, score, false, min_len,
-                                              false)) {
+                                              false, false)) {
             return false;
         }
     }
@@ -2616,24 +2750,215 @@ void rehomeEodSuffixes(RoseInGraph &vg) {
     /* old accept vertices will be tidied up by final pruneUseless() call */
 }
 
-bool doViolet(RoseBuild &rose, const NGHolder &h, bool prefilter,
-              const CompileContext &cc) {
-    assert(!can_never_match(h));
+static
+bool tryForEarlyDfa(const NGHolder &h, const CompileContext &cc) {
+    switch (h.kind) {
+    case NFA_OUTFIX: /* 'prefix' of eod */
+    case NFA_PREFIX:
+        return cc.grey.earlyMcClellanPrefix;
+    case NFA_INFIX:
+        return cc.grey.earlyMcClellanInfix;
+    case NFA_SUFFIX:
+        return cc.grey.earlyMcClellanSuffix;
+    default:
+        DEBUG_PRINTF("kind %u\n", (u32)h.kind);
+        assert(0);
+        return false;
+    }
+}
 
-    if (!cc.grey.allowViolet) {
+static
+vector<vector<CharReach>> getDfaTriggers(RoseInGraph &vg,
+                                         const vector<RoseInEdge> &edges,
+                                         bool *single_trigger) {
+    vector<vector<CharReach>> triggers;
+    u32 min_offset = ~0U;
+    u32 max_offset = 0;
+    for (const auto &e : edges) {
+        RoseInVertex s = source(e, vg);
+        if (vg[s].type == RIV_LITERAL) {
+            triggers.push_back(as_cr_seq(vg[s].s));
+        }
+        ENSURE_AT_LEAST(&max_offset, vg[s].max_offset);
+        LIMIT_TO_AT_MOST(&min_offset, vg[s].min_offset);
+    }
+
+    *single_trigger = min_offset == max_offset;
+    DEBUG_PRINTF("trigger offset (%u, %u)\n", min_offset, max_offset);
+
+    return triggers;
+}
+
+static
+bool doEarlyDfa(RoseBuild &rose, RoseInGraph &vg, NGHolder &h,
+                const vector<RoseInEdge> &edges, bool final_chance,
+                const ReportManager &rm, const CompileContext &cc) {
+    DEBUG_PRINTF("trying for dfa\n");
+
+    bool single_trigger;
+    for (const auto &e : edges) {
+        if (vg[target(e, vg)].type == RIV_ACCEPT_EOD) {
+            /* TODO: support eod prefixes */
+            return false;
+        }
+    }
+
+    auto triggers = getDfaTriggers(vg, edges, &single_trigger);
+
+    /* TODO: literal delay things */
+    if (!generates_callbacks(h)) {
+        set_report(h, rose.getNewNfaReport());
+    }
+
+    shared_ptr<raw_dfa> dfa = buildMcClellan(h, &rm, single_trigger, triggers,
+                                             cc.grey, final_chance);
+
+    if (!dfa) {
         return false;
     }
 
-    DEBUG_PRINTF("hello world\n");
+    DEBUG_PRINTF("dfa ok\n");
+    for (const auto &e : edges) {
+        vg[e].dfa = dfa;
+    }
+
+    return true;
+}
+
+#define MAX_EDGES_FOR_IMPLEMENTABILITY 50
+
+static
+bool splitForImplementabilty(RoseInGraph &vg, NGHolder &h,
+                             const vector<RoseInEdge> &edges,
+                             const CompileContext &cc) {
+    vector<pair<ue2_literal, u32>> succ_lits;
+    DEBUG_PRINTF("trying to split %s with %zu vertices on %zu edges\n",
+                  to_string(h.kind).c_str(), num_vertices(h), edges.size());
+
+    if (edges.size() > MAX_EDGES_FOR_IMPLEMENTABILITY) {
+        return false;
+    }
+
+    if (!generates_callbacks(h)) {
+        for (const auto &e : edges) {
+            const auto &lit = vg[target(e, vg)].s;
+            u32 delay = vg[e].graph_lag;
+            vg[e].graph_lag = 0;
+
+            assert(delay <= lit.length());
+            succ_lits.emplace_back(lit, delay);
+        }
+        restoreTrailingLiteralStates(h, succ_lits);
+    }
+
+    unique_ptr<VertLitInfo> split;
+    bool last_chance = true;
+    if (h.kind == NFA_PREFIX) {
+        auto depths = calcDepths(h);
+
+        split = findBestPrefixSplit(h, depths, vg, edges, last_chance, cc);
+    } else {
+        split = findBestLastChanceSplit(h, vg, edges, cc);
+    }
+
+    if (split && splitRoseEdge(h, vg, edges, *split)) {
+        DEBUG_PRINTF("split on simple literal\n");
+        return true;
+    }
+
+    DEBUG_PRINTF("trying to netflow\n");
+    bool rv =  doNetflowCut(h, nullptr, vg, edges, false, cc.grey);
+    DEBUG_PRINTF("done\n");
+
+    return rv;
+}
+
+#define MAX_IMPLEMENTABLE_SPLITS 50
+
+bool ensureImplementable(RoseBuild &rose, RoseInGraph &vg, bool allow_changes,
+                         bool final_chance, const ReportManager &rm,
+                         const CompileContext &cc) {
+    DEBUG_PRINTF("checking for impl %d\n", final_chance);
+    bool changed = false;
+    bool need_to_recalc = false;
+    u32 added_count = 0;
+    do {
+        changed = false;
+        DEBUG_PRINTF("added %u\n", added_count);
+        map<const NGHolder *, vector<RoseInEdge> > edges_by_graph;
+        vector<NGHolder *> graphs;
+        for (const RoseInEdge &ve : edges_range(vg)) {
+            if (vg[ve].graph) {
+                NGHolder *h = vg[ve].graph.get();
+                if (!contains(edges_by_graph, h)) {
+                    graphs.push_back(h);
+                }
+                edges_by_graph[h].push_back(ve);
+            }
+        }
+        for (NGHolder *h : graphs) {
+            if (isImplementableNFA(*h, &rm, cc)) {
+                continue;
+            }
+
+            if (tryForEarlyDfa(*h, cc)
+                && doEarlyDfa(rose, vg, *h, edges_by_graph[h], final_chance, rm,
+                              cc)) {
+                continue;
+            }
+
+            DEBUG_PRINTF("eek\n");
+            if (!allow_changes) {
+                return false;
+            }
+
+            if (splitForImplementabilty(vg, *h, edges_by_graph[h], cc)) {
+                added_count++;
+                changed = true;
+                continue;
+            }
+
+            return false;
+        }
+
+        if (added_count > MAX_IMPLEMENTABLE_SPLITS) {
+            return false;
+        }
+
+        if (changed) {
+            removeRedundantLiterals(vg, cc);
+            pruneUseless(vg);
+            need_to_recalc = true;
+        }
+    } while (changed);
+
+    if (need_to_recalc) {
+        renumber_vertices(vg);
+        calcVertexOffsets(vg);
+    }
+
+    DEBUG_PRINTF("ok!\n");
+    return true;
+}
+
+static
+RoseInGraph doInitialVioletTransform(const NGHolder &h, bool last_chance,
+                                     const CompileContext &cc) {
+    assert(!can_never_match(h));
 
     RoseInGraph vg = populateTrivialGraph(h);
 
+    if (!cc.grey.allowViolet) {
+        return vg;
+    }
+
+    DEBUG_PRINTF("hello world\n");
+
     /* Step 1: avoid outfixes as we always have to run them. */
-    avoidOutfixes(vg, cc);
+    avoidOutfixes(vg, last_chance, cc);
 
     if (num_vertices(vg) <= 2) {
-        /* only have an outfix; leave for ng_rose for now */
-        return false;
+        return vg; /* unable to transform pattern */
     }
 
     removeRedundantPrefixes(vg);
@@ -2663,10 +2988,6 @@ bool doViolet(RoseBuild &rose, const NGHolder &h, bool prefilter,
         decomposeLiteralChains(vg, cc);
     }
 
-    /* Step 5: avoid unimplementable, or overly large engines if possible */
-    /* TODO: later - ng_rose is currently acting as a backstop */
-
-    /* Step 6: send to rose */
     rehomeEodSuffixes(vg);
     removeRedundantLiterals(vg, cc);
 
@@ -2674,9 +2995,40 @@ bool doViolet(RoseBuild &rose, const NGHolder &h, bool prefilter,
     dumpPreRoseGraph(vg, cc.grey);
     renumber_vertices(vg);
     calcVertexOffsets(vg);
+
+    return vg;
+}
+
+bool doViolet(RoseBuild &rose, const NGHolder &h, bool prefilter,
+              bool last_chance, const ReportManager &rm,
+              const CompileContext &cc) {
+    auto vg = doInitialVioletTransform(h, last_chance, cc);
+    if (num_vertices(vg) <= 2) {
+        return false;
+    }
+
+    /* Step 5: avoid unimplementable, or overly large engines if possible */
+    if (!ensureImplementable(rose, vg, last_chance, last_chance, rm, cc)) {
+        return false;
+    }
+    dumpPreRoseGraph(vg, cc.grey, "post_ensure_rose.dot");
+
+    /* Step 6: send to rose */
     bool rv = rose.addRose(vg, prefilter);
     DEBUG_PRINTF("violet: %s\n", rv ? "success" : "fail");
     return rv;
 }
 
+bool checkViolet(const ReportManager &rm, const NGHolder &h, bool prefilter,
+                 const CompileContext &cc) {
+    auto vg = doInitialVioletTransform(h, true, cc);
+    if (num_vertices(vg) <= 2) {
+        return false;
+    }
+
+    bool rv = roseCheckRose(vg, prefilter, rm, cc);
+    DEBUG_PRINTF("violet: %s\n", rv ? "success" : "fail");
+    return rv;
+}
+
 }
diff --git a/src/nfagraph/ng_violet.h b/src/nfagraph/ng_violet.h
index fb62bfc0c..3fe57dbfa 100644
--- a/src/nfagraph/ng_violet.h
+++ b/src/nfagraph/ng_violet.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -41,12 +41,25 @@ class NGHolder;
 class RoseBuild;
 
 struct CompileContext;
+class ReportManager;
+struct RoseInGraph;
 
 /** \brief Attempt to consume the entire pattern in graph \a h with Rose.
  * Returns true if successful. */
 bool doViolet(RoseBuild &rose, const NGHolder &h, bool prefilter,
+              bool last_chance, const ReportManager &rm,
               const CompileContext &cc);
 
+bool ensureImplementable(RoseBuild &rose, RoseInGraph &vg, bool allow_changes,
+                         bool final_chance, const ReportManager &rm,
+                         const CompileContext &cc);
+
+/** \brief True if the pattern in \a h is consumable by Rose/Violet. This
+ * function may be conservative (return false even if supported) for
+ * efficiency. */
+bool checkViolet(const ReportManager &rm, const NGHolder &h, bool prefilter,
+                 const CompileContext &cc);
+
 } // namespace ue2
 
 #endif
diff --git a/src/nfagraph/ng_width.cpp b/src/nfagraph/ng_width.cpp
index d596b7b5d..c2e9eb1a6 100644
--- a/src/nfagraph/ng_width.cpp
+++ b/src/nfagraph/ng_width.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -157,12 +157,12 @@ depth findMaxWidth(const NGHolder &h, const SpecialEdgeFilter &filter,
     if (colors.at(NODE_ACCEPT) == boost::white_color) {
         acceptDepth = depth::unreachable();
     } else {
-        acceptDepth = -1 * distance.at(NODE_ACCEPT);
+        acceptDepth = depth(-1 * distance.at(NODE_ACCEPT));
     }
     if (colors.at(NODE_ACCEPT_EOD) == boost::white_color) {
         acceptEodDepth = depth::unreachable();
     } else {
-        acceptEodDepth = -1 * distance.at(NODE_ACCEPT_EOD);
+        acceptEodDepth = depth(-1 * distance.at(NODE_ACCEPT_EOD));
     }
 
     depth d;
diff --git a/src/parser/Parser.h b/src/parser/Parser.h
index 45c3ac7af..a034a18fc 100644
--- a/src/parser/Parser.h
+++ b/src/parser/Parser.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -69,7 +69,7 @@ struct ParseMode {
  *
  * This call will throw a ParseError on failure.
  */
-std::unique_ptr<Component> parse(const char *const ptr, ParseMode &mode);
+std::unique_ptr<Component> parse(const char *ptr, ParseMode &mode);
 
 } // namespace ue2
 
diff --git a/src/parser/Parser.rl b/src/parser/Parser.rl
index 53130ddf3..52b3340c6 100644
--- a/src/parser/Parser.rl
+++ b/src/parser/Parser.rl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,6 +34,7 @@
 
 /* Parser.cpp is a built source, may not be in same dir as parser files */
 #include "parser/check_refs.h"
+#include "parser/control_verbs.h"
 #include "parser/ComponentAlternation.h"
 #include "parser/ComponentAssertion.h"
 #include "parser/ComponentAtomicGroup.h"
@@ -115,7 +116,7 @@ unsigned parseAsDecimal(unsigned oct) {
 static constexpr u32 MAX_NUMBER = INT_MAX;
 
 static
-void pushDec(u32 *acc, u8 raw_digit) {
+void pushDec(u32 *acc, char raw_digit) {
     assert(raw_digit >= '0' && raw_digit <= '9');
     u32 digit_val = raw_digit - '0';
 
@@ -129,7 +130,7 @@ void pushDec(u32 *acc, u8 raw_digit) {
 }
 
 static
-void pushOct(u32 *acc, u8 raw_digit) {
+void pushOct(u32 *acc, char raw_digit) {
     assert(raw_digit >= '0' && raw_digit <= '7');
     u32 digit_val = raw_digit - '0';
 
@@ -168,8 +169,7 @@ ComponentSequence *enterSequence(ComponentSequence *parent,
 }
 
 static
-void addLiteral(ComponentSequence *currentSeq, unsigned char c,
-                const ParseMode &mode) {
+void addLiteral(ComponentSequence *currentSeq, char c, const ParseMode &mode) {
     if (mode.utf8 && mode.caseless) {
         /* leverage ComponentClass to generate the vertices */
         auto cc = getComponentClass(mode);
@@ -196,7 +196,7 @@ void addEscaped(ComponentSequence *currentSeq, unichar accum,
         if (accum > 255) {
             throw LocatedParseError(err_msg);
         }
-        addLiteral(currentSeq, (unsigned char)accum, mode);
+        addLiteral(currentSeq, (char)accum, mode);
     }
 }
 
@@ -216,7 +216,7 @@ void addEscapedHex(ComponentSequence *currentSeq, unichar accum,
 #define SLASH_C_ERROR "\\c must be followed by an ASCII character"
 
 static
-u8 decodeCtrl(u8 raw) {
+u8 decodeCtrl(char raw) {
     if (raw & 0x80) {
         throw LocatedParseError(SLASH_C_ERROR);
     }
@@ -224,10 +224,10 @@ u8 decodeCtrl(u8 raw) {
 }
 
 static
-unichar readUtf8CodePoint2c(const u8 *ts) {
+unichar readUtf8CodePoint2c(const char *s) {
+    auto *ts = (const u8 *)s;
     assert(ts[0] >= 0xc0 && ts[0] < 0xe0);
     assert(ts[1] >= 0x80 && ts[1] < 0xc0);
-
     unichar val = ts[0] & 0x1f;
     val <<= 6;
     val |= ts[1] & 0x3f;
@@ -237,7 +237,8 @@ unichar readUtf8CodePoint2c(const u8 *ts) {
 }
 
 static
-unichar readUtf8CodePoint3c(const u8 *ts) {
+unichar readUtf8CodePoint3c(const char *s) {
+    auto *ts = (const u8 *)s;
     assert(ts[0] >= 0xe0 && ts[0] < 0xf0);
     assert(ts[1] >= 0x80 && ts[1] < 0xc0);
     assert(ts[2] >= 0x80 && ts[2] < 0xc0);
@@ -252,7 +253,8 @@ unichar readUtf8CodePoint3c(const u8 *ts) {
 }
 
 static
-unichar readUtf8CodePoint4c(const u8 *ts) {
+unichar readUtf8CodePoint4c(const char *s) {
+    auto *ts = (const u8 *)s;
     assert(ts[0] >= 0xf0 && ts[0] < 0xf8);
     assert(ts[1] >= 0x80 && ts[1] < 0xc0);
     assert(ts[2] >= 0x80 && ts[2] < 0xc0);
@@ -272,12 +274,10 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
 %%{
     machine regex;
 
-    alphtype unsigned char;
-
     action throwUnsupportedEscape {
         ostringstream str;
-        str << "'\\" << (char)*(ts + 1) << "' at index "
-            << ts - ptr << " not supported in a character class.";
+        str << "'\\" << *(ts + 1) << "' at index " << ts - ptr
+            << " not supported in a character class.";
         throw ParseError(str.str());
     }
     action unsupportedProperty {
@@ -549,26 +549,25 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
     #############################################################
     readVerb := |*
         'UTF8)' => {
-            if (ts != ptr + 2) {
-                throw LocatedParseError("(*UTF8) must be at start of "
-                                        "expression, encountered");
-            }
-            mode.utf8 = true;
-            globalMode.utf8 = true; /* once you unicode, you can't stop */
-            ucp_start_p = te; /* (*UCP) can appear after us */
-            fret;
+            throw LocatedParseError("(*UTF8) must be at start of "
+                                    "expression, encountered");
+        };
+        'UTF)' => {
+            throw LocatedParseError("(*UTF) must be at start of "
+                                    "expression, encountered");
         };
         'UCP)' => {
-            if (ts != ucp_start_p + 2) {
-                throw LocatedParseError("(*UCP) must be at start of "
-                                        "expression, encountered");
-            }
-            mode.ucp = true;
-            globalMode.ucp = true; /* once you unicode, you can't stop */
-            fret;
+            throw LocatedParseError("(*UCP) must be at start of "
+                                    "expression, encountered");
         };
-        'UTF16)' => {
-            throw LocatedParseError("(*UTF16) not supported");
+        # Use the control verb mini-parser to report an error for this
+        # unsupported/unknown verb.
+        [^)]+ ')' => {
+            ParseMode temp_mode;
+            assert(ts - 2 >= ptr); // parser needs the '(*' at the start too.
+            read_control_verbs(ts - 2, te, (ts - 2 - ptr), temp_mode);
+            assert(0); // Should have thrown a parse error.
+            throw LocatedParseError("Unknown control verb");
         };
         any => {
             throw LocatedParseError("Unknown control verb");
@@ -977,8 +976,13 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
               };
 
               '\\o{' [0-7]+ '}' => {
-                  string oct((const char *)ts + 3, te - ts - 4);
-                  long int val = strtol(oct.c_str(), nullptr, 8);
+                  string oct(ts + 3, te - ts - 4);
+                  unsigned long val;
+                  try {
+                      val = stoul(oct, nullptr, 8);
+                  } catch (const std::out_of_range &) {
+                      val = MAX_UNICODE + 1;
+                  }
                   if ((!mode.utf8 && val > 255) || val > MAX_UNICODE) {
                       throw LocatedParseError("Value in \\o{...} sequence is too large");
                   }
@@ -1002,8 +1006,13 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
               };
               # Unicode Hex
               '\\x{' xdigit+ '}' => {
-                  string hex((const char *)ts + 3, te - ts - 4);
-                  long int val = strtol(hex.c_str(), nullptr, 16);
+                  string hex(ts + 3, te - ts - 4);
+                  unsigned long val;
+                  try {
+                      val = stoul(hex, nullptr, 16);
+                  } catch (const std::out_of_range &) {
+                      val = MAX_UNICODE + 1;
+                  }
                   if (val > MAX_UNICODE) {
                       throw LocatedParseError("Value in \\x{...} sequence is too large");
                   }
@@ -1092,7 +1101,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
 
               # Literal character
               (any - ']') => {
-                  currentCls->add(*ts);
+                  currentCls->add((u8)*ts);
               };
 
               ']' => {
@@ -1446,7 +1455,7 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
                       // Otherwise, we interpret the first three digits as an
                       // octal escape, and the remaining characters stand for
                       // themselves as literals.
-                      const u8 *s = ts;
+                      const char *s = ts;
                       unsigned int accum = 0;
                       unsigned int oct_digits = 0;
                       assert(*s == '\\'); // token starts at backslash
@@ -1491,8 +1500,13 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
                   throw LocatedParseError("Invalid reference after \\g");
               };
               '\\o{' [0-7]+ '}' => {
-                  string oct((const char *)ts + 3, te - ts - 4);
-                  long int val = strtol(oct.c_str(), nullptr, 8);
+                  string oct(ts + 3, te - ts - 4);
+                  unsigned long val;
+                  try {
+                      val = stoul(oct, nullptr, 8);
+                  } catch (const std::out_of_range &) {
+                      val = MAX_UNICODE + 1;
+                  }
                   if ((!mode.utf8 && val > 255) || val > MAX_UNICODE) {
                       throw LocatedParseError("Value in \\o{...} sequence is too large");
                   }
@@ -1508,8 +1522,13 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
               };
               # Unicode Hex
               '\\x{' xdigit+ '}' => {
-                  string hex((const char *)ts + 3, te - ts - 4);
-                  long int val = strtol(hex.c_str(), nullptr, 16);
+                  string hex(ts + 3, te - ts - 4);
+                  unsigned long val;
+                  try {
+                      val = stoul(hex, nullptr, 16);
+                  } catch (const std::out_of_range &) {
+                      val = MAX_UNICODE + 1;
+                  }
                   if (val > MAX_UNICODE) {
                       throw LocatedParseError("Value in \\x{...} sequence is too large");
                   }
@@ -1532,8 +1551,8 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
               # A bunch of unsupported (for now) escapes
               escapedUnsupported => {
                   ostringstream str;
-                  str << "'\\" << (char)*(ts + 1) << "' at index "
-                      << ts - ptr << " not supported.";
+                  str << "'\\" << *(ts + 1) << "' at index " << ts - ptr
+                      << " not supported.";
                   throw ParseError(str.str());
               };
 
@@ -1834,16 +1853,22 @@ unichar readUtf8CodePoint4c(const u8 *ts) {
 %% write data nofinal;
 
 /** \brief Main parser call, returns root Component or nullptr. */
-unique_ptr<Component> parse(const char *const c_ptr, ParseMode &globalMode) {
-    const u8 * const ptr = (const u8 * const)c_ptr;
-    const u8 *p = ptr;
-    const u8 *pe = ptr + strlen(c_ptr);
-    const u8 *eof = pe;
+unique_ptr<Component> parse(const char *ptr, ParseMode &globalMode) {
+    assert(ptr);
+
+    const char *p = ptr;
+    const char *pe = ptr + strlen(ptr);
+
+    // First, read the control verbs, set any global mode flags and move the
+    // ptr forward.
+    p = read_control_verbs(p, pe, 0, globalMode);
+
+    const char *eof = pe;
     int cs;
     UNUSED int act;
     int top;
     vector<int> stack;
-    const u8 *ts, *te;
+    const char *ts, *te;
     unichar accumulator = 0;
     unichar octAccumulator = 0; /* required as we are also accumulating for
                                  * back ref when looking for octals */
@@ -1889,9 +1914,7 @@ unique_ptr<Component> parse(const char *const c_ptr, ParseMode &globalMode) {
     bool inCharClassEarly = false;
 
     // Location at which the current character class began.
-    const u8 *currentClsBegin = p;
-
-    const u8 *ucp_start_p = p; /* for (*UCP) verb */
+    const char *currentClsBegin = p;
 
     // We throw exceptions on various parsing failures beyond this point: we
     // use a try/catch block here to clean up our allocated memory before we
diff --git a/src/parser/buildstate.h b/src/parser/buildstate.h
index 8a69f44fa..5ddaf9b23 100644
--- a/src/parser/buildstate.h
+++ b/src/parser/buildstate.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,10 +35,10 @@
 
 #include "ue2common.h"
 #include "position.h"
+#include "util/noncopyable.h"
 
 #include <memory>
 #include <vector>
-#include <boost/core/noncopyable.hpp>
 
 namespace ue2 {
 
@@ -49,7 +49,7 @@ class PositionInfo;
  *
  * Abstract base class; use \ref makeGlushkovBuildState to get one of these you
  * can use. */
-class GlushkovBuildState : boost::noncopyable {
+class GlushkovBuildState : noncopyable {
 public:
     /** \brief Represents an uninitialized state. */
     static const Position POS_UNINITIALIZED;
diff --git a/src/rose/rose_dump.h b/src/parser/control_verbs.h
similarity index 77%
rename from src/rose/rose_dump.h
rename to src/parser/control_verbs.h
index fe66302d2..58934ec2c 100644
--- a/src/rose/rose_dump.h
+++ b/src/parser/control_verbs.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,25 +26,23 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef ROSE_DUMP_H
-#define ROSE_DUMP_H
-
-#ifdef DUMP_SUPPORT
+/**
+ * \file
+ * \brief Parser for control verbs that can occur at the beginning of a pattern.
+ */
 
-#include <cstdio>
-#include <string>
+#ifndef CONTROL_VERBS_H
+#define CONTROL_VERBS_H
 
-struct RoseEngine;
+#include "ue2common.h"
 
 namespace ue2 {
 
-void roseDumpText(const RoseEngine *t, FILE *f);
-void roseDumpInternals(const RoseEngine *t, const std::string &base);
-void roseDumpComponents(const RoseEngine *t, bool dump_raw,
-                        const std::string &base);
-void roseDumpStructRaw(const RoseEngine *t, FILE *f);
+struct ParseMode;
+
+const char *read_control_verbs(const char *ptr, const char *end, size_t start,
+                               ParseMode &mode);
 
 } // namespace ue2
 
-#endif
-#endif
+#endif // CONTROL_VERBS_H
diff --git a/src/parser/control_verbs.rl b/src/parser/control_verbs.rl
new file mode 100644
index 000000000..1d3e33a9a
--- /dev/null
+++ b/src/parser/control_verbs.rl
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file
+ * \brief Parser for control verbs that can occur at the beginning of a pattern.
+ */
+
+#include "parser/control_verbs.h"
+
+#include "parser/Parser.h"
+#include "parser/parse_error.h"
+
+#include <cstring>
+#include <sstream>
+
+using namespace std;
+
+namespace ue2 {
+
+const char *read_control_verbs(const char *ptr, const char *end, size_t start,
+                               ParseMode &mode) {
+    const char *p = ptr;
+    const char *pe = end;
+    const char *eof = pe;
+    const char *ts, *te;
+    int cs;
+    UNUSED int act;
+
+    %%{
+        machine ControlVerbs;
+
+        # Verbs that we recognise but do not support.
+        unhandledVerbs = '(*' (
+            'LIMIT_MATCH=' [0-9]+ |
+            'LIMIT_RECURSION=' [0-9]+ |
+            'NO_AUTO_POSSESS' |
+            'NO_START_OPT' |
+            'UTF16' |
+            'UTF32' |
+            'CR' |
+            'LF' |
+            'CRLF' |
+            'ANYCRLF' |
+            'ANY' |
+            'BSR_ANYCRLF' |
+            'BSR_UNICODE'
+            ) . ')';
+
+        main := |*
+            '(*UTF8)' | '(*UTF)' => {
+                mode.utf8 = true;
+            };
+
+            '(*UCP)' => {
+                mode.ucp = true;
+            };
+
+            unhandledVerbs => {
+                ostringstream str;
+                str << "Unsupported control verb " << string(ts, te - ts);
+                throw LocatedParseError(str.str());
+            };
+
+            '(*' [^)]+ ')' => {
+                ostringstream str;
+                str << "Unknown control verb " << string(ts, te - ts);
+                throw LocatedParseError(str.str());
+            };
+
+            # Anything else means we're done.
+            any => {
+                fhold;
+                fbreak;
+            };
+        *|;
+
+        write data;
+        write init;
+    }%%
+
+    try {
+        %% write exec;
+    } catch (LocatedParseError &error) {
+        if (ts >= ptr && ts <= pe) {
+            error.locate(ts - ptr + start);
+        } else {
+            error.locate(0);
+        }
+        throw;
+    }
+
+    return p;
+}
+
+} // namespace ue2
diff --git a/src/parser/parse_error.cpp b/src/parser/parse_error.cpp
index 6245adb9f..e7f60b264 100644
--- a/src/parser/parse_error.cpp
+++ b/src/parser/parse_error.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -44,9 +44,13 @@ ParseError::~ParseError() {}
 LocatedParseError::~LocatedParseError() {}
 
 void LocatedParseError::locate(size_t offset) {
+    if (finalized) {
+        return;
+    }
     std::ostringstream str;
     str << reason << " at index " << offset << ".";
     reason = str.str();
+    finalized = true;
 }
 
 }
diff --git a/src/parser/parse_error.h b/src/parser/parse_error.h
index e727991db..4556ed5e0 100644
--- a/src/parser/parse_error.h
+++ b/src/parser/parse_error.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,8 +30,8 @@
  * \brief Parse/Compile exceptions.
  */
 
-#ifndef PARSE_ERROR_H_A02047D1AA16C9
-#define PARSE_ERROR_H_A02047D1AA16C9
+#ifndef PARSE_ERROR_H
+#define PARSE_ERROR_H
 
 #include "util/compile_error.h"
 
@@ -44,22 +44,24 @@ class ParseError : public CompileError {
 public:
     // Note: 'why' should describe why the error occurred and end with a
     // full stop, but no line break.
-    explicit ParseError(const std::string &why) : CompileError(why) {}
+    explicit ParseError(std::string why) : CompileError(std::move(why)) {}
 
     ~ParseError() override;
 };
 
 class LocatedParseError : public ParseError {
 public:
-    explicit LocatedParseError(const std::string &why) : ParseError(".") {
-        reason = why; // don't use ParseError ctor
+    explicit LocatedParseError(std::string why) : ParseError(".") {
+        reason = std::move(why); // don't use ParseError ctor
     }
 
     ~LocatedParseError() override;
 
     void locate(size_t offset);
+private:
+    bool finalized = false; //!< true when locate() has been called.
 };
 
 } // namespace ue2
 
-#endif /* PARSE_ERROR_H_A02047D1AA16C9 */
+#endif /* PARSE_ERROR_H */
diff --git a/src/parser/shortcut_literal.cpp b/src/parser/shortcut_literal.cpp
index 3f58d7526..4539836ab 100644
--- a/src/parser/shortcut_literal.cpp
+++ b/src/parser/shortcut_literal.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -159,23 +159,26 @@ class ConstructLiteralVisitor : public ConstComponentVisitor {
 ConstructLiteralVisitor::~ConstructLiteralVisitor() {}
 
 /** \brief True if the literal expression \a expr could be added to Rose. */
-bool shortcutLiteral(NG &ng, const ParsedExpression &expr) {
-    assert(expr.component);
+bool shortcutLiteral(NG &ng, const ParsedExpression &pe) {
+    assert(pe.component);
 
     if (!ng.cc.grey.allowLiteral) {
         return false;
     }
 
+    const auto &expr = pe.expr;
+
     // XXX: don't shortcut literals with extended params (yet)
-    if (expr.min_offset || expr.max_offset != MAX_OFFSET || expr.min_length) {
+    if (expr.min_offset || expr.max_offset != MAX_OFFSET || expr.min_length ||
+        expr.edit_distance) {
         DEBUG_PRINTF("extended params not allowed\n");
         return false;
     }
 
     ConstructLiteralVisitor vis;
     try {
-        assert(expr.component);
-        expr.component->accept(vis);
+        assert(pe.component);
+        pe.component->accept(vis);
         assert(vis.repeat_stack.empty());
     } catch (const ConstructLiteralVisitor::NotLiteral&) {
         DEBUG_PRINTF("not a literal\n");
@@ -195,7 +198,8 @@ bool shortcutLiteral(NG &ng, const ParsedExpression &expr) {
     }
 
     DEBUG_PRINTF("constructed literal %s\n", dumpString(lit).c_str());
-    return ng.addLiteral(lit, expr.index, expr.id, expr.highlander, expr.som);
+    return ng.addLiteral(lit, expr.index, expr.report, expr.highlander,
+                         expr.som);
 }
 
 } // namespace ue2
diff --git a/src/rose/counting_miracle.h b/src/rose/counting_miracle.h
index 76db5a77c..976208b73 100644
--- a/src/rose/counting_miracle.h
+++ b/src/rose/counting_miracle.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -98,8 +98,8 @@ u32 roseCountingMiracleScanShufti(m128 mask_lo, m128 mask_hi, u8 poison,
 
     for (; d + 16 <= d_end; d_end -= 16) {
         m128 data = loadu128(d_end - 16);
-        m128 c_lo  = pshufb(mask_lo, GET_LO_4(data));
-        m128 c_hi  = pshufb(mask_hi, GET_HI_4(data));
+        m128 c_lo  = pshufb_m128(mask_lo, GET_LO_4(data));
+        m128 c_hi  = pshufb_m128(mask_hi, GET_HI_4(data));
         m128 t     = and128(c_lo, c_hi);
         u32 z1 = movemask128(eq128(t, zeroes));
         count += popcount32(z1 ^ 0xffff);
@@ -117,8 +117,8 @@ u32 roseCountingMiracleScanShufti(m128 mask_lo, m128 mask_hi, u8 poison,
         memset(temp, poison, sizeof(temp));
         memcpy(temp, d, d_end - d);
         m128 data  = loadu128(temp);
-        m128 c_lo  = pshufb(mask_lo, GET_LO_4(data));
-        m128 c_hi  = pshufb(mask_hi, GET_HI_4(data));
+        m128 c_lo  = pshufb_m128(mask_lo, GET_LO_4(data));
+        m128 c_hi  = pshufb_m128(mask_hi, GET_HI_4(data));
         m128 t     = and128(c_lo, c_hi);
         u32 z1 = movemask128(eq128(t, zeroes));
         count += popcount32(z1 ^ 0xffff);
diff --git a/src/rose/match.c b/src/rose/match.c
index b641e39d8..daf81eac0 100644
--- a/src/rose/match.c
+++ b/src/rose/match.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -85,19 +85,13 @@ hwlmcb_rv_t roseDelayRebuildCallback(size_t start, size_t end, u32 id,
 
     DEBUG_PRINTF("STATE groups=0x%016llx\n", tctx->groups);
 
-    const u32 *delayRebuildPrograms =
-        getByOffset(t, t->litDelayRebuildProgramOffset);
-    assert(id < t->literalCount);
-    const u32 program = delayRebuildPrograms[id];
-
-    if (program) {
-        const u64a som = 0;
-        const size_t match_len = end - start + 1;
-        const u8 flags = 0;
-        UNUSED hwlmcb_rv_t rv = roseRunProgram(t, scratch, program, som,
-                                               real_end, match_len, flags);
-        assert(rv != HWLM_TERMINATE_MATCHING);
-    }
+    assert(id && id < t->size); // id is a program offset
+    const u64a som = 0;
+    const size_t match_len = end - start + 1;
+    const u8 flags = 0;
+    UNUSED hwlmcb_rv_t rv =
+        roseRunProgram(t, scratch, id, som, real_end, match_len, flags);
+    assert(rv != HWLM_TERMINATE_MATCHING);
 
     /* we are just repopulating the delay queue, groups should be
      * already set from the original scan. */
@@ -156,7 +150,7 @@ hwlmcb_rv_t roseHandleChainMatch(const struct RoseEngine *t,
     }
 
     if (top_squash_distance) {
-        assert(q->cur != q->end);
+        assert(q->cur < q->end);
         struct mq_item *last = &q->items[q->end - 1];
         if (last->type == event
             && last->location >= loc - (s64a)top_squash_distance) {
@@ -242,33 +236,13 @@ int roseAnchoredCallback(u64a start, u64a end, u32 id, void *ctx) {
  */
 static really_inline
 hwlmcb_rv_t roseProcessMatchInline(const struct RoseEngine *t,
-                             struct hs_scratch *scratch, u64a end,
-                             size_t match_len, u32 id) {
-    DEBUG_PRINTF("id=%u\n", id);
-    const u32 *programs = getByOffset(t, t->litProgramOffset);
-    assert(id < t->literalCount);
-    const u64a som = 0;
-    const u8 flags = 0;
-    return roseRunProgram_i(t, scratch, programs[id], som, end, match_len,
-                            flags);
-}
-
-/**
- * \brief Run the program for the given literal ID, with the interpreter
- * out of line.
- *
- * Assumes not in_anchored.
- */
-static really_inline
-hwlmcb_rv_t roseProcessMatch(const struct RoseEngine *t,
-                             struct hs_scratch *scratch, u64a end,
-                             size_t match_len, u32 id) {
+                                   struct hs_scratch *scratch, u64a end,
+                                   size_t match_len, u32 id) {
     DEBUG_PRINTF("id=%u\n", id);
-    const u32 *programs = getByOffset(t, t->litProgramOffset);
-    assert(id < t->literalCount);
+    assert(id && id < t->size); // id is an offset into bytecode
     const u64a som = 0;
     const u8 flags = 0;
-    return roseRunProgram(t, scratch, programs[id], som, end, match_len, flags);
+    return roseRunProgram_i(t, scratch, id, som, end, match_len, flags);
 }
 
 static rose_inline
@@ -290,14 +264,17 @@ hwlmcb_rv_t playDelaySlot(const struct RoseEngine *t,
     roseFlushLastByteHistory(t, scratch, offset);
     tctxt->lastEndOffset = offset;
 
+    const u32 *programs = getByOffset(t, t->delayProgramOffset);
+
     for (u32 it = fatbit_iterate(vicSlot, delay_count, MMB_INVALID);
          it != MMB_INVALID; it = fatbit_iterate(vicSlot, delay_count, it)) {
-        u32 literal_id = t->delay_base_id + it;
-
         UNUSED rose_group old_groups = tctxt->groups;
 
-        DEBUG_PRINTF("DELAYED MATCH id=%u offset=%llu\n", literal_id, offset);
-        hwlmcb_rv_t rv = roseProcessMatch(t, scratch, offset, 0, literal_id);
+        DEBUG_PRINTF("DELAYED MATCH id=%u offset=%llu\n", it, offset);
+        const u64a som = 0;
+        const u8 flags = 0;
+        hwlmcb_rv_t rv = roseRunProgram(t, scratch, programs[it], som, offset,
+                                        0, flags);
         DEBUG_PRINTF("DONE groups=0x%016llx\n", tctxt->groups);
 
         /* delayed literals can't safely set groups.
@@ -322,16 +299,19 @@ hwlmcb_rv_t flushAnchoredLiteralAtLoc(const struct RoseEngine *t,
     struct fatbit *curr_row = getAnchoredLiteralLog(scratch)[curr_loc - 1];
     u32 region_width = t->anchored_count;
 
+    const u32 *programs = getByOffset(t, t->anchoredProgramOffset);
+
     DEBUG_PRINTF("report matches at curr loc\n");
     for (u32 it = fatbit_iterate(curr_row, region_width, MMB_INVALID);
          it != MMB_INVALID; it = fatbit_iterate(curr_row, region_width, it)) {
         DEBUG_PRINTF("it = %u/%u\n", it, region_width);
-        u32 literal_id = t->anchored_base_id + it;
 
         rose_group old_groups = tctxt->groups;
-        DEBUG_PRINTF("ANCH REPLAY MATCH id=%u offset=%u\n", literal_id,
-                     curr_loc);
-        hwlmcb_rv_t rv = roseProcessMatch(t, scratch, curr_loc, 0, literal_id);
+        DEBUG_PRINTF("ANCH REPLAY MATCH id=%u offset=%u\n", it, curr_loc);
+        const u64a som = 0;
+        const u8 flags = 0;
+        hwlmcb_rv_t rv = roseRunProgram(t, scratch, programs[it], som, curr_loc,
+                                        0, flags);
         DEBUG_PRINTF("DONE groups=0x%016llx\n", tctxt->groups);
 
         /* anchored literals can't safely set groups.
diff --git a/src/rose/program_runtime.h b/src/rose/program_runtime.h
index e883c239e..b140a2bcd 100644
--- a/src/rose/program_runtime.h
+++ b/src/rose/program_runtime.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -41,6 +41,7 @@
 #include "miracle.h"
 #include "report.h"
 #include "rose.h"
+#include "rose_common.h"
 #include "rose_internal.h"
 #include "rose_program.h"
 #include "rose_types.h"
@@ -102,7 +103,7 @@ void rosePushDelayedMatch(const struct RoseEngine *t,
 
 static rose_inline
 void recordAnchoredLiteralMatch(const struct RoseEngine *t,
-                                struct hs_scratch *scratch, u32 literal_id,
+                                struct hs_scratch *scratch, u32 anch_id,
                                 u64a end) {
     assert(end);
 
@@ -112,7 +113,7 @@ void recordAnchoredLiteralMatch(const struct RoseEngine *t,
 
     struct fatbit **anchoredLiteralRows = getAnchoredLiteralLog(scratch);
 
-    DEBUG_PRINTF("record %u @ %llu\n", literal_id, end);
+    DEBUG_PRINTF("record %u (of %u) @ %llu\n", anch_id, t->anchored_count, end);
 
     if (!bf64_set(&scratch->al_log_sum, end - 1)) {
         // first time, clear row
@@ -120,11 +121,8 @@ void recordAnchoredLiteralMatch(const struct RoseEngine *t,
         fatbit_clear(anchoredLiteralRows[end - 1]);
     }
 
-    u32 rel_idx = literal_id - t->anchored_base_id;
-    DEBUG_PRINTF("record %u @ %llu index %u/%u\n", literal_id, end, rel_idx,
-                 t->anchored_count);
-    assert(rel_idx < t->anchored_count);
-    fatbit_set(anchoredLiteralRows[end - 1], t->anchored_count, rel_idx);
+    assert(anch_id < t->anchored_count);
+    fatbit_set(anchoredLiteralRows[end - 1], t->anchored_count, anch_id);
 }
 
 static rose_inline
@@ -486,7 +484,6 @@ static rose_inline
 hwlmcb_rv_t roseReport(const struct RoseEngine *t, struct hs_scratch *scratch,
                        u64a end, ReportID onmatch, s32 offset_adjust,
                        u32 ekey) {
-    assert(!t->needsCatchup || end == scratch->tctxt.minMatchOffset);
     DEBUG_PRINTF("firing callback onmatch=%u, end=%llu\n", onmatch, end);
     updateLastMatchOffset(&scratch->tctxt, end);
 
@@ -520,13 +517,11 @@ hwlmcb_rv_t roseCatchUpAndHandleChainMatch(const struct RoseEngine *t,
 }
 
 static rose_inline
-void roseHandleSom(UNUSED const struct RoseEngine *t,
-                   struct hs_scratch *scratch, const struct som_operation *sr,
+void roseHandleSom(struct hs_scratch *scratch, const struct som_operation *sr,
                    u64a end) {
     DEBUG_PRINTF("end=%llu, minMatchOffset=%llu\n", end,
                  scratch->tctxt.minMatchOffset);
 
-    assert(!t->needsCatchup || end == scratch->tctxt.minMatchOffset);
     updateLastMatchOffset(&scratch->tctxt, end);
     handleSomInternal(scratch, sr, end);
 }
@@ -535,7 +530,6 @@ static rose_inline
 hwlmcb_rv_t roseReportSom(const struct RoseEngine *t,
                           struct hs_scratch *scratch, u64a start, u64a end,
                           ReportID onmatch, s32 offset_adjust, u32 ekey) {
-    assert(!t->needsCatchup || end == scratch->tctxt.minMatchOffset);
     DEBUG_PRINTF("firing som callback onmatch=%u, start=%llu, end=%llu\n",
                  onmatch, start, end);
     updateLastMatchOffset(&scratch->tctxt, end);
@@ -555,13 +549,11 @@ hwlmcb_rv_t roseReportSom(const struct RoseEngine *t,
 }
 
 static rose_inline
-void roseHandleSomSom(UNUSED const struct RoseEngine *t,
-                      struct hs_scratch *scratch,
+void roseHandleSomSom(struct hs_scratch *scratch,
                       const struct som_operation *sr, u64a start, u64a end) {
     DEBUG_PRINTF("start=%llu, end=%llu, minMatchOffset=%llu\n", start, end,
                  scratch->tctxt.minMatchOffset);
 
-    assert(!t->needsCatchup || end == scratch->tctxt.minMatchOffset);
     updateLastMatchOffset(&scratch->tctxt, end);
     setSomFromSomAware(scratch, sr, start, end);
 }
@@ -859,13 +851,13 @@ u32 getBufferDataComplex(const struct core_info *ci, const s64a loc,
 }
 
 static rose_inline
-m128 getData128(const struct core_info *ci, s64a offset, u16 *valid_data_mask) {
+m128 getData128(const struct core_info *ci, s64a offset, u32 *valid_data_mask) {
     if (offset > 0 && offset + sizeof(m128) <= ci->len) {
         *valid_data_mask = 0xffff;
         return loadu128(ci->buf + offset);
     }
     ALIGN_DIRECTIVE u8 data[sizeof(m128)];
-    *valid_data_mask = (u16)getBufferDataComplex(ci, offset, data, 16);
+    *valid_data_mask = getBufferDataComplex(ci, offset, data, 16);
     return *(m128 *)data;
 }
 
@@ -894,7 +886,7 @@ int roseCheckShufti16x8(const struct core_info *ci, const u8 *nib_mask,
         return 0;
     }
 
-    u16 valid_data_mask = 0;
+    u32 valid_data_mask = 0;
     m128 data = getData128(ci, offset, &valid_data_mask);
     if (unlikely(!valid_data_mask)) {
         return 1;
@@ -926,7 +918,7 @@ int roseCheckShufti16x16(const struct core_info *ci, const u8 *hi_mask,
         return 0;
     }
 
-    u16 valid_data_mask = 0;
+    u32 valid_data_mask = 0;
     m128 data = getData128(ci, offset, &valid_data_mask);
     if (unlikely(!valid_data_mask)) {
         return 1;
@@ -1022,8 +1014,9 @@ int roseCheckShufti32x16(const struct core_info *ci, const u8 *hi_mask,
 static rose_inline
 int roseCheckSingleLookaround(const struct RoseEngine *t,
                               const struct hs_scratch *scratch,
-                              s8 checkOffset, u32 lookaroundIndex, u64a end) {
-    assert(lookaroundIndex != MO_INVALID_IDX);
+                              s8 checkOffset, u32 lookaroundReachIndex,
+                              u64a end) {
+    assert(lookaroundReachIndex != MO_INVALID_IDX);
     const struct core_info *ci = &scratch->core_info;
     DEBUG_PRINTF("end=%llu, buf_offset=%llu, buf_end=%llu\n", end,
                  ci->buf_offset, ci->buf_offset + ci->len);
@@ -1038,8 +1031,7 @@ int roseCheckSingleLookaround(const struct RoseEngine *t,
         return 0;
     }
 
-    const u8 *reach_base = (const u8 *)t + t->lookaroundReachOffset;
-    const u8 *reach = reach_base + lookaroundIndex * REACH_BITVECTOR_LEN;
+    const u8 *reach = getByOffset(t, lookaroundReachIndex);
 
     u8 c;
     if (offset >= 0 && offset < (s64a)ci->len) {
@@ -1065,23 +1057,22 @@ int roseCheckSingleLookaround(const struct RoseEngine *t,
  */
 static rose_inline
 int roseCheckLookaround(const struct RoseEngine *t,
-                        const struct hs_scratch *scratch, u32 lookaroundIndex,
+                        const struct hs_scratch *scratch,
+                        u32 lookaroundLookIndex, u32 lookaroundReachIndex,
                         u32 lookaroundCount, u64a end) {
-    assert(lookaroundIndex != MO_INVALID_IDX);
+    assert(lookaroundLookIndex != MO_INVALID_IDX);
+    assert(lookaroundReachIndex != MO_INVALID_IDX);
     assert(lookaroundCount > 0);
 
     const struct core_info *ci = &scratch->core_info;
     DEBUG_PRINTF("end=%llu, buf_offset=%llu, buf_end=%llu\n", end,
                  ci->buf_offset, ci->buf_offset + ci->len);
 
-    const u8 *base = (const u8 *)t;
-    const s8 *look_base = (const s8 *)(base + t->lookaroundTableOffset);
-    const s8 *look = look_base + lookaroundIndex;
+    const s8 *look = getByOffset(t, lookaroundLookIndex);
     const s8 *look_end = look + lookaroundCount;
     assert(look < look_end);
 
-    const u8 *reach_base = base + t->lookaroundReachOffset;
-    const u8 *reach = reach_base + lookaroundIndex * REACH_BITVECTOR_LEN;
+    const u8 *reach = getByOffset(t, lookaroundReachIndex);
 
     // The following code assumes that the lookaround structures are ordered by
     // increasing offset.
@@ -1153,6 +1144,357 @@ int roseCheckLookaround(const struct RoseEngine *t,
     return 1;
 }
 
+/**
+ * \brief Trying to find a matching path by the corresponding path mask of
+ * every lookaround location.
+ */
+static rose_inline
+int roseMultipathLookaround(const struct RoseEngine *t,
+                            const struct hs_scratch *scratch,
+                            u32 multipathLookaroundLookIndex,
+                            u32 multipathLookaroundReachIndex,
+                            u32 multipathLookaroundCount,
+                            s32 last_start, const u8 *start_mask,
+                            u64a end) {
+    assert(multipathLookaroundCount > 0);
+
+    const struct core_info *ci = &scratch->core_info;
+    DEBUG_PRINTF("end=%llu, buf_offset=%llu, buf_end=%llu\n", end,
+                 ci->buf_offset, ci->buf_offset + ci->len);
+
+    const s8 *look = getByOffset(t, multipathLookaroundLookIndex);
+    const s8 *look_end = look + multipathLookaroundCount;
+    assert(look < look_end);
+
+    const u8 *reach = getByOffset(t, multipathLookaroundReachIndex);
+
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    DEBUG_PRINTF("base_offset=%lld\n", base_offset);
+
+    u8 path = 0xff;
+
+    assert(last_start < 0);
+
+    if (unlikely((u64a)(0 - last_start) > end)) {
+        DEBUG_PRINTF("too early, fail\n");
+        return 0;
+    }
+
+    s8 base_look_offset = *look;
+    do {
+        s64a offset = base_offset + *look;
+        u32 start_offset = (u32)(*look - base_look_offset);
+        DEBUG_PRINTF("start_mask[%u] = %x\n", start_offset,
+                     start_mask[start_offset]);
+        path = start_mask[start_offset];
+        if (offset >= -(s64a)ci->hlen) {
+            break;
+        }
+        DEBUG_PRINTF("look=%d before history\n", *look);
+        look++;
+        reach += MULTI_REACH_BITVECTOR_LEN;
+    } while (look < look_end);
+
+    DEBUG_PRINTF("scan history (%zu looks left)\n", look_end - look);
+    for (; look < look_end; ++look, reach += MULTI_REACH_BITVECTOR_LEN) {
+        s64a offset = base_offset + *look;
+        DEBUG_PRINTF("reach=%p, rel offset=%lld\n", reach, offset);
+
+        if (offset >= 0) {
+            DEBUG_PRINTF("in buffer\n");
+            break;
+        }
+
+        assert(offset >= -(s64a)ci->hlen && offset < 0);
+        u8 c = ci->hbuf[ci->hlen + offset];
+        path &= reach[c];
+        DEBUG_PRINTF("reach[%x] = %02x path = %0xx\n", c, reach[c],  path);
+        if (!path) {
+            DEBUG_PRINTF("char 0x%02x failed reach check\n", c);
+            return 0;
+        }
+    }
+
+    DEBUG_PRINTF("scan buffer (%zu looks left)\n", look_end - look);
+    for(; look < look_end; ++look, reach += MULTI_REACH_BITVECTOR_LEN) {
+        s64a offset = base_offset + *look;
+        DEBUG_PRINTF("reach=%p, rel offset=%lld\n", reach, offset);
+
+        if (offset >= (s64a)ci->len) {
+            DEBUG_PRINTF("in the future\n");
+            break;
+        }
+
+        assert(offset >= 0 && offset < (s64a)ci->len);
+        u8 c = ci->buf[offset];
+        path &= reach[c];
+        DEBUG_PRINTF("reach[%x] = %02x path = %0xx\n", c, reach[c],  path);
+        if (!path) {
+            DEBUG_PRINTF("char 0x%02x failed reach check\n", c);
+            return 0;
+        }
+    }
+
+    DEBUG_PRINTF("OK :)\n");
+    return 1;
+}
+
+static never_inline
+int roseCheckMultipathShufti16x8(const struct hs_scratch *scratch,
+                       const struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_16x8 *ri,
+                                 u64a end) {
+    const struct core_info *ci = &scratch->core_info;
+    s32 checkOffset = ri->base_offset;
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+
+    assert(ri->last_start <= 0);
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        if ((u64a)(0 - ri->last_start) > end) {
+            DEBUG_PRINTF("too early, fail\n");
+            return 0;
+        }
+    }
+
+    u32 valid_data_mask;
+    m128 data_init = getData128(ci, offset, &valid_data_mask);
+    m128 data_select_mask = loadu128(ri->data_select_mask);
+
+    u32 valid_path_mask = 0;
+    if (unlikely(!(valid_data_mask & 1))) {
+        DEBUG_PRINTF("lose part of backward data\n");
+        DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask);
+
+        m128 expand_valid;
+        u64a expand_mask = 0x8080808080808080ULL;
+        u64a valid_lo = expand64(valid_data_mask & 0xff, expand_mask);
+        u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
+        DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
+        DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
+        expand_valid = set64x2(valid_hi, valid_lo);
+        valid_path_mask = ~movemask128(pshufb_m128(expand_valid,
+                                               data_select_mask));
+    }
+
+    m128 data = pshufb_m128(data_init, data_select_mask);
+    m256 nib_mask = loadu256(ri->nib_mask);
+    m128 bucket_select_mask = loadu128(ri->bucket_select_mask);
+
+    u32 hi_bits_mask = ri->hi_bits_mask;
+    u32 lo_bits_mask = ri->lo_bits_mask;
+    u32 neg_mask = ri->neg_mask;
+
+    if (validateMultipathShuftiMask16x8(data, nib_mask,
+                                        bucket_select_mask,
+                                        hi_bits_mask, lo_bits_mask,
+                                        neg_mask, valid_path_mask)) {
+        DEBUG_PRINTF("check multi-path shufti-16x8 successfully\n");
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+static never_inline
+int roseCheckMultipathShufti32x8(const struct hs_scratch *scratch,
+                       const struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_32x8 *ri,
+                                 u64a end) {
+    const struct core_info *ci = &scratch->core_info;
+    s32 checkOffset = ri->base_offset;
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+
+    assert(ri->last_start <= 0);
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        if ((u64a)(0 - ri->last_start) > end) {
+            DEBUG_PRINTF("too early, fail\n");
+            return 0;
+        }
+    }
+
+    u32 valid_data_mask;
+    m128 data_m128 = getData128(ci, offset, &valid_data_mask);
+    m256 data_double = set2x128(data_m128);
+    m256 data_select_mask = loadu256(ri->data_select_mask);
+
+    u32 valid_path_mask = 0;
+    m256 expand_valid;
+    if (unlikely(!(valid_data_mask & 1))) {
+        DEBUG_PRINTF("lose part of backward data\n");
+        DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask);
+
+        u64a expand_mask = 0x8080808080808080ULL;
+        u64a valid_lo = expand64(valid_data_mask & 0xff, expand_mask);
+        u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
+        DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
+        DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
+        expand_valid = set64x4(valid_hi, valid_lo, valid_hi,
+                                         valid_lo);
+        valid_path_mask = ~movemask256(pshufb_m256(expand_valid,
+                                                  data_select_mask));
+    }
+
+    m256 data = pshufb_m256(data_double, data_select_mask);
+    m256 hi_mask = loadu2x128(ri->hi_mask);
+    m256 lo_mask = loadu2x128(ri->lo_mask);
+    m256 bucket_select_mask = loadu256(ri->bucket_select_mask);
+
+    u32 hi_bits_mask = ri->hi_bits_mask;
+    u32 lo_bits_mask = ri->lo_bits_mask;
+    u32 neg_mask = ri->neg_mask;
+
+    if (validateMultipathShuftiMask32x8(data, hi_mask, lo_mask,
+                                        bucket_select_mask,
+                                        hi_bits_mask, lo_bits_mask,
+                                        neg_mask, valid_path_mask)) {
+        DEBUG_PRINTF("check multi-path shufti-32x8 successfully\n");
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+static never_inline
+int roseCheckMultipathShufti32x16(const struct hs_scratch *scratch,
+                      const struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_32x16 *ri,
+                                  u64a end) {
+    const struct core_info *ci = &scratch->core_info;
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s32 checkOffset = ri->base_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+
+    assert(ri->last_start <= 0);
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        if ((u64a)(0 - ri->last_start) > end) {
+            DEBUG_PRINTF("too early, fail\n");
+            return 0;
+        }
+    }
+
+    u32 valid_data_mask;
+    m128 data_m128 = getData128(ci, offset, &valid_data_mask);
+    m256 data_double = set2x128(data_m128);
+    m256 data_select_mask = loadu256(ri->data_select_mask);
+
+    u32 valid_path_mask = 0;
+    m256 expand_valid;
+    if (unlikely(!(valid_data_mask & 1))) {
+        DEBUG_PRINTF("lose part of backward data\n");
+        DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask);
+
+        u64a expand_mask = 0x8080808080808080ULL;
+        u64a valid_lo = expand64(valid_data_mask & 0xff, expand_mask);
+        u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
+        DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
+        DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
+        expand_valid = set64x4(valid_hi, valid_lo, valid_hi,
+                                         valid_lo);
+        valid_path_mask = ~movemask256(pshufb_m256(expand_valid,
+                                                   data_select_mask));
+    }
+
+    m256 data = pshufb_m256(data_double, data_select_mask);
+
+    m256 hi_mask_1 = loadu2x128(ri->hi_mask);
+    m256 hi_mask_2 = loadu2x128(ri->hi_mask + 16);
+    m256 lo_mask_1 = loadu2x128(ri->lo_mask);
+    m256 lo_mask_2 = loadu2x128(ri->lo_mask + 16);
+
+    m256 bucket_select_mask_hi = loadu256(ri->bucket_select_mask_hi);
+    m256 bucket_select_mask_lo = loadu256(ri->bucket_select_mask_lo);
+
+    u32 hi_bits_mask = ri->hi_bits_mask;
+    u32 lo_bits_mask = ri->lo_bits_mask;
+    u32 neg_mask = ri->neg_mask;
+
+    if (validateMultipathShuftiMask32x16(data, hi_mask_1, hi_mask_2,
+                                         lo_mask_1, lo_mask_2,
+                                         bucket_select_mask_hi,
+                                         bucket_select_mask_lo,
+                                         hi_bits_mask, lo_bits_mask,
+                                         neg_mask, valid_path_mask)) {
+        DEBUG_PRINTF("check multi-path shufti-32x16 successfully\n");
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+static never_inline
+int roseCheckMultipathShufti64(const struct hs_scratch *scratch,
+                         const struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_64 *ri,
+                               u64a end) {
+    const struct core_info *ci = &scratch->core_info;
+    const s64a base_offset = (s64a)end - ci->buf_offset;
+    s32 checkOffset = ri->base_offset;
+    s64a offset = base_offset + checkOffset;
+    DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset);
+    DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset);
+
+    if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) {
+        if ((u64a)(0 - ri->last_start) > end) {
+            DEBUG_PRINTF("too early, fail\n");
+            return 0;
+        }
+    }
+
+    u32 valid_data_mask;
+    m128 data_m128 = getData128(ci, offset, &valid_data_mask);
+    m256 data_m256 = set2x128(data_m128);
+    m256 data_select_mask_1 = loadu256(ri->data_select_mask);
+    m256 data_select_mask_2 = loadu256(ri->data_select_mask + 32);
+
+    u64a valid_path_mask = 0;
+    m256 expand_valid;
+    if (unlikely(!(valid_data_mask & 1))) {
+        DEBUG_PRINTF("lose part of backward data\n");
+        DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask);
+
+        u64a expand_mask = 0x8080808080808080ULL;
+        u64a valid_lo = expand64(valid_data_mask & 0xff, expand_mask);
+        u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask);
+        DEBUG_PRINTF("expand_hi %llx\n", valid_hi);
+        DEBUG_PRINTF("expand_lo %llx\n", valid_lo);
+        expand_valid = set64x4(valid_hi, valid_lo, valid_hi,
+                                         valid_lo);
+        u32 valid_path_1 = movemask256(pshufb_m256(expand_valid,
+                                                   data_select_mask_1));
+        u32 valid_path_2 = movemask256(pshufb_m256(expand_valid,
+                                                   data_select_mask_2));
+        valid_path_mask = ~((u64a)valid_path_1 | (u64a)valid_path_2 << 32);
+    }
+
+    m256 data_1 = pshufb_m256(data_m256, data_select_mask_1);
+    m256 data_2 = pshufb_m256(data_m256, data_select_mask_2);
+
+    m256 hi_mask = loadu2x128(ri->hi_mask);
+    m256 lo_mask = loadu2x128(ri->lo_mask);
+
+    m256 bucket_select_mask_1 = loadu256(ri->bucket_select_mask);
+    m256 bucket_select_mask_2 = loadu256(ri->bucket_select_mask + 32);
+
+    u64a hi_bits_mask = ri->hi_bits_mask;
+    u64a lo_bits_mask = ri->lo_bits_mask;
+    u64a neg_mask = ri->neg_mask;
+
+    if (validateMultipathShuftiMask64(data_1, data_2, hi_mask, lo_mask,
+                                      bucket_select_mask_1,
+                                      bucket_select_mask_2, hi_bits_mask,
+                                      lo_bits_mask, neg_mask,
+                                      valid_path_mask)) {
+        DEBUG_PRINTF("check multi-path shufti-64 successfully\n");
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
 int roseNfaEarliestSom(u64a start, u64a end, ReportID id, void *context);
 
 static rose_inline
@@ -1409,6 +1751,68 @@ int roseCheckLongLiteral(const struct RoseEngine *t,
     return 1;
 }
 
+static rose_inline
+int roseCheckMediumLiteral(const struct RoseEngine *t,
+                           const struct hs_scratch *scratch, u64a end,
+                           u32 lit_offset, u32 lit_length, char nocase) {
+    const struct core_info *ci = &scratch->core_info;
+    const u8 *lit = getByOffset(t, lit_offset);
+
+    DEBUG_PRINTF("check lit at %llu, length %u\n", end, lit_length);
+    DEBUG_PRINTF("base buf_offset=%llu\n", ci->buf_offset);
+
+    if (end < lit_length) {
+        DEBUG_PRINTF("too short!\n");
+        return 0;
+    }
+
+    // If any portion of the literal matched in the current buffer, check it.
+    if (end > ci->buf_offset) {
+        u32 scan_len = MIN(end - ci->buf_offset, lit_length);
+        u64a scan_start = end - ci->buf_offset - scan_len;
+        DEBUG_PRINTF("checking suffix (%u bytes) in buf[%llu:%llu]\n", scan_len,
+                     scan_start, end);
+        if (cmpForward(ci->buf + scan_start, lit + lit_length - scan_len,
+                       scan_len, nocase)) {
+            DEBUG_PRINTF("cmp of suffix failed\n");
+            return 0;
+        }
+    }
+
+    // If the entirety of the literal was in the current block, we are done.
+    if (end - lit_length >= ci->buf_offset) {
+        DEBUG_PRINTF("literal confirmed in current block\n");
+        return 1;
+    }
+
+    // We still have a prefix which we must test against the history buffer.
+    assert(t->mode != HS_MODE_BLOCK);
+
+    u64a lit_start_offset = end - lit_length;
+    u32 prefix_len = MIN(lit_length, ci->buf_offset - lit_start_offset);
+    u32 hist_rewind = ci->buf_offset - lit_start_offset;
+    DEBUG_PRINTF("hlen=%zu, hist_rewind=%u\n", ci->hlen, hist_rewind);
+
+    // History length check required for confirm in the EOD and delayed
+    // rebuild paths.
+    if (hist_rewind > ci->hlen) {
+        DEBUG_PRINTF("not enough history\n");
+        return 0;
+    }
+
+    DEBUG_PRINTF("check prefix len=%u from hist (len %zu, rewind %u)\n",
+                 prefix_len, ci->hlen, hist_rewind);
+    assert(hist_rewind <= ci->hlen);
+    if (cmpForward(ci->hbuf + ci->hlen - hist_rewind, lit, prefix_len,
+                   nocase)) {
+        DEBUG_PRINTF("cmp of prefix failed\n");
+        return 0;
+    }
+
+    DEBUG_PRINTF("cmp succeeded\n");
+    return 1;
+}
+
 static
 void updateSeqPoint(struct RoseContext *tctxt, u64a offset,
                     const char from_mpv) {
@@ -1439,6 +1843,7 @@ hwlmcb_rv_t roseRunProgram_i(const struct RoseEngine *t,
     DEBUG_PRINTF("program=%u, offsets [%llu,%llu], flags=%u\n", programOffset,
                  som, end, prog_flags);
 
+    assert(programOffset != ROSE_INVALID_PROG_OFFSET);
     assert(programOffset >= sizeof(struct RoseEngine));
     assert(programOffset < t->size);
 
@@ -1481,6 +1886,8 @@ hwlmcb_rv_t roseRunProgram_i(const struct RoseEngine *t,
                     DEBUG_PRINTF("delay until playback\n");
                     tctxt->groups |= ri->groups;
                     work_done = 1;
+                    recordAnchoredLiteralMatch(t, scratch, ri->anch_id, end);
+
                     assert(ri->done_jump); // must progress
                     pc += ri->done_jump;
                     continue;
@@ -1492,7 +1899,9 @@ hwlmcb_rv_t roseRunProgram_i(const struct RoseEngine *t,
                 if (end < ri->min_offset) {
                     DEBUG_PRINTF("halt: before min_offset=%u\n",
                                  ri->min_offset);
-                    return HWLM_CONTINUE_MATCHING;
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
                 }
             }
             PROGRAM_NEXT_INSTRUCTION
@@ -1551,8 +1960,8 @@ hwlmcb_rv_t roseRunProgram_i(const struct RoseEngine *t,
             PROGRAM_NEXT_INSTRUCTION
 
             PROGRAM_CASE(CHECK_LOOKAROUND) {
-                if (!roseCheckLookaround(t, scratch, ri->index, ri->count,
-                                         end)) {
+                if (!roseCheckLookaround(t, scratch, ri->look_index,
+                                         ri->reach_index, ri->count, end)) {
                     DEBUG_PRINTF("failed lookaround check\n");
                     assert(ri->fail_jump); // must progress
                     pc += ri->fail_jump;
@@ -1672,8 +2081,8 @@ hwlmcb_rv_t roseRunProgram_i(const struct RoseEngine *t,
             }
             PROGRAM_NEXT_INSTRUCTION
 
-            PROGRAM_CASE(RECORD_ANCHORED) {
-                recordAnchoredLiteralMatch(t, scratch, ri->id, end);
+            PROGRAM_CASE(DUMMY_NOP) {
+                assert(0);
             }
             PROGRAM_NEXT_INSTRUCTION
 
@@ -1792,14 +2201,14 @@ hwlmcb_rv_t roseRunProgram_i(const struct RoseEngine *t,
 
             PROGRAM_CASE(REPORT_SOM_INT) {
                 updateSeqPoint(tctxt, end, from_mpv);
-                roseHandleSom(t, scratch, &ri->som, end);
+                roseHandleSom(scratch, &ri->som, end);
                 work_done = 1;
             }
             PROGRAM_NEXT_INSTRUCTION
 
             PROGRAM_CASE(REPORT_SOM_AWARE) {
                 updateSeqPoint(tctxt, end, from_mpv);
-                roseHandleSomSom(t, scratch, &ri->som, som, end);
+                roseHandleSomSom(scratch, &ri->som, som, end);
                 work_done = 1;
             }
             PROGRAM_NEXT_INSTRUCTION
@@ -2060,8 +2469,10 @@ hwlmcb_rv_t roseRunProgram_i(const struct RoseEngine *t,
                 const char nocase = 0;
                 if (!roseCheckLongLiteral(t, scratch, end, ri->lit_offset,
                                           ri->lit_length, nocase)) {
-                    DEBUG_PRINTF("halt: failed long lit check\n");
-                    return HWLM_CONTINUE_MATCHING;
+                    DEBUG_PRINTF("failed long lit check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
                 }
             }
             PROGRAM_NEXT_INSTRUCTION
@@ -2070,8 +2481,93 @@ hwlmcb_rv_t roseRunProgram_i(const struct RoseEngine *t,
                 const char nocase = 1;
                 if (!roseCheckLongLiteral(t, scratch, end, ri->lit_offset,
                                           ri->lit_length, nocase)) {
-                    DEBUG_PRINTF("halt: failed nocase long lit check\n");
-                    return HWLM_CONTINUE_MATCHING;
+                    DEBUG_PRINTF("failed nocase long lit check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MED_LIT) {
+                const char nocase = 0;
+                if (!roseCheckMediumLiteral(t, scratch, end, ri->lit_offset,
+                                            ri->lit_length, nocase)) {
+                    DEBUG_PRINTF("failed lit check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MED_LIT_NOCASE) {
+                const char nocase = 1;
+                if (!roseCheckMediumLiteral(t, scratch, end, ri->lit_offset,
+                                            ri->lit_length, nocase)) {
+                    DEBUG_PRINTF("failed long lit check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CLEAR_WORK_DONE) {
+                DEBUG_PRINTF("clear work_done flag\n");
+                work_done = 0;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(MULTIPATH_LOOKAROUND) {
+                if (!roseMultipathLookaround(t, scratch, ri->look_index,
+                                             ri->reach_index, ri->count,
+                                             ri->last_start, ri->start_mask,
+                                             end)) {
+                    DEBUG_PRINTF("failed multi-path lookaround check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MULTIPATH_SHUFTI_16x8) {
+                if (!roseCheckMultipathShufti16x8(scratch, ri, end)) {
+                    DEBUG_PRINTF("failed multi-path shufti 16x8 check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MULTIPATH_SHUFTI_32x8) {
+                if (!roseCheckMultipathShufti32x8(scratch, ri, end)) {
+                    DEBUG_PRINTF("failed multi-path shufti 32x8 check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MULTIPATH_SHUFTI_32x16) {
+                if (!roseCheckMultipathShufti32x16(scratch, ri, end)) {
+                    DEBUG_PRINTF("failed multi-path shufti 32x16 check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
+                }
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MULTIPATH_SHUFTI_64) {
+                if (!roseCheckMultipathShufti64(scratch, ri, end)) {
+                    DEBUG_PRINTF("failed multi-path shufti 64 check\n");
+                    assert(ri->fail_jump); // must progress
+                    pc += ri->fail_jump;
+                    continue;
                 }
             }
             PROGRAM_NEXT_INSTRUCTION
diff --git a/src/rose/rose_build.h b/src/rose/rose_build.h
index c71671fa0..cbb925f79 100644
--- a/src/rose/rose_build.h
+++ b/src/rose/rose_build.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -40,8 +40,9 @@
 #include "ue2common.h"
 #include "rose_common.h"
 #include "rose_in_graph.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"
 #include "util/charreach.h"
+#include "util/noncopyable.h"
 #include "util/ue2_containers.h"
 #include "util/ue2string.h"
 
@@ -50,8 +51,6 @@
 #include <utility>
 #include <vector>
 
-#include <boost/core/noncopyable.hpp>
-
 struct NFA;
 struct SmallWriteEngine;
 struct RoseEngine;
@@ -80,7 +79,7 @@ class RoseDedupeAux {
 
 /** \brief Abstract interface intended for callers from elsewhere in the tree,
  * real underlying implementation is RoseBuildImpl in rose_build_impl.h. */
-class RoseBuild : boost::noncopyable {
+class RoseBuild : noncopyable {
 public:
     virtual ~RoseBuild();
 
@@ -88,8 +87,7 @@ class RoseBuild : boost::noncopyable {
     virtual void add(bool anchored, bool eod, const ue2_literal &lit,
                      const ue2::flat_set<ReportID> &ids) = 0;
 
-    virtual bool addRose(const RoseInGraph &ig, bool prefilter,
-                         bool finalChance = false) = 0;
+    virtual bool addRose(const RoseInGraph &ig, bool prefilter) = 0;
     virtual bool addSombeRose(const RoseInGraph &ig) = 0;
 
     virtual bool addOutfix(const NGHolder &h) = 0;
@@ -115,7 +113,7 @@ class RoseBuild : boost::noncopyable {
                          bool eod) = 0;
 
     /** \brief Construct a runtime implementation. */
-    virtual ue2::aligned_unique_ptr<RoseEngine> buildRose(u32 minWidth) = 0;
+    virtual bytecode_ptr<RoseEngine> buildRose(u32 minWidth) = 0;
 
     virtual std::unique_ptr<RoseDedupeAux> generateDedupeAux() const = 0;
 
@@ -136,8 +134,6 @@ std::unique_ptr<RoseBuild> makeRoseBuilder(ReportManager &rm,
 bool roseCheckRose(const RoseInGraph &ig, bool prefilter,
                    const ReportManager &rm, const CompileContext &cc);
 
-size_t roseSize(const RoseEngine *t);
-
 /* used by heuristics to determine the small write engine. High numbers are
  * intended to indicate a lightweight rose. */
 u32 roseQuality(const RoseEngine *t);
diff --git a/src/rose/rose_build_add.cpp b/src/rose/rose_build_add.cpp
index 8b10bc7dc..4c895cafc 100644
--- a/src/rose/rose_build_add.cpp
+++ b/src/rose/rose_build_add.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -46,7 +46,6 @@
 #include "nfagraph/ng_region.h"
 #include "nfagraph/ng_repeat.h"
 #include "nfagraph/ng_reports.h"
-#include "nfagraph/ng_rose.h"
 #include "nfagraph/ng_util.h"
 #include "nfagraph/ng_width.h"
 #include "util/charreach.h"
@@ -57,6 +56,7 @@
 #include "util/dump_charclass.h"
 #include "util/graph_range.h"
 #include "util/make_unique.h"
+#include "util/noncopyable.h"
 #include "util/order_check.h"
 #include "util/report_manager.h"
 #include "util/ue2string.h"
@@ -69,8 +69,6 @@
 #include <vector>
 #include <utility>
 
-#include <boost/core/noncopyable.hpp>
-
 using namespace std;
 
 namespace ue2 {
@@ -78,16 +76,13 @@ namespace ue2 {
 /**
  * \brief Data used by most of the construction code in this file.
  */
-struct RoseBuildData : boost::noncopyable {
+struct RoseBuildData : noncopyable {
     RoseBuildData(const RoseInGraph &ig_in, bool som_in)
         : ig(ig_in), som(som_in) {}
 
     /** Input rose graph. */
     const RoseInGraph &ig;
 
-    /** Mapping from engine graph to constructed DFA for pre-build DFAs. */
-    ue2::unordered_map<const NGHolder *, shared_ptr<raw_dfa> > early_dfas;
-
     /** Edges we've transformed (in \ref transformAnchoredLiteralOverlap) which
      * require ANCH history to prevent overlap. */
     ue2::unordered_set<RoseInEdge> anch_history_edges;
@@ -281,8 +276,8 @@ void createVertices(RoseBuildImpl *tbi,
 
             if (prefix_graph) {
                 g[w].left.graph = prefix_graph;
-                if (contains(bd.early_dfas, prefix_graph.get())) {
-                    g[w].left.dfa = bd.early_dfas.at(prefix_graph.get());
+                if (edge_props.dfa) {
+                    g[w].left.dfa = edge_props.dfa;
                 }
                 g[w].left.haig = edge_props.haig;
                 g[w].left.lag = prefix_lag;
@@ -300,7 +295,7 @@ void createVertices(RoseBuildImpl *tbi,
             if (bd.som && !g[w].left.haig) {
                 /* no prefix - som based on literal start */
                 assert(!prefix_graph);
-                g[w].som_adjust = tbi->literals.right.at(literalId).elength();
+                g[w].som_adjust = tbi->literals.at(literalId).elength();
                 DEBUG_PRINTF("set som_adjust to %u\n", g[w].som_adjust);
             }
 
@@ -338,7 +333,7 @@ void createVertices(RoseBuildImpl *tbi,
         u32 ghostId = tbi->literal_info[literalId].undelayed_id;
         DEBUG_PRINTF("creating delay ghost vertex, id=%u\n", ghostId);
         assert(ghostId != literalId);
-        assert(tbi->literals.right.at(ghostId).delay == 0);
+        assert(tbi->literals.at(ghostId).delay == 0);
 
         // Adjust offsets, removing delay.
         u32 ghost_min = min_offset, ghost_max = max_offset;
@@ -726,7 +721,9 @@ void makeEodEventLeftfix(RoseBuildImpl &build, RoseVertex u,
         RoseEdge e = add_edge(v, w, g);
         g[e].minBound = 0;
         g[e].maxBound = 0;
-        g[e].history = ROSE_ROLE_HISTORY_LAST_BYTE;
+        /* No need to set history as the event is only delivered at the last
+         * byte anyway - no need to invalidate stale entries. */
+        g[e].history = ROSE_ROLE_HISTORY_NONE;
         DEBUG_PRINTF("accept eod vertex (index=%zu)\n", g[w].index);
     }
 }
@@ -769,9 +766,9 @@ void doRoseAcceptVertex(RoseBuildImpl *tbi,
         assert(!g[u].suffix);
         if (ig[iv].type == RIV_ACCEPT) {
             assert(!tbi->isAnyStart(u));
-            if (contains(bd.early_dfas, edge_props.graph.get())) {
+            if (edge_props.dfa) {
                 DEBUG_PRINTF("adding early dfa suffix to i%zu\n", g[u].index);
-                g[u].suffix.rdfa = bd.early_dfas.at(edge_props.graph.get());
+                g[u].suffix.rdfa = edge_props.dfa;
                 g[u].suffix.dfa_min_width = findMinWidth(*edge_props.graph);
                 g[u].suffix.dfa_max_width = findMaxWidth(*edge_props.graph);
             } else if (edge_props.graph) {
@@ -1033,18 +1030,9 @@ bool empty(const GraphT &g) {
     return vi == ve;
 }
 
-/* We only try to implement as a dfa if a non-nullptr as_dfa is provided to return
- * the raw dfa to. */
 static
-bool canImplementGraph(RoseBuildImpl *tbi, const RoseInGraph &in, NGHolder &h,
-                       const vector<RoseInEdge> &edges, bool prefilter,
-                       const ReportManager &rm, const CompileContext &cc,
-                       bool finalChance, unique_ptr<raw_dfa> *as_dfa) {
-    assert(!edges.empty());
-    assert(&*in[edges[0]].graph == &h);
-
-    assert(h.kind == whatRoseIsThis(in, edges[0]));
-
+bool canImplementGraph(NGHolder &h, bool prefilter, const ReportManager &rm,
+                       const CompileContext &cc) {
     if (isImplementableNFA(h, &rm, cc)) {
         return true;
     }
@@ -1061,64 +1049,6 @@ bool canImplementGraph(RoseBuildImpl *tbi, const RoseInGraph &in, NGHolder &h,
         }
     }
 
-    if (as_dfa) {
-        switch (h.kind) {
-        case NFA_OUTFIX: /* 'prefix' of eod */
-        case NFA_PREFIX:
-            if (!cc.grey.earlyMcClellanPrefix) {
-                return false;
-            }
-            break;
-        case NFA_INFIX:
-            if (!cc.grey.earlyMcClellanInfix) {
-                return false;
-            }
-            break;
-        case NFA_SUFFIX:
-            if (!cc.grey.earlyMcClellanSuffix) {
-                return false;
-            }
-            break;
-        case NFA_EAGER_PREFIX:
-        case NFA_REV_PREFIX:
-        case NFA_OUTFIX_RAW:
-            DEBUG_PRINTF("kind %u\n", (u32)h.kind);
-            assert(0);
-        }
-        assert(!*as_dfa);
-        assert(tbi);
-        vector<vector<CharReach> > triggers;
-        u32 min_offset = ~0U;
-        u32 max_offset = 0;
-        for (const auto &e : edges) {
-            RoseInVertex s = source(e, in);
-            RoseInVertex t = target(e, in);
-            if (in[s].type == RIV_LITERAL) {
-                triggers.push_back(as_cr_seq(in[s].s));
-            }
-            if (in[t].type == RIV_ACCEPT_EOD) {
-                /* TODO: support eod prefixes */
-                return false;
-            }
-            ENSURE_AT_LEAST(&max_offset, in[s].max_offset);
-            LIMIT_TO_AT_MOST(&min_offset, in[s].min_offset);
-        }
-
-        if (!generates_callbacks(h)) {
-            setReportId(h, tbi->getNewNfaReport());
-        }
-
-        bool single_trigger = min_offset == max_offset;
-
-        DEBUG_PRINTF("trying for mcclellan (%u, %u)\n", min_offset, max_offset);
-        *as_dfa = buildMcClellan(h, &rm, single_trigger, triggers, cc.grey,
-                                 finalChance);
-
-        if (*as_dfa) {
-            return true;
-        }
-    }
-
     DEBUG_PRINTF("unable to build engine\n");
     return false;
 }
@@ -1573,8 +1503,7 @@ bool validateKinds(const RoseInGraph &g) {
 }
 #endif
 
-bool RoseBuildImpl::addRose(const RoseInGraph &ig, bool prefilter,
-                            bool finalChance) {
+bool RoseBuildImpl::addRose(const RoseInGraph &ig, bool prefilter) {
     DEBUG_PRINTF("trying to rose\n");
     assert(validateKinds(ig));
     assert(hasCorrectlyNumberedVertices(ig));
@@ -1601,11 +1530,14 @@ bool RoseBuildImpl::addRose(const RoseInGraph &ig, bool prefilter,
 
     for (const auto &e : edges_range(in)) {
         if (!in[e].graph) {
+            assert(!in[e].dfa);
+            assert(!in[e].haig);
             continue; // no graph
         }
 
-        if (in[e].haig) {
-            // Haigs are always implementable (we've already built the raw DFA).
+        if (in[e].haig || in[e].dfa) {
+            /* Early DFAs/Haigs are always implementable (we've already built
+             * the raw DFA). */
             continue;
         }
 
@@ -1623,17 +1555,10 @@ bool RoseBuildImpl::addRose(const RoseInGraph &ig, bool prefilter,
     vector<RoseInEdge> graph_edges;
 
     for (auto h : ordered_graphs) {
-        const vector<RoseInEdge> &h_edges = graphs.at(h);
-        unique_ptr<raw_dfa> as_dfa;
-        /* allow finalChance as fallback is basically an outfix at this point */
-        if (!canImplementGraph(this, in, *h, h_edges, prefilter, rm, cc,
-                               finalChance, &as_dfa)) {
+        if (!canImplementGraph(*h, prefilter, rm, cc)) {
             return false;
         }
-        if (as_dfa) {
-            bd.early_dfas[h] = move(as_dfa);
-        }
-        insert(&graph_edges, graph_edges.end(), h_edges);
+        insert(&graph_edges, graph_edges.end(), graphs[h]);
     }
 
     /* we are now past the point of no return. We can start making irreversible
@@ -1647,9 +1572,8 @@ bool RoseBuildImpl::addRose(const RoseInGraph &ig, bool prefilter,
         assert(allMatchStatesHaveReports(h));
 
         if (!generates_callbacks(whatRoseIsThis(in, e))
-            && !contains(bd.early_dfas, &h)
             && in[target(e, in)].type != RIV_ACCEPT_EOD) {
-            setReportId(h, getNewNfaReport());
+            set_report(h, getNewNfaReport());
         }
     }
 
@@ -1692,7 +1616,7 @@ bool roseCheckRose(const RoseInGraph &ig, bool prefilter,
         return false;
     }
 
-    map<NGHolder *, vector<RoseInEdge>> graphs;
+    vector<NGHolder *> graphs;
 
     for (const auto &e : edges_range(ig)) {
         if (!ig[e].graph) {
@@ -1704,12 +1628,11 @@ bool roseCheckRose(const RoseInGraph &ig, bool prefilter,
             continue;
         }
 
-        graphs[ig[e].graph.get()].push_back(e);
+        graphs.push_back(ig[e].graph.get());
     }
 
-    for (const auto &m : graphs) {
-        if (!canImplementGraph(nullptr, ig, *m.first, m.second, prefilter, rm,
-                               cc, false, nullptr)) {
+    for (const auto &g : graphs) {
+        if (!canImplementGraph(*g, prefilter, rm, cc)) {
             return false;
         }
     }
@@ -1775,9 +1698,70 @@ void populateOutfixInfo(OutfixInfo &outfix, const NGHolder &h,
     populateReverseAccelerationInfo(outfix.rev_info, h);
 }
 
+static
+bool addEodOutfix(RoseBuildImpl &build, const NGHolder &h) {
+    map<flat_set<ReportID>, ReportID> report_remap;
+    shared_ptr<NGHolder> eod_leftfix
+        = makeRoseEodPrefix(h, build, report_remap);
+
+    bool nfa_ok = isImplementableNFA(h, &build.rm, build.cc);
+
+    /* TODO: check if early dfa is possible */
+
+    if (!nfa_ok) {
+        DEBUG_PRINTF("could not build as  NFA\n");
+        return false;
+    }
+
+    u32 eod_event = getEodEventID(build);
+
+    auto &g = build.g;
+    for (const auto &report_mapping : report_remap) {
+        RoseVertex v = add_vertex(g);
+        g[v].literals.insert(eod_event);
+        build.literal_info[eod_event].vertices.insert(v);
+
+        g[v].left.graph = eod_leftfix;
+        g[v].left.leftfix_report = report_mapping.second;
+        g[v].left.lag = 0;
+        RoseEdge e1 = add_edge(build.anchored_root, v, g);
+        g[e1].minBound = 0;
+        g[e1].maxBound = ROSE_BOUND_INF;
+        g[v].min_offset = findMinWidth(*eod_leftfix);
+        g[v].max_offset = ROSE_BOUND_INF;
+
+        depth max_width = findMaxWidth(*g[v].left.graph);
+        if (max_width.is_finite() && isPureAnchored(*eod_leftfix)) {
+            g[e1].maxBound = max_width;
+            g[v].max_offset = max_width;
+        }
+
+        g[e1].history = ROSE_ROLE_HISTORY_NONE; // handled by prefix
+        RoseVertex w = add_vertex(g);
+        g[w].eod_accept = true;
+        g[w].reports = report_mapping.first;
+        g[w].min_offset = g[v].min_offset;
+        g[w].max_offset = g[v].max_offset;
+        RoseEdge e = add_edge(v, w, g);
+        g[e].minBound = 0;
+        g[e].maxBound = 0;
+        g[e].history = ROSE_ROLE_HISTORY_NONE;
+        DEBUG_PRINTF("accept eod vertex (index=%zu)\n", g[w].index);
+    }
+
+    return true;
+}
+
 bool RoseBuildImpl::addOutfix(const NGHolder &h) {
     DEBUG_PRINTF("%zu vertices, %zu edges\n", num_vertices(h), num_edges(h));
 
+    /* TODO: handle more than one report */
+    if (!in_degree(h.accept, h)
+        && all_reports(h).size() == 1
+        && addEodOutfix(*this, h)) {
+        return true;
+    }
+
     const u32 nfa_states = isImplementableNFA(h, &rm, cc);
     if (nfa_states) {
         DEBUG_PRINTF("implementable as an NFA in %u states\n", nfa_states);
@@ -1923,16 +1907,20 @@ void removeAddedLiterals(RoseBuildImpl &tbi, const flat_set<u32> &lit_ids) {
         return;
     }
 
+    DEBUG_PRINTF("remove last %zu literals\n", lit_ids.size());
+
     // lit_ids should be a contiguous range.
     assert(lit_ids.size() == *lit_ids.rbegin() - *lit_ids.begin() + 1);
+    assert(*lit_ids.rbegin() == tbi.literals.size() - 1);
 
-    for (const u32 &lit_id : lit_ids) {
-        assert(lit_id < tbi.literal_info.size());
-        assert(tbi.literals.right.at(lit_id).table == ROSE_ANCHORED);
-        assert(tbi.literal_info[lit_id].vertices.empty());
+    assert(all_of_in(lit_ids, [&](u32 lit_id) {
+        return lit_id < tbi.literal_info.size() &&
+               tbi.literals.at(lit_id).table == ROSE_ANCHORED &&
+               tbi.literal_info[lit_id].vertices.empty();
+    }));
 
-        tbi.literals.right.erase(lit_id);
-    }
+    tbi.literals.erase_back(lit_ids.size());
+    assert(tbi.literals.size() == *lit_ids.begin());
 
     // lit_ids should be at the end of tbi.literal_info.
     assert(tbi.literal_info.size() == *lit_ids.rbegin() + 1);
@@ -1940,8 +1928,7 @@ void removeAddedLiterals(RoseBuildImpl &tbi, const flat_set<u32> &lit_ids) {
 }
 
 bool RoseBuildImpl::addAnchoredAcyclic(const NGHolder &h) {
-    vector<DepthMinMax> vertexDepths;
-    calcDepthsFrom(h, h.start, vertexDepths);
+    auto vertexDepths = calcDepthsFrom(h, h.start);
 
     map<NFAVertex, set<u32> > reportMap;  /* NFAVertex -> literal ids */
     map<u32, DepthMinMax> depthMap;       /* literal id -> min/max depth */
diff --git a/src/rose/rose_build_add_mask.cpp b/src/rose/rose_build_add_mask.cpp
index de3bdf0a3..bd8eed0c0 100644
--- a/src/rose/rose_build_add_mask.cpp
+++ b/src/rose/rose_build_add_mask.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -480,7 +480,7 @@ void addTransientMask(RoseBuildImpl &build, const vector<CharReach> &mask,
 
     // Everyone gets the same report ID.
     ReportID mask_report = build.getNewNfaReport();
-    setReportId(*mask_graph, mask_report);
+    set_report(*mask_graph, mask_report);
 
     // Build the HWLM literal mask.
     vector<u8> msk, cmp;
diff --git a/src/rose/rose_build_anchored.cpp b/src/rose/rose_build_anchored.cpp
index 3d0affc6b..a2af160e4 100644
--- a/src/rose/rose_build_anchored.cpp
+++ b/src/rose/rose_build_anchored.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,6 +30,7 @@
 
 #include "grey.h"
 #include "rose_build_impl.h"
+#include "rose_build_matchers.h"
 #include "rose_internal.h"
 #include "ue2common.h"
 #include "nfa/dfa_min.h"
@@ -71,6 +72,8 @@ namespace ue2 {
 
 #define INIT_STATE (DEAD_STATE + 1)
 
+#define NO_FRAG_ID (~0U)
+
 // Adds a vertex with the given reach.
 static
 NFAVertex add_vertex(NGHolder &h, const CharReach &cr) {
@@ -173,7 +176,7 @@ void mergeAnchoredDfas(vector<unique_ptr<raw_dfa>> &dfas,
 }
 
 static
-void remapAnchoredReports(raw_dfa &rdfa, const RoseBuildImpl &build) {
+void remapAnchoredReports(raw_dfa &rdfa, const vector<u32> &frag_map) {
     for (dstate &ds : rdfa.states) {
         assert(ds.reports_eod.empty()); // Not used in anchored matcher.
         if (ds.reports.empty()) {
@@ -182,33 +185,49 @@ void remapAnchoredReports(raw_dfa &rdfa, const RoseBuildImpl &build) {
 
         flat_set<ReportID> new_reports;
         for (auto id : ds.reports) {
-            assert(id < build.literal_info.size());
-            new_reports.insert(build.literal_info.at(id).final_id);
+            assert(id < frag_map.size());
+            new_reports.insert(frag_map[id]);
         }
-        ds.reports = move(new_reports);
+        ds.reports = std::move(new_reports);
     }
 }
 
 /**
  * \brief Replaces the report ids currently in the dfas (rose graph literal
- * ids) with the final id for each literal.
+ * ids) with the fragment id for each literal.
  */
 static
-void remapAnchoredReports(RoseBuildImpl &build) {
+void remapAnchoredReports(RoseBuildImpl &build, const vector<u32> &frag_map) {
     for (auto &m : build.anchored_nfas) {
         for (auto &rdfa : m.second) {
             assert(rdfa);
-            remapAnchoredReports(*rdfa, build);
+            remapAnchoredReports(*rdfa, frag_map);
         }
     }
 }
 
+/**
+ * Returns mapping from literal ids to fragment ids.
+ */
+static
+vector<u32> reverseFragMap(const RoseBuildImpl &build,
+                           const vector<LitFragment> &fragments) {
+    vector<u32> rev(build.literal_info.size(), NO_FRAG_ID);
+    for (const auto &f : fragments) {
+        for (u32 lit_id : f.lit_ids) {
+            assert(lit_id < rev.size());
+            rev[lit_id] = f.fragment_id;
+        }
+    }
+    return rev;
+}
+
 /**
  * \brief Replace the reports (which are literal final_ids) in the given
  * raw_dfa with program offsets.
  */
 static
-void remapIdsToPrograms(raw_dfa &rdfa, const vector<u32> &litPrograms) {
+void remapIdsToPrograms(const vector<LitFragment> &fragments, raw_dfa &rdfa) {
     for (dstate &ds : rdfa.states) {
         assert(ds.reports_eod.empty()); // Not used in anchored matcher.
         if (ds.reports.empty()) {
@@ -216,25 +235,27 @@ void remapIdsToPrograms(raw_dfa &rdfa, const vector<u32> &litPrograms) {
         }
 
         flat_set<ReportID> new_reports;
-        for (auto id : ds.reports) {
-            assert(id < litPrograms.size());
-            new_reports.insert(litPrograms.at(id));
+        for (auto fragment_id : ds.reports) {
+            const auto &frag = fragments.at(fragment_id);
+            new_reports.insert(frag.lit_program_offset);
         }
-        ds.reports = move(new_reports);
+        ds.reports = std::move(new_reports);
     }
 }
 
 static
-void populate_holder(const simple_anchored_info &sai, const set<u32> &exit_ids,
-                     NGHolder *h_in) {
+unique_ptr<NGHolder> populate_holder(const simple_anchored_info &sai,
+                                     const flat_set<u32> &exit_ids) {
     DEBUG_PRINTF("populating holder for ^.{%u,%u}%s\n", sai.min_bound,
                  sai.max_bound, dumpString(sai.literal).c_str());
-    NGHolder &h = *h_in;
-    set<NFAVertex> ends = addDotsToGraph(h, h.start, sai.min_bound,
-                                         sai.max_bound, CharReach::dot());
+    auto h_ptr = make_unique<NGHolder>();
+    NGHolder &h = *h_ptr;
+    auto ends = addDotsToGraph(h, h.start, sai.min_bound, sai.max_bound,
+                               CharReach::dot());
     NFAVertex v = addToGraph(h, ends, sai.literal);
     add_edge(v, h.accept, h);
     h[v].reports.insert(exit_ids.begin(), exit_ids.end());
+    return h_ptr;
 }
 
 u32 anchoredStateSize(const anchored_matcher_info &atable) {
@@ -467,7 +488,7 @@ bool check_dupe(const raw_dfa &rdfa,
 }
 
 static
-bool check_dupe_simple(const RoseBuildImpl &tbi, u32 min_bound, u32 max_bound,
+bool check_dupe_simple(const RoseBuildImpl &build, u32 min_bound, u32 max_bound,
                        const ue2_literal &lit, ReportID *remap) {
     if (!remap) {
         DEBUG_PRINTF("no remap\n");
@@ -475,8 +496,8 @@ bool check_dupe_simple(const RoseBuildImpl &tbi, u32 min_bound, u32 max_bound,
     }
 
     simple_anchored_info sai(min_bound, max_bound, lit);
-    if (contains(tbi.anchored_simple, sai)) {
-        *remap = *tbi.anchored_simple.at(sai).begin();
+    if (contains(build.anchored_simple, sai)) {
+        *remap = *build.anchored_simple.at(sai).begin();
         return true;
     }
 
@@ -640,7 +661,7 @@ bool isSimple(const NGHolder &h, u32 *min_bound, u32 *max_bound,
 }
 
 static
-int finalise_out(RoseBuildImpl &tbi, const NGHolder &h,
+int finalise_out(RoseBuildImpl &build, const NGHolder &h,
                  const Automaton_Holder &autom, unique_ptr<raw_dfa> out_dfa,
                  ReportID *remap) {
     u32 min_bound = ~0U;
@@ -649,12 +670,12 @@ int finalise_out(RoseBuildImpl &tbi, const NGHolder &h,
     u32 simple_report = MO_INVALID_IDX;
     if (isSimple(h, &min_bound, &max_bound, &lit, &simple_report)) {
         assert(simple_report != MO_INVALID_IDX);
-        if (check_dupe_simple(tbi, min_bound, max_bound, lit, remap)) {
+        if (check_dupe_simple(build, min_bound, max_bound, lit, remap)) {
             DEBUG_PRINTF("found duplicate remapping to %u\n", *remap);
             return ANCHORED_REMAP;
         }
         DEBUG_PRINTF("add with report %u\n", simple_report);
-        tbi.anchored_simple[simple_anchored_info(min_bound, max_bound, lit)]
+        build.anchored_simple[simple_anchored_info(min_bound, max_bound, lit)]
             .insert(simple_report);
         return ANCHORED_SUCCESS;
     }
@@ -664,15 +685,15 @@ int finalise_out(RoseBuildImpl &tbi, const NGHolder &h,
     out_dfa->alpha_size = autom.alphasize;
     out_dfa->alpha_remap = autom.alpha;
     auto hash = hash_dfa_no_reports(*out_dfa);
-    if (check_dupe(*out_dfa, tbi.anchored_nfas[hash], remap)) {
+    if (check_dupe(*out_dfa, build.anchored_nfas[hash], remap)) {
         return ANCHORED_REMAP;
     }
-    tbi.anchored_nfas[hash].push_back(move(out_dfa));
+    build.anchored_nfas[hash].push_back(move(out_dfa));
     return ANCHORED_SUCCESS;
 }
 
 static
-int addAutomaton(RoseBuildImpl &tbi, const NGHolder &h, ReportID *remap) {
+int addAutomaton(RoseBuildImpl &build, const NGHolder &h, ReportID *remap) {
     if (num_vertices(h) > ANCHORED_NFA_STATE_LIMIT) {
         DEBUG_PRINTF("autom bad!\n");
         return ANCHORED_FAIL;
@@ -682,7 +703,7 @@ int addAutomaton(RoseBuildImpl &tbi, const NGHolder &h, ReportID *remap) {
 
     unique_ptr<raw_dfa> out_dfa = ue2::make_unique<raw_dfa>(NFA_OUTFIX_RAW);
     if (!determinise(autom, out_dfa->states, MAX_DFA_STATES)) {
-        return finalise_out(tbi, h, autom, move(out_dfa), remap);
+        return finalise_out(build, h, autom, move(out_dfa), remap);
     }
 
     DEBUG_PRINTF("determinise failed\n");
@@ -700,7 +721,7 @@ void setReports(NGHolder &h, const map<NFAVertex, set<u32>> &reportMap,
     }
 }
 
-int addAnchoredNFA(RoseBuildImpl &tbi, const NGHolder &wrapper,
+int addAnchoredNFA(RoseBuildImpl &build, const NGHolder &wrapper,
                    const map<NFAVertex, set<u32>> &reportMap) {
     NGHolder h;
     ue2::unordered_map<NFAVertex, NFAVertex> orig_to_copy;
@@ -711,10 +732,10 @@ int addAnchoredNFA(RoseBuildImpl &tbi, const NGHolder &wrapper,
     clearReports(h);
     setReports(h, reportMap, orig_to_copy);
 
-    return addAutomaton(tbi, h, nullptr);
+    return addAutomaton(build, h, nullptr);
 }
 
-int addToAnchoredMatcher(RoseBuildImpl &tbi, const NGHolder &anchored,
+int addToAnchoredMatcher(RoseBuildImpl &build, const NGHolder &anchored,
                          u32 exit_id, ReportID *remap) {
     NGHolder h;
     cloneHolder(h, anchored);
@@ -725,23 +746,24 @@ int addToAnchoredMatcher(RoseBuildImpl &tbi, const NGHolder &anchored,
         h[v].reports.insert(exit_id);
     }
 
-    return addAutomaton(tbi, h, remap);
+    return addAutomaton(build, h, remap);
 }
 
 static
-void buildSimpleDfas(const RoseBuildImpl &tbi,
+void buildSimpleDfas(const RoseBuildImpl &build, const vector<u32> &frag_map,
                      vector<unique_ptr<raw_dfa>> *anchored_dfas) {
     /* we should have determinised all of these before so there should be no
      * chance of failure. */
-     for (const auto &simple : tbi.anchored_simple) {
-        set<u32> exit_ids;
+    flat_set<u32> exit_ids;
+    for (const auto &simple : build.anchored_simple) {
+        exit_ids.clear();
         for (auto lit_id : simple.second) {
-            exit_ids.insert(tbi.literal_info[lit_id].final_id);
+            assert(lit_id < frag_map.size());
+            exit_ids.insert(frag_map[lit_id]);
         }
-        NGHolder h;
-        populate_holder(simple.first, exit_ids, &h);
-        Automaton_Holder autom(h);
-        unique_ptr<raw_dfa> rdfa = ue2::make_unique<raw_dfa>(NFA_OUTFIX_RAW);
+        auto h = populate_holder(simple.first, exit_ids);
+        Automaton_Holder autom(*h);
+        auto rdfa = ue2::make_unique<raw_dfa>(NFA_OUTFIX_RAW);
         UNUSED int rv = determinise(autom, rdfa->states, MAX_DFA_STATES);
         assert(!rv);
         rdfa->start_anchored = INIT_STATE;
@@ -758,7 +780,8 @@ void buildSimpleDfas(const RoseBuildImpl &tbi,
  * from RoseBuildImpl.
  */
 static
-vector<unique_ptr<raw_dfa>> getAnchoredDfas(RoseBuildImpl &build) {
+vector<unique_ptr<raw_dfa>> getAnchoredDfas(RoseBuildImpl &build,
+                                            const vector<u32> &frag_map) {
     vector<unique_ptr<raw_dfa>> dfas;
 
     // DFAs that already exist as raw_dfas.
@@ -771,7 +794,7 @@ vector<unique_ptr<raw_dfa>> getAnchoredDfas(RoseBuildImpl &build) {
 
     // DFAs we currently have as simple literals.
     if (!build.anchored_simple.empty()) {
-        buildSimpleDfas(build, &dfas);
+        buildSimpleDfas(build, frag_map, &dfas);
         build.anchored_simple.clear();
     }
 
@@ -790,7 +813,7 @@ vector<unique_ptr<raw_dfa>> getAnchoredDfas(RoseBuildImpl &build) {
  */
 static
 size_t buildNfas(vector<raw_dfa> &anchored_dfas,
-                 vector<aligned_unique_ptr<NFA>> *nfas,
+                 vector<bytecode_ptr<NFA>> *nfas,
                  vector<u32> *start_offset, const CompileContext &cc,
                  const ReportManager &rm) {
     const size_t num_dfas = anchored_dfas.size();
@@ -806,7 +829,7 @@ size_t buildNfas(vector<raw_dfa> &anchored_dfas,
 
         minimize_hopcroft(rdfa, cc.grey);
 
-        auto nfa = mcclellanCompile(rdfa, cc, rm);
+        auto nfa = mcclellanCompile(rdfa, cc, rm, false);
         if (!nfa) {
             assert(0);
             throw std::bad_alloc();
@@ -823,7 +846,8 @@ size_t buildNfas(vector<raw_dfa> &anchored_dfas,
     return total_size;
 }
 
-vector<raw_dfa> buildAnchoredDfas(RoseBuildImpl &build) {
+vector<raw_dfa> buildAnchoredDfas(RoseBuildImpl &build,
+                                  const vector<LitFragment> &fragments) {
     vector<raw_dfa> dfas;
 
     if (build.anchored_nfas.empty() && build.anchored_simple.empty()) {
@@ -831,9 +855,10 @@ vector<raw_dfa> buildAnchoredDfas(RoseBuildImpl &build) {
         return dfas;
     }
 
-    remapAnchoredReports(build);
+    const auto frag_map = reverseFragMap(build, fragments);
+    remapAnchoredReports(build, frag_map);
 
-    auto anch_dfas = getAnchoredDfas(build);
+    auto anch_dfas = getAnchoredDfas(build, frag_map);
     mergeAnchoredDfas(anch_dfas, build);
 
     dfas.reserve(anch_dfas.size());
@@ -844,22 +869,21 @@ vector<raw_dfa> buildAnchoredDfas(RoseBuildImpl &build) {
     return dfas;
 }
 
-aligned_unique_ptr<anchored_matcher_info>
-buildAnchoredMatcher(RoseBuildImpl &build, vector<raw_dfa> &dfas,
-                     const vector<u32> &litPrograms, size_t *asize) {
+bytecode_ptr<anchored_matcher_info>
+buildAnchoredMatcher(RoseBuildImpl &build, const vector<LitFragment> &fragments,
+                     vector<raw_dfa> &dfas) {
     const CompileContext &cc = build.cc;
 
     if (dfas.empty()) {
         DEBUG_PRINTF("empty\n");
-        *asize = 0;
         return nullptr;
     }
 
     for (auto &rdfa : dfas) {
-        remapIdsToPrograms(rdfa, litPrograms);
+        remapIdsToPrograms(fragments, rdfa);
     }
 
-    vector<aligned_unique_ptr<NFA>> nfas;
+    vector<bytecode_ptr<NFA>> nfas;
     vector<u32> start_offset; // start offset for each dfa (dots removed)
     size_t total_size = buildNfas(dfas, &nfas, &start_offset, cc, build.rm);
 
@@ -867,8 +891,8 @@ buildAnchoredMatcher(RoseBuildImpl &build, vector<raw_dfa> &dfas,
         throw ResourceLimitError();
     }
 
-    *asize = total_size;
-    auto atable = aligned_zmalloc_unique<anchored_matcher_info>(total_size);
+    auto atable =
+        make_zeroed_bytecode_ptr<anchored_matcher_info>(total_size, 64);
     char *curr = (char *)atable.get();
 
     u32 state_offset = 0;
@@ -894,7 +918,7 @@ buildAnchoredMatcher(RoseBuildImpl &build, vector<raw_dfa> &dfas,
         ami->anchoredMinDistance = start_offset[i];
     }
 
-    DEBUG_PRINTF("success %zu\n", *asize);
+    DEBUG_PRINTF("success %zu\n", atable.size());
     return atable;
 }
 
diff --git a/src/rose/rose_build_anchored.h b/src/rose/rose_build_anchored.h
index ef06fcbbe..37d268ac5 100644
--- a/src/rose/rose_build_anchored.h
+++ b/src/rose/rose_build_anchored.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,9 +30,9 @@
 #define ROSE_BUILD_ANCHORED
 
 #include "ue2common.h"
-#include "rose_build.h"
+#include "rose_build_impl.h"
 #include "nfagraph/ng_holder.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"
 
 #include <map>
 #include <vector>
@@ -44,11 +44,13 @@ namespace ue2 {
 
 class RoseBuildImpl;
 struct raw_dfa;
+struct LitFragment;
 
 /**
  * \brief Construct a set of anchored DFAs from our anchored literals/engines.
  */
-std::vector<raw_dfa> buildAnchoredDfas(RoseBuildImpl &build);
+std::vector<raw_dfa> buildAnchoredDfas(RoseBuildImpl &build,
+                                    const std::vector<LitFragment> &fragments);
 
 /**
  * \brief Construct an anchored_matcher_info runtime structure from the given
@@ -57,9 +59,10 @@ std::vector<raw_dfa> buildAnchoredDfas(RoseBuildImpl &build);
  * Remap the literal final_ids used for raw_dfa reports to the program offsets
  * given in litPrograms.
  */
-aligned_unique_ptr<anchored_matcher_info>
-buildAnchoredMatcher(RoseBuildImpl &build, std::vector<raw_dfa> &dfas,
-                     const std::vector<u32> &litPrograms, size_t *asize);
+bytecode_ptr<anchored_matcher_info>
+buildAnchoredMatcher(RoseBuildImpl &build,
+                     const std::vector<LitFragment> &fragments,
+                     std::vector<raw_dfa> &dfas);
 
 u32 anchoredStateSize(const anchored_matcher_info &atable);
 
@@ -67,10 +70,10 @@ u32 anchoredStateSize(const anchored_matcher_info &atable);
 #define ANCHORED_SUCCESS 1
 #define ANCHORED_REMAP   2
 
-int addAnchoredNFA(RoseBuildImpl &tbi, const NGHolder &wrapper,
+int addAnchoredNFA(RoseBuildImpl &build, const NGHolder &wrapper,
                    const std::map<NFAVertex, std::set<u32>> &reportMap);
 
-int addToAnchoredMatcher(RoseBuildImpl &tbi, const NGHolder &anchored,
+int addToAnchoredMatcher(RoseBuildImpl &build, const NGHolder &anchored,
                          u32 exit_id, ReportID *remap);
 
 } // namespace ue2
diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp
index 9f4abcadf..4d0793bfe 100644
--- a/src/rose/rose_build_bytecode.cpp
+++ b/src/rose/rose_build_bytecode.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,6 +33,7 @@
 #include "hs_compile.h" // for HS_MODE_*
 #include "rose_build_add_internal.h"
 #include "rose_build_anchored.h"
+#include "rose_build_dump.h"
 #include "rose_build_engine_blob.h"
 #include "rose_build_exclusive.h"
 #include "rose_build_groups.h"
@@ -41,6 +42,7 @@
 #include "rose_build_lookaround.h"
 #include "rose_build_matchers.h"
 #include "rose_build_program.h"
+#include "rose_build_resources.h"
 #include "rose_build_scatter.h"
 #include "rose_build_util.h"
 #include "rose_build_width.h"
@@ -73,7 +75,6 @@
 #include "nfagraph/ng_width.h"
 #include "smallwrite/smallwrite_build.h"
 #include "som/slot_manager.h"
-#include "util/alloc.h"
 #include "util/bitutils.h"
 #include "util/boundary_reports.h"
 #include "util/charreach.h"
@@ -85,6 +86,7 @@
 #include "util/graph_range.h"
 #include "util/make_unique.h"
 #include "util/multibit_build.h"
+#include "util/noncopyable.h"
 #include "util/order_check.h"
 #include "util/popcount.h"
 #include "util/queue_index_factory.h"
@@ -97,6 +99,7 @@
 #include <map>
 #include <queue>
 #include <set>
+#include <sstream>
 #include <string>
 #include <vector>
 #include <utility>
@@ -129,140 +132,66 @@ namespace ue2 {
 
 namespace /* anon */ {
 
-struct left_build_info {
-    // Constructor for an engine implementation.
-    left_build_info(u32 q, u32 l, u32 t, rose_group sm,
-                    const std::vector<u8> &stops, u32 max_ql, u8 cm_count,
-                    const CharReach &cm_cr)
-        : queue(q), lag(l), transient(t), squash_mask(sm), stopAlphabet(stops),
-          max_queuelen(max_ql), countingMiracleCount(cm_count),
-          countingMiracleReach(cm_cr) {}
-
-    // Constructor for a lookaround implementation.
-    explicit left_build_info(const vector<LookEntry> &look)
-        : has_lookaround(true), lookaround(look) {}
-
-    u32 queue = 0; /* uniquely idents the left_build_info */
-    u32 lag = 0;
-    u32 transient = 0;
-    rose_group squash_mask = ~rose_group{0};
-    vector<u8> stopAlphabet;
-    u32 max_queuelen = 0;
-    u8 countingMiracleCount = 0;
-    CharReach countingMiracleReach;
-    u32 countingMiracleOffset = 0; /* populated later when laying out bytecode */
-    bool has_lookaround = false;
-    vector<LookEntry> lookaround; // alternative implementation to the NFA
-};
-
-/**
- * \brief Structure tracking which resources are used by this Rose instance at
- * runtime.
- *
- * We use this to control how much initialisation we need to do at the
- * beginning of a stream/block at runtime.
- */
-struct RoseResources {
-    bool has_outfixes = false;
-    bool has_suffixes = false;
-    bool has_leftfixes = false;
-    bool has_literals = false;
-    bool has_states = false;
-    bool checks_groups = false;
-    bool has_lit_delay = false;
-    bool has_lit_check = false; // long literal support
-    bool has_anchored = false;
-    bool has_eod = false;
-};
-
-struct build_context : boost::noncopyable {
+struct build_context : noncopyable {
     /** \brief information about engines to the left of a vertex */
     map<RoseVertex, left_build_info> leftfix_info;
 
     /** \brief mapping from suffix to queue index. */
     map<suffix_id, u32> suffixes;
 
-    /** \brief Mapping from vertex to key, for vertices with a
-     * CHECK_NOT_HANDLED instruction. */
-    ue2::unordered_map<RoseVertex, u32> handledKeys;
-
-    /** \brief Number of roles with a state bit.
-     *
-     * This is set by assignStateIndices() and should be constant throughout
-     * the rest of the compile.
-     */
-    size_t numStates = 0;
+    /** \brief engine info by queue. */
+    map<u32, engine_info> engine_info_by_queue;
 
     /** \brief Simple cache of programs written to engine blob, used for
      * deduplication. */
     ue2::unordered_map<RoseProgram, u32, RoseProgramHash,
                        RoseProgramEquivalence> program_cache;
 
-    /** \brief LookEntry list cache, so that we don't have to go scanning
-     * through the full list to find cases we've used already. */
-    ue2::unordered_map<vector<LookEntry>, size_t> lookaround_cache;
-
-    /** \brief Lookaround table for Rose roles. */
-    vector<LookEntry> lookaround;
-
-    /** \brief State indices, for those roles that have them. */
-    ue2::unordered_map<RoseVertex, u32> roleStateIndices;
+    /** \brief State indices, for those roles that have them.
+     * Each vertex present has a unique state index in the range
+     * [0, roleStateIndices.size()). */
+    unordered_map<RoseVertex, u32> roleStateIndices;
 
     /** \brief Mapping from queue index to bytecode offset for built engines
      * that have already been pushed into the engine_blob. */
     ue2::unordered_map<u32, u32> engineOffsets;
 
-    /** \brief Literal programs, indexed by final_id, after they have been
-     * written to the engine_blob. */
-    vector<u32> litPrograms;
-
-    /** \brief List of long literals (ones with CHECK_LITERAL instructions)
+    /** \brief List of long literals (ones with CHECK_LONG_LIT instructions)
      * that need hash table support. */
     vector<ue2_case_string> longLiterals;
 
-    /** \brief Minimum offset of a match from the floating table. */
-    u32 floatingMinLiteralMatchOffset = 0;
-
-    /** \brief Long literal length threshold, used in streaming mode. */
-    size_t longLitLengthThreshold = 0;
-
     /** \brief Contents of the Rose bytecode immediately following the
      * RoseEngine. */
     RoseEngineBlob engine_blob;
 
-    /** \brief True if reports need CATCH_UP instructions, to catch up anchored
-     * matches, suffixes, outfixes etc. */
-    bool needs_catchup = false;
-
     /** \brief True if this Rose engine has an MPV engine. */
     bool needs_mpv_catchup = false;
 
     /** \brief Resources in use (tracked as programs are added). */
     RoseResources resources;
+};
 
-    /** \brief Mapping from every vertex to the groups that must be on for that
-     * vertex to be reached. */
-    ue2::unordered_map<RoseVertex, rose_group> vertex_group_map;
-
-    /** \brief Global bitmap of groups that can be squashed. */
-    rose_group squashable_groups = 0;
+/** \brief subengine info including built engine and
+* corresponding triggering rose vertices */
+struct ExclusiveSubengine {
+    bytecode_ptr<NFA> nfa;
+    vector<RoseVertex> vertices;
 };
 
-}
+/** \brief exclusive info to build tamarama */
+struct ExclusiveInfo : noncopyable {
+    // subengine info
+    vector<ExclusiveSubengine> subengines;
+    // all the report in tamarama
+    set<ReportID> reports;
+    // assigned queue id
+    u32 queue;
+};
 
-static
-const NFA *get_nfa_from_blob(const build_context &bc, u32 qi) {
-    assert(contains(bc.engineOffsets, qi));
-    u32 nfa_offset = bc.engineOffsets.at(qi);
-    assert(nfa_offset >= bc.engine_blob.base_offset);
-    const NFA *n = (const NFA *)(bc.engine_blob.data() + nfa_offset -
-                                 bc.engine_blob.base_offset);
-    assert(n->queueIndex == qi);
-    return n;
 }
 
 static
-const NFA *add_nfa_to_blob(build_context &bc, NFA &nfa) {
+void add_nfa_to_blob(build_context &bc, NFA &nfa) {
     u32 qi = nfa.queueIndex;
     u32 nfa_offset = bc.engine_blob.add(nfa, nfa.length);
     DEBUG_PRINTF("added nfa qi=%u, type=%u, length=%u at offset=%u\n", qi,
@@ -270,10 +199,6 @@ const NFA *add_nfa_to_blob(build_context &bc, NFA &nfa) {
 
     assert(!contains(bc.engineOffsets, qi));
     bc.engineOffsets.emplace(qi, nfa_offset);
-
-    const NFA *n = get_nfa_from_blob(bc, qi);
-    assert(memcmp(&nfa, n, nfa.length) == 0);
-    return n;
 }
 
 static
@@ -288,38 +213,39 @@ u32 countRosePrefixes(const vector<LeftNfaInfo> &roses) {
 }
 
 /**
- * \brief True if this Rose engine needs to run a catch up whenever a report is
- * generated.
+ * \brief True if this Rose engine needs to run a catch up whenever a literal
+ * report is generated.
  *
  * Catch up is necessary if there are output-exposed engines (suffixes,
- * outfixes) or an anchored table (anchored literals, acyclic DFAs).
+ * outfixes).
  */
 static
-bool needsCatchup(const RoseBuildImpl &build,
-                  const vector<raw_dfa> &anchored_dfas) {
+bool needsCatchup(const RoseBuildImpl &build) {
+    /* Note: we could be more selective about when we need to generate catch up
+     * instructions rather than just a boolean yes/no - for instance, if we know
+     * that a role can only match before the point that an outfix/suffix could
+     * match, we do not strictly need a catchup instruction.
+     *
+     * However, this would add a certain amount of complexity to the
+     * catchup logic and would likely have limited applicability - how many
+     * reporting roles have a fixed max offset and how much time is spent on
+     * catchup for these cases?
+     */
+
     if (!build.outfixes.empty()) {
+        /* TODO: check that they have non-eod reports */
         DEBUG_PRINTF("has outfixes\n");
         return true;
     }
-    if (!anchored_dfas.empty()) {
-        DEBUG_PRINTF("has anchored dfas\n");
-        return true;
-    }
 
     const RoseGraph &g = build.g;
 
     for (auto v : vertices_range(g)) {
-        if (build.root == v) {
-            continue;
-        }
-        if (build.anchored_root == v) {
-            continue;
-        }
         if (g[v].suffix) {
+            /* TODO: check that they have non-eod reports */
             DEBUG_PRINTF("vertex %zu has suffix\n", g[v].index);
             return true;
         }
-
     }
 
     DEBUG_PRINTF("no need for catch-up on report\n");
@@ -328,6 +254,11 @@ bool needsCatchup(const RoseBuildImpl &build,
 
 static
 bool isPureFloating(const RoseResources &resources, const CompileContext &cc) {
+    if (!resources.has_floating) {
+        DEBUG_PRINTF("no floating table\n");
+        return false;
+    }
+
     if (resources.has_outfixes || resources.has_suffixes ||
         resources.has_leftfixes) {
         DEBUG_PRINTF("has engines\n");
@@ -355,8 +286,8 @@ bool isPureFloating(const RoseResources &resources, const CompileContext &cc) {
     }
 
     if (cc.streaming && resources.has_lit_check) {
-        DEBUG_PRINTF("has long literals in streaming mode, which needs "
-                     "long literal table support\n");
+        DEBUG_PRINTF("has long literals in streaming mode, which needs long "
+                     "literal table support\n");
         return false;
     }
 
@@ -394,20 +325,21 @@ bool isSingleOutfix(const RoseBuildImpl &tbi) {
 }
 
 static
-u8 pickRuntimeImpl(const RoseBuildImpl &build, const build_context &bc,
+u8 pickRuntimeImpl(const RoseBuildImpl &build, const RoseResources &resources,
                    UNUSED u32 outfixEndQueue) {
-    DEBUG_PRINTF("has_outfixes=%d\n", bc.resources.has_outfixes);
-    DEBUG_PRINTF("has_suffixes=%d\n", bc.resources.has_suffixes);
-    DEBUG_PRINTF("has_leftfixes=%d\n", bc.resources.has_leftfixes);
-    DEBUG_PRINTF("has_literals=%d\n", bc.resources.has_literals);
-    DEBUG_PRINTF("has_states=%d\n", bc.resources.has_states);
-    DEBUG_PRINTF("checks_groups=%d\n", bc.resources.checks_groups);
-    DEBUG_PRINTF("has_lit_delay=%d\n", bc.resources.has_lit_delay);
-    DEBUG_PRINTF("has_lit_check=%d\n", bc.resources.has_lit_check);
-    DEBUG_PRINTF("has_anchored=%d\n", bc.resources.has_anchored);
-    DEBUG_PRINTF("has_eod=%d\n", bc.resources.has_eod);
-
-    if (isPureFloating(bc.resources, build.cc)) {
+    DEBUG_PRINTF("has_outfixes=%d\n", resources.has_outfixes);
+    DEBUG_PRINTF("has_suffixes=%d\n", resources.has_suffixes);
+    DEBUG_PRINTF("has_leftfixes=%d\n", resources.has_leftfixes);
+    DEBUG_PRINTF("has_literals=%d\n", resources.has_literals);
+    DEBUG_PRINTF("has_states=%d\n", resources.has_states);
+    DEBUG_PRINTF("checks_groups=%d\n", resources.checks_groups);
+    DEBUG_PRINTF("has_lit_delay=%d\n", resources.has_lit_delay);
+    DEBUG_PRINTF("has_lit_check=%d\n", resources.has_lit_check);
+    DEBUG_PRINTF("has_anchored=%d\n", resources.has_anchored);
+    DEBUG_PRINTF("has_floating=%d\n", resources.has_floating);
+    DEBUG_PRINTF("has_eod=%d\n", resources.has_eod);
+
+    if (isPureFloating(resources, build.cc)) {
         return ROSE_RUNTIME_PURE_LITERAL;
     }
 
@@ -444,7 +376,7 @@ bool needsMpvCatchup(const RoseBuildImpl &build) {
 }
 
 static
-void fillStateOffsets(const RoseBuildImpl &tbi, u32 rolesWithStateCount,
+void fillStateOffsets(const RoseBuildImpl &build, u32 rolesWithStateCount,
                       u32 anchorStateSize, u32 activeArrayCount,
                       u32 activeLeftCount, u32 laggedRoseCount,
                       u32 longLitStreamStateRequired, u32 historyRequired,
@@ -476,7 +408,7 @@ void fillStateOffsets(const RoseBuildImpl &tbi, u32 rolesWithStateCount,
     curr_offset += anchorStateSize;
 
     so->groups = curr_offset;
-    so->groups_size = (tbi.group_end + 7) / 8;
+    so->groups_size = (build.group_end + 7) / 8;
     assert(so->groups_size <= sizeof(u64a));
     curr_offset += so->groups_size;
 
@@ -486,22 +418,22 @@ void fillStateOffsets(const RoseBuildImpl &tbi, u32 rolesWithStateCount,
 
     // Exhaustion multibit.
     so->exhausted = curr_offset;
-    curr_offset += mmbit_size(tbi.rm.numEkeys());
+    curr_offset += mmbit_size(build.rm.numEkeys());
 
     // SOM locations and valid/writeable multibit structures.
-    if (tbi.ssm.numSomSlots()) {
-        const u32 somWidth = tbi.ssm.somPrecision();
+    if (build.ssm.numSomSlots()) {
+        const u32 somWidth = build.ssm.somPrecision();
         if (somWidth) { // somWidth is zero in block mode.
             curr_offset = ROUNDUP_N(curr_offset, somWidth);
             so->somLocation = curr_offset;
-            curr_offset += tbi.ssm.numSomSlots() * somWidth;
+            curr_offset += build.ssm.numSomSlots() * somWidth;
         } else {
             so->somLocation = 0;
         }
         so->somValid = curr_offset;
-        curr_offset += mmbit_size(tbi.ssm.numSomSlots());
+        curr_offset += mmbit_size(build.ssm.numSomSlots());
         so->somWritable = curr_offset;
-        curr_offset += mmbit_size(tbi.ssm.numSomSlots());
+        curr_offset += mmbit_size(build.ssm.numSomSlots());
     } else {
         // No SOM handling, avoid growing the stream state any further.
         so->somLocation = 0;
@@ -515,7 +447,10 @@ void fillStateOffsets(const RoseBuildImpl &tbi, u32 rolesWithStateCount,
 
 // Get the mask of initial vertices due to root and anchored_root.
 rose_group RoseBuildImpl::getInitialGroups() const {
-    rose_group groups = getSuccGroups(root) | getSuccGroups(anchored_root);
+    rose_group groups = getSuccGroups(root)
+                      | getSuccGroups(anchored_root)
+                      | boundary_group_mask;
+
     DEBUG_PRINTF("initial groups = %016llx\n", groups);
     return groups;
 }
@@ -599,8 +534,8 @@ void findFixedDepthTops(const RoseGraph &g, const set<PredTopPair> &triggers,
  * engine.
  */
 static
-aligned_unique_ptr<NFA> pickImpl(aligned_unique_ptr<NFA> dfa_impl,
-                                 aligned_unique_ptr<NFA> nfa_impl) {
+bytecode_ptr<NFA> pickImpl(bytecode_ptr<NFA> dfa_impl,
+                           bytecode_ptr<NFA> nfa_impl) {
     assert(nfa_impl);
     assert(dfa_impl);
     assert(isDfaType(dfa_impl->type));
@@ -652,7 +587,7 @@ aligned_unique_ptr<NFA> pickImpl(aligned_unique_ptr<NFA> dfa_impl,
  * otherwise a Castle.
  */
 static
-aligned_unique_ptr<NFA>
+bytecode_ptr<NFA>
 buildRepeatEngine(const CastleProto &proto,
                   const map<u32, vector<vector<CharReach>>> &triggers,
                   const CompileContext &cc, const ReportManager &rm) {
@@ -668,11 +603,10 @@ buildRepeatEngine(const CastleProto &proto,
 }
 
 static
-aligned_unique_ptr<NFA> getDfa(raw_dfa &rdfa, bool is_transient,
-                               const CompileContext &cc,
-                               const ReportManager &rm) {
+bytecode_ptr<NFA> getDfa(raw_dfa &rdfa, bool is_transient,
+                         const CompileContext &cc, const ReportManager &rm) {
     // Unleash the Sheng!!
-    auto dfa = shengCompile(rdfa, cc, rm);
+    auto dfa = shengCompile(rdfa, cc, rm, false);
     if (!dfa && !is_transient) {
         // Sheng wasn't successful, so unleash McClellan!
         /* We don't try the hybrid for transient prefixes due to the extra
@@ -681,14 +615,14 @@ aligned_unique_ptr<NFA> getDfa(raw_dfa &rdfa, bool is_transient,
     }
     if (!dfa) {
         // Sheng wasn't successful, so unleash McClellan!
-        dfa = mcclellanCompile(rdfa, cc, rm);
+        dfa = mcclellanCompile(rdfa, cc, rm, false);
     }
     return dfa;
 }
 
 /* builds suffix nfas */
 static
-aligned_unique_ptr<NFA>
+bytecode_ptr<NFA>
 buildSuffix(const ReportManager &rm, const SomSlotManager &ssm,
             const map<u32, u32> &fixed_depth_tops,
             const map<u32, vector<vector<CharReach>>> &triggers,
@@ -810,21 +744,21 @@ void findTriggerSequences(const RoseBuildImpl &tbi,
         const u32 top = e.first;
         const set<u32> &lit_ids = e.second;
 
-        for (u32 id :  lit_ids) {
-            const rose_literal_id &lit = tbi.literals.right.at(id);
+        for (u32 id : lit_ids) {
+            const rose_literal_id &lit = tbi.literals.at(id);
             (*trigger_lits)[top].push_back(as_cr_seq(lit));
         }
     }
 }
 
-static aligned_unique_ptr<NFA>
-makeLeftNfa(const RoseBuildImpl &tbi, left_id &left,
-            const bool is_prefix, const bool is_transient,
-            const map<left_id, set<PredTopPair> > &infixTriggers,
-            const CompileContext &cc) {
+static
+bytecode_ptr<NFA> makeLeftNfa(const RoseBuildImpl &tbi, left_id &left,
+                        const bool is_prefix, const bool is_transient,
+                        const map<left_id, set<PredTopPair>> &infixTriggers,
+                        const CompileContext &cc) {
     const ReportManager &rm = tbi.rm;
 
-    aligned_unique_ptr<NFA> n;
+    bytecode_ptr<NFA> n;
 
     // Should compress state if this rose is non-transient and we're in
     // streaming mode.
@@ -969,8 +903,8 @@ u32 decreaseLag(const RoseBuildImpl &build, NGHolder &h,
     for (RoseVertex v : succs) {
         u32 lag = rg[v].left.lag;
         for (u32 lit_id : rg[v].literals) {
-            u32 delay = build.literals.right.at(lit_id).delay;
-            const ue2_literal &literal = build.literals.right.at(lit_id).s;
+            u32 delay = build.literals.at(lit_id).delay;
+            const ue2_literal &literal = build.literals.at(lit_id).s;
             assert(lag <= literal.length() + delay);
             size_t base = literal.length() + delay - lag;
             if (base >= literal.length()) {
@@ -1105,6 +1039,31 @@ left_id updateLeftfixWithEager(RoseGraph &g, const eager_info &ei,
     return leftfix;
 }
 
+static
+void enforceEngineSizeLimit(const NFA *n, const Grey &grey) {
+    const size_t nfa_size = n->length;
+    // Global limit.
+    if (nfa_size > grey.limitEngineSize) {
+        throw ResourceLimitError();
+    }
+
+    // Type-specific limit checks follow.
+
+    if (isDfaType(n->type)) {
+        if (nfa_size > grey.limitDFASize) {
+            throw ResourceLimitError();
+        }
+    } else if (isNfaType(n->type)) {
+        if (nfa_size > grey.limitNFASize) {
+            throw ResourceLimitError();
+        }
+    } else if (isLbrType(n->type)) {
+        if (nfa_size > grey.limitLBRSize) {
+            throw ResourceLimitError();
+        }
+    }
+}
+
 static
 bool buildLeftfix(RoseBuildImpl &build, build_context &bc, bool prefix, u32 qi,
                   const map<left_id, set<PredTopPair> > &infixTriggers,
@@ -1125,7 +1084,7 @@ bool buildLeftfix(RoseBuildImpl &build, build_context &bc, bool prefix, u32 qi,
         leftfix = updateLeftfixWithEager(g, eager.at(leftfix), succs);
     }
 
-    aligned_unique_ptr<NFA> nfa;
+    bytecode_ptr<NFA> nfa;
     // Need to build NFA, which is either predestined to be a Haig (in SOM mode)
     // or could be all manner of things.
     if (leftfix.haig()) {
@@ -1142,8 +1101,10 @@ bool buildLeftfix(RoseBuildImpl &build, build_context &bc, bool prefix, u32 qi,
 
     setLeftNfaProperties(*nfa, leftfix);
 
-    build.leftfix_queue_map.emplace(leftfix, qi);
     nfa->queueIndex = qi;
+    enforceEngineSizeLimit(nfa.get(), cc.grey);
+    bc.engine_info_by_queue.emplace(nfa->queueIndex,
+                                    engine_info(nfa.get(), is_transient));
 
     if (!prefix && !leftfix.haig() && leftfix.graph()
         && nfaStuckOn(*leftfix.graph())) {
@@ -1171,7 +1132,7 @@ bool buildLeftfix(RoseBuildImpl &build, build_context &bc, bool prefix, u32 qi,
         for (RoseVertex v : succs) {
             for (auto u : inv_adjacent_vertices_range(v, g)) {
                 for (u32 lit_id : g[u].literals) {
-                    lits.insert(build.literals.right.at(lit_id).s);
+                    lits.insert(build.literals.at(lit_id).s);
                 }
             }
         }
@@ -1241,12 +1202,10 @@ void updateTops(const RoseGraph &g, const TamaInfo &tamaInfo,
     for (const auto &n : tamaInfo.subengines) {
         for (const auto &v : subengines[i].vertices) {
             if (is_suffix) {
-                tamaProto.add(n, g[v].index, g[v].suffix.top,
-                              out_top_remap);
+                tamaProto.add(n, g[v].index, g[v].suffix.top, out_top_remap);
             } else {
                 for (const auto &e : in_edges_range(v, g)) {
-                    tamaProto.add(n, g[v].index, g[e].rose_top,
-                                  out_top_remap);
+                    tamaProto.add(n, g[v].index, g[e].rose_top, out_top_remap);
                 }
             }
         }
@@ -1259,32 +1218,34 @@ shared_ptr<TamaProto> constructContainerEngine(const RoseGraph &g,
                                                build_context &bc,
                                                const ExclusiveInfo &info,
                                                const u32 queue,
-                                               const bool is_suffix) {
+                                               const bool is_suffix,
+                                               const Grey &grey) {
     const auto &subengines = info.subengines;
-    auto tamaInfo =
-        constructTamaInfo(g, subengines, is_suffix);
+    auto tamaInfo = constructTamaInfo(g, subengines, is_suffix);
 
     map<pair<const NFA *, u32>, u32> out_top_remap;
     auto n = buildTamarama(*tamaInfo, queue, out_top_remap);
+    enforceEngineSizeLimit(n.get(), grey);
+    bc.engine_info_by_queue.emplace(n->queueIndex, engine_info(n.get(), false));
     add_nfa_to_blob(bc, *n);
 
     DEBUG_PRINTF("queue id:%u\n", queue);
     shared_ptr<TamaProto> tamaProto = make_shared<TamaProto>();
     tamaProto->reports = info.reports;
-    updateTops(g, *tamaInfo, *tamaProto, subengines,
-               out_top_remap, is_suffix);
+    updateTops(g, *tamaInfo, *tamaProto, subengines, out_top_remap, is_suffix);
     return tamaProto;
 }
 
 static
 void buildInfixContainer(RoseGraph &g, build_context &bc,
-                         const vector<ExclusiveInfo> &exclusive_info) {
+                         const vector<ExclusiveInfo> &exclusive_info,
+                         const Grey &grey) {
     // Build tamarama engine
     for (const auto &info : exclusive_info) {
         const u32 queue = info.queue;
         const auto &subengines = info.subengines;
         auto tamaProto =
-            constructContainerEngine(g, bc, info, queue, false);
+            constructContainerEngine(g, bc, info, queue, false, grey);
 
         for (const auto &sub : subengines) {
             const auto &verts = sub.vertices;
@@ -1298,13 +1259,14 @@ void buildInfixContainer(RoseGraph &g, build_context &bc,
 
 static
 void buildSuffixContainer(RoseGraph &g, build_context &bc,
-                          const vector<ExclusiveInfo> &exclusive_info) {
+                          const vector<ExclusiveInfo> &exclusive_info,
+                          const Grey &grey) {
     // Build tamarama engine
     for (const auto &info : exclusive_info) {
         const u32 queue = info.queue;
         const auto &subengines = info.subengines;
-        auto tamaProto =
-            constructContainerEngine(g, bc, info, queue, true);
+        auto tamaProto = constructContainerEngine(g, bc, info, queue, true,
+                                                  grey);
         for (const auto &sub : subengines) {
             const auto &verts = sub.vertices;
             for (const auto &v : verts) {
@@ -1320,9 +1282,9 @@ void buildSuffixContainer(RoseGraph &g, build_context &bc,
 
 static
 void updateExclusiveInfixProperties(const RoseBuildImpl &build,
-                                    build_context &bc,
-                                    const vector<ExclusiveInfo> &exclusive_info,
-                                    set<u32> *no_retrigger_queues) {
+                                const vector<ExclusiveInfo> &exclusive_info,
+                                map<RoseVertex, left_build_info> &leftfix_info,
+                                set<u32> *no_retrigger_queues) {
     const RoseGraph &g = build.g;
     for (const auto &info : exclusive_info) {
         // Set leftfix optimisations, disabled for tamarama subengines
@@ -1351,7 +1313,7 @@ void updateExclusiveInfixProperties(const RoseBuildImpl &build,
                 set<ue2_literal> lits;
                 for (auto u : inv_adjacent_vertices_range(v, build.g)) {
                     for (u32 lit_id : build.g[u].literals) {
-                        lits.insert(build.literals.right.at(lit_id).s);
+                        lits.insert(build.literals.at(lit_id).s);
                     }
                 }
                 DEBUG_PRINTF("%zu literals\n", lits.size());
@@ -1372,9 +1334,10 @@ void updateExclusiveInfixProperties(const RoseBuildImpl &build,
             const auto &verts = sub.vertices;
             for (const auto &v : verts) {
                 u32 lag = g[v].left.lag;
-                bc.leftfix_info.emplace(
-                    v, left_build_info(qi, lag, max_width, squash_mask, stop,
-                                       max_queuelen, cm_count, cm_cr));
+                leftfix_info.emplace(v, left_build_info(qi, lag, max_width,
+                                                        squash_mask, stop,
+                                                        max_queuelen, cm_count,
+                                                        cm_cr));
             }
         }
     }
@@ -1436,9 +1399,9 @@ void buildExclusiveInfixes(RoseBuildImpl &build, build_context &bc,
         info.queue = qif.get_queue();
         exclusive_info.push_back(move(info));
     }
-    updateExclusiveInfixProperties(build, bc, exclusive_info,
+    updateExclusiveInfixProperties(build, exclusive_info, bc.leftfix_info,
                                    no_retrigger_queues);
-    buildInfixContainer(g, bc, exclusive_info);
+    buildInfixContainer(g, bc, exclusive_info, build.cc.grey);
 }
 
 static
@@ -1510,8 +1473,7 @@ bool buildLeftfixes(RoseBuildImpl &tbi, build_context &bc,
     findInfixTriggers(tbi, &infixTriggers);
 
     if (cc.grey.allowTamarama && cc.streaming && !do_prefix) {
-        findExclusiveInfixes(tbi, bc, qif, infixTriggers,
-                             no_retrigger_queues);
+        findExclusiveInfixes(tbi, bc, qif, infixTriggers, no_retrigger_queues);
     }
 
     for (auto v : vertices_range(g)) {
@@ -1540,7 +1502,7 @@ bool buildLeftfixes(RoseBuildImpl &tbi, build_context &bc,
         // TODO: Handle SOM-tracking cases as well.
         if (cc.grey.roseLookaroundMasks && is_transient &&
             !g[v].left.tracksSom()) {
-            vector<LookEntry> lookaround;
+            vector<vector<LookEntry>> lookaround;
             if (makeLeftfixLookaround(tbi, v, lookaround)) {
                 DEBUG_PRINTF("implementing as lookaround!\n");
                 bc.leftfix_info.emplace(v, left_build_info(lookaround));
@@ -1613,26 +1575,26 @@ bool hasNonSmallBlockOutfix(const vector<OutfixInfo> &outfixes) {
 }
 
 namespace {
-class OutfixBuilder : public boost::static_visitor<aligned_unique_ptr<NFA>> {
+class OutfixBuilder : public boost::static_visitor<bytecode_ptr<NFA>> {
 public:
     explicit OutfixBuilder(const RoseBuildImpl &build_in) : build(build_in) {}
 
-    aligned_unique_ptr<NFA> operator()(boost::blank&) const {
+    bytecode_ptr<NFA> operator()(boost::blank&) const {
         return nullptr;
     };
 
-    aligned_unique_ptr<NFA> operator()(unique_ptr<raw_dfa> &rdfa) const {
+    bytecode_ptr<NFA> operator()(unique_ptr<raw_dfa> &rdfa) const {
         // Unleash the mighty DFA!
         return getDfa(*rdfa, false, build.cc, build.rm);
     }
 
-    aligned_unique_ptr<NFA> operator()(unique_ptr<raw_som_dfa> &haig) const {
+    bytecode_ptr<NFA> operator()(unique_ptr<raw_som_dfa> &haig) const {
         // Unleash the Goughfish!
         return goughCompile(*haig, build.ssm.somPrecision(), build.cc,
                             build.rm);
     }
 
-    aligned_unique_ptr<NFA> operator()(unique_ptr<NGHolder> &holder) const {
+    bytecode_ptr<NFA> operator()(unique_ptr<NGHolder> &holder) const {
         const CompileContext &cc = build.cc;
         const ReportManager &rm = build.rm;
 
@@ -1661,7 +1623,7 @@ class OutfixBuilder : public boost::static_visitor<aligned_unique_ptr<NFA>> {
         return n;
     }
 
-    aligned_unique_ptr<NFA> operator()(UNUSED MpvProto &mpv) const {
+    bytecode_ptr<NFA> operator()(UNUSED MpvProto &mpv) const {
         // MPV construction handled separately.
         assert(mpv.puffettes.empty());
         return nullptr;
@@ -1673,7 +1635,7 @@ class OutfixBuilder : public boost::static_visitor<aligned_unique_ptr<NFA>> {
 }
 
 static
-aligned_unique_ptr<NFA> buildOutfix(RoseBuildImpl &build, OutfixInfo &outfix) {
+bytecode_ptr<NFA> buildOutfix(const RoseBuildImpl &build, OutfixInfo &outfix) {
     assert(!outfix.is_dead()); // should not be marked dead.
 
     auto n = boost::apply_visitor(OutfixBuilder(build), outfix.proto);
@@ -1719,6 +1681,9 @@ void prepMpv(RoseBuildImpl &tbi, build_context &bc, size_t *historyRequired,
 
     u32 qi = mpv_outfix->get_queue(tbi.qif);
     nfa->queueIndex = qi;
+    enforceEngineSizeLimit(nfa.get(), tbi.cc.grey);
+    bc.engine_info_by_queue.emplace(nfa->queueIndex,
+                                    engine_info(nfa.get(), false));
 
     DEBUG_PRINTF("built mpv\n");
 
@@ -1777,6 +1742,9 @@ bool prepOutfixes(RoseBuildImpl &tbi, build_context &bc,
         setOutfixProperties(*n, out);
 
         n->queueIndex = out.get_queue(tbi.qif);
+        enforceEngineSizeLimit(n.get(), tbi.cc.grey);
+        bc.engine_info_by_queue.emplace(n->queueIndex,
+                                        engine_info(n.get(), false));
 
         if (!*historyRequired && requires_decompress_key(*n)) {
             *historyRequired = 1;
@@ -1789,7 +1757,7 @@ bool prepOutfixes(RoseBuildImpl &tbi, build_context &bc,
 }
 
 static
-void assignSuffixQueues(RoseBuildImpl &build, build_context &bc) {
+void assignSuffixQueues(RoseBuildImpl &build, map<suffix_id, u32> &suffixes) {
     const RoseGraph &g = build.g;
 
     for (auto v : vertices_range(g)) {
@@ -1802,14 +1770,13 @@ void assignSuffixQueues(RoseBuildImpl &build, build_context &bc) {
         DEBUG_PRINTF("vertex %zu triggers suffix %p\n", g[v].index, s.graph());
 
         // We may have already built this NFA.
-        if (contains(bc.suffixes, s)) {
+        if (contains(suffixes, s)) {
             continue;
         }
 
         u32 queue = build.qif.get_queue();
         DEBUG_PRINTF("assigning %p to queue %u\n", s.graph(), queue);
-        bc.suffixes.emplace(s, queue);
-        build.suffix_queue_map.emplace(s, queue);
+        suffixes.emplace(s, queue);
     }
 }
 
@@ -1875,14 +1842,14 @@ void buildExclusiveSuffixes(RoseBuildImpl &build, build_context &bc,
     }
     updateExclusiveSuffixProperties(build, exclusive_info,
                                     no_retrigger_queues);
-    buildSuffixContainer(g, bc, exclusive_info);
+    buildSuffixContainer(g, bc, exclusive_info, build.cc.grey);
 }
 
 static
 void findExclusiveSuffixes(RoseBuildImpl &tbi, build_context &bc,
-                  QueueIndexFactory &qif,
-                  map<suffix_id, set<PredTopPair>> &suffixTriggers,
-                  set<u32> *no_retrigger_queues) {
+                           QueueIndexFactory &qif,
+                           map<suffix_id, set<PredTopPair>> &suffixTriggers,
+                           set<u32> *no_retrigger_queues) {
     const RoseGraph &g = tbi.g;
 
     map<suffix_id, u32> suffixes;
@@ -1972,6 +1939,10 @@ bool buildSuffixes(const RoseBuildImpl &tbi, build_context &bc,
         setSuffixProperties(*n, s, tbi.rm);
 
         n->queueIndex = queue;
+        enforceEngineSizeLimit(n.get(), tbi.cc.grey);
+        bc.engine_info_by_queue.emplace(n->queueIndex,
+                                        engine_info(n.get(), false));
+
         if (s.graph() && nfaStuckOn(*s.graph())) { /* todo: have corresponding
                                                     * haig analysis */
             assert(!s.haig());
@@ -2042,7 +2013,7 @@ bool buildNfas(RoseBuildImpl &tbi, build_context &bc, QueueIndexFactory &qif,
                               no_retrigger_queues);
     }
 
-    assignSuffixQueues(tbi, bc);
+    assignSuffixQueues(tbi, bc.suffixes);
 
     if (!buildSuffixes(tbi, bc, no_retrigger_queues, suffixTriggers)) {
         return false;
@@ -2065,65 +2036,47 @@ bool buildNfas(RoseBuildImpl &tbi, build_context &bc, QueueIndexFactory &qif,
 }
 
 static
-void allocateStateSpace(const NFA *nfa, const set<u32> &transient_queues,
-                        RoseStateOffsets *so, NfaInfo *nfa_infos,
-                        u32 *currFullStateSize, u32 *maskStateSize,
-                        u32 *tStateSize) {
-    u32 qi = nfa->queueIndex;
-    bool transient = transient_queues.find(qi) != transient_queues.end();
-    u32 stateSize = verify_u32(nfa->streamStateSize);
-
+void allocateStateSpace(const engine_info &eng_info, NfaInfo &nfa_info,
+                        RoseStateOffsets *so, u32 *scratchStateSize,
+                        u32 *streamStateSize, u32 *transientStateSize) {
     u32 state_offset;
-    if (transient) {
-        state_offset = *tStateSize;
-        *tStateSize += stateSize;
+    if (eng_info.transient) {
+        // Transient engines do not use stream state, but must have room in
+        // transient state (stored in scratch).
+        state_offset = *transientStateSize;
+        *transientStateSize += eng_info.stream_size;
     } else {
-        // Pack NFA state on to the end of the Rose state.
+        // Pack NFA stream state on to the end of the Rose stream state.
         state_offset = so->end;
-        so->end += stateSize;
-        *maskStateSize += stateSize;
+        so->end += eng_info.stream_size;
+        *streamStateSize += eng_info.stream_size;
     }
 
-    nfa_infos[qi].stateOffset = state_offset;
+    nfa_info.stateOffset = state_offset;
 
-    // Uncompressed state must be aligned.
-    u32 scratchStateSize = verify_u32(nfa->scratchStateSize);
-    u32 alignReq = state_alignment(*nfa);
-    assert(alignReq);
-    while (*currFullStateSize % alignReq) {
-        (*currFullStateSize)++;
-    }
-    nfa_infos[qi].fullStateOffset = *currFullStateSize;
-    *currFullStateSize += scratchStateSize;
+    // Uncompressed state in scratch must be aligned.
+    *scratchStateSize = ROUNDUP_N(*scratchStateSize, eng_info.scratch_align);
+    nfa_info.fullStateOffset = *scratchStateSize;
+    *scratchStateSize += eng_info.scratch_size;
 }
 
 static
-void findTransientQueues(const map<RoseVertex, left_build_info> &leftfix_info,
-                         set<u32> *out) {
-    DEBUG_PRINTF("curating transient queues\n");
-    for (const auto &build : leftfix_info | map_values) {
-        if (build.transient) {
-            DEBUG_PRINTF("q %u is transient\n", build.queue);
-            out->insert(build.queue);
-        }
+void updateNfaState(const build_context &bc, vector<NfaInfo> &nfa_infos,
+                    RoseStateOffsets *so, u32 *scratchStateSize,
+                    u32 *streamStateSize, u32 *transientStateSize) {
+    if (nfa_infos.empty()) {
+        return;
     }
-}
-
-static
-void updateNfaState(const build_context &bc, RoseStateOffsets *so,
-                    NfaInfo *nfa_infos, u32 *fullStateSize, u32 *nfaStateSize,
-                    u32 *tStateSize) {
-    *nfaStateSize = 0;
-    *tStateSize = 0;
-    *fullStateSize = 0;
 
-    set<u32> transient_queues;
-    findTransientQueues(bc.leftfix_info, &transient_queues);
+    *streamStateSize = 0;
+    *transientStateSize = 0;
+    *scratchStateSize = 0;
 
-    for (const auto &m : bc.engineOffsets) {
-        const NFA *n = get_nfa_from_blob(bc, m.first);
-        allocateStateSpace(n, transient_queues, so, nfa_infos, fullStateSize,
-                           nfaStateSize, tStateSize);
+    for (u32 qi = 0; qi < nfa_infos.size(); qi++) {
+        NfaInfo &nfa_info = nfa_infos[qi];
+        const auto &eng_info = bc.engine_info_by_queue.at(qi);
+        allocateStateSpace(eng_info, nfa_info, so, scratchStateSize,
+                           streamStateSize, transientStateSize);
     }
 }
 
@@ -2162,9 +2115,8 @@ u32 RoseBuildImpl::calcHistoryRequired() const {
     }
 
     // Delayed literals contribute to history requirement as well.
-    for (const auto &e : literals.right) {
-        const u32 id = e.first;
-        const auto &lit = e.second;
+    for (u32 id = 0; id < literals.size(); id++) {
+        const auto &lit = literals.at(id);
         if (lit.delay) {
             // If the literal is delayed _and_ has a mask that is longer than
             // the literal, we need enough history to match the whole mask as
@@ -2203,6 +2155,7 @@ u32 buildLastByteIter(const RoseGraph &g, build_context &bc) {
         auto it = bc.roleStateIndices.find(v);
         if (it != end(bc.roleStateIndices)) {
             lb_roles.push_back(it->second);
+            DEBUG_PRINTF("last byte %u\n", it->second);
         }
     }
 
@@ -2210,35 +2163,10 @@ u32 buildLastByteIter(const RoseGraph &g, build_context &bc) {
         return 0; /* invalid offset */
     }
 
-    vector<mmbit_sparse_iter> iter;
-    mmbBuildSparseIterator(iter, lb_roles, bc.numStates);
+    auto iter = mmbBuildSparseIterator(lb_roles, bc.roleStateIndices.size());
     return bc.engine_blob.add_iterator(iter);
 }
 
-static
-void enforceEngineSizeLimit(const NFA *n, const size_t nfa_size, const Grey &grey) {
-    // Global limit.
-    if (nfa_size > grey.limitEngineSize) {
-        throw ResourceLimitError();
-    }
-
-    // Type-specific limit checks follow.
-
-    if (isDfaType(n->type)) {
-        if (nfa_size > grey.limitDFASize) {
-            throw ResourceLimitError();
-        }
-    } else if (isNfaType(n->type)) {
-        if (nfa_size > grey.limitNFASize) {
-            throw ResourceLimitError();
-        }
-    } else if (isLbrType(n->type)) {
-        if (nfa_size > grey.limitLBRSize) {
-            throw ResourceLimitError();
-        }
-    }
-}
-
 static
 u32 findMinFloatingLiteralMatch(const RoseBuildImpl &build,
                                 const vector<raw_dfa> &anchored_dfas) {
@@ -2269,17 +2197,16 @@ u32 findMinFloatingLiteralMatch(const RoseBuildImpl &build,
 }
 
 static
-void buildSuffixEkeyLists(const RoseBuildImpl &tbi, build_context &bc,
-                          const QueueIndexFactory &qif,
-                          vector<u32> *out) {
-    out->resize(qif.allocated_count());
+vector<u32> buildSuffixEkeyLists(const RoseBuildImpl &build, build_context &bc,
+                                 const QueueIndexFactory &qif) {
+    vector<u32> out(qif.allocated_count());
 
-    map<u32, vector<u32> > qi_to_ekeys; /* for determinism */
+    map<u32, vector<u32>> qi_to_ekeys; /* for determinism */
 
     for (const auto &e : bc.suffixes) {
         const suffix_id &s = e.first;
         u32 qi = e.second;
-        set<u32> ekeys = reportsToEkeys(all_reports(s), tbi.rm);
+        set<u32> ekeys = reportsToEkeys(all_reports(s), build.rm);
 
         if (!ekeys.empty()) {
             qi_to_ekeys[qi] = {ekeys.begin(), ekeys.end()};
@@ -2287,9 +2214,9 @@ void buildSuffixEkeyLists(const RoseBuildImpl &tbi, build_context &bc,
     }
 
     /* for each outfix also build elists */
-    for (const auto &outfix : tbi.outfixes) {
+    for (const auto &outfix : build.outfixes) {
         u32 qi = outfix.get_queue();
-        set<u32> ekeys = reportsToEkeys(all_reports(outfix), tbi.rm);
+        set<u32> ekeys = reportsToEkeys(all_reports(outfix), build.rm);
 
         if (!ekeys.empty()) {
             qi_to_ekeys[qi] = {ekeys.begin(), ekeys.end()};
@@ -2297,11 +2224,14 @@ void buildSuffixEkeyLists(const RoseBuildImpl &tbi, build_context &bc,
     }
 
     for (auto &e : qi_to_ekeys) {
-        assert(!e.second.empty());
-        e.second.push_back(INVALID_EKEY); /* terminator */
-        (*out)[e.first] = bc.engine_blob.add(e.second.begin(),
-                                             e.second.end());
+        u32 qi = e.first;
+        auto &ekeys = e.second;
+        assert(!ekeys.empty());
+        ekeys.push_back(INVALID_EKEY); /* terminator */
+        out[qi] = bc.engine_blob.add_range(ekeys);
     }
+
+    return out;
 }
 
 /** Returns sparse iter offset in engine blob. */
@@ -2309,8 +2239,8 @@ static
 u32 buildEodNfaIterator(build_context &bc, const u32 activeQueueCount) {
     vector<u32> keys;
     for (u32 qi = 0; qi < activeQueueCount; ++qi) {
-        const NFA *n = get_nfa_from_blob(bc, qi);
-        if (nfaAcceptsEod(n)) {
+        const auto &eng_info = bc.engine_info_by_queue.at(qi);
+        if (eng_info.accepts_eod) {
             DEBUG_PRINTF("nfa qi=%u accepts eod\n", qi);
             keys.push_back(qi);
         }
@@ -2322,8 +2252,7 @@ u32 buildEodNfaIterator(build_context &bc, const u32 activeQueueCount) {
 
     DEBUG_PRINTF("building iter for %zu nfas\n", keys.size());
 
-    vector<mmbit_sparse_iter> iter;
-    mmbBuildSparseIterator(iter, keys, activeQueueCount);
+    auto iter = mmbBuildSparseIterator(keys, activeQueueCount);
     return bc.engine_blob.add_iterator(iter);
 }
 
@@ -2368,42 +2297,6 @@ bool anyEndfixMpvTriggers(const RoseBuildImpl &tbi) {
     return false;
 }
 
-static
-void populateNfaInfoBasics(const RoseBuildImpl &build, const build_context &bc,
-                           const vector<OutfixInfo> &outfixes,
-                           const vector<u32> &ekeyListOffsets,
-                           const set<u32> &no_retrigger_queues,
-                           NfaInfo *infos) {
-    const u32 num_queues = build.qif.allocated_count();
-    for (u32 qi = 0; qi < num_queues; qi++) {
-        const NFA *n = get_nfa_from_blob(bc, qi);
-        enforceEngineSizeLimit(n, n->length, build.cc.grey);
-
-        NfaInfo &info = infos[qi];
-        info.nfaOffset = bc.engineOffsets.at(qi);
-        info.ekeyListOffset = ekeyListOffsets[qi];
-        info.no_retrigger = contains(no_retrigger_queues, qi) ? 1 : 0;
-    }
-
-    // Mark outfixes that are in the small block matcher.
-    for (const auto &out : outfixes) {
-        const u32 qi = out.get_queue();
-        infos[qi].in_sbmatcher = out.in_sbmatcher;
-    }
-
-    // Mark suffixes triggered by EOD table literals.
-    const RoseGraph &g = build.g;
-    for (auto v : vertices_range(g)) {
-        if (!g[v].suffix) {
-            continue;
-        }
-        u32 qi = bc.suffixes.at(g[v].suffix);
-        if (build.isInETable(v)) {
-            infos[qi].eod = 1;
-        }
-    }
-}
-
 struct DerivedBoundaryReports {
     explicit DerivedBoundaryReports(const BoundaryReports &boundary) {
         insert(&report_at_0_eod_full, boundary.report_at_0_eod);
@@ -2414,144 +2307,33 @@ struct DerivedBoundaryReports {
 };
 
 static
-void prepSomRevNfas(const SomSlotManager &ssm, u32 *rev_nfa_table_offset,
-                    vector<u32> *nfa_offsets, u32 *currOffset) {
-    const deque<aligned_unique_ptr<NFA>> &nfas = ssm.getRevNfas();
-
-    *currOffset = ROUNDUP_N(*currOffset, alignof(u32));
-    *rev_nfa_table_offset = *currOffset;
-    *currOffset += sizeof(u32) * nfas.size();
-
-    *currOffset = ROUNDUP_CL(*currOffset);
-    for (const auto &n : nfas) {
-        u32 bs_offset;
-        bs_offset = *currOffset;
-        nfa_offsets->push_back(bs_offset);
-        *currOffset += ROUNDUP_CL(n->length);
+void addSomRevNfas(build_context &bc, RoseEngine &proto,
+                   const SomSlotManager &ssm) {
+    const auto &nfas = ssm.getRevNfas();
+    vector<u32> nfa_offsets;
+    nfa_offsets.reserve(nfas.size());
+    for (const auto &nfa : nfas) {
+        assert(nfa);
+        u32 offset = bc.engine_blob.add(*nfa, nfa->length);
+        DEBUG_PRINTF("wrote SOM rev NFA %zu (len %u) to offset %u\n",
+                     nfa_offsets.size(), nfa->length, offset);
+        nfa_offsets.push_back(offset);
         /* note: som rev nfas don't need a queue assigned as only run in block
          * mode reverse */
     }
 
-    assert(nfa_offsets->size() == nfas.size());
-}
-
-static
-void fillInSomRevNfas(RoseEngine *engine, const SomSlotManager &ssm,
-                      u32 rev_nfa_table_offset,
-                      const vector<u32> &nfa_offsets) {
-    const deque<aligned_unique_ptr<NFA>> &nfas = ssm.getRevNfas();
-    assert(nfa_offsets.size() == nfas.size());
-
-    engine->somRevCount = (u32)nfas.size();
-    engine->somRevOffsetOffset = rev_nfa_table_offset;
-
-    if (nfas.empty()) {
-        return;
-    }
-
-    char *out = (char *)engine + rev_nfa_table_offset;
-    size_t table_size = sizeof(u32) * nfa_offsets.size();
-    memcpy(out, nfa_offsets.data(), table_size);
-    out = (char *)engine + ROUNDUP_CL(rev_nfa_table_offset + table_size);
-
-    // Write the SOM reverse NFAs into place.
-    UNUSED size_t i = 0;
-    for (const auto &n : nfas) {
-        assert(n != nullptr);
-        assert(out == (char *)engine + nfa_offsets[i]);
-
-        memcpy(out, n.get(), n->length);
-        out += ROUNDUP_CL(n->length);
-        DEBUG_PRINTF("wrote som rev nfa with len %u\n", n->length);
-        ++i;
-    }
-}
-
-static
-vector<const rose_literal_info *>
-getLiteralInfoByFinalId(const RoseBuildImpl &build, u32 final_id) {
-    vector<const rose_literal_info *> out;
-
-    const auto &final_id_to_literal = build.final_id_to_literal;
-    assert(contains(final_id_to_literal, final_id));
-
-    const auto &lits = final_id_to_literal.find(final_id)->second;
-    assert(!lits.empty());
-
-    for (const auto &lit_id : lits) {
-        const rose_literal_info &li = build.literal_info[lit_id];
-        assert(li.final_id == final_id);
-        out.push_back(&li);
-    }
-
-    return out;
-}
-
-static
-void applyFinalSpecialisation(RoseProgram &program) {
-    assert(!program.empty());
-    assert(program.back().code() == ROSE_INSTR_END);
-    if (program.size() < 2) {
-        return;
-    }
-
-    /* Replace the second-to-last instruction (before END) with a one-shot
-     * specialisation if available. */
-    auto it = next(program.rbegin());
-    if (auto *ri = dynamic_cast<const RoseInstrReport *>(it->get())) {
-        DEBUG_PRINTF("replacing REPORT with FINAL_REPORT\n");
-        program.replace(it, make_unique<RoseInstrFinalReport>(
-                                ri->onmatch, ri->offset_adjust));
-    }
-}
-
-static
-void recordResources(RoseResources &resources, const RoseProgram &program) {
-    for (const auto &ri : program) {
-        switch (ri->code()) {
-        case ROSE_INSTR_TRIGGER_SUFFIX:
-            resources.has_suffixes = true;
-            break;
-        case ROSE_INSTR_TRIGGER_INFIX:
-        case ROSE_INSTR_CHECK_INFIX:
-        case ROSE_INSTR_CHECK_PREFIX:
-        case ROSE_INSTR_SOM_LEFTFIX:
-            resources.has_leftfixes = true;
-            break;
-        case ROSE_INSTR_SET_STATE:
-        case ROSE_INSTR_CHECK_STATE:
-        case ROSE_INSTR_SPARSE_ITER_BEGIN:
-        case ROSE_INSTR_SPARSE_ITER_NEXT:
-            resources.has_states = true;
-            break;
-        case ROSE_INSTR_CHECK_GROUPS:
-            resources.checks_groups = true;
-            break;
-        case ROSE_INSTR_PUSH_DELAYED:
-            resources.has_lit_delay = true;
-            break;
-        case ROSE_INSTR_CHECK_LONG_LIT:
-        case ROSE_INSTR_CHECK_LONG_LIT_NOCASE:
-            resources.has_lit_check = true;
-            break;
-        default:
-            break;
-        }
-    }
+    proto.somRevCount = verify_u32(nfas.size());
+    proto.somRevOffsetOffset = bc.engine_blob.add_range(nfa_offsets);
 }
 
 static
-void recordResources(RoseResources &resources,
-                     const RoseBuildImpl &build) {
+void recordResources(RoseResources &resources, const RoseBuildImpl &build,
+                     const vector<LitFragment> &fragments) {
     if (!build.outfixes.empty()) {
         resources.has_outfixes = true;
     }
-    for (u32 i = 0; i < build.literal_info.size(); i++) {
-        if (build.hasFinalId(i)) {
-            resources.has_literals = true;
-            break;
-        }
-    }
+
+    resources.has_literals = !fragments.empty();
 
     const auto &g = build.g;
     for (const auto &v : vertices_range(g)) {
@@ -2566,25 +2348,6 @@ void recordResources(RoseResources &resources,
     }
 }
 
-static
-void recordLongLiterals(build_context &bc, const RoseProgram &program) {
-    for (const auto &ri : program) {
-        if (const auto *ri_check =
-                dynamic_cast<const RoseInstrCheckLongLit *>(ri.get())) {
-            DEBUG_PRINTF("found CHECK_LITERAL for string '%s'\n",
-                         escapeString(ri_check->literal).c_str());
-            bc.longLiterals.emplace_back(ri_check->literal, false);
-            continue;
-        }
-        if (const auto *ri_check =
-                dynamic_cast<const RoseInstrCheckLongLitNocase *>(ri.get())) {
-            DEBUG_PRINTF("found CHECK_LITERAL_NOCASE for string '%s'\n",
-                         escapeString(ri_check->literal).c_str());
-            bc.longLiterals.emplace_back(ri_check->literal, true);
-        }
-    }
-}
-
 static
 u32 writeProgram(build_context &bc, RoseProgram &&program) {
     if (program.empty()) {
@@ -2592,6 +2355,8 @@ u32 writeProgram(build_context &bc, RoseProgram &&program) {
         return 0;
     }
 
+    applyFinalSpecialisation(program);
+
     auto it = bc.program_cache.find(program);
     if (it != end(bc.program_cache)) {
         DEBUG_PRINTF("reusing cached program at %u\n", it->second);
@@ -2599,73 +2364,43 @@ u32 writeProgram(build_context &bc, RoseProgram &&program) {
     }
 
     recordResources(bc.resources, program);
-    recordLongLiterals(bc, program);
+    recordLongLiterals(bc.longLiterals, program);
 
-    u32 len = 0;
-    auto prog_bytecode = writeProgram(bc.engine_blob, program, &len);
-    u32 offset = bc.engine_blob.add(prog_bytecode.get(), len,
-                                    ROSE_INSTR_MIN_ALIGN);
-    DEBUG_PRINTF("prog len %u written at offset %u\n", len, offset);
+    auto prog_bytecode = writeProgram(bc.engine_blob, program);
+    u32 offset = bc.engine_blob.add(prog_bytecode);
+    DEBUG_PRINTF("prog len %zu written at offset %u\n", prog_bytecode.size(),
+                 offset);
     bc.program_cache.emplace(move(program), offset);
     return offset;
 }
 
 static
-void buildActiveLeftIter(const vector<LeftNfaInfo> &leftTable,
-                         vector<mmbit_sparse_iter> &out) {
+u32 writeActiveLeftIter(RoseEngineBlob &engine_blob,
+                        const vector<LeftNfaInfo> &leftInfoTable) {
     vector<u32> keys;
-    for (size_t i = 0; i < leftTable.size(); i++) {
-        if (!leftTable[i].transient) {
-            DEBUG_PRINTF("rose %zu is active\n", i);
+    for (size_t i = 0; i < leftInfoTable.size(); i++) {
+        if (!leftInfoTable[i].transient) {
+            DEBUG_PRINTF("leftfix %zu is active\n", i);
             keys.push_back(verify_u32(i));
         }
     }
 
-    DEBUG_PRINTF("%zu active roses\n", keys.size());
+    DEBUG_PRINTF("%zu active leftfixes\n", keys.size());
 
     if (keys.empty()) {
-        out.clear();
-        return;
-    }
-
-    mmbBuildSparseIterator(out, keys, leftTable.size());
-}
-
-static
-bool canEagerlyReportAtEod(const RoseBuildImpl &build, const RoseEdge &e) {
-    const auto &g = build.g;
-    const auto v = target(e, g);
-
-    if (!build.g[v].eod_accept) {
-        return false;
-    }
-
-    // If there's a graph between us and EOD, we shouldn't be eager.
-    if (build.g[v].left) {
-        return false;
-    }
-
-    // Must be exactly at EOD.
-    if (g[e].minBound != 0 || g[e].maxBound != 0) {
-        return false;
-    }
-
-    // In streaming mode, we can only eagerly report EOD for literals in the
-    // EOD-anchored table, as that's the only time we actually know where EOD
-    // is. In block mode, we always have this information.
-    const auto u = source(e, g);
-    if (build.cc.streaming && !build.isInETable(u)) {
-        return false;
+        return 0;
     }
 
-    return true;
+    auto iter = mmbBuildSparseIterator(keys, verify_u32(leftInfoTable.size()));
+    return engine_blob.add_iterator(iter);
 }
 
 static
 bool hasEodAnchors(const RoseBuildImpl &build, const build_context &bc,
                    u32 outfixEndQueue) {
     for (u32 i = 0; i < outfixEndQueue; i++) {
-        if (nfaAcceptsEod(get_nfa_from_blob(bc, i))) {
+        const auto &eng_info = bc.engine_info_by_queue.at(i);
+        if (eng_info.accepts_eod) {
             DEBUG_PRINTF("outfix has eod\n");
             return true;
         }
@@ -2692,23 +2427,72 @@ bool hasEodAnchors(const RoseBuildImpl &build, const build_context &bc,
 }
 
 static
-void fillLookaroundTables(char *look_base, char *reach_base,
-                          const vector<LookEntry> &look_vec) {
-    DEBUG_PRINTF("%zu lookaround table entries\n", look_vec.size());
+void writeDkeyInfo(const ReportManager &rm, RoseEngineBlob &engine_blob,
+                   RoseEngine &proto) {
+    const auto inv_dkeys = rm.getDkeyToReportTable();
+    proto.invDkeyOffset = engine_blob.add_range(inv_dkeys);
+    proto.dkeyCount = rm.numDkeys();
+    proto.dkeyLogSize = fatbit_size(proto.dkeyCount);
+}
+
+static
+void writeLeftInfo(RoseEngineBlob &engine_blob, RoseEngine &proto,
+                   const vector<LeftNfaInfo> &leftInfoTable) {
+    proto.leftOffset = engine_blob.add_range(leftInfoTable);
+    proto.activeLeftIterOffset
+        = writeActiveLeftIter(engine_blob, leftInfoTable);
+    proto.roseCount = verify_u32(leftInfoTable.size());
+    proto.activeLeftCount = verify_u32(leftInfoTable.size());
+    proto.rosePrefixCount = countRosePrefixes(leftInfoTable);
+}
+
+static
+void writeNfaInfo(const RoseBuildImpl &build, build_context &bc,
+                  RoseEngine &proto, const set<u32> &no_retrigger_queues) {
+    const u32 queue_count = build.qif.allocated_count();
+    if (!queue_count) {
+        return;
+    }
+
+    auto ekey_lists = buildSuffixEkeyLists(build, bc, build.qif);
 
-    s8 *look = (s8 *)look_base;
-    u8 *reach = (u8 *)reach_base; // base for 256-bit bitvectors
+    vector<NfaInfo> infos(queue_count);
+    memset(infos.data(), 0, sizeof(NfaInfo) * queue_count);
 
-    for (const auto &le : look_vec) {
-        *look = verify_s8(le.offset);
-        const CharReach &cr = le.reach;
+    for (u32 qi = 0; qi < queue_count; qi++) {
+        NfaInfo &info = infos[qi];
+        info.nfaOffset = bc.engineOffsets.at(qi);
+        assert(qi < ekey_lists.size());
+        info.ekeyListOffset = ekey_lists.at(qi);
+        info.no_retrigger = contains(no_retrigger_queues, qi) ? 1 : 0;
+    }
 
-        assert(cr.any()); // Should be at least one character!
-        fill_bitvector(cr, reach);
+    // Mark outfixes that are in the small block matcher.
+    for (const auto &out : build.outfixes) {
+        const u32 qi = out.get_queue();
+        assert(qi < infos.size());
+        infos.at(qi).in_sbmatcher = out.in_sbmatcher;
+    }
 
-        ++look;
-        reach += REACH_BITVECTOR_LEN;
+    // Mark suffixes triggered by EOD table literals.
+    const RoseGraph &g = build.g;
+    for (auto v : vertices_range(g)) {
+        if (!g[v].suffix) {
+            continue;
+        }
+        u32 qi = bc.suffixes.at(g[v].suffix);
+        assert(qi < infos.size());
+        if (build.isInETable(v)) {
+            infos.at(qi).eod = 1;
+        }
     }
+
+    // Update state offsets to do with NFAs in proto and in the NfaInfo
+    // structures.
+    updateNfaState(bc, infos, &proto.stateOffsets, &proto.scratchStateSize,
+                   &proto.nfaStateSize, &proto.tStateSize);
+
+    proto.nfaInfoOffset = bc.engine_blob.add_range(infos);
 }
 
 static
@@ -2729,1150 +2513,31 @@ bool hasBoundaryReports(const BoundaryReports &boundary) {
     return false;
 }
 
-/**
- * \brief True if the given vertex is a role that can only be switched on at
- * EOD.
- */
 static
-bool onlyAtEod(const RoseBuildImpl &tbi, RoseVertex v) {
-    const RoseGraph &g = tbi.g;
-
-    // All such roles have only (0,0) edges to vertices with the eod_accept
-    // property, and no other effects (suffixes, ordinary reports, etc, etc).
-
-    if (isLeafNode(v, g) || !g[v].reports.empty() || g[v].suffix) {
-        return false;
-    }
-
-    for (const auto &e : out_edges_range(v, g)) {
-        RoseVertex w = target(e, g);
-        if (!g[w].eod_accept) {
-            return false;
-        }
-        assert(!g[w].reports.empty());
-        assert(g[w].literals.empty());
+void makeBoundaryPrograms(const RoseBuildImpl &build, build_context &bc,
+                          const BoundaryReports &boundary,
+                          const DerivedBoundaryReports &dboundary,
+                          RoseBoundaryReports &out) {
+    DEBUG_PRINTF("report ^:  %zu\n", boundary.report_at_0.size());
+    DEBUG_PRINTF("report $:  %zu\n", boundary.report_at_eod.size());
+    DEBUG_PRINTF("report ^$: %zu\n", dboundary.report_at_0_eod_full.size());
 
-        if (g[e].minBound || g[e].maxBound) {
-            return false;
-        }
-    }
+    auto eod_prog = makeBoundaryProgram(build, boundary.report_at_eod);
+    out.reportEodOffset = writeProgram(bc, move(eod_prog));
 
-    /* There is no pointing enforcing this check at runtime if
-     * this role is only fired by the eod event literal */
-    if (tbi.eod_event_literal_id != MO_INVALID_IDX &&
-        g[v].literals.size() == 1 &&
-        *g[v].literals.begin() == tbi.eod_event_literal_id) {
-        return false;
-    }
+    auto zero_prog = makeBoundaryProgram(build, boundary.report_at_0);
+    out.reportZeroOffset = writeProgram(bc, move(zero_prog));
 
-    return true;
+    auto zeod_prog = makeBoundaryProgram(build, dboundary.report_at_0_eod_full);
+    out.reportZeroEodOffset = writeProgram(bc, move(zeod_prog));
 }
 
 static
-u32 addLookaround(build_context &bc, const vector<LookEntry> &look) {
-    // Check the cache.
-    auto it = bc.lookaround_cache.find(look);
-    if (it != bc.lookaround_cache.end()) {
-        DEBUG_PRINTF("reusing look at idx %zu\n", it->second);
-        return verify_u32(it->second);
-    }
-
-    // Linear scan for sequence.
-    auto seq_it = search(begin(bc.lookaround), end(bc.lookaround), begin(look),
-                         end(look));
-    if (seq_it != end(bc.lookaround)) {
-        size_t idx = distance(begin(bc.lookaround), seq_it);
-        DEBUG_PRINTF("linear scan found look at idx %zu\n", idx);
-        bc.lookaround_cache.emplace(look, idx);
-        return verify_u32(idx);
-    }
-
-    // New sequence.
-    size_t idx = bc.lookaround.size();
-    bc.lookaround_cache.emplace(look, idx);
-    insert(&bc.lookaround, bc.lookaround.end(), look);
-    DEBUG_PRINTF("adding look at idx %zu\n", idx);
-    return verify_u32(idx);
-}
-
-static
-bool checkReachMask(const CharReach &cr, u8 &andmask, u8 &cmpmask) {
-    size_t reach_size = cr.count();
-    assert(reach_size > 0);
-    // check whether entry_size is some power of 2.
-    if ((reach_size - 1) & reach_size) {
-        return false;
-    }
-    make_and_cmp_mask(cr, &andmask, &cmpmask);
-    if ((1 << popcount32((u8)(~andmask))) ^ reach_size) {
-        return false;
-    }
-    return true;
-}
-
-static
-bool checkReachWithFlip(const CharReach &cr, u8 &andmask,
-                       u8 &cmpmask, u8 &flip) {
-    if (checkReachMask(cr, andmask, cmpmask)) {
-        flip = 0;
-        return true;
-    }
-    if (checkReachMask(~cr, andmask, cmpmask)) {
-        flip = 1;
-        return true;
-    }
-    return false;
-}
-
-static
-bool makeRoleByte(const vector<LookEntry> &look, RoseProgram &program) {
-    if (look.size() == 1) {
-        const auto &entry = look[0];
-        u8 andmask_u8, cmpmask_u8;
-        u8 flip;
-        if (!checkReachWithFlip(entry.reach, andmask_u8, cmpmask_u8, flip)) {
-            return false;
-        }
-        s32 checkbyte_offset = verify_s32(entry.offset);
-        DEBUG_PRINTF("CHECK BYTE offset=%d\n", checkbyte_offset);
-        const auto *end_inst = program.end_instruction();
-        auto ri = make_unique<RoseInstrCheckByte>(andmask_u8, cmpmask_u8, flip,
-                                                  checkbyte_offset, end_inst);
-        program.add_before_end(move(ri));
-        return true;
-    }
-    return false;
-}
-
-static
-bool makeRoleMask(const vector<LookEntry> &look, RoseProgram &program) {
-    if (look.back().offset < look.front().offset + 8) {
-        s32 base_offset = verify_s32(look.front().offset);
-        u64a and_mask = 0;
-        u64a cmp_mask = 0;
-        u64a neg_mask = 0;
-        for (const auto &entry : look) {
-            u8 andmask_u8, cmpmask_u8, flip;
-            if (!checkReachWithFlip(entry.reach, andmask_u8,
-                                    cmpmask_u8, flip)) {
-                return false;
-            }
-            DEBUG_PRINTF("entry offset %d\n", entry.offset);
-            u32 shift = (entry.offset - base_offset) << 3;
-            and_mask |= (u64a)andmask_u8 << shift;
-            cmp_mask |= (u64a)cmpmask_u8 << shift;
-            if (flip) {
-                neg_mask |= 0xffLLU << shift;
-            }
-        }
-        DEBUG_PRINTF("CHECK MASK and_mask=%llx cmp_mask=%llx\n",
-                     and_mask, cmp_mask);
-        const auto *end_inst = program.end_instruction();
-        auto ri = make_unique<RoseInstrCheckMask>(and_mask, cmp_mask, neg_mask,
-                                                  base_offset, end_inst);
-        program.add_before_end(move(ri));
-        return true;
-    }
-    return false;
-}
-
-static UNUSED
-string convertMaskstoString(u8 *p, int byte_len) {
-    string s;
-    for (int i = 0; i < byte_len; i++) {
-        u8 hi = *p >> 4;
-        u8 lo = *p & 0xf;
-        s += (char)(hi + (hi < 10 ? 48 : 87));
-        s += (char)(lo + (lo < 10 ? 48 : 87));
-        p++;
-    }
-    return s;
-}
-
-static
-bool makeRoleMask32(const vector<LookEntry> &look,
-                    RoseProgram &program) {
-    if (look.back().offset >= look.front().offset + 32) {
-        return false;
-    }
-    s32 base_offset = verify_s32(look.front().offset);
-    array<u8, 32> and_mask, cmp_mask;
-    and_mask.fill(0);
-    cmp_mask.fill(0);
-    u32 neg_mask = 0;
-    for (const auto &entry : look) {
-        u8 andmask_u8, cmpmask_u8, flip;
-        if (!checkReachWithFlip(entry.reach, andmask_u8,
-                                cmpmask_u8, flip)) {
-            return false;
-        }
-        u32 shift = entry.offset - base_offset;
-        assert(shift < 32);
-        and_mask[shift] = andmask_u8;
-        cmp_mask[shift] = cmpmask_u8;
-        if (flip) {
-            neg_mask |= 1 << shift;
-        }
-    }
-
-    DEBUG_PRINTF("and_mask %s\n",
-                 convertMaskstoString(and_mask.data(), 32).c_str());
-    DEBUG_PRINTF("cmp_mask %s\n",
-                 convertMaskstoString(cmp_mask.data(), 32).c_str());
-    DEBUG_PRINTF("neg_mask %08x\n", neg_mask);
-    DEBUG_PRINTF("base_offset %d\n", base_offset);
-
-    const auto *end_inst = program.end_instruction();
-    auto ri = make_unique<RoseInstrCheckMask32>(and_mask, cmp_mask, neg_mask,
-                                                base_offset, end_inst);
-    program.add_before_end(move(ri));
-    return true;
-}
-
-// Sorting by the size of every bucket.
-// Used in map<u32, vector<s8>, cmpNibble>.
-struct cmpNibble {
-    bool operator()(const u32 data1, const u32 data2) const{
-        u32 size1 = popcount32(data1 >> 16) * popcount32(data1 << 16);
-        u32 size2 = popcount32(data2 >> 16) * popcount32(data2 << 16);
-        return std::tie(size1, data1) < std::tie(size2, data2);
-    }
-};
-
-// Insert all pairs of bucket and offset into buckets.
-static really_inline
-void getAllBuckets(const vector<LookEntry> &look,
-                 map<u32, vector<s8>, cmpNibble> &buckets, u32 &neg_mask) {
-    s32 base_offset = verify_s32(look.front().offset);
-    for (const auto &entry : look) {
-        CharReach cr = entry.reach;
-        // Flip heavy character classes to save buckets.
-        if (cr.count() > 128 ) {
-            cr.flip();
-        } else {
-            neg_mask ^= 1 << (entry.offset - base_offset);
-        }
-        map <u16, u16> lo2hi;
-        // We treat Ascii Table as a 16x16 grid.
-        // Push every row in cr into lo2hi and mark the row number.
-        for (size_t i = cr.find_first(); i != CharReach::npos;) {
-            u8 it_hi = i >> 4;
-            u16 low_encode = 0;
-            while (i != CharReach::npos && (i >> 4) == it_hi) {
-                low_encode |= 1 << (i & 0xf);
-                i = cr.find_next(i);
-            }
-            lo2hi[low_encode] |= 1 << it_hi;
-        }
-        for (const auto &it : lo2hi) {
-            u32 hi_lo = (it.second << 16) | it.first;
-            buckets[hi_lo].push_back(entry.offset);
-        }
-    }
-}
-
-// Once we have a new bucket, we'll try to combine it with all old buckets.
-static really_inline
-void nibUpdate(map<u32, u16> &nib, u32 hi_lo) {
-    u16 hi = hi_lo >> 16;
-    u16 lo = hi_lo & 0xffff;
-    for (const auto pairs : nib) {
-        u32 old = pairs.first;
-        if ((old >> 16) == hi || (old & 0xffff) == lo) {
-            if (!nib[old | hi_lo]) {
-                nib[old | hi_lo] = nib[old] | nib[hi_lo];
-            }
-        }
-    }
-}
-
-static really_inline
-void nibMaskUpdate(array<u8, 32> &mask, u32 data, u8 bit_index) {
-    for (u8 index = 0; data > 0; data >>= 1, index++) {
-        if (data & 1) {
-            // 0 ~ 7 bucket in first 16 bytes,
-            // 8 ~ 15 bucket in second 16 bytes.
-            if (bit_index >= 8) {
-                mask[index + 16] |= 1 << (bit_index - 8);
-            } else {
-                mask[index] |= 1 << bit_index;
-            }
-        }
-    }
-}
-
-static
-bool makeRoleShufti(const vector<LookEntry> &look,
-                    RoseProgram &program) {
-
-    s32 base_offset = verify_s32(look.front().offset);
-    if (look.back().offset >= base_offset + 32) {
-        return false;
-    }
-    array<u8, 32> hi_mask, lo_mask;
-    hi_mask.fill(0);
-    lo_mask.fill(0);
-    array<u8, 32> bucket_select_hi, bucket_select_lo;
-    bucket_select_hi.fill(0); // will not be used in 16x8 and 32x8.
-    bucket_select_lo.fill(0);
-    u8 bit_index = 0; // number of buckets
-    map<u32, u16> nib; // map every bucket to its bucket number.
-    map<u32, vector<s8>, cmpNibble> bucket2offsets;
-    u32 neg_mask = ~0u;
-
-    getAllBuckets(look, bucket2offsets, neg_mask);
-
-    for (const auto &it : bucket2offsets) {
-        u32 hi_lo = it.first;
-        // New bucket.
-        if (!nib[hi_lo]) {
-            if (bit_index >= 16) {
-                return false;
-            }
-            nib[hi_lo] = 1 << bit_index;
-
-            nibUpdate(nib, hi_lo);
-            nibMaskUpdate(hi_mask, hi_lo >> 16, bit_index);
-            nibMaskUpdate(lo_mask, hi_lo & 0xffff, bit_index);
-            bit_index++;
-        }
-
-        DEBUG_PRINTF("hi_lo %x bucket %x\n", hi_lo, nib[hi_lo]);
-
-        // Update bucket_select_mask.
-        u8 nib_hi = nib[hi_lo] >> 8;
-        u8 nib_lo = nib[hi_lo] & 0xff;
-        for (const auto offset : it.second) {
-            bucket_select_hi[offset - base_offset] |= nib_hi;
-            bucket_select_lo[offset - base_offset] |= nib_lo;
-        }
-    }
-
-    DEBUG_PRINTF("hi_mask %s\n",
-                 convertMaskstoString(hi_mask.data(), 32).c_str());
-    DEBUG_PRINTF("lo_mask %s\n",
-                 convertMaskstoString(lo_mask.data(), 32).c_str());
-    DEBUG_PRINTF("bucket_select_hi %s\n",
-                 convertMaskstoString(bucket_select_hi.data(), 32).c_str());
-    DEBUG_PRINTF("bucket_select_lo %s\n",
-                 convertMaskstoString(bucket_select_lo.data(), 32).c_str());
-
-    const auto *end_inst = program.end_instruction();
-    if (bit_index < 8) {
-        if (look.back().offset < base_offset + 16) {
-            neg_mask &= 0xffff;
-            array<u8, 32> nib_mask;
-            array<u8, 16> bucket_select_mask_16;
-            copy(lo_mask.begin(), lo_mask.begin() + 16, nib_mask.begin());
-            copy(hi_mask.begin(), hi_mask.begin() + 16, nib_mask.begin() + 16);
-            copy(bucket_select_lo.begin(), bucket_select_lo.begin() + 16,
-                 bucket_select_mask_16.begin());
-            auto ri = make_unique<RoseInstrCheckShufti16x8>
-                      (nib_mask, bucket_select_mask_16,
-                       neg_mask, base_offset, end_inst);
-            program.add_before_end(move(ri));
-        } else {
-            array<u8, 16> hi_mask_16;
-            array<u8, 16> lo_mask_16;
-            copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_16.begin());
-            copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_16.begin());
-            auto ri = make_unique<RoseInstrCheckShufti32x8>
-                      (hi_mask_16, lo_mask_16, bucket_select_lo,
-                       neg_mask, base_offset, end_inst);
-            program.add_before_end(move(ri));
-        }
-    } else {
-        if (look.back().offset < base_offset + 16) {
-            neg_mask &= 0xffff;
-            array<u8, 32> bucket_select_mask_32;
-            copy(bucket_select_lo.begin(), bucket_select_lo.begin() + 16,
-                 bucket_select_mask_32.begin());
-            copy(bucket_select_hi.begin(), bucket_select_hi.begin() + 16,
-                 bucket_select_mask_32.begin() + 16);
-            auto ri = make_unique<RoseInstrCheckShufti16x16>
-                      (hi_mask, lo_mask, bucket_select_mask_32,
-                       neg_mask, base_offset, end_inst);
-            program.add_before_end(move(ri));
-        } else {
-            auto ri = make_unique<RoseInstrCheckShufti32x16>
-                      (hi_mask, lo_mask, bucket_select_hi, bucket_select_lo,
-                       neg_mask, base_offset, end_inst);
-            program.add_before_end(move(ri));
-        }
-    }
-    return true;
-}
-
-/**
- * Builds a lookaround instruction, or an appropriate specialization if one is
- * available.
- */
-static
-void makeLookaroundInstruction(build_context &bc, const vector<LookEntry> &look,
-                               RoseProgram &program) {
-    assert(!look.empty());
-
-    if (makeRoleByte(look, program)) {
-        return;
-    }
-
-    if (look.size() == 1) {
-        s8 offset = look.begin()->offset;
-        u32 look_idx = addLookaround(bc, look);
-        auto ri = make_unique<RoseInstrCheckSingleLookaround>(offset, look_idx,
-                                                    program.end_instruction());
-        program.add_before_end(move(ri));
-        return;
-    }
-
-    if (makeRoleMask(look, program)) {
-        return;
-    }
-
-    if (makeRoleMask32(look, program)) {
-        return;
-    }
-
-    if (makeRoleShufti(look, program)) {
-        return;
-    }
-
-    u32 look_idx = addLookaround(bc, look);
-    u32 look_count = verify_u32(look.size());
-
-    auto ri = make_unique<RoseInstrCheckLookaround>(look_idx, look_count,
-                                                    program.end_instruction());
-    program.add_before_end(move(ri));
-}
-
-static
-void makeRoleLookaround(RoseBuildImpl &build, build_context &bc, RoseVertex v,
-                        RoseProgram &program) {
-    if (!build.cc.grey.roseLookaroundMasks) {
-        return;
-    }
-
-    vector<LookEntry> look;
-
-    // Lookaround from leftfix (mandatory).
-    if (contains(bc.leftfix_info, v) && bc.leftfix_info.at(v).has_lookaround) {
-        DEBUG_PRINTF("using leftfix lookaround\n");
-        look = bc.leftfix_info.at(v).lookaround;
-    }
-
-    // We may be able to find more lookaround info (advisory) and merge it
-    // in.
-    vector<LookEntry> look_more;
-    findLookaroundMasks(build, v, look_more);
-    mergeLookaround(look, look_more);
-
-    if (look.empty()) {
-        return;
-    }
-
-    makeLookaroundInstruction(bc, look, program);
-}
-
-static
-void makeRoleCheckLeftfix(RoseBuildImpl &build, build_context &bc, RoseVertex v,
-                          RoseProgram &program) {
-    auto it = bc.leftfix_info.find(v);
-    if (it == end(bc.leftfix_info)) {
-        return;
-    }
-    const left_build_info &lni = it->second;
-    if (lni.has_lookaround) {
-        return; // Leftfix completely implemented by lookaround.
-    }
-
-    assert(!build.cc.streaming ||
-           build.g[v].left.lag <= MAX_STORED_LEFTFIX_LAG);
-
-    bool is_prefix = build.isRootSuccessor(v);
-    const auto *end_inst = program.end_instruction();
-
-    unique_ptr<RoseInstruction> ri;
-    if (is_prefix) {
-        ri = make_unique<RoseInstrCheckPrefix>(lni.queue, build.g[v].left.lag,
-                                               build.g[v].left.leftfix_report,
-                                               end_inst);
-    } else {
-        ri = make_unique<RoseInstrCheckInfix>(lni.queue, build.g[v].left.lag,
-                                              build.g[v].left.leftfix_report,
-                                              end_inst);
-    }
-    program.add_before_end(move(ri));
-}
-
-static
-void makeRoleAnchoredDelay(RoseBuildImpl &build, build_context &bc,
-                           RoseVertex v, RoseProgram &program) {
-    // Only relevant for roles that can be triggered by the anchored table.
-    if (!build.isAnchored(v)) {
-        return;
-    }
-
-    // If this match cannot occur after floatingMinLiteralMatchOffset, we do
-    // not need this check.
-    if (build.g[v].max_offset <= bc.floatingMinLiteralMatchOffset) {
-        return;
-    }
-
-    const auto *end_inst = program.end_instruction();
-    auto ri = make_unique<RoseInstrAnchoredDelay>(build.g[v].groups, end_inst);
-    program.add_before_end(move(ri));
-}
-
-static
-void makeDedupe(const RoseBuildImpl &build, const Report &report,
-                RoseProgram &program) {
-    const auto *end_inst = program.end_instruction();
-    auto ri =
-        make_unique<RoseInstrDedupe>(report.quashSom, build.rm.getDkey(report),
-                                     report.offsetAdjust, end_inst);
-    program.add_before_end(move(ri));
-}
-
-static
-void makeDedupeSom(const RoseBuildImpl &build, const Report &report,
-                   RoseProgram &program) {
-    const auto *end_inst = program.end_instruction();
-    auto ri = make_unique<RoseInstrDedupeSom>(report.quashSom,
-                                              build.rm.getDkey(report),
-                                              report.offsetAdjust, end_inst);
-    program.add_before_end(move(ri));
-}
-
-static
-void makeCatchup(RoseBuildImpl &build, build_context &bc,
-                 const flat_set<ReportID> &reports, RoseProgram &program) {
-    if (!bc.needs_catchup) {
-        return;
-    }
-
-    // Everything except the INTERNAL_ROSE_CHAIN report needs catchup to run
-    // before reports are triggered.
-
-    auto report_needs_catchup = [&](const ReportID &id) {
-        const Report &report = build.rm.getReport(id);
-        return report.type != INTERNAL_ROSE_CHAIN;
-    };
-
-    if (!any_of(begin(reports), end(reports), report_needs_catchup)) {
-        DEBUG_PRINTF("none of the given reports needs catchup\n");
-        return;
-    }
-
-    program.add_before_end(make_unique<RoseInstrCatchUp>());
-}
-
-static
-void makeCatchupMpv(RoseBuildImpl &build, build_context &bc, ReportID id,
-                    RoseProgram &program) {
-    if (!bc.needs_mpv_catchup) {
-        return;
-    }
-
-    const Report &report = build.rm.getReport(id);
-    if (report.type == INTERNAL_ROSE_CHAIN) {
-        return;
-    }
-
-    program.add_before_end(make_unique<RoseInstrCatchUpMpv>());
-}
-
-static
-void writeSomOperation(const Report &report, som_operation *op) {
-    assert(op);
-
-    memset(op, 0, sizeof(*op));
-
-    switch (report.type) {
-    case EXTERNAL_CALLBACK_SOM_REL:
-        op->type = SOM_EXTERNAL_CALLBACK_REL;
-        break;
-    case INTERNAL_SOM_LOC_SET:
-        op->type = SOM_INTERNAL_LOC_SET;
-        break;
-    case INTERNAL_SOM_LOC_SET_IF_UNSET:
-        op->type = SOM_INTERNAL_LOC_SET_IF_UNSET;
-        break;
-    case INTERNAL_SOM_LOC_SET_IF_WRITABLE:
-        op->type = SOM_INTERNAL_LOC_SET_IF_WRITABLE;
-        break;
-    case INTERNAL_SOM_LOC_SET_SOM_REV_NFA:
-        op->type = SOM_INTERNAL_LOC_SET_REV_NFA;
-        break;
-    case INTERNAL_SOM_LOC_SET_SOM_REV_NFA_IF_UNSET:
-        op->type = SOM_INTERNAL_LOC_SET_REV_NFA_IF_UNSET;
-        break;
-    case INTERNAL_SOM_LOC_SET_SOM_REV_NFA_IF_WRITABLE:
-        op->type = SOM_INTERNAL_LOC_SET_REV_NFA_IF_WRITABLE;
-        break;
-    case INTERNAL_SOM_LOC_COPY:
-        op->type = SOM_INTERNAL_LOC_COPY;
-        break;
-    case INTERNAL_SOM_LOC_COPY_IF_WRITABLE:
-        op->type = SOM_INTERNAL_LOC_COPY_IF_WRITABLE;
-        break;
-    case INTERNAL_SOM_LOC_MAKE_WRITABLE:
-        op->type = SOM_INTERNAL_LOC_MAKE_WRITABLE;
-        break;
-    case EXTERNAL_CALLBACK_SOM_STORED:
-        op->type = SOM_EXTERNAL_CALLBACK_STORED;
-        break;
-    case EXTERNAL_CALLBACK_SOM_ABS:
-        op->type = SOM_EXTERNAL_CALLBACK_ABS;
-        break;
-    case EXTERNAL_CALLBACK_SOM_REV_NFA:
-        op->type = SOM_EXTERNAL_CALLBACK_REV_NFA;
-        break;
-    case INTERNAL_SOM_LOC_SET_FROM:
-        op->type = SOM_INTERNAL_LOC_SET_FROM;
-        break;
-    case INTERNAL_SOM_LOC_SET_FROM_IF_WRITABLE:
-        op->type = SOM_INTERNAL_LOC_SET_FROM_IF_WRITABLE;
-        break;
-    default:
-        // This report doesn't correspond to a SOM operation.
-        assert(0);
-        throw CompileError("Unable to generate bytecode.");
-    }
-
-    op->onmatch = report.onmatch;
-
-    switch (report.type) {
-    case EXTERNAL_CALLBACK_SOM_REV_NFA:
-    case INTERNAL_SOM_LOC_SET_SOM_REV_NFA:
-    case INTERNAL_SOM_LOC_SET_SOM_REV_NFA_IF_UNSET:
-    case INTERNAL_SOM_LOC_SET_SOM_REV_NFA_IF_WRITABLE:
-        op->aux.revNfaIndex = report.revNfaIndex;
-        break;
-    default:
-        op->aux.somDistance = report.somDistance;
-        break;
-    }
-}
-
-static
-void makeReport(RoseBuildImpl &build, const ReportID id,
-                const bool has_som, RoseProgram &program) {
-    assert(id < build.rm.numReports());
-    const Report &report = build.rm.getReport(id);
-
-    RoseProgram report_block;
-    const RoseInstruction *end_inst = report_block.end_instruction();
-
-    // Handle min/max offset checks.
-    if (report.minOffset > 0 || report.maxOffset < MAX_OFFSET) {
-        auto ri = make_unique<RoseInstrCheckBounds>(report.minOffset,
-                                                    report.maxOffset, end_inst);
-        report_block.add_before_end(move(ri));
-    }
-
-    // If this report has an exhaustion key, we can check it in the program
-    // rather than waiting until we're in the callback adaptor.
-    if (report.ekey != INVALID_EKEY) {
-        auto ri = make_unique<RoseInstrCheckExhausted>(report.ekey, end_inst);
-        report_block.add_before_end(move(ri));
-    }
-
-    // External SOM reports that aren't passthrough need their SOM value
-    // calculated.
-    if (isExternalSomReport(report) &&
-        report.type != EXTERNAL_CALLBACK_SOM_PASS) {
-        auto ri = make_unique<RoseInstrSomFromReport>();
-        writeSomOperation(report, &ri->som);
-        report_block.add_before_end(move(ri));
-    }
-
-    // Min length constraint.
-    if (report.minLength > 0) {
-        assert(build.hasSom);
-        auto ri = make_unique<RoseInstrCheckMinLength>(
-            report.offsetAdjust, report.minLength, end_inst);
-        report_block.add_before_end(move(ri));
-    }
-
-    if (report.quashSom) {
-        report_block.add_before_end(make_unique<RoseInstrSomZero>());
-    }
-
-    switch (report.type) {
-    case EXTERNAL_CALLBACK:
-        if (!has_som) {
-            // Dedupe is only necessary if this report has a dkey, or if there
-            // are SOM reports to catch up.
-            bool needs_dedupe = build.rm.getDkey(report) != ~0U || build.hasSom;
-            if (report.ekey == INVALID_EKEY) {
-                if (needs_dedupe) {
-                    report_block.add_before_end(
-                        make_unique<RoseInstrDedupeAndReport>(
-                            report.quashSom, build.rm.getDkey(report),
-                            report.onmatch, report.offsetAdjust, end_inst));
-                } else {
-                    report_block.add_before_end(make_unique<RoseInstrReport>(
-                        report.onmatch, report.offsetAdjust));
-                }
-            } else {
-                if (needs_dedupe) {
-                    makeDedupe(build, report, report_block);
-                }
-                report_block.add_before_end(make_unique<RoseInstrReportExhaust>(
-                    report.onmatch, report.offsetAdjust, report.ekey));
-            }
-        } else { // has_som
-            makeDedupeSom(build, report, report_block);
-            if (report.ekey == INVALID_EKEY) {
-                report_block.add_before_end(make_unique<RoseInstrReportSom>(
-                    report.onmatch, report.offsetAdjust));
-            } else {
-                report_block.add_before_end(
-                    make_unique<RoseInstrReportSomExhaust>(
-                        report.onmatch, report.offsetAdjust, report.ekey));
-            }
-        }
-        break;
-    case INTERNAL_SOM_LOC_SET:
-    case INTERNAL_SOM_LOC_SET_IF_UNSET:
-    case INTERNAL_SOM_LOC_SET_IF_WRITABLE:
-    case INTERNAL_SOM_LOC_SET_SOM_REV_NFA:
-    case INTERNAL_SOM_LOC_SET_SOM_REV_NFA_IF_UNSET:
-    case INTERNAL_SOM_LOC_SET_SOM_REV_NFA_IF_WRITABLE:
-    case INTERNAL_SOM_LOC_COPY:
-    case INTERNAL_SOM_LOC_COPY_IF_WRITABLE:
-    case INTERNAL_SOM_LOC_MAKE_WRITABLE:
-    case INTERNAL_SOM_LOC_SET_FROM:
-    case INTERNAL_SOM_LOC_SET_FROM_IF_WRITABLE:
-        if (has_som) {
-            auto ri = make_unique<RoseInstrReportSomAware>();
-            writeSomOperation(report, &ri->som);
-            report_block.add_before_end(move(ri));
-        } else {
-            auto ri = make_unique<RoseInstrReportSomInt>();
-            writeSomOperation(report, &ri->som);
-            report_block.add_before_end(move(ri));
-        }
-        break;
-    case INTERNAL_ROSE_CHAIN: {
-        report_block.add_before_end(make_unique<RoseInstrReportChain>(
-            report.onmatch, report.topSquashDistance));
-        break;
-    }
-    case EXTERNAL_CALLBACK_SOM_REL:
-    case EXTERNAL_CALLBACK_SOM_STORED:
-    case EXTERNAL_CALLBACK_SOM_ABS:
-    case EXTERNAL_CALLBACK_SOM_REV_NFA:
-        makeDedupeSom(build, report, report_block);
-        if (report.ekey == INVALID_EKEY) {
-            report_block.add_before_end(make_unique<RoseInstrReportSom>(
-                report.onmatch, report.offsetAdjust));
-        } else {
-            report_block.add_before_end(make_unique<RoseInstrReportSomExhaust>(
-                report.onmatch, report.offsetAdjust, report.ekey));
-        }
-        break;
-    case EXTERNAL_CALLBACK_SOM_PASS:
-        makeDedupeSom(build, report, report_block);
-        if (report.ekey == INVALID_EKEY) {
-            report_block.add_before_end(make_unique<RoseInstrReportSom>(
-                report.onmatch, report.offsetAdjust));
-        } else {
-            report_block.add_before_end(make_unique<RoseInstrReportSomExhaust>(
-                report.onmatch, report.offsetAdjust, report.ekey));
-        }
-        break;
-
-    default:
-        assert(0);
-        throw CompileError("Unable to generate bytecode.");
-    }
-
-    assert(!report_block.empty());
-    program.add_block(move(report_block));
-}
-
-static
-void makeRoleReports(RoseBuildImpl &build, build_context &bc, RoseVertex v,
-                     RoseProgram &program) {
-    const auto &g = build.g;
-
-    /* we are a suffaig - need to update role to provide som to the
-     * suffix. */
-    bool has_som = false;
-    if (g[v].left.tracksSom()) {
-        assert(contains(bc.leftfix_info, v));
-        const left_build_info &lni = bc.leftfix_info.at(v);
-        program.add_before_end(
-            make_unique<RoseInstrSomLeftfix>(lni.queue, g[v].left.lag));
-        has_som = true;
-    } else if (g[v].som_adjust) {
-        program.add_before_end(
-            make_unique<RoseInstrSomAdjust>(g[v].som_adjust));
-        has_som = true;
-    }
-
-    const auto &reports = g[v].reports;
-    makeCatchup(build, bc, reports, program);
-
-    RoseProgram report_block;
-    for (ReportID id : reports) {
-        makeReport(build, id, has_som, report_block);
-    }
-    program.add_before_end(move(report_block));
-}
-
-static
-void makeRoleSuffix(RoseBuildImpl &build, build_context &bc, RoseVertex v,
-                    RoseProgram &program) {
-    const auto &g = build.g;
-    if (!g[v].suffix) {
-        return;
-    }
-    assert(contains(bc.suffixes, g[v].suffix));
-    u32 qi = bc.suffixes.at(g[v].suffix);
-    assert(contains(bc.engineOffsets, qi));
-    const NFA *nfa = get_nfa_from_blob(bc, qi);
-    u32 suffixEvent;
-    if (isContainerType(nfa->type)) {
-        auto tamaProto = g[v].suffix.tamarama.get();
-        assert(tamaProto);
-        u32 top = (u32)MQE_TOP_FIRST +
-                  tamaProto->top_remap.at(make_pair(g[v].index,
-                                                    g[v].suffix.top));
-        assert(top < MQE_INVALID);
-        suffixEvent = top;
-    } else if (isMultiTopType(nfa->type)) {
-        assert(!g[v].suffix.haig);
-        u32 top = (u32)MQE_TOP_FIRST + g[v].suffix.top;
-        assert(top < MQE_INVALID);
-        suffixEvent = top;
-    } else {
-        // DFAs/Puffs have no MQE_TOP_N support, so they get a classic TOP
-        // event.
-        assert(!g[v].suffix.graph || onlyOneTop(*g[v].suffix.graph));
-        suffixEvent = MQE_TOP;
-    }
-    program.add_before_end(
-        make_unique<RoseInstrTriggerSuffix>(qi, suffixEvent));
-}
-
-static
-void makeRoleGroups(RoseBuildImpl &build, build_context &bc, RoseVertex v,
-                    RoseProgram &program) {
-    const auto &g = build.g;
-    rose_group groups = g[v].groups;
-    if (!groups) {
-        return;
-    }
-
-    // The set of "already on" groups as we process this vertex is the
-    // intersection of the groups set by our predecessors.
-    assert(in_degree(v, g) > 0);
-    rose_group already_on = ~rose_group{0};
-    for (const auto &u : inv_adjacent_vertices_range(v, g)) {
-        already_on &= bc.vertex_group_map.at(u);
-    }
-
-    DEBUG_PRINTF("already_on=0x%llx\n", already_on);
-    DEBUG_PRINTF("squashable=0x%llx\n", bc.squashable_groups);
-    DEBUG_PRINTF("groups=0x%llx\n", groups);
-
-    already_on &= ~bc.squashable_groups;
-    DEBUG_PRINTF("squashed already_on=0x%llx\n", already_on);
-
-    // We don't *have* to mask off the groups that we know are already on, but
-    // this will make bugs more apparent.
-    groups &= ~already_on;
-
-    if (!groups) {
-        DEBUG_PRINTF("no new groups to set, skipping\n");
-        return;
-    }
-
-    program.add_before_end(make_unique<RoseInstrSetGroups>(groups));
-}
-
-static
-void makeRoleInfixTriggers(RoseBuildImpl &build, build_context &bc,
-                           RoseVertex u, RoseProgram &program) {
-    const auto &g = build.g;
-
-    vector<RoseInstrTriggerInfix> infix_program;
-
-    for (const auto &e : out_edges_range(u, g)) {
-        RoseVertex v = target(e, g);
-        if (!g[v].left) {
-            continue;
-        }
-
-        assert(contains(bc.leftfix_info, v));
-        const left_build_info &lbi = bc.leftfix_info.at(v);
-        if (lbi.has_lookaround) {
-            continue;
-        }
-
-        const NFA *nfa = get_nfa_from_blob(bc, lbi.queue);
-
-        // DFAs have no TOP_N support, so they get a classic MQE_TOP event.
-        u32 top;
-        if (isContainerType(nfa->type)) {
-            auto tamaProto = g[v].left.tamarama.get();
-            assert(tamaProto);
-            top = MQE_TOP_FIRST + tamaProto->top_remap.at(
-                                      make_pair(g[v].index, g[e].rose_top));
-            assert(top < MQE_INVALID);
-        } else if (!isMultiTopType(nfa->type)) {
-            assert(num_tops(g[v].left) == 1);
-            top = MQE_TOP;
-        } else {
-            top = MQE_TOP_FIRST + g[e].rose_top;
-            assert(top < MQE_INVALID);
-        }
-
-        infix_program.emplace_back(g[e].rose_cancel_prev_top, lbi.queue, top);
-    }
-
-    if (infix_program.empty()) {
-        return;
-    }
-
-    // Order, de-dupe and add instructions to the end of program.
-    sort(begin(infix_program), end(infix_program),
-         [](const RoseInstrTriggerInfix &a, const RoseInstrTriggerInfix &b) {
-             return tie(a.cancel, a.queue, a.event) <
-                    tie(b.cancel, b.queue, b.event);
-         });
-    infix_program.erase(unique(begin(infix_program), end(infix_program)),
-                        end(infix_program));
-    for (const auto &ri : infix_program) {
-        program.add_before_end(make_unique<RoseInstrTriggerInfix>(ri));
-    }
-}
-
-static
-void makeRoleSetState(const build_context &bc, RoseVertex v,
-                      RoseProgram &program) {
-    // We only need this instruction if a state index has been assigned to this
-    // vertex.
-    auto it = bc.roleStateIndices.find(v);
-    if (it == end(bc.roleStateIndices)) {
-        return;
-    }
-    program.add_before_end(make_unique<RoseInstrSetState>(it->second));
-}
-
-static
-void makeRoleCheckBounds(const RoseBuildImpl &build, RoseVertex v,
-                         const RoseEdge &e, RoseProgram &program) {
-    const RoseGraph &g = build.g;
-    const RoseVertex u = source(e, g);
-
-    // We know that we can trust the anchored table (DFA) to always deliver us
-    // literals at the correct offset.
-    if (build.isAnchored(v)) {
-        DEBUG_PRINTF("literal in anchored table, skipping bounds check\n");
-        return;
-    }
-
-    // Use the minimum literal length.
-    u32 lit_length = g[v].eod_accept ? 0 : verify_u32(build.minLiteralLen(v));
-
-    u64a min_bound = g[e].minBound + lit_length;
-    u64a max_bound = g[e].maxBound == ROSE_BOUND_INF
-                         ? ROSE_BOUND_INF
-                         : g[e].maxBound + lit_length;
-
-    if (g[e].history == ROSE_ROLE_HISTORY_ANCH) {
-        assert(g[u].fixedOffset());
-        // Make offsets absolute.
-        min_bound += g[u].max_offset;
-        if (max_bound != ROSE_BOUND_INF) {
-            max_bound += g[u].max_offset;
-        }
-    }
-
-    assert(max_bound <= ROSE_BOUND_INF);
-    assert(min_bound <= max_bound);
-
-    // CHECK_BOUNDS instruction uses 64-bit bounds, so we can use MAX_OFFSET
-    // (max value of a u64a) to represent ROSE_BOUND_INF.
-    if (max_bound == ROSE_BOUND_INF) {
-        max_bound = MAX_OFFSET;
-    }
-
-    // This instruction should be doing _something_ -- bounds should be tighter
-    // than just {length, inf}.
-    assert(min_bound > lit_length || max_bound < MAX_OFFSET);
-
-    const auto *end_inst = program.end_instruction();
-    program.add_before_end(
-        make_unique<RoseInstrCheckBounds>(min_bound, max_bound, end_inst));
-}
-
-static
-void makeRoleCheckNotHandled(build_context &bc, RoseVertex v,
-                             RoseProgram &program) {
-    u32 handled_key;
-    if (contains(bc.handledKeys, v)) {
-        handled_key = bc.handledKeys.at(v);
-    } else {
-        handled_key = verify_u32(bc.handledKeys.size());
-        bc.handledKeys.emplace(v, handled_key);
-    }
-
-    const auto *end_inst = program.end_instruction();
-    auto ri = make_unique<RoseInstrCheckNotHandled>(handled_key, end_inst);
-    program.add_before_end(move(ri));
-}
-
-static
-void makeRoleEagerEodReports(RoseBuildImpl &build, build_context &bc,
-                             RoseVertex v, RoseProgram &program) {
-    RoseProgram eod_program;
-
-    for (const auto &e : out_edges_range(v, build.g)) {
-        if (canEagerlyReportAtEod(build, e)) {
-            RoseProgram block;
-            makeRoleReports(build, bc, target(e, build.g), block);
-            eod_program.add_block(move(block));
-        }
-    }
-
-    if (eod_program.empty()) {
-        return;
-    }
-
-    if (!onlyAtEod(build, v)) {
-        // The rest of our program wasn't EOD anchored, so we need to guard
-        // these reports with a check.
-        const auto *end_inst = eod_program.end_instruction();
-        eod_program.insert(begin(eod_program),
-                           make_unique<RoseInstrCheckOnlyEod>(end_inst));
-    }
-
-    program.add_before_end(move(eod_program));
-}
-
-static
-RoseProgram makeProgram(RoseBuildImpl &build, build_context &bc,
-                        const RoseEdge &e) {
-    const RoseGraph &g = build.g;
-    auto v = target(e, g);
-
-    RoseProgram program;
-
-    // First, add program instructions that enforce preconditions without
-    // effects.
-
-    makeRoleAnchoredDelay(build, bc, v, program);
-
-    if (onlyAtEod(build, v)) {
-        DEBUG_PRINTF("only at eod\n");
-        const auto *end_inst = program.end_instruction();
-        program.add_before_end(make_unique<RoseInstrCheckOnlyEod>(end_inst));
-    }
-
-    if (g[e].history == ROSE_ROLE_HISTORY_ANCH) {
-        makeRoleCheckBounds(build, v, e, program);
-    }
-
-    // This program may be triggered by different predecessors, with different
-    // offset bounds. We must ensure we put this check/set operation after the
-    // bounds check to deal with this case.
-    if (in_degree(v, g) > 1) {
-        makeRoleCheckNotHandled(bc, v, program);
-    }
-
-    makeRoleLookaround(build, bc, v, program);
-    makeRoleCheckLeftfix(build, bc, v, program);
-
-    // Next, we can add program instructions that have effects. This must be
-    // done as a series of blocks, as some of them (like reports) are
-    // escapable.
-
-    RoseProgram effects_block;
-
-    RoseProgram reports_block;
-    makeRoleReports(build, bc, v, reports_block);
-    effects_block.add_block(move(reports_block));
-
-    RoseProgram infix_block;
-    makeRoleInfixTriggers(build, bc, v, infix_block);
-    effects_block.add_block(move(infix_block));
-
-    // Note: SET_GROUPS instruction must be after infix triggers, as an infix
-    // going dead may switch off groups.
-    RoseProgram groups_block;
-    makeRoleGroups(build, bc, v, groups_block);
-    effects_block.add_block(move(groups_block));
-
-    RoseProgram suffix_block;
-    makeRoleSuffix(build, bc, v, suffix_block);
-    effects_block.add_block(move(suffix_block));
-
-    RoseProgram state_block;
-    makeRoleSetState(bc, v, state_block);
-    effects_block.add_block(move(state_block));
-
-    // Note: EOD eager reports may generate a CHECK_ONLY_EOD instruction (if
-    // the program doesn't have one already).
-    RoseProgram eod_block;
-    makeRoleEagerEodReports(build, bc, v, eod_block);
-    effects_block.add_block(move(eod_block));
-
-    program.add_before_end(move(effects_block));
-    return program;
-}
-
-static
-u32 writeBoundaryProgram(RoseBuildImpl &build, build_context &bc,
-                         const set<ReportID> &reports) {
-    if (reports.empty()) {
-        return 0;
-    }
-
-    // Note: no CATCHUP instruction is necessary in the boundary case, as we
-    // should always be caught up (and may not even have the resources in
-    // scratch to support it).
-
-    const bool has_som = false;
-    RoseProgram program;
-    for (const auto &id : reports) {
-        makeReport(build, id, has_som, program);
-    }
-    applyFinalSpecialisation(program);
-    return writeProgram(bc, move(program));
-}
-
-static
-RoseBoundaryReports
-makeBoundaryPrograms(RoseBuildImpl &build, build_context &bc,
-                     const BoundaryReports &boundary,
-                     const DerivedBoundaryReports &dboundary) {
-    RoseBoundaryReports out;
-    memset(&out, 0, sizeof(out));
-
-    DEBUG_PRINTF("report ^:  %zu\n", boundary.report_at_0.size());
-    DEBUG_PRINTF("report $:  %zu\n", boundary.report_at_eod.size());
-    DEBUG_PRINTF("report ^$: %zu\n", dboundary.report_at_0_eod_full.size());
-
-    out.reportEodOffset =
-        writeBoundaryProgram(build, bc, boundary.report_at_eod);
-    out.reportZeroOffset =
-        writeBoundaryProgram(build, bc, boundary.report_at_0);
-    out.reportZeroEodOffset =
-        writeBoundaryProgram(build, bc, dboundary.report_at_0_eod_full);
-
-    return out;
-}
-
-static
-void assignStateIndices(const RoseBuildImpl &build, build_context &bc) {
-    const auto &g = build.g;
+unordered_map<RoseVertex, u32> assignStateIndices(const RoseBuildImpl &build) {
+    const auto &g = build.g;
 
     u32 state = 0;
-
+    unordered_map<RoseVertex, u32> roleStateIndices;
     for (auto v : vertices_range(g)) {
         // Virtual vertices (starts, EOD accept vertices) never need state
         // indices.
@@ -3895,12 +2560,13 @@ void assignStateIndices(const RoseBuildImpl &build, build_context &bc) {
         }
 
         /* TODO: also don't need a state index if all edges are nfa based */
-        bc.roleStateIndices.emplace(v, state++);
+        roleStateIndices.emplace(v, state++);
     }
 
     DEBUG_PRINTF("assigned %u states (from %zu vertices)\n", state,
                  num_vertices(g));
-    bc.numStates = state;
+
+    return roleStateIndices;
 }
 
 static
@@ -3915,10 +2581,9 @@ bool hasUsefulStops(const left_build_info &build) {
 
 static
 void buildLeftInfoTable(const RoseBuildImpl &tbi, build_context &bc,
-                        const set<u32> &eager_queues,
-                        u32 leftfixBeginQueue, u32 leftfixCount,
-                        vector<LeftNfaInfo> &leftTable, u32 *laggedRoseCount,
-                        size_t *history) {
+                        const set<u32> &eager_queues, u32 leftfixBeginQueue,
+                        u32 leftfixCount, vector<LeftNfaInfo> &leftTable,
+                        u32 *laggedRoseCount, size_t *history) {
     const RoseGraph &g = tbi.g;
     const CompileContext &cc = tbi.cc;
 
@@ -3965,8 +2630,7 @@ void buildLeftInfoTable(const RoseBuildImpl &tbi, build_context &bc,
 
             if (hasUsefulStops(lbi)) {
                 assert(lbi.stopAlphabet.size() == N_CHARS);
-                left.stopTable = bc.engine_blob.add(lbi.stopAlphabet.begin(),
-                                                    lbi.stopAlphabet.end());
+                left.stopTable = bc.engine_blob.add_range(lbi.stopAlphabet);
             }
 
             assert(lbi.countingMiracleOffset || !lbi.countingMiracleCount);
@@ -3985,11 +2649,11 @@ void buildLeftInfoTable(const RoseBuildImpl &tbi, build_context &bc,
             } else {
                 left.lagIndex = ROSE_OFFSET_INVALID;
             }
-
-            DEBUG_PRINTF("rose %u is %s\n", left_index,
-                         left.infix ? "infix" : "prefix");
         }
 
+        DEBUG_PRINTF("rose %u is %s\n", left_index,
+                     left.infix ? "infix" : "prefix");
+
         // Update squash mask.
         left.squash_mask &= lbi.squash_mask;
 
@@ -4006,568 +2670,299 @@ void buildLeftInfoTable(const RoseBuildImpl &tbi, build_context &bc,
 }
 
 static
-void addPredBlockSingle(u32 pred_state, RoseProgram &pred_block,
-                        RoseProgram &program) {
-    // Prepend an instruction to check the pred state is on.
-    const auto *end_inst = pred_block.end_instruction();
-    pred_block.insert(begin(pred_block),
-                      make_unique<RoseInstrCheckState>(pred_state, end_inst));
-    program.add_block(move(pred_block));
-}
-
-static
-void addPredBlocksAny(build_context &bc, map<u32, RoseProgram> &pred_blocks,
-                      RoseProgram &program) {
-    RoseProgram sparse_program;
-
-    vector<u32> keys;
-    for (const u32 &key : pred_blocks | map_keys) {
-        keys.push_back(key);
-    }
-
-    const RoseInstruction *end_inst = sparse_program.end_instruction();
-    auto ri = make_unique<RoseInstrSparseIterAny>(bc.numStates, keys, end_inst);
-    sparse_program.add_before_end(move(ri));
-
-    RoseProgram &block = pred_blocks.begin()->second;
-    sparse_program.add_before_end(move(block));
-    program.add_block(move(sparse_program));
-}
-
-static
-void addPredBlocksMulti(build_context &bc, map<u32, RoseProgram> &pred_blocks,
-                        RoseProgram &program) {
-    assert(!pred_blocks.empty());
-
-    RoseProgram sparse_program;
-    const RoseInstruction *end_inst = sparse_program.end_instruction();
-    vector<pair<u32, const RoseInstruction *>> jump_table;
-
-    // BEGIN instruction.
-    auto ri_begin =
-        make_unique<RoseInstrSparseIterBegin>(bc.numStates, end_inst);
-    RoseInstrSparseIterBegin *begin_inst = ri_begin.get();
-    sparse_program.add_before_end(move(ri_begin));
-
-    // NEXT instructions, one per pred program.
-    u32 prev_key = pred_blocks.begin()->first;
-    for (auto it = next(begin(pred_blocks)); it != end(pred_blocks); ++it) {
-        auto ri = make_unique<RoseInstrSparseIterNext>(prev_key, begin_inst,
-                                                       end_inst);
-        sparse_program.add_before_end(move(ri));
-        prev_key = it->first;
-    }
-
-    // Splice in each pred program after its BEGIN/NEXT.
-    auto out_it = begin(sparse_program);
-    for (auto &m : pred_blocks) {
-        u32 key = m.first;
-        RoseProgram &flat_prog = m.second;
-        assert(!flat_prog.empty());
-        const size_t block_len = flat_prog.size() - 1; // without INSTR_END.
-
-        assert(dynamic_cast<const RoseInstrSparseIterBegin *>(out_it->get()) ||
-               dynamic_cast<const RoseInstrSparseIterNext *>(out_it->get()));
-        out_it = sparse_program.insert(++out_it, move(flat_prog));
-
-        // Jump table target for this key is the beginning of the block we just
-        // spliced in.
-        jump_table.emplace_back(key, out_it->get());
-
-        assert(distance(begin(sparse_program), out_it) + block_len <=
-               sparse_program.size());
-        advance(out_it, block_len);
-    }
-
-    // Write the jump table back into the SPARSE_ITER_BEGIN instruction.
-    begin_inst->jump_table = move(jump_table);
-
-    program.add_block(move(sparse_program));
-}
-
-static
-void addPredBlocks(build_context &bc, map<u32, RoseProgram> &pred_blocks,
-                   RoseProgram &program) {
-    // Trim empty blocks, if any exist.
-    for (auto it = pred_blocks.begin(); it != pred_blocks.end();) {
-        if (it->second.empty()) {
-            it = pred_blocks.erase(it);
-        } else {
-            ++it;
-        }
-    }
-
-    const size_t num_preds = pred_blocks.size();
-    if (num_preds == 0) {
-        return;
-    }
-
-    if (num_preds == 1) {
-        const auto head = pred_blocks.begin();
-        addPredBlockSingle(head->first, head->second, program);
-        return;
-    }
-
-    // First, see if all our blocks are equivalent, in which case we can
-    // collapse them down into one.
-    const auto &blocks = pred_blocks | map_values;
-    if (all_of(begin(blocks), end(blocks), [&](const RoseProgram &block) {
-            return RoseProgramEquivalence()(*begin(blocks), block);
-        })) {
-        DEBUG_PRINTF("all blocks equiv\n");
-        addPredBlocksAny(bc, pred_blocks, program);
-        return;
-    }
-
-    addPredBlocksMulti(bc, pred_blocks, program);
-}
-
-static
-void makePushDelayedInstructions(const RoseBuildImpl &build, u32 final_id,
-                                 RoseProgram &program) {
-    const auto &lit_infos = getLiteralInfoByFinalId(build, final_id);
-    const auto &arb_lit_info = **lit_infos.begin();
-    if (arb_lit_info.delayed_ids.empty()) {
-        return;
-    }
-
-    for (const auto &int_id : arb_lit_info.delayed_ids) {
-        const auto &child_literal = build.literals.right.at(int_id);
-        u32 child_id = build.literal_info[int_id].final_id;
-        u32 delay_index = child_id - build.delay_base_id;
-
-        DEBUG_PRINTF("final_id=%u delay=%u child_id=%u\n", final_id,
-                     child_literal.delay, child_id);
-
-        auto ri = make_unique<RoseInstrPushDelayed>(
-            verify_u8(child_literal.delay), delay_index);
-        program.add_before_end(move(ri));
-    }
-}
-
-static
-rose_group getFinalIdGroupsUnion(const RoseBuildImpl &build, u32 final_id) {
-    assert(contains(build.final_id_to_literal, final_id));
-    const auto &lit_infos = getLiteralInfoByFinalId(build, final_id);
-
-    rose_group groups = 0;
-    for (const auto &li : lit_infos) {
-        groups |= li->group_mask;
-    }
-    return groups;
-}
-
-static
-void makeGroupCheckInstruction(const RoseBuildImpl &build, u32 final_id,
-                               RoseProgram &program) {
-    rose_group groups = getFinalIdGroupsUnion(build, final_id);
-    if (!groups) {
-        return;
-    }
-    program.add_before_end(make_unique<RoseInstrCheckGroups>(groups));
-}
-
-static
-void makeCheckLitMaskInstruction(const RoseBuildImpl &build, build_context &bc,
-                                 u32 final_id, RoseProgram &program) {
-    assert(contains(build.final_id_to_literal, final_id));
-    const auto &lit_infos = getLiteralInfoByFinalId(build, final_id);
-    assert(!lit_infos.empty());
-
-    if (!lit_infos.front()->requires_benefits) {
-        return;
-    }
-
-    vector<LookEntry> look;
-
-    assert(build.final_id_to_literal.at(final_id).size() == 1);
-    u32 lit_id = *build.final_id_to_literal.at(final_id).begin();
-    const ue2_literal &s = build.literals.right.at(lit_id).s;
-    DEBUG_PRINTF("building mask for lit %u (final id %u) %s\n", lit_id,
-                 final_id, dumpString(s).c_str());
-    assert(s.length() <= MAX_MASK2_WIDTH);
-    s32 i = 0 - s.length();
-    for (const auto &e : s) {
-        if (!e.nocase) {
-            look.emplace_back(verify_s8(i), e);
-        }
-        i++;
-    }
-
-    assert(!look.empty());
-    makeLookaroundInstruction(bc, look, program);
-}
-
-static
-void makeGroupSquashInstruction(const RoseBuildImpl &build, u32 final_id,
-                                RoseProgram &program) {
-    assert(contains(build.final_id_to_literal, final_id));
-    const auto &lit_infos = getLiteralInfoByFinalId(build, final_id);
-
-    if (!lit_infos.front()->squash_group) {
-        return;
-    }
-
-    rose_group groups = getFinalIdGroupsUnion(build, final_id);
-    if (!groups) {
-        return;
-    }
-
-    DEBUG_PRINTF("final_id %u squashes 0x%llx\n", final_id, groups);
-    program.add_before_end(
-        make_unique<RoseInstrSquashGroups>(~groups)); // Note negated.
-}
-
-static
-u32 findMaxOffset(const RoseBuildImpl &build, u32 lit_id) {
-    const auto &lit_vertices = build.literal_info.at(lit_id).vertices;
-    assert(!lit_vertices.empty());
+RoseProgram makeLiteralProgram(const RoseBuildImpl &build, build_context &bc,
+                               ProgramBuild &prog_build, u32 lit_id,
+                               const map<u32, vector<RoseEdge>> &lit_edge_map,
+                               bool is_anchored_replay_program) {
+    const vector<RoseEdge> no_edges;
 
-    u32 max_offset = 0;
-    for (const auto &v : lit_vertices) {
-        max_offset = max(max_offset, build.g[v].max_offset);
+    DEBUG_PRINTF("lit_id=%u\n", lit_id);
+    const vector<RoseEdge> *edges_ptr;
+    if (contains(lit_edge_map, lit_id)) {
+        edges_ptr = &lit_edge_map.at(lit_id);
+    } else {
+        /* literal may happen only in a delay context */
+        edges_ptr = &no_edges;
     }
 
-    return max_offset;
+    return makeLiteralProgram(build, bc.leftfix_info, bc.suffixes,
+                              bc.engine_info_by_queue,
+                              bc.roleStateIndices, prog_build, lit_id,
+                              *edges_ptr, is_anchored_replay_program);
 }
 
 static
-void makeRecordAnchoredInstruction(const RoseBuildImpl &build,
-                                   build_context &bc, u32 final_id,
-                                   RoseProgram &program) {
-    assert(contains(build.final_id_to_literal, final_id));
-    const auto &lit_ids = build.final_id_to_literal.at(final_id);
-
-    // Must be anchored.
+RoseProgram makeFragmentProgram(const RoseBuildImpl &build, build_context &bc,
+                               ProgramBuild &prog_build,
+                               const vector<u32> &lit_ids,
+                               const map<u32, vector<RoseEdge>> &lit_edge_map) {
     assert(!lit_ids.empty());
-    if (build.literals.right.at(*begin(lit_ids)).table != ROSE_ANCHORED) {
-        return;
-    }
-
-    // If this anchored literal can never match past
-    // floatingMinLiteralMatchOffset, we will never have to record it.
-    u32 max_offset = 0;
-    for (u32 lit_id : lit_ids) {
-        assert(build.literals.right.at(lit_id).table == ROSE_ANCHORED);
-        max_offset = max(max_offset, findMaxOffset(build, lit_id));
-    }
-
-    if (max_offset <= bc.floatingMinLiteralMatchOffset) {
-        return;
-    }
-
-    program.add_before_end(make_unique<RoseInstrRecordAnchored>(final_id));
-}
-
-static
-u32 findMinOffset(const RoseBuildImpl &build, u32 lit_id) {
-    const auto &lit_vertices = build.literal_info.at(lit_id).vertices;
-    assert(!lit_vertices.empty());
 
-    u32 min_offset = UINT32_MAX;
-    for (const auto &v : lit_vertices) {
-        min_offset = min(min_offset, build.g[v].min_offset);
+    vector<RoseProgram> blocks;
+    for (const auto &lit_id : lit_ids) {
+        auto prog = makeLiteralProgram(build, bc, prog_build, lit_id,
+                                       lit_edge_map, false);
+        blocks.push_back(move(prog));
     }
 
-    return min_offset;
+    return assembleProgramBlocks(move(blocks));
 }
 
+/**
+ * \brief Returns a map from literal ID to a list of edges leading into
+ * vertices with that literal ID.
+ */
 static
-void makeCheckLitEarlyInstruction(const RoseBuildImpl &build, build_context &bc,
-                                  u32 final_id,
-                                  const vector<RoseEdge> &lit_edges,
-                                  RoseProgram &program) {
-    if (lit_edges.empty()) {
-        return;
-    }
-
-    if (bc.floatingMinLiteralMatchOffset == 0) {
-        return;
-    }
-
-    RoseVertex v = target(lit_edges.front(), build.g);
-    if (!build.isFloating(v)) {
-        return;
-    }
-
-    const auto &lit_ids = build.final_id_to_literal.at(final_id);
-    if (lit_ids.empty()) {
-        return;
-    }
+map<u32, vector<RoseEdge>> findEdgesByLiteral(const RoseBuildImpl &build) {
+    // Use a set of edges while building the map to cull duplicates.
+    map<u32, flat_set<RoseEdge>> unique_lit_edge_map;
 
-    size_t min_len = SIZE_MAX;
-    u32 min_offset = UINT32_MAX;
-    for (u32 lit_id : lit_ids) {
-        const auto &lit = build.literals.right.at(lit_id);
-        size_t lit_min_len = lit.elength();
-        u32 lit_min_offset = findMinOffset(build, lit_id);
-        DEBUG_PRINTF("lit_id=%u has min_len=%zu, min_offset=%u\n", lit_id,
-                     lit_min_len, lit_min_offset);
-        min_len = min(min_len, lit_min_len);
-        min_offset = min(min_offset, lit_min_offset);
+    const auto &g = build.g;
+    for (const auto &e : edges_range(g)) {
+        const auto &v = target(e, g);
+        for (const auto &lit_id : g[v].literals) {
+            unique_lit_edge_map[lit_id].insert(e);
+        }
     }
 
-    DEBUG_PRINTF("final_id=%u has min_len=%zu, min_offset=%u, "
-                 "global min is %u\n", final_id, min_len, min_offset,
-                 bc.floatingMinLiteralMatchOffset);
-
-    // If we can't match before the min offset, we don't need the check.
-    if (min_len >= bc.floatingMinLiteralMatchOffset) {
-        DEBUG_PRINTF("no need for check, min is %u\n",
-                     bc.floatingMinLiteralMatchOffset);
-        return;
+    // Build output map, sorting edges by (source, target) vertex index.
+    map<u32, vector<RoseEdge>> lit_edge_map;
+    for (const auto &m : unique_lit_edge_map) {
+        auto edge_list = vector<RoseEdge>(begin(m.second), end(m.second));
+        sort(begin(edge_list), end(edge_list),
+             [&g](const RoseEdge &a, const RoseEdge &b) {
+                 return tie(g[source(a, g)].index, g[target(a, g)].index) <
+                        tie(g[source(b, g)].index, g[target(b, g)].index);
+             });
+        lit_edge_map.emplace(m.first, std::move(edge_list));
     }
 
-    assert(min_offset >= bc.floatingMinLiteralMatchOffset);
-    assert(min_offset < UINT32_MAX);
-
-    DEBUG_PRINTF("adding lit early check, min_offset=%u\n", min_offset);
-    program.add_before_end(make_unique<RoseInstrCheckLitEarly>(min_offset));
+    return lit_edge_map;
 }
 
 static
-void makeCheckLiteralInstruction(const RoseBuildImpl &build,
-                                 const build_context &bc, u32 final_id,
-                                 RoseProgram &program) {
-    const auto &lits = build.final_id_to_literal.at(final_id);
-    if (lits.size() != 1) {
-        // Long literals should not share a final_id.
-        assert(all_of(begin(lits), end(lits), [&](u32 lit_id) {
-            const rose_literal_id &lit = build.literals.right.at(lit_id);
-            return lit.table != ROSE_FLOATING ||
-                   lit.s.length() <= bc.longLitLengthThreshold;
-        }));
-        return;
-    }
-
-    u32 lit_id = *lits.begin();
-    if (build.isDelayed(lit_id)) {
-        return;
-    }
-
-    const rose_literal_id &lit = build.literals.right.at(lit_id);
-    if (lit.table != ROSE_FLOATING) {
-        return;
-    }
-    assert(bc.longLitLengthThreshold > 0);
-    if (lit.s.length() <= bc.longLitLengthThreshold) {
-        return;
-    }
-
-    // Check resource limits as well.
-    if (lit.s.length() > build.cc.grey.limitLiteralLength) {
-        throw ResourceLimitError();
-    }
-
-    unique_ptr<RoseInstruction> ri;
-    if (lit.s.any_nocase()) {
-        ri = make_unique<RoseInstrCheckLongLitNocase>(lit.s.get_string());
-    } else {
-        ri = make_unique<RoseInstrCheckLongLit>(lit.s.get_string());
+bool isUsedLiteral(const RoseBuildImpl &build, u32 lit_id) {
+    assert(lit_id < build.literal_info.size());
+    const auto &info = build.literal_info[lit_id];
+    if (!info.vertices.empty()) {
+        return true;
     }
-    program.add_before_end(move(ri));
-}
 
-static
-bool hasDelayedLiteral(RoseBuildImpl &build,
-                       const vector<RoseEdge> &lit_edges) {
-    auto is_delayed = bind(&RoseBuildImpl::isDelayed, &build, _1);
-    for (const auto &e : lit_edges) {
-        auto v = target(e, build.g);
-        const auto &lits = build.g[v].literals;
-        if (any_of(begin(lits), end(lits), is_delayed)) {
+    for (const u32 &delayed_id : info.delayed_ids) {
+        assert(delayed_id < build.literal_info.size());
+        const rose_literal_info &delayed_info = build.literal_info[delayed_id];
+        if (!delayed_info.vertices.empty()) {
             return true;
         }
     }
+
+    DEBUG_PRINTF("literal %u has no refs\n", lit_id);
     return false;
 }
 
 static
-RoseProgram buildLitInitialProgram(RoseBuildImpl &build, build_context &bc,
-                                   u32 final_id,
-                                   const vector<RoseEdge> &lit_edges) {
-    RoseProgram program;
-
-    // No initial program for EOD.
-    if (final_id == MO_INVALID_IDX) {
-        return program;
+rose_literal_id getFragment(const rose_literal_id &lit) {
+    if (lit.s.length() <= ROSE_SHORT_LITERAL_LEN_MAX) {
+        DEBUG_PRINTF("whole lit is frag\n");
+        return lit;
     }
 
-    DEBUG_PRINTF("final_id %u\n", final_id);
+    rose_literal_id frag = lit;
+    frag.s = frag.s.substr(frag.s.length() - ROSE_SHORT_LITERAL_LEN_MAX);
 
-    // Check long literal info.
-    makeCheckLiteralInstruction(build, bc, final_id, program);
-
-    // Check lit mask.
-    makeCheckLitMaskInstruction(build, bc, final_id, program);
-
-    // Check literal groups. This is an optimisation that we only perform for
-    // delayed literals, as their groups may be switched off; ordinarily, we
-    // can trust the HWLM matcher.
-    if (hasDelayedLiteral(build, lit_edges)) {
-        makeGroupCheckInstruction(build, final_id, program);
-    }
-
-    // Add instructions for pushing delayed matches, if there are any.
-    makePushDelayedInstructions(build, final_id, program);
-
-    // Add pre-check for early literals in the floating table.
-    makeCheckLitEarlyInstruction(build, bc, final_id, lit_edges, program);
-
-    return program;
+    DEBUG_PRINTF("fragment: %s\n", dumpString(frag.s).c_str());
+    return frag;
 }
 
 static
-RoseProgram buildLiteralProgram(RoseBuildImpl &build, build_context &bc,
-                                u32 final_id,
-                                const vector<RoseEdge> &lit_edges) {
-    const auto &g = build.g;
+vector<LitFragment> groupByFragment(const RoseBuildImpl &build) {
+    vector<LitFragment> fragments;
+    u32 frag_id = 0;
 
-    DEBUG_PRINTF("final id %u, %zu lit edges\n", final_id, lit_edges.size());
+    struct FragmentInfo {
+        vector<u32> lit_ids;
+        rose_group groups = 0;
+    };
 
-    RoseProgram program;
+    map<rose_literal_id, FragmentInfo> frag_info;
 
-    // Predecessor state id -> program block.
-    map<u32, RoseProgram> pred_blocks;
-
-    // Construct sparse iter sub-programs.
-    for (const auto &e : lit_edges) {
-        const auto &u = source(e, g);
-        if (build.isAnyStart(u)) {
-            continue; // Root roles are not handled with sparse iterator.
-        }
-        DEBUG_PRINTF("sparse iter edge (%zu,%zu)\n", g[u].index,
-                     g[target(e, g)].index);
-        assert(contains(bc.roleStateIndices, u));
-        u32 pred_state = bc.roleStateIndices.at(u);
-        pred_blocks[pred_state].add_block(makeProgram(build, bc, e));
-    }
+    for (u32 lit_id = 0; lit_id < build.literals.size(); lit_id++) {
+        const auto &lit = build.literals.at(lit_id);
+        const auto &info = build.literal_info.at(lit_id);
 
-    // Add blocks to deal with non-root edges (triggered by sparse iterator or
-    // mmbit_isset checks).
-    addPredBlocks(bc, pred_blocks, program);
+        if (!isUsedLiteral(build, lit_id)) {
+            DEBUG_PRINTF("lit %u is unused\n", lit_id);
+            continue;
+        }
 
-    // Add blocks to handle root roles.
-    for (const auto &e : lit_edges) {
-        const auto &u = source(e, g);
-        if (!build.isAnyStart(u)) {
+        if (lit.table == ROSE_EVENT) {
+            DEBUG_PRINTF("lit %u is an event\n", lit_id);
             continue;
         }
-        DEBUG_PRINTF("root edge (%zu,%zu)\n", g[u].index,
-                     g[target(e, g)].index);
-        program.add_block(makeProgram(build, bc, e));
-    }
 
-    if (final_id != MO_INVALID_IDX) {
-        RoseProgram root_block;
+        auto groups = info.group_mask;
 
-        // Literal may squash groups.
-        makeGroupSquashInstruction(build, final_id, root_block);
+        if (lit.s.length() < ROSE_SHORT_LITERAL_LEN_MAX) {
+            fragments.emplace_back(frag_id, groups, lit_id);
+            frag_id++;
+            continue;
+        }
 
-        // Literal may be anchored and need to be recorded.
-        makeRecordAnchoredInstruction(build, bc, final_id, root_block);
+        DEBUG_PRINTF("fragment candidate: lit_id=%u %s\n", lit_id,
+                     dumpString(lit.s).c_str());
+        auto &fi = frag_info[getFragment(lit)];
+        fi.lit_ids.push_back(lit_id);
+        fi.groups |= groups;
+    }
 
-        program.add_block(move(root_block));
+    for (auto &m : frag_info) {
+        auto &fi = m.second;
+        DEBUG_PRINTF("frag %s -> ids: %s\n", dumpString(m.first.s).c_str(),
+                     as_string_list(fi.lit_ids).c_str());
+        fragments.emplace_back(frag_id, fi.groups, move(fi.lit_ids));
+        frag_id++;
+        assert(frag_id == fragments.size());
     }
 
-    // Construct initial program up front, as its early checks must be able to
-    // jump to end and terminate processing for this literal.
-    auto lit_program = buildLitInitialProgram(build, bc, final_id, lit_edges);
-    lit_program.add_before_end(move(program));
-    return lit_program;
+    return fragments;
 }
 
+/**
+ * \brief Build the interpreter programs for each literal.
+ */
 static
-u32 writeLiteralProgram(RoseBuildImpl &build, build_context &bc, u32 final_id,
-                        const vector<RoseEdge> &lit_edges) {
-    RoseProgram program = buildLiteralProgram(build, bc, final_id, lit_edges);
-    if (program.empty()) {
-        return 0;
-    }
-    applyFinalSpecialisation(program);
-    return writeProgram(bc, move(program));
-}
+void buildLiteralPrograms(const RoseBuildImpl &build,
+                          vector<LitFragment> &fragments, build_context &bc,
+                          ProgramBuild &prog_build) {
+    DEBUG_PRINTF("%zu fragments\n", fragments.size());
+    auto lit_edge_map = findEdgesByLiteral(build);
 
-static
-u32 buildDelayRebuildProgram(RoseBuildImpl &build, build_context &bc,
-                             u32 final_id) {
-    const auto &lit_infos = getLiteralInfoByFinalId(build, final_id);
-    const auto &arb_lit_info = **lit_infos.begin();
-    if (arb_lit_info.delayed_ids.empty()) {
-        return 0; // No delayed IDs, no work to do.
-    }
+    for (auto &frag : fragments) {
+        DEBUG_PRINTF("frag_id=%u, lit_ids=[%s]\n", frag.fragment_id,
+                     as_string_list(frag.lit_ids).c_str());
 
-    RoseProgram program;
-    makeCheckLitMaskInstruction(build, bc, final_id, program);
-    makePushDelayedInstructions(build, final_id, program);
-    assert(!program.empty());
-    applyFinalSpecialisation(program);
-    return writeProgram(bc, move(program));
+        auto lit_prog = makeFragmentProgram(build, bc, prog_build, frag.lit_ids,
+                                            lit_edge_map);
+        frag.lit_program_offset = writeProgram(bc, move(lit_prog));
+
+        // We only do delayed rebuild in streaming mode.
+        if (!build.cc.streaming) {
+            continue;
+        }
+
+        auto rebuild_prog = makeDelayRebuildProgram(build, prog_build,
+                                                    frag.lit_ids);
+        frag.delay_program_offset = writeProgram(bc, move(rebuild_prog));
+    }
 }
 
+/**
+ * \brief Write delay replay programs to the bytecode.
+ *
+ * Returns the offset of the beginning of the program array, and the number of
+ * programs.
+ */
 static
-map<u32, vector<RoseEdge>> findEdgesByLiteral(const RoseBuildImpl &build) {
-    // Use a set of edges while building the map to cull duplicates.
-    map<u32, flat_set<RoseEdge>> unique_lit_edge_map;
+pair<u32, u32> writeDelayPrograms(const RoseBuildImpl &build,
+                                  const vector<LitFragment> &fragments,
+                                  build_context &bc,
+                                  ProgramBuild &prog_build) {
+    auto lit_edge_map = findEdgesByLiteral(build);
 
-    const auto &g = build.g;
-    for (const auto &e : edges_range(g)) {
-        const auto &v = target(e, g);
-        for (const auto &lit_id : g[v].literals) {
-            assert(lit_id < build.literal_info.size());
-            u32 final_id = build.literal_info.at(lit_id).final_id;
-            if (final_id == MO_INVALID_IDX) {
-                // Unused, special report IDs are handled elsewhere.
-                continue;
+    vector<u32> programs; // program offsets indexed by (delayed) lit id
+    unordered_map<u32, u32> cache; // program offsets we have already seen
+
+    for (const auto &frag : fragments) {
+        for (const u32 lit_id : frag.lit_ids) {
+            const auto &info = build.literal_info.at(lit_id);
+
+            for (const auto &delayed_lit_id : info.delayed_ids) {
+                DEBUG_PRINTF("lit id %u delay id %u\n", lit_id, delayed_lit_id);
+                auto prog = makeLiteralProgram(build, bc, prog_build,
+                                               delayed_lit_id, lit_edge_map,
+                                               false);
+                u32 offset = writeProgram(bc, move(prog));
+
+                u32 delay_id;
+                auto it = cache.find(offset);
+                if (it != end(cache)) {
+                    delay_id = it->second;
+                    DEBUG_PRINTF("reusing delay_id %u for offset %u\n",
+                                 delay_id, offset);
+                } else {
+                    delay_id = verify_u32(programs.size());
+                    programs.push_back(offset);
+                    cache.emplace(offset, delay_id);
+                    DEBUG_PRINTF("assigned new delay_id %u for offset %u\n",
+                                 delay_id, offset);
+                }
+                prog_build.delay_programs.emplace(delayed_lit_id, delay_id);
             }
-            unique_lit_edge_map[final_id].insert(e);
         }
     }
 
-    // Build output map, sorting edges by (source, target) vertex index.
-    map<u32, vector<RoseEdge>> lit_edge_map;
-    for (const auto &m : unique_lit_edge_map) {
-        auto edge_list = vector<RoseEdge>(begin(m.second), end(m.second));
-        sort(begin(edge_list), end(edge_list),
-             [&g](const RoseEdge &a, const RoseEdge &b) {
-                 return tie(g[source(a, g)].index, g[target(a, g)].index) <
-                        tie(g[source(b, g)].index, g[target(b, g)].index);
-             });
-        lit_edge_map.emplace(m.first, edge_list);
-    }
-
-    return lit_edge_map;
+    DEBUG_PRINTF("%zu delay programs\n", programs.size());
+    return {bc.engine_blob.add_range(programs), verify_u32(programs.size())};
 }
 
 /**
- * \brief Build the interpreter programs for each literal.
+ * \brief Write anchored replay programs to the bytecode.
  *
- * Returns the base of the literal program list and the base of the delay
- * rebuild program list.
+ * Returns the offset of the beginning of the program array, and the number of
+ * programs.
  */
 static
-pair<u32, u32> buildLiteralPrograms(RoseBuildImpl &build, build_context &bc) {
-    const u32 num_literals = build.final_id_to_literal.size();
+pair<u32, u32> writeAnchoredPrograms(const RoseBuildImpl &build,
+                                     const vector<LitFragment> &fragments,
+                                     build_context &bc,
+                                     ProgramBuild &prog_build) {
     auto lit_edge_map = findEdgesByLiteral(build);
 
-    bc.litPrograms.resize(num_literals);
-    vector<u32> delayRebuildPrograms(num_literals);
+    vector<u32> programs; // program offsets indexed by anchored id
+    unordered_map<u32, u32> cache; // program offsets we have already seen
 
-    for (u32 finalId = 0; finalId != num_literals; ++finalId) {
-        const auto &lit_edges = lit_edge_map[finalId];
+    for (const auto &frag : fragments) {
+        for (const u32 lit_id : frag.lit_ids) {
+            const auto &lit = build.literals.at(lit_id);
 
-        bc.litPrograms[finalId] =
-            writeLiteralProgram(build, bc, finalId, lit_edges);
-        delayRebuildPrograms[finalId] =
-            buildDelayRebuildProgram(build, bc, finalId);
-    }
+            if (lit.table != ROSE_ANCHORED) {
+                continue;
+            }
+
+            // If this anchored literal can never match past
+            // floatingMinLiteralMatchOffset, we will never have to record it.
+            if (findMaxOffset(build, lit_id)
+                <= prog_build.floatingMinLiteralMatchOffset) {
+                DEBUG_PRINTF("can never match after "
+                             "floatingMinLiteralMatchOffset=%u\n",
+                             prog_build.floatingMinLiteralMatchOffset);
+                continue;
+            }
 
-    u32 litProgramsOffset =
-        bc.engine_blob.add(begin(bc.litPrograms), end(bc.litPrograms));
-    u32 delayRebuildProgramsOffset = bc.engine_blob.add(
-        begin(delayRebuildPrograms), end(delayRebuildPrograms));
+            auto prog = makeLiteralProgram(build, bc, prog_build, lit_id,
+                                           lit_edge_map, true);
+            u32 offset = writeProgram(bc, move(prog));
+            DEBUG_PRINTF("lit_id=%u -> anch prog at %u\n", lit_id, offset);
+
+            u32 anch_id;
+            auto it = cache.find(offset);
+            if (it != end(cache)) {
+                anch_id = it->second;
+                DEBUG_PRINTF("reusing anch_id %u for offset %u\n", anch_id,
+                             offset);
+            } else {
+                anch_id = verify_u32(programs.size());
+                programs.push_back(offset);
+                cache.emplace(offset, anch_id);
+                DEBUG_PRINTF("assigned new anch_id %u for offset %u\n", anch_id,
+                             offset);
+            }
+            prog_build.anchored_programs.emplace(lit_id, anch_id);
+        }
+    }
 
-    return {litProgramsOffset, delayRebuildProgramsOffset};
+    DEBUG_PRINTF("%zu anchored programs\n", programs.size());
+    return {bc.engine_blob.add_range(programs), verify_u32(programs.size())};
 }
 
 /**
@@ -4598,17 +2993,14 @@ set<ReportID> findEngineReports(const RoseBuildImpl &build) {
 }
 
 static
-pair<u32, u32> buildReportPrograms(RoseBuildImpl &build, build_context &bc) {
+pair<u32, u32> buildReportPrograms(const RoseBuildImpl &build,
+                                   build_context &bc) {
     const auto reports = findEngineReports(build);
     vector<u32> programs;
     programs.reserve(reports.size());
 
     for (ReportID id : reports) {
-        RoseProgram program;
-        const bool has_som = false;
-        makeCatchupMpv(build, bc, id, program);
-        makeReport(build, id, has_som, program);
-        applyFinalSpecialisation(program);
+        auto program = makeReportProgram(build, bc.needs_mpv_catchup, id);
         u32 offset = writeProgram(bc, move(program));
         programs.push_back(offset);
         build.rm.setProgramOffset(id, offset);
@@ -4616,41 +3008,11 @@ pair<u32, u32> buildReportPrograms(RoseBuildImpl &build, build_context &bc) {
                      programs.back(), program.size());
     }
 
-    u32 offset = bc.engine_blob.add(begin(programs), end(programs));
+    u32 offset = bc.engine_blob.add_range(programs);
     u32 count = verify_u32(programs.size());
     return {offset, count};
 }
 
-static
-RoseProgram makeEodAnchorProgram(RoseBuildImpl &build, build_context &bc,
-                                 const RoseEdge &e, const bool multiple_preds) {
-    const RoseGraph &g = build.g;
-    const RoseVertex v = target(e, g);
-
-    RoseProgram program;
-
-    if (g[e].history == ROSE_ROLE_HISTORY_ANCH) {
-        makeRoleCheckBounds(build, v, e, program);
-    }
-
-    if (multiple_preds) {
-        // Only necessary when there is more than one pred.
-        makeRoleCheckNotHandled(bc, v, program);
-    }
-
-    const auto &reports = g[v].reports;
-    makeCatchup(build, bc, reports, program);
-
-    const bool has_som = false;
-    RoseProgram report_block;
-    for (const auto &id : reports) {
-        makeReport(build, id, has_som, report_block);
-    }
-    program.add_before_end(move(report_block));
-
-    return program;
-}
-
 static
 bool hasEodAnchoredSuffix(const RoseBuildImpl &build) {
     const RoseGraph &g = build.g;
@@ -4677,8 +3039,9 @@ bool hasEodMatcher(const RoseBuildImpl &build) {
 }
 
 static
-void addEodAnchorProgram(RoseBuildImpl &build, build_context &bc,
-                         bool in_etable, RoseProgram &program) {
+void addEodAnchorProgram(const RoseBuildImpl &build, const build_context &bc,
+                         ProgramBuild &prog_build, bool in_etable,
+                         RoseProgram &program) {
     const RoseGraph &g = build.g;
 
     // Predecessor state id -> program block.
@@ -4701,7 +3064,8 @@ void addEodAnchorProgram(RoseBuildImpl &build, build_context &bc,
                 continue;
             }
             if (canEagerlyReportAtEod(build, e)) {
-                DEBUG_PRINTF("already done report for vertex %zu\n", g[u].index);
+                DEBUG_PRINTF("already done report for vertex %zu\n",
+                             g[u].index);
                 continue;
             }
             edge_list.push_back(e);
@@ -4713,16 +3077,16 @@ void addEodAnchorProgram(RoseBuildImpl &build, build_context &bc,
             assert(contains(bc.roleStateIndices, u));
             u32 pred_state = bc.roleStateIndices.at(u);
             pred_blocks[pred_state].add_block(
-                makeEodAnchorProgram(build, bc, e, multiple_preds));
+                makeEodAnchorProgram(build, prog_build, e, multiple_preds));
         }
     }
 
-    addPredBlocks(bc, pred_blocks, program);
+    addPredBlocks(pred_blocks, bc.roleStateIndices.size(), program);
 }
 
 static
-void addEodEventProgram(RoseBuildImpl &build, build_context &bc,
-                        RoseProgram &program) {
+void addEodEventProgram(const RoseBuildImpl &build, build_context &bc,
+                        ProgramBuild &prog_build, RoseProgram &program) {
     if (build.eod_event_literal_id == MO_INVALID_IDX) {
         return;
     }
@@ -4748,61 +3112,31 @@ void addEodEventProgram(RoseBuildImpl &build, build_context &bc,
                     tie(g[source(b, g)].index, g[target(b, g)].index);
          });
 
-    program.add_block(
-        buildLiteralProgram(build, bc, MO_INVALID_IDX, edge_list));
-}
-
-static
-void addEnginesEodProgram(u32 eodNfaIterOffset, RoseProgram &program) {
-    if (!eodNfaIterOffset) {
-        return;
-    }
-
-    RoseProgram block;
-    block.add_before_end(make_unique<RoseInstrEnginesEod>(eodNfaIterOffset));
-    program.add_block(move(block));
-}
-
-static
-void addSuffixesEodProgram(const RoseBuildImpl &build, RoseProgram &program) {
-    if (!hasEodAnchoredSuffix(build)) {
-        return;
-    }
-
-    RoseProgram block;
-    block.add_before_end(make_unique<RoseInstrSuffixesEod>());
-    program.add_block(move(block));
-}
-
-static
-void addMatcherEodProgram(const RoseBuildImpl &build, RoseProgram &program) {
-    if (!hasEodMatcher(build)) {
-        return;
-    }
-
-    RoseProgram block;
-    block.add_before_end(make_unique<RoseInstrMatcherEod>());
+    auto block = makeLiteralProgram(build, bc.leftfix_info, bc.suffixes,
+                                    bc.engine_info_by_queue,
+                                    bc.roleStateIndices, prog_build,
+                                    build.eod_event_literal_id, edge_list,
+                                    false);
     program.add_block(move(block));
 }
 
 static
-u32 writeEodProgram(RoseBuildImpl &build, build_context &bc,
-                    u32 eodNfaIterOffset) {
+RoseProgram makeEodProgram(const RoseBuildImpl &build, build_context &bc,
+                           ProgramBuild &prog_build, u32 eodNfaIterOffset) {
     RoseProgram program;
 
-    addEodEventProgram(build, bc, program);
+    addEodEventProgram(build, bc, prog_build, program);
     addEnginesEodProgram(eodNfaIterOffset, program);
-    addEodAnchorProgram(build, bc, false, program);
-    addMatcherEodProgram(build, program);
-    addEodAnchorProgram(build, bc, true, program);
-    addSuffixesEodProgram(build, program);
-
-    if (program.empty()) {
-        return 0;
+    addEodAnchorProgram(build, bc, prog_build, false, program);
+    if (hasEodMatcher(build)) {
+        addMatcherEodProgram(program);
+    }
+    addEodAnchorProgram(build, bc, prog_build, true, program);
+    if (hasEodAnchoredSuffix(build)) {
+        addSuffixesEodProgram(program);
     }
 
-    applyFinalSpecialisation(program);
-    return writeProgram(bc, move(program));
+    return program;
 }
 
 static
@@ -4834,7 +3168,7 @@ void fillMatcherDistances(const RoseBuildImpl &build, RoseEngine *engine) {
         assert(g[v].min_offset <= g[v].max_offset);
 
         for (u32 lit_id : g[v].literals) {
-            const rose_literal_id &key = build.literals.right.at(lit_id);
+            const rose_literal_id &key = build.literals.at(lit_id);
             u32 max_d = g[v].max_offset;
             u32 min_d = g[v].min_offset;
 
@@ -4907,9 +3241,8 @@ void fillMatcherDistances(const RoseBuildImpl &build, RoseEngine *engine) {
 }
 
 static
-u32 buildEagerQueueIter(const set<u32> &eager, u32 leftfixBeginQueue,
-                        u32 queue_count,
-                        build_context &bc) {
+u32 writeEagerQueueIter(const set<u32> &eager, u32 leftfixBeginQueue,
+                        u32 queue_count, RoseEngineBlob &engine_blob) {
     if (eager.empty()) {
         return 0;
     }
@@ -4920,182 +3253,13 @@ u32 buildEagerQueueIter(const set<u32> &eager, u32 leftfixBeginQueue,
         vec.push_back(q - leftfixBeginQueue);
     }
 
-    vector<mmbit_sparse_iter> iter;
-    mmbBuildSparseIterator(iter, vec, queue_count - leftfixBeginQueue);
-    return bc.engine_blob.add_iterator(iter);
-}
-
-static
-void allocateFinalIdToSet(RoseBuildImpl &build, const set<u32> &lits,
-                          size_t longLitLengthThreshold, u32 *next_final_id) {
-    const auto &g = build.g;
-    auto &literal_info = build.literal_info;
-    auto &final_id_to_literal = build.final_id_to_literal;
-
-    /* We can allocate the same final id to multiple literals of the same type
-     * if they share the same vertex set and trigger the same delayed literal
-     * ids and squash the same roles and have the same group squashing
-     * behaviour. Benefits literals cannot be merged. */
-
-    assert(longLitLengthThreshold > 0);
-
-    for (u32 int_id : lits) {
-        rose_literal_info &curr_info = literal_info[int_id];
-        const rose_literal_id &lit = build.literals.right.at(int_id);
-        const auto &verts = curr_info.vertices;
-
-        // Literals with benefits cannot be merged.
-        if (curr_info.requires_benefits) {
-            DEBUG_PRINTF("id %u has benefits\n", int_id);
-            goto assign_new_id;
-        }
-
-        // Long literals (that require CHECK_LITERAL instructions) cannot be
-        // merged.
-        if (lit.s.length() > longLitLengthThreshold) {
-            DEBUG_PRINTF("id %u is a long literal\n", int_id);
-            goto assign_new_id;
-        }
-
-        if (!verts.empty() && curr_info.delayed_ids.empty()) {
-            vector<u32> cand;
-            insert(&cand, cand.end(), g[*verts.begin()].literals);
-            for (auto v : verts) {
-                vector<u32> temp;
-                set_intersection(cand.begin(), cand.end(),
-                                 g[v].literals.begin(),
-                                 g[v].literals.end(),
-                                 inserter(temp, temp.end()));
-                cand.swap(temp);
-            }
-
-            for (u32 cand_id : cand) {
-                if (cand_id >= int_id) {
-                    break;
-                }
-
-                const auto &cand_info = literal_info[cand_id];
-                const auto &cand_lit = build.literals.right.at(cand_id);
-
-                if (cand_lit.s.length() > longLitLengthThreshold) {
-                    continue;
-                }
-
-                if (cand_info.requires_benefits) {
-                    continue;
-                }
-
-                if (!cand_info.delayed_ids.empty()) {
-                    /* TODO: allow cases where delayed ids are equivalent.
-                     * This is awkward currently as the have not had their
-                     * final ids allocated yet */
-                    continue;
-                }
-
-                if (lits.find(cand_id) == lits.end()
-                    || cand_info.vertices.size() != verts.size()
-                    || cand_info.squash_group != curr_info.squash_group) {
-                    continue;
-                }
-
-                /* if we are squashing groups we need to check if they are the
-                 * same group */
-                if (cand_info.squash_group
-                    && cand_info.group_mask != curr_info.group_mask) {
-                    continue;
-                }
-
-                u32 final_id = cand_info.final_id;
-                assert(final_id != MO_INVALID_IDX);
-                assert(curr_info.final_id == MO_INVALID_IDX);
-                curr_info.final_id = final_id;
-                final_id_to_literal[final_id].insert(int_id);
-                goto next_lit;
-            }
-        }
-
-    assign_new_id:
-        /* oh well, have to give it a fresh one, hang the expense */
-        DEBUG_PRINTF("allocating final id %u to %u\n", *next_final_id, int_id);
-                assert(curr_info.final_id == MO_INVALID_IDX);
-        curr_info.final_id = *next_final_id;
-        final_id_to_literal[*next_final_id].insert(int_id);
-        (*next_final_id)++;
-    next_lit:;
-    }
-}
-
-static
-bool isUsedLiteral(const RoseBuildImpl &build, u32 lit_id) {
-    assert(lit_id < build.literal_info.size());
-    const auto &info = build.literal_info[lit_id];
-    if (!info.vertices.empty()) {
-        return true;
-    }
-
-    for (const u32 &delayed_id : info.delayed_ids) {
-        assert(delayed_id < build.literal_info.size());
-        const rose_literal_info &delayed_info = build.literal_info[delayed_id];
-        if (!delayed_info.vertices.empty()) {
-            return true;
-        }
-    }
-
-    DEBUG_PRINTF("literal %u has no refs\n", lit_id);
-    return false;
-}
-
-/** \brief Allocate final literal IDs for all literals.  */
-static
-void allocateFinalLiteralId(RoseBuildImpl &build,
-                            size_t longLitLengthThreshold) {
-    set<u32> anch;
-    set<u32> norm;
-    set<u32> delay;
-
-    /* undelayed ids come first */
-    assert(build.final_id_to_literal.empty());
-    u32 next_final_id = 0;
-    for (u32 i = 0; i < build.literal_info.size(); i++) {
-        assert(!build.hasFinalId(i));
-
-        if (!isUsedLiteral(build, i)) {
-            /* what is this literal good for? absolutely nothing */
-            continue;
-        }
-
-        // The special EOD event literal has its own program and does not need
-        // a real literal ID.
-        if (i == build.eod_event_literal_id) {
-            assert(build.eod_event_literal_id != MO_INVALID_IDX);
-            continue;
-        }
-
-        if (build.isDelayed(i)) {
-            assert(!build.literal_info[i].requires_benefits);
-            delay.insert(i);
-        } else if (build.literals.right.at(i).table == ROSE_ANCHORED) {
-            anch.insert(i);
-        } else {
-            norm.insert(i);
-        }
-    }
-
-    /* normal lits */
-    allocateFinalIdToSet(build, norm, longLitLengthThreshold, &next_final_id);
-
-    /* next anchored stuff */
-    build.anchored_base_id = next_final_id;
-    allocateFinalIdToSet(build, anch, longLitLengthThreshold, &next_final_id);
-
-    /* delayed ids come last */
-    build.delay_base_id = next_final_id;
-    allocateFinalIdToSet(build, delay, longLitLengthThreshold, &next_final_id);
+    auto iter = mmbBuildSparseIterator(vec, queue_count - leftfixBeginQueue);
+    return engine_blob.add_iterator(iter);
 }
 
 static
-aligned_unique_ptr<RoseEngine> addSmallWriteEngine(RoseBuildImpl &build,
-                                        aligned_unique_ptr<RoseEngine> rose) {
+bytecode_ptr<RoseEngine> addSmallWriteEngine(const RoseBuildImpl &build,
+                                             bytecode_ptr<RoseEngine> rose) {
     assert(rose);
 
     if (roseIsPureLiteral(rose.get())) {
@@ -5110,14 +3274,14 @@ aligned_unique_ptr<RoseEngine> addSmallWriteEngine(RoseBuildImpl &build,
         return rose;
     }
 
-    const size_t mainSize = roseSize(rose.get());
-    const size_t smallWriteSize = smwrSize(smwr_engine.get());
+    const size_t mainSize = rose.size();
+    const size_t smallWriteSize = smwr_engine.size();
     DEBUG_PRINTF("adding smwr engine, size=%zu\n", smallWriteSize);
 
     const size_t smwrOffset = ROUNDUP_CL(mainSize);
     const size_t newSize = smwrOffset + smallWriteSize;
 
-    auto rose2 = aligned_zmalloc_unique<RoseEngine>(newSize);
+    auto rose2 = make_zeroed_bytecode_ptr<RoseEngine>(newSize, 64);
     char *ptr = (char *)rose2.get();
     memcpy(ptr, rose.get(), mainSize);
     memcpy(ptr + smwrOffset, smwr_engine.get(), smallWriteSize);
@@ -5137,9 +3301,8 @@ pair<size_t, size_t> floatingCountAndMaxLen(const RoseBuildImpl &build) {
     size_t num = 0;
     size_t max_len = 0;
 
-    for (const auto &e : build.literals.right) {
-        const u32 id = e.first;
-        const rose_literal_id &lit = e.second;
+    for (u32 id = 0; id < build.literals.size(); id++) {
+        const rose_literal_id &lit = build.literals.at(id);
 
         if (lit.table != ROSE_FLOATING) {
             continue;
@@ -5164,10 +3327,11 @@ size_t calcLongLitThreshold(const RoseBuildImpl &build,
                             const size_t historyRequired) {
     const auto &cc = build.cc;
 
-    // In block mode, we should only use the long literal support for literals
-    // that cannot be handled by HWLM.
+    // In block mode, we don't have history, so we don't need long literal
+    // support and can just use "medium-length" literal confirm. TODO: we could
+    // specialize further and have a block mode literal confirm instruction.
     if (!cc.streaming) {
-        return HWLM_LITERAL_MAX_LEN;
+        return SIZE_MAX;
     }
 
     size_t longLitLengthThreshold = ROSE_LONG_LITERAL_THRESHOLD_MIN;
@@ -5195,7 +3359,40 @@ size_t calcLongLitThreshold(const RoseBuildImpl &build,
     return longLitLengthThreshold;
 }
 
-aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
+static
+map<left_id, u32> makeLeftQueueMap(const RoseGraph &g,
+                         const map<RoseVertex, left_build_info> &leftfix_info) {
+    map<left_id, u32> lqm;
+    for (const auto &e : leftfix_info) {
+        if (e.second.has_lookaround) {
+            continue;
+        }
+        DEBUG_PRINTF("%zu: using queue %u\n", g[e.first].index, e.second.queue);
+        assert(e.second.queue != INVALID_QUEUE);
+        left_id left(g[e.first].left);
+        assert(!contains(lqm, left) || lqm[left] == e.second.queue);
+        lqm[left] = e.second.queue;
+    }
+
+    return lqm;
+}
+
+bytecode_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
+    // We keep all our offsets, counts etc. in a prototype RoseEngine which we
+    // will copy into the real one once it is allocated: we can't do this
+    // until we know how big it will be.
+    RoseEngine proto;
+    memset(&proto, 0, sizeof(proto));
+
+    // Set scanning mode.
+    if (!cc.streaming) {
+        proto.mode = HS_MODE_BLOCK;
+    } else if (cc.vectored) {
+        proto.mode = HS_MODE_VECTORED;
+    } else {
+        proto.mode = HS_MODE_STREAM;
+    }
+
     DerivedBoundaryReports dboundary(boundary);
 
     size_t historyRequired = calcHistoryRequired(); // Updated by HWLM.
@@ -5203,49 +3400,43 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
                                                          historyRequired);
     DEBUG_PRINTF("longLitLengthThreshold=%zu\n", longLitLengthThreshold);
 
-    allocateFinalLiteralId(*this, longLitLengthThreshold);
+    vector<LitFragment> fragments = groupByFragment(*this);
 
-    auto anchored_dfas = buildAnchoredDfas(*this);
+    auto anchored_dfas = buildAnchoredDfas(*this, fragments);
 
     build_context bc;
-    bc.floatingMinLiteralMatchOffset =
-        findMinFloatingLiteralMatch(*this, anchored_dfas);
-    bc.longLitLengthThreshold = longLitLengthThreshold;
-    bc.needs_catchup = needsCatchup(*this, anchored_dfas);
-    recordResources(bc.resources, *this);
+    u32 floatingMinLiteralMatchOffset
+        = findMinFloatingLiteralMatch(*this, anchored_dfas);
+    recordResources(bc.resources, *this, fragments);
     if (!anchored_dfas.empty()) {
         bc.resources.has_anchored = true;
     }
     bc.needs_mpv_catchup = needsMpvCatchup(*this);
-    bc.vertex_group_map = getVertexGroupMap(*this);
-    bc.squashable_groups = getSquashableGroups(*this);
 
-    auto boundary_out = makeBoundaryPrograms(*this, bc, boundary, dboundary);
+    makeBoundaryPrograms(*this, bc, boundary, dboundary, proto.boundary);
 
-    u32 reportProgramOffset;
-    u32 reportProgramCount;
-    tie(reportProgramOffset, reportProgramCount) =
+    tie(proto.reportProgramOffset, proto.reportProgramCount) =
         buildReportPrograms(*this, bc);
 
     // Build NFAs
-    set<u32> no_retrigger_queues;
     bool mpv_as_outfix;
     prepMpv(*this, bc, &historyRequired, &mpv_as_outfix);
-    u32 outfixBeginQueue = qif.allocated_count();
+    proto.outfixBeginQueue = qif.allocated_count();
     if (!prepOutfixes(*this, bc, &historyRequired)) {
         return nullptr;
     }
-    u32 outfixEndQueue = qif.allocated_count();
-    u32 leftfixBeginQueue = outfixEndQueue;
+    proto.outfixEndQueue = qif.allocated_count();
+    proto.leftfixBeginQueue = proto.outfixEndQueue;
 
+    set<u32> no_retrigger_queues;
     set<u32> eager_queues;
 
     /* Note: buildNfas may reduce the lag for vertices that have prefixes */
     if (!buildNfas(*this, bc, qif, &no_retrigger_queues, &eager_queues,
-                   &leftfixBeginQueue)) {
+                   &proto.leftfixBeginQueue)) {
         return nullptr;
     }
-    u32 eodNfaIterOffset = buildEodNfaIterator(bc, leftfixBeginQueue);
+    u32 eodNfaIterOffset = buildEodNfaIterator(bc, proto.leftfixBeginQueue);
     buildCountingMiracles(bc);
 
     u32 queue_count = qif.allocated_count(); /* excludes anchored matcher q;
@@ -5254,127 +3445,88 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
         throw ResourceLimitError();
     }
 
-    vector<u32> suffixEkeyLists;
-    buildSuffixEkeyLists(*this, bc, qif, &suffixEkeyLists);
+    // Enforce role table resource limit.
+    if (num_vertices(g) > cc.grey.limitRoseRoleCount) {
+        throw ResourceLimitError();
+    }
 
-    assignStateIndices(*this, bc);
+    bc.roleStateIndices = assignStateIndices(*this);
 
     u32 laggedRoseCount = 0;
     vector<LeftNfaInfo> leftInfoTable;
-    buildLeftInfoTable(*this, bc, eager_queues, leftfixBeginQueue,
-                       queue_count - leftfixBeginQueue, leftInfoTable,
+    buildLeftInfoTable(*this, bc, eager_queues, proto.leftfixBeginQueue,
+                       queue_count - proto.leftfixBeginQueue, leftInfoTable,
                        &laggedRoseCount, &historyRequired);
 
-    u32 litProgramOffset;
-    u32 litDelayRebuildProgramOffset;
-    tie(litProgramOffset, litDelayRebuildProgramOffset) =
-        buildLiteralPrograms(*this, bc);
+    // Information only needed for program construction.
+    ProgramBuild prog_build(floatingMinLiteralMatchOffset,
+                            longLitLengthThreshold, needsCatchup(*this));
+    prog_build.vertex_group_map = getVertexGroupMap(*this);
+    prog_build.squashable_groups = getSquashableGroups(*this);
 
-    u32 eodProgramOffset = writeEodProgram(*this, bc, eodNfaIterOffset);
+    tie(proto.anchoredProgramOffset, proto.anchored_count) =
+        writeAnchoredPrograms(*this, fragments, bc, prog_build);
 
-    size_t longLitStreamStateRequired = 0;
-    u32 longLitTableOffset = buildLongLiteralTable(*this, bc.engine_blob,
-                bc.longLiterals, longLitLengthThreshold, &historyRequired,
-                &longLitStreamStateRequired);
+    tie(proto.delayProgramOffset, proto.delay_count) =
+        writeDelayPrograms(*this, fragments, bc, prog_build);
 
-    vector<mmbit_sparse_iter> activeLeftIter;
-    buildActiveLeftIter(leftInfoTable, activeLeftIter);
+    buildLiteralPrograms(*this, fragments, bc, prog_build);
 
-    u32 lastByteOffset = buildLastByteIter(g, bc);
-    u32 eagerIterOffset = buildEagerQueueIter(eager_queues, leftfixBeginQueue,
-                                              queue_count, bc);
+    auto eod_prog = makeEodProgram(*this, bc, prog_build, eodNfaIterOffset);
+    proto.eodProgramOffset = writeProgram(bc, move(eod_prog));
 
-    // Enforce role table resource limit.
-    if (num_vertices(g) > cc.grey.limitRoseRoleCount) {
-        throw ResourceLimitError();
-    }
+    size_t longLitStreamStateRequired = 0;
+    proto.longLitTableOffset
+        = buildLongLiteralTable(*this, bc.engine_blob, bc.longLiterals,
+                                longLitLengthThreshold, &historyRequired,
+                                &longLitStreamStateRequired);
 
-    u32 currOffset;  /* relative to base of RoseEngine */
-    if (!bc.engine_blob.empty()) {
-        currOffset = bc.engine_blob.base_offset + bc.engine_blob.size();
-    } else {
-        currOffset = sizeof(RoseEngine);
-    }
+    proto.lastByteHistoryIterOffset = buildLastByteIter(g, bc);
+    proto.eagerIterOffset = writeEagerQueueIter(
+        eager_queues, proto.leftfixBeginQueue, queue_count, bc.engine_blob);
 
-    UNUSED const size_t engineBlobSize = bc.engine_blob.size(); // test later
+    addSomRevNfas(bc, proto, ssm);
 
-    currOffset = ROUNDUP_CL(currOffset);
-    DEBUG_PRINTF("currOffset %u\n", currOffset);
+    writeDkeyInfo(rm, bc.engine_blob, proto);
+    writeLeftInfo(bc.engine_blob, proto, leftInfoTable);
 
     // Build anchored matcher.
-    size_t asize = 0;
-    u32 amatcherOffset = 0;
-    auto atable = buildAnchoredMatcher(*this, anchored_dfas, bc.litPrograms,
-                                       &asize);
+    auto atable = buildAnchoredMatcher(*this, fragments, anchored_dfas);
     if (atable) {
-        currOffset = ROUNDUP_CL(currOffset);
-        amatcherOffset = currOffset;
-        currOffset += verify_u32(asize);
+        proto.amatcherOffset = bc.engine_blob.add(atable);
     }
 
     // Build floating HWLM matcher.
     rose_group fgroups = 0;
-    size_t fsize = 0;
-    auto ftable = buildFloatingMatcher(*this, bc.longLitLengthThreshold,
-                                       &fgroups, &fsize, &historyRequired);
-    u32 fmatcherOffset = 0;
+    auto ftable = buildFloatingMatcher(*this, fragments, longLitLengthThreshold,
+                                       &fgroups, &historyRequired);
     if (ftable) {
-        currOffset = ROUNDUP_CL(currOffset);
-        fmatcherOffset = currOffset;
-        currOffset += verify_u32(fsize);
+        proto.fmatcherOffset = bc.engine_blob.add(ftable);
+        bc.resources.has_floating = true;
+    }
+
+    // Build delay rebuild HWLM matcher.
+    auto drtable = buildDelayRebuildMatcher(*this, fragments,
+                                            longLitLengthThreshold);
+    if (drtable) {
+        proto.drmatcherOffset = bc.engine_blob.add(drtable);
     }
 
     // Build EOD-anchored HWLM matcher.
-    size_t esize = 0;
-    auto etable = buildEodAnchoredMatcher(*this, &esize);
-    u32 ematcherOffset = 0;
+    auto etable = buildEodAnchoredMatcher(*this, fragments);
     if (etable) {
-        currOffset = ROUNDUP_CL(currOffset);
-        ematcherOffset = currOffset;
-        currOffset += verify_u32(esize);
+        proto.ematcherOffset = bc.engine_blob.add(etable);
     }
 
     // Build small-block HWLM matcher.
-    size_t sbsize = 0;
-    auto sbtable = buildSmallBlockMatcher(*this, &sbsize);
-    u32 sbmatcherOffset = 0;
+    auto sbtable = buildSmallBlockMatcher(*this, fragments);
     if (sbtable) {
-        currOffset = ROUNDUP_CL(currOffset);
-        sbmatcherOffset = currOffset;
-        currOffset += verify_u32(sbsize);
+        proto.sbmatcherOffset = bc.engine_blob.add(sbtable);
     }
 
-    u32 leftOffset = ROUNDUP_N(currOffset, alignof(LeftNfaInfo));
-    u32 roseLen = sizeof(LeftNfaInfo) * leftInfoTable.size();
-    currOffset = leftOffset + roseLen;
-
-    u32 lookaroundReachOffset = currOffset;
-    u32 lookaroundReachLen = REACH_BITVECTOR_LEN * bc.lookaround.size();
-    currOffset = lookaroundReachOffset + lookaroundReachLen;
-
-    u32 lookaroundTableOffset = currOffset;
-    u32 lookaroundTableLen = sizeof(s8) * bc.lookaround.size();
-    currOffset = lookaroundTableOffset + lookaroundTableLen;
-
-    u32 nfaInfoOffset = ROUNDUP_N(currOffset, sizeof(u32));
-    u32 nfaInfoLen = sizeof(NfaInfo) * queue_count;
-    currOffset = nfaInfoOffset + nfaInfoLen;
+    proto.activeArrayCount = proto.leftfixBeginQueue;
 
-    currOffset = ROUNDUP_N(currOffset, alignof(mmbit_sparse_iter));
-    u32 activeLeftIterOffset = currOffset;
-    currOffset += activeLeftIter.size() * sizeof(mmbit_sparse_iter);
-
-    u32 activeArrayCount = leftfixBeginQueue;
-    u32 activeLeftCount = leftInfoTable.size();
-    u32 rosePrefixCount = countRosePrefixes(leftInfoTable);
-
-    u32 rev_nfa_table_offset;
-    vector<u32> rev_nfa_offsets;
-    prepSomRevNfas(ssm, &rev_nfa_table_offset, &rev_nfa_offsets, &currOffset);
-
-    // Build engine header and copy tables into place.
-
-    u32 anchorStateSize = atable ? anchoredStateSize(*atable) : 0;
+    proto.anchorStateSize = atable ? anchoredStateSize(*atable) : 0;
 
     DEBUG_PRINTF("rose history required %zu\n", historyRequired);
     assert(!cc.streaming || historyRequired <= cc.grey.maxHistoryAvailable);
@@ -5385,192 +3537,112 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildFinalEngine(u32 minWidth) {
     assert(!cc.streaming || historyRequired <=
            max(cc.grey.maxHistoryAvailable, cc.grey.somMaxRevNfaLength));
 
-    RoseStateOffsets stateOffsets;
-    memset(&stateOffsets, 0, sizeof(stateOffsets));
-    fillStateOffsets(*this, bc.numStates, anchorStateSize,
-                     activeArrayCount, activeLeftCount, laggedRoseCount,
-                     longLitStreamStateRequired, historyRequired,
-                     &stateOffsets);
-
-    scatter_plan_raw state_scatter;
-    buildStateScatterPlan(sizeof(u8), bc.numStates,
-                          activeLeftCount, rosePrefixCount, stateOffsets,
-                          cc.streaming, activeArrayCount, outfixBeginQueue,
-                          outfixEndQueue, &state_scatter);
-
-    currOffset = ROUNDUP_N(currOffset, alignof(scatter_unit_u64a));
-
-    u32 state_scatter_aux_offset = currOffset;
-    currOffset += aux_size(state_scatter);
+    fillStateOffsets(*this, bc.roleStateIndices.size(), proto.anchorStateSize,
+                     proto.activeArrayCount, proto.activeLeftCount,
+                     laggedRoseCount, longLitStreamStateRequired,
+                     historyRequired, &proto.stateOffsets);
 
-    currOffset = ROUNDUP_N(currOffset, alignof(ReportID));
-    u32 dkeyOffset = currOffset;
-    currOffset += rm.numDkeys() * sizeof(ReportID);
+    // Write in NfaInfo structures. This will also update state size
+    // information in proto.
+    writeNfaInfo(*this, bc, proto, no_retrigger_queues);
 
-    aligned_unique_ptr<RoseEngine> engine
-        = aligned_zmalloc_unique<RoseEngine>(currOffset);
-    assert(engine); // will have thrown bad_alloc otherwise.
-    char *ptr = (char *)engine.get();
-    assert(ISALIGNED_CL(ptr));
+    scatter_plan_raw state_scatter = buildStateScatterPlan(
+        sizeof(u8), bc.roleStateIndices.size(), proto.activeLeftCount,
+        proto.rosePrefixCount, proto.stateOffsets, cc.streaming,
+        proto.activeArrayCount, proto.outfixBeginQueue, proto.outfixEndQueue);
 
-    if (atable) {
-        assert(amatcherOffset);
-        memcpy(ptr + amatcherOffset, atable.get(), asize);
-    }
-    if (ftable) {
-        assert(fmatcherOffset);
-        memcpy(ptr + fmatcherOffset, ftable.get(), fsize);
-    }
-    if (etable) {
-        assert(ematcherOffset);
-        memcpy(ptr + ematcherOffset, etable.get(), esize);
-    }
-    if (sbtable) {
-        assert(sbmatcherOffset);
-        memcpy(ptr + sbmatcherOffset, sbtable.get(), sbsize);
+    u32 currOffset;  /* relative to base of RoseEngine */
+    if (!bc.engine_blob.empty()) {
+        currOffset = bc.engine_blob.base_offset + bc.engine_blob.size();
+    } else {
+        currOffset = sizeof(RoseEngine);
     }
 
-    memcpy(&engine->stateOffsets, &stateOffsets, sizeof(stateOffsets));
-
-    engine->historyRequired = verify_u32(historyRequired);
-
-    engine->ekeyCount = rm.numEkeys();
-    engine->dkeyCount = rm.numDkeys();
-    engine->dkeyLogSize = fatbit_size(engine->dkeyCount);
-    engine->invDkeyOffset = dkeyOffset;
-    copy_bytes(ptr + dkeyOffset, rm.getDkeyToReportTable());
-
-    engine->somHorizon = ssm.somPrecision();
-    engine->somLocationCount = ssm.numSomSlots();
-    engine->somLocationFatbitSize = fatbit_size(engine->somLocationCount);
-
-    engine->needsCatchup = bc.needs_catchup ? 1 : 0;
-
-    engine->literalCount = verify_u32(final_id_to_literal.size());
-    engine->litProgramOffset = litProgramOffset;
-    engine->litDelayRebuildProgramOffset = litDelayRebuildProgramOffset;
-    engine->reportProgramOffset = reportProgramOffset;
-    engine->reportProgramCount = reportProgramCount;
-    engine->runtimeImpl = pickRuntimeImpl(*this, bc, outfixEndQueue);
-    engine->mpvTriggeredByLeaf = anyEndfixMpvTriggers(*this);
-
-    engine->activeArrayCount = activeArrayCount;
-    engine->activeLeftCount = activeLeftCount;
-    engine->queueCount = queue_count;
-    engine->activeQueueArraySize = fatbit_size(queue_count);
-    engine->eagerIterOffset = eagerIterOffset;
-    engine->handledKeyCount = bc.handledKeys.size();
-    engine->handledKeyFatbitSize = fatbit_size(engine->handledKeyCount);
+    currOffset = ROUNDUP_CL(currOffset);
+    DEBUG_PRINTF("currOffset %u\n", currOffset);
 
-    engine->rolesWithStateCount = bc.numStates;
+    currOffset = ROUNDUP_N(currOffset, alignof(scatter_unit_u64a));
+    u32 state_scatter_aux_offset = currOffset;
+    currOffset += aux_size(state_scatter);
 
-    engine->leftOffset = leftOffset;
-    engine->roseCount = verify_u32(leftInfoTable.size());
-    engine->lookaroundTableOffset = lookaroundTableOffset;
-    engine->lookaroundReachOffset = lookaroundReachOffset;
-    engine->outfixBeginQueue = outfixBeginQueue;
-    engine->outfixEndQueue = outfixEndQueue;
-    engine->leftfixBeginQueue = leftfixBeginQueue;
-    engine->initMpvNfa = mpv_as_outfix ? 0 : MO_INVALID_IDX;
-    engine->stateSize = mmbit_size(bc.numStates);
-    engine->anchorStateSize = anchorStateSize;
-    engine->nfaInfoOffset = nfaInfoOffset;
+    proto.historyRequired = verify_u32(historyRequired);
+    proto.ekeyCount = rm.numEkeys();
 
-    engine->eodProgramOffset = eodProgramOffset;
+    proto.somHorizon = ssm.somPrecision();
+    proto.somLocationCount = ssm.numSomSlots();
+    proto.somLocationFatbitSize = fatbit_size(proto.somLocationCount);
 
-    engine->lastByteHistoryIterOffset = lastByteOffset;
+    proto.runtimeImpl = pickRuntimeImpl(*this, bc.resources,
+                                        proto.outfixEndQueue);
+    proto.mpvTriggeredByLeaf = anyEndfixMpvTriggers(*this);
 
-    engine->delay_count =
-        verify_u32(final_id_to_literal.size() - delay_base_id);
-    engine->delay_fatbit_size = fatbit_size(engine->delay_count);
-    engine->delay_base_id = delay_base_id;
-    engine->anchored_base_id = anchored_base_id;
-    engine->anchored_count = delay_base_id - anchored_base_id;
-    engine->anchored_fatbit_size = fatbit_size(engine->anchored_count);
+    proto.queueCount = queue_count;
+    proto.activeQueueArraySize = fatbit_size(queue_count);
+    proto.handledKeyCount = prog_build.handledKeys.size();
+    proto.handledKeyFatbitSize = fatbit_size(proto.handledKeyCount);
 
-    engine->rosePrefixCount = rosePrefixCount;
+    proto.rolesWithStateCount = bc.roleStateIndices.size();
 
-    engine->activeLeftIterOffset
-        = activeLeftIter.empty() ? 0 : activeLeftIterOffset;
+    proto.initMpvNfa = mpv_as_outfix ? 0 : MO_INVALID_IDX;
+    proto.stateSize = mmbit_size(bc.roleStateIndices.size());
 
-    // Set scanning mode.
-    if (!cc.streaming) {
-        engine->mode = HS_MODE_BLOCK;
-    } else if (cc.vectored) {
-        engine->mode = HS_MODE_VECTORED;
-    } else {
-        engine->mode = HS_MODE_STREAM;
-    }
+    proto.delay_fatbit_size = fatbit_size(proto.delay_count);
+    proto.anchored_fatbit_size = fatbit_size(proto.anchored_count);
 
     // The Small Write matcher is (conditionally) added to the RoseEngine in
     // another pass by the caller. Set to zero (meaning no SMWR engine) for
     // now.
-    engine->smallWriteOffset = 0;
-
-    engine->amatcherOffset = amatcherOffset;
-    engine->ematcherOffset = ematcherOffset;
-    engine->sbmatcherOffset = sbmatcherOffset;
-    engine->fmatcherOffset = fmatcherOffset;
-    engine->longLitTableOffset = longLitTableOffset;
-    engine->amatcherMinWidth = findMinWidth(*this, ROSE_ANCHORED);
-    engine->fmatcherMinWidth = findMinWidth(*this, ROSE_FLOATING);
-    engine->eodmatcherMinWidth = findMinWidth(*this, ROSE_EOD_ANCHORED);
-    engine->amatcherMaxBiAnchoredWidth = findMaxBAWidth(*this, ROSE_ANCHORED);
-    engine->fmatcherMaxBiAnchoredWidth = findMaxBAWidth(*this, ROSE_FLOATING);
-    engine->size = currOffset;
-    engine->minWidth = hasBoundaryReports(boundary) ? 0 : minWidth;
-    engine->minWidthExcludingBoundaries = minWidth;
-    engine->floatingMinLiteralMatchOffset = bc.floatingMinLiteralMatchOffset;
-
-    engine->maxBiAnchoredWidth = findMaxBAWidth(*this);
-    engine->noFloatingRoots = hasNoFloatingRoots();
-    engine->requiresEodCheck = hasEodAnchors(*this, bc, outfixEndQueue);
-    engine->hasOutfixesInSmallBlock = hasNonSmallBlockOutfix(outfixes);
-    engine->canExhaust = rm.patternSetCanExhaust();
-    engine->hasSom = hasSom;
+    proto.smallWriteOffset = 0;
+
+    proto.amatcherMinWidth = findMinWidth(*this, ROSE_ANCHORED);
+    proto.fmatcherMinWidth = findMinWidth(*this, ROSE_FLOATING);
+    proto.eodmatcherMinWidth = findMinWidth(*this, ROSE_EOD_ANCHORED);
+    proto.amatcherMaxBiAnchoredWidth = findMaxBAWidth(*this, ROSE_ANCHORED);
+    proto.fmatcherMaxBiAnchoredWidth = findMaxBAWidth(*this, ROSE_FLOATING);
+    proto.minWidth = hasBoundaryReports(boundary) ? 0 : minWidth;
+    proto.minWidthExcludingBoundaries = minWidth;
+    proto.floatingMinLiteralMatchOffset = floatingMinLiteralMatchOffset;
+
+    proto.maxBiAnchoredWidth = findMaxBAWidth(*this);
+    proto.noFloatingRoots = hasNoFloatingRoots();
+    proto.requiresEodCheck = hasEodAnchors(*this, bc, proto.outfixEndQueue);
+    proto.hasOutfixesInSmallBlock = hasNonSmallBlockOutfix(outfixes);
+    proto.canExhaust = rm.patternSetCanExhaust();
+    proto.hasSom = hasSom;
 
     /* populate anchoredDistance, floatingDistance, floatingMinDistance, etc */
-    fillMatcherDistances(*this, engine.get());
+    fillMatcherDistances(*this, &proto);
+
+    proto.initialGroups = getInitialGroups();
+    proto.floating_group_mask = fgroups;
+    proto.totalNumLiterals = verify_u32(literal_info.size());
+    proto.asize = verify_u32(atable.size());
+    proto.ematcherRegionSize = ematcher_region_size;
+    proto.longLitStreamState = verify_u32(longLitStreamStateRequired);
 
-    engine->initialGroups = getInitialGroups();
-    engine->floating_group_mask = fgroups;
-    engine->totalNumLiterals = verify_u32(literal_info.size());
-    engine->asize = verify_u32(asize);
-    engine->ematcherRegionSize = ematcher_region_size;
-    engine->longLitStreamState = verify_u32(longLitStreamStateRequired);
+    proto.size = currOffset;
 
-    engine->boundary.reportEodOffset = boundary_out.reportEodOffset;
-    engine->boundary.reportZeroOffset = boundary_out.reportZeroOffset;
-    engine->boundary.reportZeroEodOffset = boundary_out.reportZeroEodOffset;
+    // Time to allocate the real RoseEngine structure, at cacheline alignment.
+    auto engine = make_zeroed_bytecode_ptr<RoseEngine>(currOffset, 64);
+    assert(engine); // will have thrown bad_alloc otherwise.
+
+    // Copy in our prototype engine data.
+    memcpy(engine.get(), &proto, sizeof(proto));
 
     write_out(&engine->state_init, (char *)engine.get(), state_scatter,
               state_scatter_aux_offset);
 
-    NfaInfo *nfa_infos = (NfaInfo *)(ptr + nfaInfoOffset);
-    populateNfaInfoBasics(*this, bc, outfixes, suffixEkeyLists,
-                          no_retrigger_queues, nfa_infos);
-    updateNfaState(bc, &engine->stateOffsets, nfa_infos,
-                   &engine->scratchStateSize, &engine->nfaStateSize,
-                   &engine->tStateSize);
-
-    // Copy in other tables
+    // Copy in the engine blob.
     bc.engine_blob.write_bytes(engine.get());
-    copy_bytes(ptr + engine->leftOffset, leftInfoTable);
-
-    fillLookaroundTables(ptr + lookaroundTableOffset,
-                         ptr + lookaroundReachOffset, bc.lookaround);
-
-    fillInSomRevNfas(engine.get(), ssm, rev_nfa_table_offset, rev_nfa_offsets);
-    copy_bytes(ptr + engine->activeLeftIterOffset, activeLeftIter);
-
-    // Safety check: we shouldn't have written anything to the engine blob
-    // after we copied it into the engine bytecode.
-    assert(bc.engine_blob.size() == engineBlobSize);
 
     // Add a small write engine if appropriate.
     engine = addSmallWriteEngine(*this, move(engine));
 
     DEBUG_PRINTF("rose done %p\n", engine.get());
+
+    dumpRose(*this, fragments, makeLeftQueueMap(g, bc.leftfix_info),
+             bc.suffixes, engine.get());
+
     return engine;
 }
 
diff --git a/src/rose/rose_build_castle.cpp b/src/rose/rose_build_castle.cpp
index 7987b0f61..a85a784fc 100644
--- a/src/rose/rose_build_castle.cpp
+++ b/src/rose/rose_build_castle.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -131,7 +131,7 @@ vector<rose_literal_id> literals_for_vertex(const RoseBuildImpl &tbi,
     vector<rose_literal_id> rv;
 
     for (const u32 id : tbi.g[v].literals) {
-        rv.push_back(tbi.literals.right.at(id));
+        rv.push_back(tbi.literals.at(id));
     }
 
     return rv;
@@ -366,7 +366,7 @@ bool triggerKillsRoseCastle(const RoseBuildImpl &tbi, const left_id &left,
     /* check each pred literal to see if they all kill previous castle
      * state */
     for (u32 lit_id : tbi.g[source(e, tbi.g)].literals) {
-        const rose_literal_id &pred_lit = tbi.literals.right.at(lit_id);
+        const rose_literal_id &pred_lit = tbi.literals.at(lit_id);
         const ue2_literal s = findNonOverlappingTail(all_lits, pred_lit.s);
         const CharReach &cr = c.reach();
 
diff --git a/src/rose/rose_build_compile.cpp b/src/rose/rose_build_compile.cpp
index e13d7c5c7..96241e39d 100644
--- a/src/rose/rose_build_compile.cpp
+++ b/src/rose/rose_build_compile.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -40,6 +40,7 @@
 #include "rose_build_role_aliasing.h"
 #include "rose_build_util.h"
 #include "ue2common.h"
+#include "hwlm/hwlm_literal.h"
 #include "nfa/nfa_internal.h"
 #include "nfa/rdfa.h"
 #include "nfagraph/ng_holder.h"
@@ -102,10 +103,75 @@ bool limited_explosion(const ue2_literal &s) {
     return nc_count <= MAX_EXPLOSION_NC;
 }
 
+static
+void removeLiteralFromGraph(RoseBuildImpl &build, u32 id) {
+    assert(id < build.literal_info.size());
+    auto &info = build.literal_info.at(id);
+    for (const auto &v : info.vertices) {
+        build.g[v].literals.erase(id);
+    }
+    info.vertices.clear();
+}
+
+/**
+ * \brief Replace the given mixed-case literal with the set of its caseless
+ * variants.
+ */
+static
+void explodeLiteral(RoseBuildImpl &build, u32 id) {
+    const auto &lit = build.literals.at(id);
+    auto &info = build.literal_info[id];
+
+    assert(!info.group_mask); // not set yet
+    assert(info.undelayed_id == id); // we do not explode delayed literals
+
+    for (auto it = caseIterateBegin(lit.s); it != caseIterateEnd(); ++it) {
+        ue2_literal new_str(*it, false);
+
+        if (!maskIsConsistent(new_str.get_string(), false, lit.msk, lit.cmp)) {
+            DEBUG_PRINTF("msk/cmp for literal can't match, skipping\n");
+            continue;
+        }
+
+        u32 new_id =
+            build.getLiteralId(new_str, lit.msk, lit.cmp, lit.delay, lit.table);
+
+        DEBUG_PRINTF("adding exploded lit %u: '%s'\n", new_id,
+                     dumpString(new_str).c_str());
+
+        const auto &new_lit = build.literals.at(new_id);
+        auto &new_info = build.literal_info.at(new_id);
+        insert(&new_info.vertices, info.vertices);
+        for (const auto &v : info.vertices) {
+            build.g[v].literals.insert(new_id);
+        }
+
+        build.literal_info[new_id].undelayed_id = new_id;
+        if (!info.delayed_ids.empty()) {
+            flat_set<u32> &del_ids = new_info.delayed_ids;
+            for (u32 delay_id : info.delayed_ids) {
+                const auto &dlit = build.literals.at(delay_id);
+                u32 new_delay_id =
+                    build.getLiteralId(new_lit.s, new_lit.msk, new_lit.cmp,
+                                       dlit.delay, dlit.table);
+                del_ids.insert(new_delay_id);
+                build.literal_info[new_delay_id].undelayed_id = new_id;
+            }
+        }
+    }
+
+    // Remove the old literal and any old delay variants.
+    removeLiteralFromGraph(build, id);
+    for (u32 delay_id : info.delayed_ids) {
+        removeLiteralFromGraph(build, delay_id);
+    }
+    info.delayed_ids.clear();
+}
+
 void RoseBuildImpl::handleMixedSensitivity(void) {
-    for (const auto &e : literals.right) {
-        u32 id = e.first;
-        const rose_literal_id &lit = e.second;
+    vector<u32> explode;
+    for (u32 id = 0; id < literals.size(); id++) {
+        const rose_literal_id &lit = literals.at(id);
 
         if (lit.delay) {
             continue; /* delay id's are virtual-ish */
@@ -120,18 +186,23 @@ void RoseBuildImpl::handleMixedSensitivity(void) {
         }
 
         // We don't want to explode long literals, as they require confirmation
-        // with a CHECK_LITERAL instruction and need unique final_ids.
+        // with a CHECK_LONG_LIT instruction and need unique final_ids.
         // TODO: we could allow explosion for literals where the prefixes
-        // covered by CHECK_LITERAL are identical.
+        // covered by CHECK_LONG_LIT are identical.
+
         if (lit.s.length() <= ROSE_LONG_LITERAL_THRESHOLD_MIN &&
-            limited_explosion(lit.s)) {
+            limited_explosion(lit.s) && literal_info[id].delayed_ids.empty()) {
             DEBUG_PRINTF("need to explode existing string '%s'\n",
                          dumpString(lit.s).c_str());
-            literal_info[id].requires_explode = true;
+            explode.push_back(id);
         } else {
             literal_info[id].requires_benefits = true;
         }
     }
+
+    for (u32 id : explode) {
+        explodeLiteral(*this, id);
+    }
 }
 
 // Returns the length of the longest prefix of s that is (a) also a suffix of s
@@ -348,7 +419,7 @@ bool RoseBuildImpl::isDirectReport(u32 id) const {
             }
         }
 
-        if (literals.right.at(id).table == ROSE_ANCHORED) {
+        if (literals.at(id).table == ROSE_ANCHORED) {
             /* in-edges are irrelevant for anchored region. */
             continue;
         }
@@ -367,7 +438,7 @@ bool RoseBuildImpl::isDirectReport(u32 id) const {
     }
 
     DEBUG_PRINTF("literal %u ('%s') is a %s report\n", id,
-                 dumpString(literals.right.at(id).s).c_str(),
+                 dumpString(literals.at(id).s).c_str(),
                  info.vertices.size() > 1 ? "multi-direct" : "direct");
     return true;
 }
@@ -412,7 +483,7 @@ bool checkFloatingKillableByPrefixes(const RoseBuildImpl &tbi) {
 }
 
 static
-bool checkEodStealFloating(const RoseBuildImpl &tbi,
+bool checkEodStealFloating(const RoseBuildImpl &build,
                            const vector<u32> &eodLiteralsForFloating,
                            u32 numFloatingLiterals,
                            size_t shortestFloatingLen) {
@@ -426,27 +497,35 @@ bool checkEodStealFloating(const RoseBuildImpl &tbi,
         return false;
     }
 
-    if (tbi.hasNoFloatingRoots()) {
+    if (build.hasNoFloatingRoots()) {
         DEBUG_PRINTF("skipping as floating table is conditional\n");
         /* TODO: investigate putting stuff in atable */
         return false;
     }
 
-    if (checkFloatingKillableByPrefixes(tbi)) {
+    if (checkFloatingKillableByPrefixes(build)) {
          DEBUG_PRINTF("skipping as prefixes may make ftable conditional\n");
          return false;
     }
 
+    // Collect a set of all floating literals.
+    unordered_set<ue2_literal> floating_lits;
+    for (auto &lit : build.literals) {
+        if (lit.table == ROSE_FLOATING) {
+            floating_lits.insert(lit.s);
+        }
+    }
+
     DEBUG_PRINTF("%zu are eod literals, %u floating; floating len=%zu\n",
                  eodLiteralsForFloating.size(), numFloatingLiterals,
                  shortestFloatingLen);
     u32 new_floating_lits = 0;
 
     for (u32 eod_id : eodLiteralsForFloating) {
-        const rose_literal_id &lit = tbi.literals.right.at(eod_id);
+        const rose_literal_id &lit = build.literals.at(eod_id);
         DEBUG_PRINTF("checking '%s'\n", dumpString(lit.s).c_str());
 
-        if (tbi.hasLiteral(lit.s, ROSE_FLOATING)) {
+        if (contains(floating_lits, lit.s)) {
             DEBUG_PRINTF("skip; there is already a floating version\n");
             continue;
         }
@@ -477,12 +556,16 @@ bool checkEodStealFloating(const RoseBuildImpl &tbi,
 
 static
 void promoteEodToFloating(RoseBuildImpl &tbi, const vector<u32> &eodLiterals) {
-    DEBUG_PRINTF("promoting eod literals to floating table\n");
+    DEBUG_PRINTF("promoting %zu eod literals to floating table\n",
+                 eodLiterals.size());
 
     for (u32 eod_id : eodLiterals) {
-        const rose_literal_id &lit = tbi.literals.right.at(eod_id);
+        const rose_literal_id &lit = tbi.literals.at(eod_id);
+        DEBUG_PRINTF("eod_id=%u, lit=%s\n", eod_id, dumpString(lit.s).c_str());
         u32 floating_id = tbi.getLiteralId(lit.s, lit.msk, lit.cmp, lit.delay,
                                            ROSE_FLOATING);
+        DEBUG_PRINTF("floating_id=%u, lit=%s\n", floating_id,
+                     dumpString(tbi.literals.at(floating_id).s).c_str());
         auto &float_verts = tbi.literal_info[floating_id].vertices;
         auto &eod_verts = tbi.literal_info[eod_id].vertices;
 
@@ -496,8 +579,6 @@ void promoteEodToFloating(RoseBuildImpl &tbi, const vector<u32> &eodLiterals) {
             tbi.g[v].literals.insert(floating_id);
         }
 
-        tbi.literal_info[floating_id].requires_explode
-            = tbi.literal_info[eod_id].requires_explode;
         tbi.literal_info[floating_id].requires_benefits
             = tbi.literal_info[eod_id].requires_benefits;
     }
@@ -509,7 +590,7 @@ bool promoteEodToAnchored(RoseBuildImpl &tbi, const vector<u32> &eodLiterals) {
     bool rv = true;
 
     for (u32 eod_id : eodLiterals) {
-        const rose_literal_id &lit = tbi.literals.right.at(eod_id);
+        const rose_literal_id &lit = tbi.literals.at(eod_id);
 
         NGHolder h;
         add_edge(h.start, h.accept, h);
@@ -649,7 +730,7 @@ void stealEodVertices(RoseBuildImpl &tbi) {
             continue; // skip unused literals
         }
 
-        const rose_literal_id &lit = tbi.literals.right.at(i);
+        const rose_literal_id &lit = tbi.literals.at(i);
 
         if (lit.table == ROSE_EOD_ANCHORED) {
             if (suitableForAnchored(tbi, lit, info)) {
@@ -689,13 +770,9 @@ bool RoseBuildImpl::isDelayed(u32 id) const {
     return literal_info.at(id).undelayed_id != id;
 }
 
-bool RoseBuildImpl::hasFinalId(u32 id) const {
-    return literal_info.at(id).final_id != MO_INVALID_IDX;
-}
-
 bool RoseBuildImpl::hasDelayedLiteral(RoseVertex v) const {
     for (u32 lit_id : g[v].literals) {
-        if (literals.right.at(lit_id).delay) {
+        if (literals.at(lit_id).delay) {
             return true;
         }
     }
@@ -966,7 +1043,7 @@ void packInfixTops(NGHolder &h, RoseGraph &g,
                 updated_tops.insert(top_mapping.at(t));
             }
         }
-        h[e].tops = move(updated_tops);
+        h[e].tops = std::move(updated_tops);
         if (h[e].tops.empty()) {
             DEBUG_PRINTF("edge (start,%zu) has only unused tops\n", h[v].index);
             dead.push_back(e);
@@ -1021,7 +1098,7 @@ bool triggerKillsRoseGraph(const RoseBuildImpl &build, const left_id &left,
     /* check each pred literal to see if they all kill previous graph
      * state */
     for (u32 lit_id : build.g[source(e, build.g)].literals) {
-        const rose_literal_id &pred_lit = build.literals.right.at(lit_id);
+        const rose_literal_id &pred_lit = build.literals.at(lit_id);
         const ue2_literal s = findNonOverlappingTail(all_lits, pred_lit.s);
 
         DEBUG_PRINTF("running graph %zu\n", states.size());
@@ -1095,7 +1172,7 @@ void findTopTriggerCancels(RoseBuildImpl &build) {
         }
 
         for (u32 lit_id : pred_lit_ids) {
-            const rose_literal_id &p_lit = build.literals.right.at(lit_id);
+            const rose_literal_id &p_lit = build.literals.at(lit_id);
             if (p_lit.delay || p_lit.table == ROSE_ANCHORED) {
                 goto next_rose;
             }
@@ -1166,11 +1243,15 @@ void buildRoseSquashMasks(RoseBuildImpl &tbi) {
             }
         }
 
-        rose_group unsquashable = 0;
+        rose_group unsquashable = tbi.boundary_group_mask;
 
         for (u32 lit_id : lit_ids) {
             const rose_literal_info &info = tbi.literal_info[lit_id];
-            if (info.vertices.size() > 1 || !info.delayed_ids.empty()) {
+            if (!info.delayed_ids.empty()
+                || !all_of_in(info.vertices,
+                              [&](RoseVertex v) {
+                                  return left == tbi.g[v].left; })) {
+                DEBUG_PRINTF("group %llu is unsquashable\n", info.group_mask);
                 unsquashable |= info.group_mask;
             }
         }
@@ -1192,7 +1273,7 @@ void countFloatingLiterals(const RoseBuildImpl &tbi, u32 *total_count,
                            u32 *short_count) {
     *total_count = 0;
     *short_count = 0;
-    for (const rose_literal_id &lit : tbi.literals.right | map_values) {
+    for (const rose_literal_id &lit : tbi.literals) {
         if (lit.delay) {
             continue; /* delay id's are virtual-ish */
         }
@@ -1598,8 +1679,8 @@ bool roleOffsetsAreValid(const RoseGraph &g) {
 }
 #endif // NDEBUG
 
-aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildRose(u32 minWidth) {
-    dumpRoseGraph(*this, nullptr, "rose_early.dot");
+bytecode_ptr<RoseEngine> RoseBuildImpl::buildRose(u32 minWidth) {
+    dumpRoseGraph(*this, "rose_early.dot");
 
     // Early check for Rose implementability.
     assert(canImplementGraphs(*this));
@@ -1644,8 +1725,6 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildRose(u32 minWidth) {
     dedupeLeftfixes(*this);
     aliasRoles(*this, false); // Don't merge leftfixes.
     dedupeLeftfixes(*this);
-
-    convertBadLeaves(*this);
     uncalcLeaves(*this);
 
     /* note the leftfixes which do not need to keep state across stream
@@ -1712,7 +1791,7 @@ aligned_unique_ptr<RoseEngine> RoseBuildImpl::buildRose(u32 minWidth) {
     assert(roleOffsetsAreValid(g));
     assert(historiesAreValid(g));
 
-    dumpRoseGraph(*this, nullptr, "rose_pre_norm.dot");
+    dumpRoseGraph(*this, "rose_pre_norm.dot");
 
     return buildFinalEngine(minWidth);
 }
diff --git a/src/rose/rose_build_convert.cpp b/src/rose/rose_build_convert.cpp
index b151c0c91..0c1f43386 100644
--- a/src/rose/rose_build_convert.cpp
+++ b/src/rose/rose_build_convert.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -76,301 +76,6 @@ NFAVertex addHolderVertex(const CharReach &cr, NGHolder &out) {
     return v;
 }
 
-// Returns the first and last vertices.
-static
-pair<NFAVertex, NFAVertex> addLiteralVertices(const RoseGraph &g,
-                                              const RoseLiteralMap &literals,
-                                              const RoseVertex &t_v,
-                                              NGHolder &out) {
-    // We have limited cases that we support: one literal of arbitrary length,
-    // or a bunch of literals of length one that just become a vertex with
-    // their reach unioned together.
-
-    // TODO: generalise this and handle more cases.
-
-    const auto &litids = g[t_v].literals;
-    if (litids.size() > 1) {
-        // Multiple literals of len 1.
-        CharReach v_cr;
-        for (const auto &lit_id : litids) {
-            const rose_literal_id &litv = literals.right.at(lit_id);
-            assert(litv.s.length() == 1);
-            v_cr |= *litv.s.begin();
-        }
-
-        NFAVertex v = addHolderVertex(v_cr, out);
-        return make_pair(v, v);
-    }
-
-    // Otherwise, we have a single literal, could be of arbitrary length.
-    assert(litids.size() == 1);
-    u32 lit_id = *(litids.begin());
-    const rose_literal_id &litv = literals.right.at(lit_id);
-    assert(!litv.s.empty());
-
-    ue2_literal::const_iterator it = litv.s.begin(), ite = litv.s.end();
-    NFAVertex first = addHolderVertex(*it, out), last = first;
-    for (++it; it != ite; ++it) {
-        NFAVertex v = addHolderVertex(*it, out);
-        add_edge(last, v, out);
-        last = v;
-    }
-
-    return make_pair(first, last);
-}
-
-static
-unique_ptr<NGHolder> convertLeafToHolder(const RoseGraph &g,
-                                         const RoseEdge &t_e,
-                                         const RoseLiteralMap &literals) {
-    RoseVertex t_v = target(t_e, g); // leaf vertex for demolition.
-    u32 minBound = g[t_e].minBound;
-    u32 maxBound = g[t_e].maxBound;
-
-    const CharReach dot = CharReach::dot();
-
-    assert(!g[t_v].left);
-
-    auto out = ue2::make_unique<NGHolder>(NFA_SUFFIX);
-
-    // Repeats wired to the start of the graph.
-    DEBUG_PRINTF("bounds [%u, %u]\n", minBound, maxBound);
-    u32 i = 1;
-    NFAVertex last = out->start;
-    for (; i <= minBound; i++) {
-        NFAVertex v = addHolderVertex(dot, *out);
-        add_edge(last, v, *out);
-        last = v;
-    }
-    NFAVertex last_mand = last;
-    if (maxBound != ROSE_BOUND_INF) {
-        for (; i <= maxBound; i++) {
-            NFAVertex v = addHolderVertex(dot, *out);
-            add_edge(last_mand, v, *out);
-            if (last != last_mand) {
-                add_edge(last, v, *out);
-            }
-            last = v;
-        }
-    } else {
-        if (minBound) {
-            add_edge(last_mand, last_mand, *out);
-        } else {
-            NFAVertex v = addHolderVertex(dot, *out);
-            add_edge(last_mand, v, *out);
-            add_edge(v, v, *out);
-            last = v;
-        }
-    }
-
-    setTops(*out);
-
-    // Literal vertices wired to accept.
-    NFAVertex litfirst, litlast;
-    tie(litfirst, litlast) = addLiteralVertices(g, literals, t_v, *out);
-    add_edge(last, litfirst, *out);
-    if (last != last_mand) {
-        add_edge(last_mand, litfirst, *out);
-    }
-    add_edge(litlast, out->accept, *out);
-    insert(&(*out)[litlast].reports, g[t_v].reports);
-    return out;
-}
-
-static
-bool areLiteralsConvertible(const RoseLiteralMap &literals,
-                            const flat_set<u32> &ids) {
-    // Every literal in v must have the same length.
-
-    // TODO: at the moment, we only handle two cases in construction: (a) one
-    // literal of arbitrary length, and (b) many literals, but all with length
-    // 1.
-
-    if (ids.empty()) {
-        return false;
-    }
-
-    auto it = ids.begin(), ite = ids.end();
-    const size_t len = literals.right.at(*it).elength();
-
-    // Note: len may be 0 for cases with special literals, like EOD prefixes.
-
-    if (len != 1 && ids.size() != 1) {
-        DEBUG_PRINTF("more than one literal of len > 1\n");
-        return false;
-    }
-
-    // Check the others all have the same length.
-    while (++it != ite) {
-        if (literals.right.at(*it).elength() != len) {
-            DEBUG_PRINTF("literals have different lengths\n");
-            return false;
-        }
-    }
-
-    return true;
-}
-
-// Returns true if the given vertex doesn't qualify as a bad leaf to be eaten
-// by an NFA.
-static
-bool isUnconvertibleLeaf(const RoseBuildImpl &tbi, const RoseVertex v) {
-    const RoseGraph &g = tbi.g;
-
-    if (in_degree(v, g) != 1) {
-        DEBUG_PRINTF("more than one in-edge\n");
-        return true;
-    }
-
-    const RoseEdge &e = *(in_edges(v, g).first);
-    RoseVertex u = source(e, g);
-
-    if (!g[u].reports.empty()) {
-        DEBUG_PRINTF("pred has accept\n");
-        return true;
-    }
-
-    if (g[u].suffix) {
-        // TODO: this could be handled by adding new vertices to the existing
-        // suffix.
-        DEBUG_PRINTF("pred already has suffix\n");
-        return true;
-    }
-
-    if (tbi.isAnyStart(u)) {
-        DEBUG_PRINTF("fail start\n");
-        return true;
-    }
-
-    if (tbi.isAnchored(u)) {
-        /* TODO need to check for possible anchored queue overflow? maybe? */
-        DEBUG_PRINTF("fail anchored\n");
-        return true;
-    }
-
-    if (g[v].reports.empty() || g[v].eod_accept) {
-        DEBUG_PRINTF("bad accept\n");
-        return true;
-    }
-
-    if (g[v].suffix) {
-        DEBUG_PRINTF("suffix\n");
-        return true;
-    }
-
-    if (g[v].left) {
-        /* TODO: we really should handle this case as we would be checking
-         * an nfa each time. However it requires completely different graph
-         * fiddling logic */
-        DEBUG_PRINTF("rose prefix action\n");
-        return true;
-    }
-
-    if (!areLiteralsConvertible(tbi.literals, g[v].literals)) {
-        DEBUG_PRINTF("fail length\n");
-        return true;
-    }
-
-    u32 max_lit_len = tbi.maxLiteralLen(v);
-
-    u32 maxbound = max_lit_len == 1 ? 124 : 32; // arbitrary magic numbers
-    if (g[e].maxBound > maxbound && g[e].maxBound != ROSE_BOUND_INF) {
-        DEBUG_PRINTF("fail maxbound (%u)\n", maxbound);
-        return true;
-    }
-
-    if (g[e].maxBound == ROSE_BOUND_INF) {
-        /* slightly risky as nfa won't die */
-        DEBUG_PRINTF("fail: .*\n");
-        return true;
-    }
-
-    return false;
-}
-
-// Find all of the leaves with literals whose length is <= len.
-static
-void findBadLeaves(RoseBuildImpl &tbi, set<RoseVertex> &bad) {
-    RoseGraph &g = tbi.g;
-    u32 len = tbi.cc.grey.roseMaxBadLeafLength;
-
-    for (const auto &m : tbi.literals.right) {
-        if (m.second.s.length() > len) {
-            continue;
-        }
-        u32 lid = m.first;
-        DEBUG_PRINTF("%u is a short lit (length %zu)\n", lid,
-                     m.second.s.length());
-
-        if (tbi.isDelayed(lid)) {
-            DEBUG_PRINTF("delayed, skipping!\n");
-            continue;
-        }
-
-        const rose_literal_info &info = tbi.literal_info[lid];
-
-        for (auto v : info.vertices) {
-            if (!isLeafNode(v, g)) {
-                continue;
-            }
-            if (isUnconvertibleLeaf(tbi, v)) {
-                continue; // we don't want to touch it
-            }
-
-            // This leaf may have a predecessor with more than one successor,
-            // in which case we want to clone the pred just to support this
-            // leaf.
-            const RoseEdge &e = *in_edges(v, g).first;
-            RoseVertex u = source(e, g);
-            if (out_degree(u, g) != 1) {
-                DEBUG_PRINTF("re-homing %zu to cloned pred\n", g[v].index);
-                RoseVertex u2 = tbi.cloneVertex(u);
-                for (const auto &e_in : in_edges_range(u, g)) {
-                    add_edge(source(e_in, g), u2, g[e_in], g);
-                }
-                add_edge(u2, v, g[e], g);
-                remove_edge(e, g);
-            }
-
-            DEBUG_PRINTF("%zu is a bad leaf vertex\n", g[v].index);
-            bad.insert(v);
-        }
-    }
-}
-
-void convertBadLeaves(RoseBuildImpl &tbi) {
-    RoseGraph &g = tbi.g;
-    set<RoseVertex> bad;
-    findBadLeaves(tbi, bad);
-    DEBUG_PRINTF("found %zu bad leaves\n", bad.size());
-
-    if (bad.empty()) {
-        return;
-    }
-
-    vector<RoseVertex> dead;
-    for (auto v : bad) {
-        assert(in_degree(v, g));
-
-        const RoseEdge &e = *(in_edges(v, g).first);
-
-        shared_ptr<NGHolder> h = convertLeafToHolder(g, e, tbi.literals);
-        if (num_vertices(*h) >= NFA_MAX_STATES) {
-            assert(0); // too big!
-            continue;
-        }
-
-        RoseVertex u = source(e, g);
-        assert(!g[u].suffix);
-        g[u].suffix.graph = h;
-        DEBUG_PRINTF("%zu's nfa holder %p\n", g[u].index, h.get());
-
-        dead.push_back(v);
-    }
-
-    tbi.removeVertices(dead);
-}
-
 static
 size_t suffixFloodLen(const ue2_literal &s) {
     if (s.empty()) {
@@ -461,7 +166,7 @@ bool delayLiteralWithPrefix(RoseBuildImpl &tbi, RoseVertex v, u32 lit_id,
 
     shared_ptr<NGHolder> h = makeRosePrefix(lit.s);
     ReportID prefix_report = 0;
-    setReportId(*h, prefix_report);
+    set_report(*h, prefix_report);
 
     if (!isImplementableNFA(*h, &tbi.rm, tbi.cc)) {
         DEBUG_PRINTF("prefix not implementable\n");
@@ -530,7 +235,7 @@ void convertFloodProneSuffix(RoseBuildImpl &tbi, RoseVertex v, u32 lit_id,
 static
 size_t findFloodProneSuffixLen(const RoseBuildImpl &tbi) {
     size_t numLiterals = 0;
-    for (const rose_literal_id &lit : tbi.literals.right | map_values) {
+    for (const rose_literal_id &lit : tbi.literals) {
         if (lit.delay) {
             continue; // delay ids are virtual-ish
         }
@@ -588,7 +293,7 @@ void convertFloodProneSuffixes(RoseBuildImpl &tbi) {
         }
 
         u32 lit_id = *g[v].literals.begin();
-        const rose_literal_id &lit = tbi.literals.right.at(lit_id);
+        const rose_literal_id &lit = tbi.literals.at(lit_id);
 
         // anchored or delayed literals need thought.
         if (lit.table != ROSE_FLOATING || lit.delay) {
@@ -846,7 +551,7 @@ bool handleMixedPrefixCliche(const NGHolder &h, RoseGraph &g, RoseVertex v,
                && is_subset_of(exits, base_succ)
                && is_subset_of(base_succ, exits_and_repeat_verts)) {
         /* we have a jump edge */
-        ri.repeatMin = 0;
+        ri.repeatMin = depth(0);
     } else {
         return false;
     }
@@ -1097,7 +802,7 @@ void convertAnchPrefixToBounds(RoseBuildImpl &tbi) {
 
         DepthMinMax bounds(pr.bounds); // copy
         if (delay_adj > bounds.min) {
-            bounds.min = 0;
+            bounds.min = depth(0);
         } else {
             bounds.min -= delay_adj;
         }
diff --git a/src/rose/rose_build_convert.h b/src/rose/rose_build_convert.h
index fd7c6d3ea..7307c213c 100644
--- a/src/rose/rose_build_convert.h
+++ b/src/rose/rose_build_convert.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,7 +34,6 @@ namespace ue2 {
 class RoseBuildImpl;
 
 void convertFloodProneSuffixes(RoseBuildImpl &tbi);
-void convertBadLeaves(RoseBuildImpl &tbi);
 void convertPrefixToBounds(RoseBuildImpl &tbi);
 void convertAnchPrefixToBounds(RoseBuildImpl &tbi);
 
diff --git a/src/rose/rose_build_dedupe.cpp b/src/rose/rose_build_dedupe.cpp
new file mode 100644
index 000000000..d3e723133
--- /dev/null
+++ b/src/rose/rose_build_dedupe.cpp
@@ -0,0 +1,388 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "rose_build_impl.h"
+#include "nfa/castlecompile.h"
+#include "nfagraph/ng_repeat.h"
+#include "util/compile_context.h"
+#include "util/boundary_reports.h"
+#include "util/make_unique.h"
+#include "util/report_manager.h"
+
+using namespace std;
+
+namespace ue2 {
+
+static
+bool requiresDedupe(const NGHolder &h, const ue2::flat_set<ReportID> &reports,
+                    const Grey &grey) {
+    /* TODO: tighten */
+    NFAVertex seen_vert = NGHolder::null_vertex();
+
+    for (auto v : inv_adjacent_vertices_range(h.accept, h)) {
+        if (has_intersection(h[v].reports, reports)) {
+            if (seen_vert != NGHolder::null_vertex()) {
+                return true;
+            }
+            seen_vert = v;
+        }
+    }
+
+    for (auto v : inv_adjacent_vertices_range(h.acceptEod, h)) {
+        if (has_intersection(h[v].reports, reports)) {
+            if (seen_vert != NGHolder::null_vertex()) {
+                return true;
+            }
+            seen_vert = v;
+        }
+    }
+
+    if (seen_vert) {
+        /* if the reporting vertex is part of of a terminal repeat, the
+         * construction process may reform the graph splitting it into two
+         * vertices (pos, cyclic) and hence require dedupe */
+        vector<GraphRepeatInfo> repeats;
+        findRepeats(h, grey.minExtBoundedRepeatSize, &repeats);
+        for (const auto &repeat : repeats) {
+            if (find(repeat.vertices.begin(), repeat.vertices.end(),
+                     seen_vert) != repeat.vertices.end()) {
+                return true;
+            }
+        }
+    }
+
+    return false;
+}
+
+class RoseDedupeAuxImpl : public RoseDedupeAux {
+public:
+    explicit RoseDedupeAuxImpl(const RoseBuildImpl &build_in);
+    bool requiresDedupeSupport(
+        const ue2::flat_set<ReportID> &reports) const override;
+
+private:
+    bool hasSafeMultiReports(const ue2::flat_set<ReportID> &reports) const;
+
+    const RoseBuildImpl &build;
+    map<ReportID, set<RoseVertex>> vert_map; //!< ordinary literals
+    map<ReportID, set<RoseVertex>> sb_vert_map; //!< small block literals
+    map<ReportID, set<suffix_id>> suffix_map;
+    map<ReportID, set<const OutfixInfo *>> outfix_map;
+    map<ReportID, set<const raw_puff *>> puff_map;
+
+    unordered_set<ReportID> live_reports; //!< all live internal reports.
+};
+
+unique_ptr<RoseDedupeAux> RoseBuildImpl::generateDedupeAux() const {
+    return ue2::make_unique<RoseDedupeAuxImpl>(*this);
+}
+
+RoseDedupeAux::~RoseDedupeAux() = default;
+
+RoseDedupeAuxImpl::RoseDedupeAuxImpl(const RoseBuildImpl &build_in)
+    : build(build_in) {
+    const RoseGraph &g = build.g;
+
+    set<suffix_id> suffixes;
+
+    for (auto v : vertices_range(g)) {
+        insert(&live_reports, g[v].reports);
+
+        // Literals in the small block table are "shadow" copies of literals in
+        // the other tables that do not run in the same runtime invocation.
+        // Dedupe key assignment will be taken care of by the real literals.
+        if (build.hasLiteralInTable(v, ROSE_ANCHORED_SMALL_BLOCK)) {
+            for (const auto &report_id : g[v].reports) {
+                sb_vert_map[report_id].insert(v);
+            }
+        } else {
+            for (const auto &report_id : g[v].reports) {
+                vert_map[report_id].insert(v);
+            }
+        }
+
+        // Several vertices may share a suffix, so we collect the set of
+        // suffixes first to avoid repeating work.
+        if (g[v].suffix) {
+            suffixes.insert(g[v].suffix);
+        }
+    }
+
+    for (const auto &suffix : suffixes) {
+        for (const auto &report_id : all_reports(suffix)) {
+            suffix_map[report_id].insert(suffix);
+            live_reports.insert(report_id);
+        }
+    }
+
+    for (const auto &outfix : build.outfixes) {
+        for (const auto &report_id : all_reports(outfix)) {
+            outfix_map[report_id].insert(&outfix);
+            live_reports.insert(report_id);
+        }
+    }
+
+    if (build.mpv_outfix) {
+        auto *mpv = build.mpv_outfix->mpv();
+        for (const auto &puff : mpv->puffettes) {
+            puff_map[puff.report].insert(&puff);
+            live_reports.insert(puff.report);
+        }
+        for (const auto &puff : mpv->triggered_puffettes) {
+            puff_map[puff.report].insert(&puff);
+            live_reports.insert(puff.report);
+        }
+    }
+
+    // Collect live reports from boundary reports.
+    insert(&live_reports, build.boundary.report_at_0);
+    insert(&live_reports, build.boundary.report_at_0_eod);
+    insert(&live_reports, build.boundary.report_at_eod);
+
+    DEBUG_PRINTF("%zu of %zu reports are live\n", live_reports.size(),
+                 build.rm.numReports());
+}
+
+static
+vector<CharReach> makePath(const rose_literal_id &lit) {
+    vector<CharReach> path(begin(lit.s), end(lit.s));
+    for (u32 i = 0; i < lit.delay; i++) {
+        path.push_back(CharReach::dot());
+    }
+    return path;
+}
+
+/**
+ * \brief True if one of the given literals overlaps with the suffix of
+ * another, meaning that they could arrive at the same offset.
+ */
+static
+bool literalsCouldRace(const rose_literal_id &lit1,
+                       const rose_literal_id &lit2) {
+    DEBUG_PRINTF("compare %s (delay %u) and %s (delay %u)\n",
+                 dumpString(lit1.s).c_str(), lit1.delay,
+                 dumpString(lit2.s).c_str(), lit2.delay);
+
+    // Add dots on the end of each literal for delay.
+    const auto v1 = makePath(lit1);
+    const auto v2 = makePath(lit2);
+
+    // See if the smaller path is a suffix of the larger path.
+    const auto *smaller = v1.size() < v2.size() ? &v1 : &v2;
+    const auto *bigger = v1.size() < v2.size() ? &v2 : &v1;
+    auto r = mismatch(smaller->rbegin(), smaller->rend(), bigger->rbegin(),
+                      overlaps);
+    return r.first == smaller->rend();
+}
+
+bool RoseDedupeAuxImpl::hasSafeMultiReports(
+    const flat_set<ReportID> &reports) const {
+    if (reports.size() <= 1) {
+        return true;
+    }
+
+    /* We have more than one ReportID corresponding to the external ID that is
+     * presented to the user. These may differ in offset adjustment, bounds
+     * checks, etc. */
+
+    /* TODO: work out if these differences will actually cause problems */
+
+    /* One common case where we know we don't have a problem is if there are
+     * precisely two reports, one for the main Rose path and one for the
+     * "small block matcher" path. */
+    if (reports.size() == 2) {
+        ReportID id1 = *reports.begin();
+        ReportID id2 = *reports.rbegin();
+
+        bool has_verts_1 = contains(vert_map, id1);
+        bool has_verts_2 = contains(vert_map, id2);
+        bool has_sb_verts_1 = contains(sb_vert_map, id1);
+        bool has_sb_verts_2 = contains(sb_vert_map, id2);
+
+        if (has_verts_1 != has_verts_2 && has_sb_verts_1 != has_sb_verts_2) {
+            DEBUG_PRINTF("two reports, one full and one small block: ok\n");
+            return true;
+        }
+    }
+
+    DEBUG_PRINTF("more than one report\n");
+    return false;
+}
+
+bool RoseDedupeAuxImpl::requiresDedupeSupport(
+    const flat_set<ReportID> &reports_in) const {
+    /* TODO: this could be expanded to check for offset or character
+       constraints */
+
+    // We don't want to consider dead reports (tracked by ReportManager but no
+    // longer used) for the purposes of assigning dupe keys.
+    flat_set<ReportID> reports;
+    for (auto id : reports_in) {
+        if (contains(live_reports, id)) {
+            reports.insert(id);
+        }
+    }
+
+    DEBUG_PRINTF("live reports: %s\n", as_string_list(reports).c_str());
+
+    const RoseGraph &g = build.g;
+
+    bool has_suffix = false;
+    bool has_outfix = false;
+
+    if (!hasSafeMultiReports(reports)) {
+        DEBUG_PRINTF("multiple reports not safe\n");
+        return true;
+    }
+
+    set<RoseVertex> roles;
+    set<suffix_id> suffixes;
+    set<const OutfixInfo *> outfixes;
+    set<const raw_puff *> puffettes;
+    for (ReportID r : reports) {
+        if (contains(vert_map, r)) {
+            insert(&roles, vert_map.at(r));
+        }
+        if (contains(suffix_map, r)) {
+            insert(&suffixes, suffix_map.at(r));
+        }
+
+        if (contains(outfix_map, r)) {
+            insert(&outfixes, outfix_map.at(r));
+        }
+
+        if (contains(puff_map, r)) {
+            insert(&puffettes, puff_map.at(r));
+        }
+    }
+
+    /* roles */
+
+    map<u32, u32> lits; // Literal ID -> count of occurrences.
+
+    const bool has_role = !roles.empty();
+    for (auto v : roles) {
+        for (const auto &lit : g[v].literals) {
+            lits[lit]++;
+        }
+        if (g[v].eod_accept) {
+            // Literals plugged into this EOD accept must be taken into account
+            // as well.
+            for (auto u : inv_adjacent_vertices_range(v, g)) {
+                for (const auto &lit : g[u].literals) {
+                    lits[lit]++;
+                }
+            }
+        }
+    }
+
+    /* literals */
+
+    for (const auto &m : lits) {
+        if (m.second > 1) {
+            DEBUG_PRINTF("lit %u used by >1 reporting roles\n", m.first);
+            return true;
+        }
+    }
+
+    for (auto it = begin(lits); it != end(lits); ++it) {
+        const auto &lit1 = build.literals.at(it->first);
+        for (auto jt = next(it); jt != end(lits); ++jt) {
+            const auto &lit2 = build.literals.at(jt->first);
+            if (literalsCouldRace(lit1, lit2)) {
+                DEBUG_PRINTF("literals could race\n");
+                return true;
+            }
+        }
+    }
+
+    /* suffixes */
+
+    for (const auto &suffix : suffixes) {
+        if (has_suffix || has_role) {
+            return true; /* scope for badness */
+        }
+
+        has_suffix = true;
+
+        /* some lesser suffix engines (nfas, haig, castle) can raise multiple
+         * matches for a report id at the same offset if there are multiple
+         * report states live. */
+        if (suffix.haig()) {
+            return true;
+        }
+        if (suffix.graph() &&
+            requiresDedupe(*suffix.graph(), reports, build.cc.grey)) {
+            return true;
+        }
+        if (suffix.castle() && requiresDedupe(*suffix.castle(), reports)) {
+            return true;
+        }
+    }
+
+    /* outfixes */
+
+    for (const auto &outfix_ptr : outfixes) {
+        assert(outfix_ptr);
+        const OutfixInfo &out = *outfix_ptr;
+
+        if (has_outfix || has_role || has_suffix) {
+            return true;
+        }
+        has_outfix = true;
+
+        if (out.haig()) {
+            return true; /* haig may report matches with different SOM at the
+                            same offset */
+        }
+
+        if (out.holder() &&
+            requiresDedupe(*out.holder(), reports, build.cc.grey)) {
+            return true;
+        }
+    }
+
+    /* mpv */
+    for (UNUSED const auto &puff : puffettes) {
+        if (has_outfix || has_role || has_suffix) {
+            return true;
+        }
+        has_outfix = true;
+    }
+
+    /* boundary */
+    if (has_intersection(build.boundary.report_at_eod, reports)) {
+        if (has_outfix || has_role || has_suffix) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+} // namespace ue2
diff --git a/src/rose/rose_build_dump.cpp b/src/rose/rose_build_dump.cpp
index 105ee338d..b527db6c8 100644
--- a/src/rose/rose_build_dump.cpp
+++ b/src/rose/rose_build_dump.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -32,11 +32,16 @@
 
 #include "rose_build_impl.h"
 #include "rose_build_matchers.h"
-#include "rose/rose_dump.h"
 #include "rose_internal.h"
+#include "rose_program.h"
 #include "ue2common.h"
+#include "hs_compile.h"
 #include "hwlm/hwlm_build.h"
+#include "hwlm/hwlm_dump.h"
+#include "hwlm/hwlm_literal.h"
 #include "nfa/castlecompile.h"
+#include "nfa/nfa_build_util.h"
+#include "nfa/nfa_dump_api.h"
 #include "nfa/nfa_internal.h"
 #include "nfagraph/ng_dump.h"
 #include "som/slot_manager_dump.h"
@@ -44,9 +49,12 @@
 #include "util/container.h"
 #include "util/dump_charclass.h"
 #include "util/graph_range.h"
+#include "util/multibit.h"
+#include "util/multibit_build.h"
 #include "util/ue2string.h"
 
 #include <iomanip>
+#include <numeric>
 #include <ostream>
 #include <set>
 #include <sstream>
@@ -81,10 +89,34 @@ string render_kind(const Graph &g) {
 
 namespace {
 
+struct rose_off {
+    explicit rose_off(u32 j) : i(j) {}
+    string str(void) const;
+    u32 i;
+};
+
+ostream &operator<<(ostream &o, const rose_off &to) {
+    if (to.i == ROSE_BOUND_INF) {
+        o << "inf";
+    } else {
+        o << to.i;
+    }
+    return o;
+}
+
+string rose_off::str(void) const {
+    ostringstream out;
+    out << *this;
+    return out.str();
+}
+
 class RoseGraphWriter {
 public:
-    RoseGraphWriter(const RoseBuildImpl &b_in, const RoseEngine *t_in) :
-        build(b_in), t(t_in) {
+    RoseGraphWriter(const RoseBuildImpl &b_in, const map<u32, u32> &frag_map_in,
+                    const map<left_id, u32> &lqm_in,
+                    const map<suffix_id, u32> &sqm_in, const RoseEngine *t_in)
+        : frag_map(frag_map_in), leftfix_queue_map(lqm_in),
+          suffix_queue_map(sqm_in), build(b_in), t(t_in) {
         for (const auto &m : build.ghost) {
             ghost.insert(m.second);
         }
@@ -131,8 +163,8 @@ class RoseGraphWriter {
         if (g[v].suffix) {
             suffix_id suff(g[v].suffix);
             os << "\\n" << render_kind(suff) << " (top " << g[v].suffix.top;
-            auto it = build.suffix_queue_map.find(suff);
-            if (it != end(build.suffix_queue_map)) {
+            auto it = suffix_queue_map.find(suff);
+            if (it != end(suffix_queue_map)) {
                 os << ", queue " << it->second;
             }
             os << ")";
@@ -145,8 +177,8 @@ class RoseGraphWriter {
         if (g[v].left) {
             left_id left(g[v].left);
             os << "\\n" << render_kind(left) << " (queue ";
-            auto it = build.leftfix_queue_map.find(left);
-            if (it != end(build.leftfix_queue_map)) {
+            auto it = leftfix_queue_map.find(left);
+            if (it != end(leftfix_queue_map)) {
                 os << it->second;
             } else {
                 os << "??";
@@ -219,37 +251,50 @@ class RoseGraphWriter {
     // Render the literal associated with a vertex.
     void writeLiteral(ostream &os, u32 id) const {
         os << "lit=" << id;
-        if (id < build.literal_info.size()) {
-            os << "/" << build.literal_info[id].final_id << " ";
+        if (contains(frag_map, id)) {
+            os << "/" << frag_map.at(id) << " ";
         } else {
-            os << "/nofinal ";
+            os << "/nofrag ";
         }
 
-        if (contains(build.literals.right, id)) {
-            const auto &lit = build.literals.right.at(id);
-            os << '\'' << dotEscapeString(lit.s.get_string()) << '\'';
-            if (lit.s.any_nocase()) {
-                os << " (nocase)";
-            }
-            if (lit.delay) {
-                os << " +" << lit.delay;
-            }
-        } else {
-            os << "<unknown>";
+        const auto &lit = build.literals.at(id);
+        os << '\'' << dotEscapeString(lit.s.get_string()) << '\'';
+        if (lit.s.any_nocase()) {
+            os << " (nocase)";
+        }
+        if (lit.delay) {
+            os << " +" << lit.delay;
         }
     }
 
     set<RoseVertex> ghost;
+    const map<u32, u32> &frag_map;
+    const map<left_id, u32> &leftfix_queue_map;
+    const map<suffix_id, u32> &suffix_queue_map;
     const RoseBuildImpl &build;
     const RoseEngine *t;
 };
 
 } // namespace
 
-void dumpRoseGraph(const RoseBuild &build_base, const RoseEngine *t,
-                   const char *filename) {
-    const RoseBuildImpl &build = dynamic_cast<const RoseBuildImpl &>(build_base);
+static
+map<u32, u32> makeFragMap(const vector<LitFragment> &fragments) {
+    map<u32, u32> fm;
+    for (const auto &f : fragments) {
+        for (u32 id : f.lit_ids) {
+            fm[id] = f.fragment_id;
+        }
+    }
 
+    return fm;
+}
+
+static
+void dumpRoseGraph(const RoseBuildImpl &build, const RoseEngine *t,
+                   const vector<LitFragment> &fragments,
+                   const map<left_id, u32> &leftfix_queue_map,
+                   const map<suffix_id, u32> &suffix_queue_map,
+                   const char *filename) {
     const Grey &grey = build.cc.grey;
 
     /* "early" rose graphs should only be dumped if we are dumping intermediate
@@ -266,10 +311,16 @@ void dumpRoseGraph(const RoseBuild &build_base, const RoseEngine *t,
     DEBUG_PRINTF("dumping graph to %s\n", ss.str().c_str());
     ofstream os(ss.str());
 
-    RoseGraphWriter writer(build, t);
+    auto frag_map = makeFragMap(fragments);
+    RoseGraphWriter writer(build, frag_map, leftfix_queue_map, suffix_queue_map,
+                           t);
     writeGraphviz(os, build.g, writer, get(boost::vertex_index, build.g));
 }
 
+void dumpRoseGraph(const RoseBuildImpl &build, const char *filename) {
+    dumpRoseGraph(build, nullptr, {}, {}, {}, filename);
+}
+
 namespace {
 struct CompareVertexRole {
     explicit CompareVertexRole(const RoseGraph &g_in) : g(g_in) {}
@@ -294,21 +345,25 @@ void lit_graph_info(const RoseBuildImpl &build, const rose_literal_info &li,
 }
 
 static
-void dumpRoseLiterals(const RoseBuildImpl &build, const char *filename) {
+void dumpRoseLiterals(const RoseBuildImpl &build,
+                      const vector<LitFragment> &fragments,
+                      const Grey &grey) {
     const RoseGraph &g = build.g;
+    map<u32, u32> frag_map = makeFragMap(fragments);
 
     DEBUG_PRINTF("dumping literals\n");
-    ofstream os(filename);
+    ofstream os(grey.dumpPath + "rose_literals.txt");
 
-    os << "ROSE LITERALS: a total of " << build.literals.right.size()
-       << " literals and " << num_vertices(g) << " roles." << endl << endl;
+    os << "ROSE LITERALS: a total of " << build.literals.size()
+       << " literals and " << num_vertices(g) << " roles." << endl
+       << endl;
 
-    for (const auto &e : build.literals.right) {
-        u32 id = e.first;
-        const ue2_literal &s = e.second.s;
+    for (u32 id = 0; id < build.literals.size(); id++) {
+        const auto &lit = build.literals.at(id);
+        const ue2_literal &s = lit.s;
         const rose_literal_info &lit_info = build.literal_info[id];
 
-        switch (e.second.table) {
+        switch (lit.table) {
         case ROSE_ANCHORED:
             os << "ANCHORED";
             break;
@@ -326,8 +381,11 @@ void dumpRoseLiterals(const RoseBuildImpl &build, const char *filename) {
             break;
         }
 
-        os << " ID " << id << "/" << lit_info.final_id << ": \""
-           << escapeString(s.get_string()) << "\""
+        os << " ID " << id;
+        if (contains(frag_map, id)) {
+            os << "/" << frag_map.at(id);
+        }
+        os << ": \"" << escapeString(s.get_string()) << "\""
            << " (len " << s.length() << ",";
         if (s.any_nocase()) {
             os << " nocase,";
@@ -336,8 +394,8 @@ void dumpRoseLiterals(const RoseBuildImpl &build, const char *filename) {
             os << " benefits,";
         }
 
-        if (e.second.delay) {
-            os << " delayed "<< e.second.delay << ",";
+        if (lit.delay) {
+            os << " delayed "<< lit.delay << ",";
         }
 
         os << " groups 0x" << hex << setw(16) << setfill('0')
@@ -420,60 +478,1751 @@ string toHex(Iter i, const Iter &end) {
 }
 
 static
-void dumpTestLiterals(const string &filename, const vector<hwlmLiteral> &lits) {
-    ofstream of(filename.c_str());
+bool isMetaChar(char c) {
+    switch (c) {
+    case '#':
+    case '$':
+    case '(':
+    case ')':
+    case '*':
+    case '+':
+    case '.':
+    case '/':
+    case '?':
+    case '[':
+    case '\\':
+    case ']':
+    case '^':
+    case '{':
+    case '|':
+    case '}':
+        return true;
+    default:
+        return false;
+    }
+}
 
-    for (const hwlmLiteral &lit : lits) {
-        of << lit.id << "=";
-        if (lit.nocase) {
-            of << "!";
+static
+string toRegex(const string &lit) {
+    ostringstream os;
+    for (char c : lit) {
+        if (0x20 <= c && c <= 0x7e) {
+            if (isMetaChar(c)) {
+                os << "\\" << c;
+            } else {
+                os << c;
+            }
+        } else if (c == '\n') {
+            os << "\\n";
+        } else if (c == '\r') {
+            os << "\\r";
+        } else if (c == '\t') {
+            os << "\\t";
+        } else {
+            os << "\\x" << hex << setw(2) << setfill('0')
+               << (unsigned)(c & 0xff) << dec;
         }
-        of << toHex(lit.s.begin(), lit.s.end());
+    }
+    return os.str();
+}
+
+void dumpMatcherLiterals(const vector<hwlmLiteral> &lits, const string &name,
+                         const Grey &grey) {
+    if (!grey.dumpFlags) {
+        return;
+    }
+
+    ofstream of(grey.dumpPath + "rose_" + name + "_test_literals.txt");
+
+    // Unique regex index, as literals may share an ID.
+    u32 i = 0;
+
+    for (const hwlmLiteral &lit : lits) {
+        // First, detail in a comment.
+        of << "# id=" << lit.id;
         if (!lit.msk.empty()) {
-            of << " " << toHex(lit.msk.begin(), lit.msk.end());
-            of << " " << toHex(lit.cmp.begin(), lit.cmp.end());
+            of << " msk=0x" << toHex(lit.msk.begin(), lit.msk.end());
+            of << " cmp=0x" << toHex(lit.cmp.begin(), lit.cmp.end());
+        }
+        of << " groups=0x" << hex << setfill('0') << lit.groups << dec;
+        if (lit.noruns) {
+            of << " noruns";
         }
+        of << endl;
+
+        // Second, literal rendered as a regex.
+        of << i << ":/" << toRegex(lit.s) << (lit.nocase ? "/i" : "/");
 
         of << endl;
+
+        i++;
     }
 
     of.close();
 }
 
 static
-void dumpRoseTestLiterals(const RoseBuildImpl &build, const string &base) {
-    size_t historyRequired = build.calcHistoryRequired();
-    size_t longLitLengthThreshold =
-        calcLongLitThreshold(build, historyRequired);
+const void *loadFromByteCodeOffset(const RoseEngine *t, u32 offset) {
+    if (!offset) {
+        return nullptr;
+    }
+
+    const char *lt = (const char *)t + offset;
+    return lt;
+}
+
+static
+const void *getAnchoredMatcher(const RoseEngine *t) {
+    return loadFromByteCodeOffset(t, t->amatcherOffset);
+}
+
+static
+const HWLM *getFloatingMatcher(const RoseEngine *t) {
+    return (const HWLM *)loadFromByteCodeOffset(t, t->fmatcherOffset);
+}
+
+static
+const HWLM *getDelayRebuildMatcher(const RoseEngine *t) {
+    return (const HWLM *)loadFromByteCodeOffset(t, t->drmatcherOffset);
+}
+
+static
+const HWLM *getEodMatcher(const RoseEngine *t) {
+    return (const HWLM *)loadFromByteCodeOffset(t, t->ematcherOffset);
+}
 
-    auto lits = fillHamsterLiteralList(build, ROSE_ANCHORED,
-                                       longLitLengthThreshold);
-    dumpTestLiterals(base + "rose_anchored_test_literals.txt", lits);
+static
+const HWLM *getSmallBlockMatcher(const RoseEngine *t) {
+    return (const HWLM *)loadFromByteCodeOffset(t, t->sbmatcherOffset);
+}
 
-    lits = fillHamsterLiteralList(build, ROSE_FLOATING, longLitLengthThreshold);
-    dumpTestLiterals(base + "rose_float_test_literals.txt", lits);
+static
+CharReach bitvectorToReach(const u8 *reach) {
+    CharReach cr;
 
-    lits = fillHamsterLiteralList(build, ROSE_EOD_ANCHORED,
-                                  build.ematcher_region_size);
-    dumpTestLiterals(base + "rose_eod_test_literals.txt", lits);
+    for (size_t i = 0; i < N_CHARS; i++) {
+        if (reach[i / 8] & (1U << (i % 8))) {
+            cr.set(i);
+        }
+    }
+    return cr;
+}
 
-    if (!build.cc.streaming) {
-        lits = fillHamsterLiteralList(build, ROSE_FLOATING,
-                                    ROSE_SMALL_BLOCK_LEN, ROSE_SMALL_BLOCK_LEN);
-        auto lits2 = fillHamsterLiteralList(build, ROSE_ANCHORED_SMALL_BLOCK,
-                                    ROSE_SMALL_BLOCK_LEN, ROSE_SMALL_BLOCK_LEN);
-        lits.insert(end(lits), begin(lits2), end(lits2));
-        dumpTestLiterals(base + "rose_smallblock_test_literals.txt", lits);
+static
+CharReach multiBitvectorToReach(const u8 *reach, u8 path_mask) {
+    CharReach cr;
+    for (size_t i = 0; i < N_CHARS; i++) {
+        if (reach[i] & path_mask) {
+            cr.set(i);
+        }
     }
+    return cr;
 }
 
-void dumpRose(const RoseBuild &build_base, const RoseEngine *t,
-              const Grey &grey) {
-    if (!grey.dumpFlags) {
+static
+void dumpLookaround(ofstream &os, const RoseEngine *t,
+                    const ROSE_STRUCT_CHECK_LOOKAROUND *ri) {
+    assert(ri);
+
+    const u8 *base = (const u8 *)t;
+
+    const s8 *look = (const s8 *)base + ri->look_index;
+    const s8 *look_end = look + ri->count;
+    const u8 *reach = base + ri->reach_index;
+
+    os << "    contents:" << endl;
+
+    for (; look < look_end; look++, reach += REACH_BITVECTOR_LEN) {
+        os << "      " << std::setw(4) << std::setfill(' ') << int{*look}
+           << ": ";
+        describeClass(os, bitvectorToReach(reach), 1000, CC_OUT_TEXT);
+        os << endl;
+    }
+}
+
+static
+void dumpMultipathLookaround(ofstream &os, const RoseEngine *t,
+                             const ROSE_STRUCT_MULTIPATH_LOOKAROUND *ri) {
+    assert(ri);
+
+    const u8 *base = (const u8 *)t;
+
+    const s8 *look_begin = (const s8 *)base + ri->look_index;
+    const s8 *look_end = look_begin + ri->count;
+    const u8 *reach_begin = base + ri->reach_index;
+
+    os << "    contents:" << endl;
+
+    u32 path_mask = ri->start_mask[0];
+    while (path_mask) {
+        u32 path = findAndClearLSB_32(&path_mask);
+        os << "    Path #" << path << ":" << endl;
+        os << "      ";
+
+        const s8 *look = look_begin;
+        const u8 *reach = reach_begin;
+        for (; look < look_end; look++, reach += MULTI_REACH_BITVECTOR_LEN) {
+            CharReach cr = multiBitvectorToReach(reach, 1U << path);
+            if (cr.any() && !cr.all()) {
+                os << "<" << int(*look) << ": ";
+                describeClass(os, cr, 1000, CC_OUT_TEXT);
+                os << "> ";
+            }
+        }
+        os << endl;
+    }
+}
+
+static
+vector<u32> sparseIterValues(const mmbit_sparse_iter *it, u32 num_bits) {
+    vector<u32> keys;
+
+    if (num_bits == 0) {
+        return keys;
+    }
+
+    vector<u8> bits(mmbit_size(num_bits), u8{0xff}); // All bits on.
+    vector<mmbit_sparse_state> state(MAX_SPARSE_ITER_STATES);
+
+    const u8 *b = bits.data();
+    mmbit_sparse_state *s = state.data();
+
+    u32 idx = 0;
+    u32 i = mmbit_sparse_iter_begin(b, num_bits, &idx, it, s);
+    while (i != MMB_INVALID) {
+        keys.push_back(i);
+        i = mmbit_sparse_iter_next(b, num_bits, i, &idx, it, s);
+    }
+
+    return keys;
+}
+
+static
+void dumpJumpTable(ofstream &os, const RoseEngine *t,
+                   const ROSE_STRUCT_SPARSE_ITER_BEGIN *ri) {
+    auto *it =
+        (const mmbit_sparse_iter *)loadFromByteCodeOffset(t, ri->iter_offset);
+    auto *jumps = (const u32 *)loadFromByteCodeOffset(t, ri->jump_table);
+
+    for (const auto &key : sparseIterValues(it, t->rolesWithStateCount)) {
+        os << "      " << std::setw(4) << std::setfill(' ') << key << " : +"
+           << *jumps << endl;
+        ++jumps;
+    }
+}
+
+static
+void dumpSomOperation(ofstream &os, const som_operation &op) {
+    os << "    som (type=" << u32{op.type} << ", onmatch=" << op.onmatch;
+    switch (op.type) {
+    case SOM_EXTERNAL_CALLBACK_REV_NFA:
+    case SOM_INTERNAL_LOC_SET_REV_NFA:
+    case SOM_INTERNAL_LOC_SET_REV_NFA_IF_UNSET:
+    case SOM_INTERNAL_LOC_SET_REV_NFA_IF_WRITABLE:
+        os << ", revNfaIndex=" << op.aux.revNfaIndex;
+        break;
+    default:
+        os << ", somDistance=" << op.aux.somDistance;
+        break;
+    }
+    os << ")" << endl;
+}
+
+static
+string dumpStrMask(const u8 *mask, size_t len) {
+    ostringstream oss;
+    for (size_t i = 0; i < len; i++) {
+        oss << std::hex << std::setw(2) << std::setfill('0') << u32{mask[i]}
+            << " ";
+    }
+    return oss.str();
+}
+
+static
+CharReach shufti2cr(const u8 *lo, const u8 *hi, u8 bucket_mask) {
+    CharReach cr;
+    for (u32 i = 0; i < N_CHARS; i++) {
+        if(lo[i & 0xf] & hi[i >> 4] & bucket_mask) {
+            cr.set(i);
+        }
+    }
+    return cr;
+}
+
+static
+void dumpLookaroundShufti(ofstream &os, u32 len, const u8 *lo, const u8 *hi,
+                          const u8 *bucket_mask, u32 neg_mask, s32 offset) {
+    assert(len == 16 || len == 32);
+    os << "    contents:" << endl;
+    for (u32 idx = 0; idx < len; idx++) {
+        CharReach cr = shufti2cr(lo, hi, bucket_mask[idx]);
+
+        if (neg_mask & (1U << idx)) {
+            cr.flip();
+        }
+
+        if (cr.any() && !cr.all()) {
+            os << "      " << std::setw(4) << std::setfill(' ')
+               << int(offset + idx) << ": ";
+            describeClass(os, cr, 1000, CC_OUT_TEXT);
+            os << endl;
+        }
+    }
+}
+
+static
+void dumpLookaroundShufti(ofstream &os, u32 len, const u8 *lo, const u8 *hi,
+                          const u8 *lo_2, const u8 *hi_2, const u8 *bucket_mask,
+                          const u8 *bucket_mask_2, u32 neg_mask, s32 offset) {
+    assert(len == 16 || len == 32);
+    os << "    contents:" << endl;
+    for (u32 idx = 0; idx < len; idx++) {
+        CharReach cr = shufti2cr(lo, hi, bucket_mask[idx]);
+        cr |= shufti2cr(lo_2, hi_2, bucket_mask_2[idx]);
+
+        if (neg_mask & (1U << idx)) {
+            cr.flip();
+        }
+
+        if (cr.any() && !cr.all()) {
+            os << "      " << std::setw(4) << std::setfill(' ')
+               << int(offset + idx) << ": ";
+            describeClass(os, cr, 1000, CC_OUT_TEXT);
+            os << endl;
+        }
+    }
+}
+
+static
+void dumpMultipathShufti(ofstream &os, u32 len, const u8 *lo, const u8 *hi,
+                         const u8 *bucket_mask, const u8 *data_offset,
+                         u64a neg_mask, s32 base_offset) {
+    assert(len == 16 || len == 32 || len == 64);
+    os << "    contents:" << endl;
+    u32 path = 0;
+    for (u32 idx = 0; idx < len; idx++) {
+        CharReach cr = shufti2cr(lo, hi, bucket_mask[idx]);
+
+        if (neg_mask & (1ULL << idx)) {
+            cr.flip();
+        }
+
+        if (cr.any() && !cr.all()) {
+            if (idx == 0 || data_offset[idx - 1] > data_offset[idx]) {
+                path++;
+                if (idx) {
+                    os << endl;
+                }
+                os << "    Path #" << path << ":" << endl;
+                os << "      ";
+            }
+
+            os << "<" << int(base_offset + data_offset[idx]) << ": ";
+            describeClass(os, cr, 1000, CC_OUT_TEXT);
+            os << "> ";
+        }
+    }
+    os << endl;
+}
+
+static
+void dumpMultipathShufti(ofstream &os, u32 len, const u8 *lo, const u8 *hi,
+                         const u8 *lo_2, const u8 *hi_2, const u8 *bucket_mask,
+                         const u8 *bucket_mask_2, const u8 *data_offset,
+                         u32 neg_mask, s32 base_offset) {
+    assert(len == 16 || len == 32 || len == 64);
+    os << "    contents:";
+    u32 path = 0;
+    for (u32 idx = 0; idx < len; idx++) {
+        CharReach cr = shufti2cr(lo, hi, bucket_mask[idx]);
+        cr |= shufti2cr(lo_2, hi_2, bucket_mask_2[idx]);
+
+        if (neg_mask & (1ULL << idx)) {
+            cr.flip();
+        }
+
+        if (cr.any() && !cr.all()) {
+            if (idx == 0 || data_offset[idx - 1] > data_offset[idx]) {
+                path++;
+                os << endl;
+                os << "    Path #" << path << ":" << endl;
+                os << "      ";
+            }
+
+            os << "<" << int(base_offset + data_offset[idx]) << ": ";
+            describeClass(os, cr, 1000, CC_OUT_TEXT);
+            os << "> ";
+        }
+    }
+    os << endl;
+}
+
+           #define PROGRAM_CASE(name)                                                     \
+    case ROSE_INSTR_##name: {                                                  \
+        os << "  " << std::setw(4) << std::setfill('0') << (pc - pc_base)      \
+           << ": " #name "\n";                                                 \
+        const auto *ri = (const struct ROSE_STRUCT_##name *)pc;
+
+#define PROGRAM_NEXT_INSTRUCTION                                               \
+    pc += ROUNDUP_N(sizeof(*ri), ROSE_INSTR_MIN_ALIGN);                        \
+    break;                                                                     \
+    }
+
+
+static
+void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
+    const char *pc_base = pc;
+    for (;;) {
+        u8 code = *(const u8 *)pc;
+        assert(code <= LAST_ROSE_INSTRUCTION);
+        const size_t offset = pc - pc_base;
+        switch (code) {
+            PROGRAM_CASE(END) { return; }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(ANCHORED_DELAY) {
+                os << "    groups 0x" << std::hex << ri->groups << std::dec
+                   << endl;
+                os << "    anch_id " << ri->anch_id << "\n";
+                os << "    done_jump " << offset + ri->done_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_LIT_EARLY) {
+                os << "    min_offset " << ri->min_offset << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_GROUPS) {
+                os << "    groups 0x" << std::hex << ri->groups << std::dec
+                   << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_ONLY_EOD) {
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_BOUNDS) {
+                os << "    min_bound " << ri->min_bound << endl;
+                os << "    max_bound " << ri->max_bound << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_NOT_HANDLED) {
+                os << "    key " << ri->key << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_SINGLE_LOOKAROUND) {
+                os << "    offset " << int{ri->offset} << endl;
+                os << "    reach_index " << ri->reach_index << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+                const u8 *reach = (const u8 *)t + ri->reach_index;
+                os << "    contents ";
+                describeClass(os, bitvectorToReach(reach), 1000, CC_OUT_TEXT);
+                os << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_LOOKAROUND) {
+                os << "    look_index " << ri->look_index << endl;
+                os << "    reach_index " << ri->reach_index << endl;
+                os << "    count " << ri->count << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+                dumpLookaround(os, t, ri);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MASK) {
+                os << "    and_mask 0x" << std::hex << std::setw(16)
+                   << std::setfill('0') << ri->and_mask << std::dec << endl;
+                os << "    cmp_mask 0x" << std::hex << std::setw(16)
+                   << std::setfill('0') << ri->cmp_mask << std::dec << endl;
+                os << "    neg_mask 0x" << std::hex << std::setw(16)
+                   << std::setfill('0') << ri->neg_mask << std::dec << endl;
+                os << "    offset " << ri->offset << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MASK_32) {
+                os << "    and_mask "
+                   << dumpStrMask(ri->and_mask, sizeof(ri->and_mask))
+                   << endl;
+                os << "    cmp_mask "
+                   << dumpStrMask(ri->cmp_mask, sizeof(ri->cmp_mask))
+                   << endl;
+                os << "    neg_mask 0x" << std::hex << std::setw(8)
+                   << std::setfill('0') << ri->neg_mask << std::dec << endl;
+                os << "    offset " << ri->offset << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_BYTE) {
+                os << "    and_mask 0x" << std::hex << std::setw(2)
+                   << std::setfill('0') << u32{ri->and_mask} << std::dec
+                   << endl;
+                os << "    cmp_mask 0x" << std::hex << std::setw(2)
+                   << std::setfill('0') << u32{ri->cmp_mask} << std::dec
+                   << endl;
+                os << "    negation " << u32{ri->negation} << endl;
+                os << "    offset " << ri->offset << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_SHUFTI_16x8) {
+                os << "    nib_mask "
+                   << dumpStrMask(ri->nib_mask, sizeof(ri->nib_mask))
+                   << endl;
+                os << "    bucket_select_mask "
+                   << dumpStrMask(ri->bucket_select_mask,
+                                  sizeof(ri->bucket_select_mask))
+                   << endl;
+                os << "    neg_mask 0x" << std::hex << std::setw(8)
+                   << std::setfill('0') << ri->neg_mask << std::dec << endl;
+                os << "    offset " << ri->offset << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+                dumpLookaroundShufti(os, 16, ri->nib_mask, ri->nib_mask + 16,
+                                     ri->bucket_select_mask, ri->neg_mask,
+                                     ri->offset);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_SHUFTI_32x8) {
+                os << "    hi_mask "
+                   << dumpStrMask(ri->hi_mask, sizeof(ri->hi_mask))
+                   << endl;
+                os << "    lo_mask "
+                   << dumpStrMask(ri->lo_mask, sizeof(ri->lo_mask))
+                   << endl;
+                os << "    bucket_select_mask "
+                   << dumpStrMask(ri->bucket_select_mask,
+                                  sizeof(ri->bucket_select_mask))
+                   << endl;
+                os << "    neg_mask 0x" << std::hex << std::setw(8)
+                   << std::setfill('0') << ri->neg_mask << std::dec << endl;
+                os << "    offset " << ri->offset << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+                dumpLookaroundShufti(os, 32, ri->lo_mask, ri->hi_mask,
+                                     ri->bucket_select_mask, ri->neg_mask,
+                                     ri->offset);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_SHUFTI_16x16) {
+                os << "    hi_mask "
+                   << dumpStrMask(ri->hi_mask, sizeof(ri->hi_mask))
+                   << endl;
+                os << "    lo_mask "
+                   << dumpStrMask(ri->lo_mask, sizeof(ri->lo_mask))
+                   << endl;
+                os << "    bucket_select_mask "
+                   << dumpStrMask(ri->bucket_select_mask,
+                                  sizeof(ri->bucket_select_mask))
+                   << endl;
+                os << "    neg_mask 0x" << std::hex << std::setw(8)
+                   << std::setfill('0') << ri->neg_mask << std::dec << endl;
+                os << "    offset " << ri->offset << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+                dumpLookaroundShufti(os, 16, ri->lo_mask, ri->hi_mask,
+                                     ri->lo_mask + 16, ri->hi_mask + 16,
+                                     ri->bucket_select_mask,
+                                     ri->bucket_select_mask + 16,
+                                     ri->neg_mask, ri->offset);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_SHUFTI_32x16) {
+                os << "    hi_mask "
+                   << dumpStrMask(ri->hi_mask, sizeof(ri->hi_mask))
+                   << endl;
+                os << "    lo_mask "
+                   << dumpStrMask(ri->lo_mask, sizeof(ri->lo_mask))
+                   << endl;
+                os << "    bucket_select_mask_hi "
+                   << dumpStrMask(ri->bucket_select_mask_hi,
+                                  sizeof(ri->bucket_select_mask_hi))
+                   << endl;
+                os << "    bucket_select_mask_lo "
+                   << dumpStrMask(ri->bucket_select_mask_lo,
+                                  sizeof(ri->bucket_select_mask_lo))
+                   << endl;
+                os << "    neg_mask 0x" << std::hex << std::setw(8)
+                   << std::setfill('0') << ri->neg_mask << std::dec << endl;
+                os << "    offset " << ri->offset << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+                dumpLookaroundShufti(os, 32, ri->lo_mask, ri->hi_mask,
+                                     ri->lo_mask + 16, ri->hi_mask + 16,
+                                     ri->bucket_select_mask_lo,
+                                     ri->bucket_select_mask_hi,
+                                     ri->neg_mask, ri->offset);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_INFIX) {
+                os << "    queue " << ri->queue << endl;
+                os << "    lag " << ri->lag << endl;
+                os << "    report " << ri->report << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_PREFIX) {
+                os << "    queue " << ri->queue << endl;
+                os << "    lag " << ri->lag << endl;
+                os << "    report " << ri->report << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(PUSH_DELAYED) {
+                os << "    delay " << u32{ri->delay} << endl;
+                os << "    index " << ri->index << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(DUMMY_NOP) {}
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CATCH_UP) {}
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CATCH_UP_MPV) {}
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SOM_ADJUST) {
+                os << "    distance " << ri->distance << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SOM_LEFTFIX) {
+                os << "    queue " << ri->queue << endl;
+                os << "    lag " << ri->lag << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SOM_FROM_REPORT) {
+                dumpSomOperation(os, ri->som);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SOM_ZERO) {}
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(TRIGGER_INFIX) {
+                os << "    queue " << ri->queue << endl;
+                os << "    event " << ri->event << endl;
+                os << "    cancel " << u32{ri->cancel} << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(TRIGGER_SUFFIX) {
+                os << "    queue " << ri->queue << endl;
+                os << "    event " << ri->event << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(DEDUPE) {
+                os << "    quash_som " << u32{ri->quash_som} << endl;
+                os << "    dkey " << ri->dkey << endl;
+                os << "    offset_adjust " << ri->offset_adjust << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(DEDUPE_SOM) {
+                os << "    quash_som " << u32{ri->quash_som} << endl;
+                os << "    dkey " << ri->dkey << endl;
+                os << "    offset_adjust " << ri->offset_adjust << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(REPORT_CHAIN) {
+                os << "    event " << ri->event << endl;
+                os << "    top_squash_distance " << ri->top_squash_distance
+                   << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(REPORT_SOM_INT) {
+                dumpSomOperation(os, ri->som);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(REPORT_SOM_AWARE) {
+                dumpSomOperation(os, ri->som);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(REPORT) {
+                os << "    onmatch " << ri->onmatch << endl;
+                os << "    offset_adjust " << ri->offset_adjust << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(REPORT_EXHAUST) {
+                os << "    onmatch " << ri->onmatch << endl;
+                os << "    offset_adjust " << ri->offset_adjust << endl;
+                os << "    ekey " << ri->ekey << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(REPORT_SOM) {
+                os << "    onmatch " << ri->onmatch << endl;
+                os << "    offset_adjust " << ri->offset_adjust << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(REPORT_SOM_EXHAUST) {
+                os << "    onmatch " << ri->onmatch << endl;
+                os << "    offset_adjust " << ri->offset_adjust << endl;
+                os << "    ekey " << ri->ekey << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(DEDUPE_AND_REPORT) {
+                os << "    quash_som " << u32{ri->quash_som} << endl;
+                os << "    dkey " << ri->dkey << endl;
+                os << "    onmatch " << ri->onmatch << endl;
+                os << "    offset_adjust " << ri->offset_adjust << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(FINAL_REPORT) {
+                os << "    onmatch " << ri->onmatch << endl;
+                os << "    offset_adjust " << ri->offset_adjust << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_EXHAUSTED) {
+                os << "    ekey " << ri->ekey << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MIN_LENGTH) {
+                os << "    end_adj " << ri->end_adj << endl;
+                os << "    min_length " << ri->min_length << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SET_STATE) {
+                os << "    index " << ri->index << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SET_GROUPS) {
+                os << "    groups 0x" << std::hex << ri->groups << std::dec
+                   << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SQUASH_GROUPS) {
+                os << "    groups 0x" << std::hex << ri->groups << std::dec
+                   << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_STATE) {
+                os << "    index " << ri->index << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SPARSE_ITER_BEGIN) {
+                os << "    iter_offset " << ri->iter_offset << endl;
+                os << "    jump_table " << ri->jump_table << endl;
+                dumpJumpTable(os, t, ri);
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SPARSE_ITER_NEXT) {
+                os << "    iter_offset " << ri->iter_offset << endl;
+                os << "    jump_table " << ri->jump_table << endl;
+                os << "    state " << ri->state << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SPARSE_ITER_ANY) {
+                os << "    iter_offset " << ri->iter_offset << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(ENGINES_EOD) {
+                os << "    iter_offset " << ri->iter_offset << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(SUFFIXES_EOD) {}
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(MATCHER_EOD) {}
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_LONG_LIT) {
+                os << "    lit_offset " << ri->lit_offset << endl;
+                os << "    lit_length " << ri->lit_length << endl;
+                const char *lit = (const char *)t + ri->lit_offset;
+                os << "    literal: \""
+                   << escapeString(string(lit, ri->lit_length)) << "\"" << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_LONG_LIT_NOCASE) {
+                os << "    lit_offset " << ri->lit_offset << endl;
+                os << "    lit_length " << ri->lit_length << endl;
+                const char *lit = (const char *)t + ri->lit_offset;
+                os << "    literal: \""
+                   << escapeString(string(lit, ri->lit_length)) << "\"" << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MED_LIT) {
+                os << "    lit_offset " << ri->lit_offset << endl;
+                os << "    lit_length " << ri->lit_length << endl;
+                const char *lit = (const char *)t + ri->lit_offset;
+                os << "    literal: \""
+                   << escapeString(string(lit, ri->lit_length)) << "\"" << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MED_LIT_NOCASE) {
+                os << "    lit_offset " << ri->lit_offset << endl;
+                os << "    lit_length " << ri->lit_length << endl;
+                const char *lit = (const char *)t + ri->lit_offset;
+                os << "    literal: \""
+                   << escapeString(string(lit, ri->lit_length)) << "\"" << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CLEAR_WORK_DONE) {}
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(MULTIPATH_LOOKAROUND) {
+                os << "    look_index " << ri->look_index << endl;
+                os << "    reach_index " << ri->reach_index << endl;
+                os << "    count " << ri->count << endl;
+                os << "    last_start " << ri->last_start << endl;
+                os << "    start_mask "
+                   << dumpStrMask(ri->start_mask, sizeof(ri->start_mask))
+                   << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+                dumpMultipathLookaround(os, t, ri);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MULTIPATH_SHUFTI_16x8) {
+                os << "    nib_mask "
+                   << dumpStrMask(ri->nib_mask, sizeof(ri->nib_mask))
+                   << endl;
+                os << "    bucket_select_mask "
+                   << dumpStrMask(ri->bucket_select_mask,
+                                  sizeof(ri->bucket_select_mask))
+                   << endl;
+                os << "    data_select_mask "
+                   << dumpStrMask(ri->data_select_mask,
+                                  sizeof(ri->data_select_mask))
+                   << endl;
+                os << "    hi_bits_mask 0x" << std::hex << std::setw(4)
+                   << std::setfill('0') << ri->hi_bits_mask << std::dec << endl;
+                os << "    lo_bits_mask 0x" << std::hex << std::setw(4)
+                   << std::setfill('0') << ri->lo_bits_mask << std::dec << endl;
+                os << "    neg_mask 0x" << std::hex << std::setw(4)
+                   << std::setfill('0') << ri->neg_mask << std::dec << endl;
+                os << "    base_offset " << ri->base_offset << endl;
+                os << "    last_start " << ri->last_start << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+                dumpMultipathShufti(os, 16, ri->nib_mask, ri->nib_mask + 16,
+                                    ri->bucket_select_mask,
+                                    ri->data_select_mask,
+                                    ri->neg_mask, ri->base_offset);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MULTIPATH_SHUFTI_32x8) {
+                os << "    hi_mask "
+                   << dumpStrMask(ri->hi_mask, sizeof(ri->hi_mask))
+                   << endl;
+                os << "    lo_mask "
+                   << dumpStrMask(ri->lo_mask, sizeof(ri->lo_mask))
+                   << endl;
+                os << "    bucket_select_mask "
+                   << dumpStrMask(ri->bucket_select_mask,
+                                  sizeof(ri->bucket_select_mask))
+                   << endl;
+                os << "    data_select_mask "
+                   << dumpStrMask(ri->data_select_mask,
+                                  sizeof(ri->data_select_mask))
+                   << endl;
+                os << "    hi_bits_mask 0x" << std::hex << std::setw(8)
+                   << std::setfill('0') << ri->hi_bits_mask << std::dec << endl;
+                os << "    lo_bits_mask 0x" << std::hex << std::setw(8)
+                   << std::setfill('0') << ri->lo_bits_mask << std::dec << endl;
+                os << "    neg_mask 0x" << std::hex << std::setw(8)
+                   << std::setfill('0') << ri->neg_mask << std::dec << endl;
+                os << "    base_offset " << ri->base_offset << endl;
+                os << "    last_start " << ri->last_start << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+                dumpMultipathShufti(os, 32, ri->lo_mask, ri->hi_mask,
+                                    ri->bucket_select_mask,
+                                    ri->data_select_mask,
+                                    ri->neg_mask, ri->base_offset);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MULTIPATH_SHUFTI_32x16) {
+                os << "    hi_mask "
+                   << dumpStrMask(ri->hi_mask, sizeof(ri->hi_mask))
+                   << endl;
+                os << "    lo_mask "
+                   << dumpStrMask(ri->lo_mask, sizeof(ri->lo_mask))
+                   << endl;
+                os << "    bucket_select_mask_hi "
+                   << dumpStrMask(ri->bucket_select_mask_hi,
+                                  sizeof(ri->bucket_select_mask_hi))
+                   << endl;
+                os << "    bucket_select_mask_lo "
+                   << dumpStrMask(ri->bucket_select_mask_lo,
+                                  sizeof(ri->bucket_select_mask_lo))
+                   << endl;
+                os << "    data_select_mask "
+                   << dumpStrMask(ri->data_select_mask,
+                                  sizeof(ri->data_select_mask))
+                   << endl;
+                os << "    hi_bits_mask 0x" << std::hex << std::setw(8)
+                   << std::setfill('0') << ri->hi_bits_mask << std::dec << endl;
+                os << "    lo_bits_mask 0x" << std::hex << std::setw(8)
+                   << std::setfill('0') << ri->lo_bits_mask << std::dec << endl;
+                os << "    neg_mask 0x" << std::hex << std::setw(8)
+                   << std::setfill('0') << ri->neg_mask << std::dec << endl;
+                os << "    base_offset " << ri->base_offset << endl;
+                os << "    last_start " << ri->last_start << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+                dumpMultipathShufti(os, 32, ri->lo_mask, ri->hi_mask,
+                                    ri->lo_mask + 16, ri->hi_mask + 16,
+                                    ri->bucket_select_mask_lo,
+                                    ri->bucket_select_mask_hi,
+                                    ri->data_select_mask,
+                                    ri->neg_mask, ri->base_offset);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+            PROGRAM_CASE(CHECK_MULTIPATH_SHUFTI_64) {
+                os << "    hi_mask "
+                   << dumpStrMask(ri->hi_mask, sizeof(ri->hi_mask))
+                   << endl;
+                os << "    lo_mask "
+                   << dumpStrMask(ri->lo_mask, sizeof(ri->lo_mask))
+                   << endl;
+                os << "    bucket_select_mask "
+                   << dumpStrMask(ri->bucket_select_mask,
+                                  sizeof(ri->bucket_select_mask))
+                   << endl;
+                os << "    data_select_mask "
+                   << dumpStrMask(ri->data_select_mask,
+                                  sizeof(ri->data_select_mask))
+                   << endl;
+                os << "    hi_bits_mask 0x" << std::hex << std::setw(16)
+                   << std::setfill('0') << ri->hi_bits_mask << std::dec << endl;
+                os << "    lo_bits_mask 0x" << std::hex << std::setw(16)
+                   << std::setfill('0') << ri->lo_bits_mask << std::dec << endl;
+                os << "    neg_mask 0x" << std::hex << std::setw(16)
+                   << std::setfill('0') << ri->neg_mask << std::dec << endl;
+                os << "    base_offset " << ri->base_offset << endl;
+                os << "    last_start " << ri->last_start << endl;
+                os << "    fail_jump " << offset + ri->fail_jump << endl;
+                dumpMultipathShufti(os, 64, ri->lo_mask, ri->hi_mask,
+                                    ri->bucket_select_mask,
+                                    ri->data_select_mask,
+                                    ri->neg_mask, ri->base_offset);
+            }
+            PROGRAM_NEXT_INSTRUCTION
+
+        default:
+            os << "  UNKNOWN (code " << int{code} << ")" << endl;
+            os << "  <stopping>" << endl;
+            return;
+        }
+    }
+}
+
+#undef PROGRAM_CASE
+#undef PROGRAM_NEXT_INSTRUCTION
+
+static
+void dumpRoseLitPrograms(const vector<LitFragment> &fragments,
+                         const RoseEngine *t, const string &filename) {
+    ofstream os(filename);
+
+    // Collect all programs referenced by a literal fragment.
+    vector<u32> programs;
+    for (const auto &frag : fragments) {
+        if (frag.lit_program_offset) {
+            programs.push_back(frag.lit_program_offset);
+        }
+        if (frag.delay_program_offset) {
+            programs.push_back(frag.delay_program_offset);
+        }
+    }
+    sort_and_unique(programs);
+
+    for (u32 prog_offset : programs) {
+        os << "Program @ " << prog_offset << ":" << endl;
+        const char *prog = (const char *)loadFromByteCodeOffset(t, prog_offset);
+        dumpProgram(os, t, prog);
+        os << endl;
+    }
+
+    os.close();
+}
+
+static
+void dumpRoseEodPrograms(const RoseEngine *t, const string &filename) {
+    ofstream os(filename);
+    const char *base = (const char *)t;
+
+    if (t->eodProgramOffset) {
+        os << "EOD Program @ " << t->eodProgramOffset << ":" << endl;
+        dumpProgram(os, t, base + t->eodProgramOffset);
+        os << endl;
+    } else {
+        os << "<No EOD Program>" << endl;
+    }
+
+    os.close();
+}
+
+static
+void dumpRoseReportPrograms(const RoseEngine *t, const string &filename) {
+    ofstream os(filename);
+
+    const u32 *programs =
+        (const u32 *)loadFromByteCodeOffset(t, t->reportProgramOffset);
+
+    for (u32 i = 0; i < t->reportProgramCount; i++) {
+        os << "Report " << i << endl;
+        os << "---------------" << endl;
+
+        if (programs[i]) {
+            os << "Program @ " << programs[i] << ":" << endl;
+            const char *prog =
+                (const char *)loadFromByteCodeOffset(t, programs[i]);
+            dumpProgram(os, t, prog);
+        } else {
+            os << "<No Program>" << endl;
+        }
+    }
+
+    os.close();
+}
+
+static
+void dumpRoseAnchoredPrograms(const RoseEngine *t, const string &filename) {
+    ofstream os(filename);
+
+    const u32 *programs =
+        (const u32 *)loadFromByteCodeOffset(t, t->anchoredProgramOffset);
+
+    for (u32 i = 0; i < t->anchored_count; i++) {
+        os << "Anchored entry " << i << endl;
+        os << "---------------" << endl;
+
+        if (programs[i]) {
+            os << "Program @ " << programs[i] << ":" << endl;
+            const char *prog =
+                (const char *)loadFromByteCodeOffset(t, programs[i]);
+            dumpProgram(os, t, prog);
+        } else {
+            os << "<No Program>" << endl;
+        }
+        os << endl;
+    }
+
+    os.close();
+}
+
+static
+void dumpRoseDelayPrograms(const RoseEngine *t, const string &filename) {
+    ofstream os(filename);
+
+    const u32 *programs =
+        (const u32 *)loadFromByteCodeOffset(t, t->delayProgramOffset);
+
+    for (u32 i = 0; i < t->delay_count; i++) {
+        os << "Delay entry " << i << endl;
+        os << "---------------" << endl;
+
+        if (programs[i]) {
+            os << "Program @ " << programs[i] << ":" << endl;
+            const char *prog =
+                (const char *)loadFromByteCodeOffset(t, programs[i]);
+            dumpProgram(os, t, prog);
+        } else {
+            os << "<No Program>" << endl;
+        }
+        os << endl;
+    }
+
+    os.close();
+}
+
+static
+void dumpNfaNotes(ofstream &fout, const RoseEngine *t, const NFA *n) {
+    const u32 qindex = n->queueIndex;
+
+    if (qindex < t->outfixBeginQueue) {
+        fout << "chained";
+        return;
+    }
+
+    if (qindex < t->outfixEndQueue) {
+        fout << "outfix";
+        return;
+    }
+
+    const NfaInfo *nfa_info = getNfaInfoByQueue(t, qindex);
+    const NFA *nfa = getNfaByInfo(t, nfa_info);
+
+    if (nfa_info->eod) {
+        fout << "eod ";
+    }
+
+    if (qindex < t->leftfixBeginQueue) {
+        fout << "suffix";
+        return;
+    }
+
+    const LeftNfaInfo *left = getLeftInfoByQueue(t, qindex);
+    if (left->eager) {
+        fout << "eager ";
+    }
+    if (left->transient) {
+        fout << "transient " << (u32)left->transient << " ";
+    }
+    if (left->infix) {
+        fout << "infix";
+        u32 maxQueueLen = left->maxQueueLen;
+        if (maxQueueLen != (u32)(-1)) {
+            fout << " maxqlen=" << maxQueueLen;
+        }
+    } else {
+        fout << "prefix";
+    }
+    fout << " maxlag=" << left->maxLag;
+    if (left->stopTable) {
+        fout << " miracles";
+    }
+    if (left->countingMiracleOffset) {
+        const RoseCountingMiracle *cm
+            = (const RoseCountingMiracle *)((const char *)t
+                                            + left->countingMiracleOffset);
+        fout << " counting_miracle:" << (int)cm->count
+             << (cm->shufti ? "s" : "v");
+    }
+    if (nfaSupportsZombie(nfa)) {
+        fout << " zombie";
+    }
+    if (left->eod_check) {
+        fout << " eod";
+    }
+}
+
+static
+void dumpComponentInfo(const RoseEngine *t, const string &base) {
+    stringstream ss;
+    ss << base << "rose_components.txt";
+    ofstream fout(ss.str().c_str());
+
+    fout << "Index  Offset\tEngine               \tStates S.State Bytes   Notes\n";
+
+    for (u32 i = 0; i < t->queueCount; i++) {
+        const NfaInfo *nfa_info = getNfaInfoByQueue(t, i);
+        const NFA *n = getNfaByInfo(t, nfa_info);
+
+        fout << left << setw(6) << i << " ";
+
+        fout << left << ((const char *)n - (const char *)t) << "\t"; /* offset */
+
+        fout << left << setw(16) << describe(*n) << "\t";
+
+        fout << left << setw(6) << n->nPositions << " ";
+        fout << left << setw(7) << n->streamStateSize << " ";
+        fout << left << setw(7) << n->length << " ";
+
+        dumpNfaNotes(fout, t, n);
+
+        fout << endl;
+    }
+}
+
+
+static
+void dumpComponentInfoCsv(const RoseEngine *t, const string &base) {
+    FILE *f = fopen((base +"rose_components.csv").c_str(), "w");
+
+    fprintf(f, "Index, Offset,Engine Type,States,Stream State,Bytecode Size,"
+            "Kind,Notes\n");
+
+    for (u32 i = 0; i < t->queueCount; i++) {
+        const NfaInfo *nfa_info = getNfaInfoByQueue(t, i);
+        const NFA *n = getNfaByInfo(t, nfa_info);
+        nfa_kind kind;
+        stringstream notes;
+
+        if (i < t->outfixBeginQueue) {
+            notes << "chained;";
+        }
+
+        if (nfa_info->eod) {
+            notes << "eod;";
+        }
+
+        if (i < t->outfixEndQueue) {
+            kind = NFA_OUTFIX;
+        } else if (i < t->leftfixBeginQueue) {
+            kind = NFA_SUFFIX;
+        } else {
+            const LeftNfaInfo *left = getLeftInfoByQueue(t, i);
+            if (left->eager) {
+                notes << "eager;";
+            }
+            if (left->transient) {
+                notes << "transient " << (u32)left->transient << ";";
+            }
+            if (left->infix) {
+                kind = NFA_INFIX;
+                u32 maxQueueLen = left->maxQueueLen;
+                if (maxQueueLen != (u32)(-1)) {
+                    notes << "maxqlen=" << maxQueueLen << ";";
+                }
+            } else {
+                kind = NFA_PREFIX;
+            }
+            notes << "maxlag=" << left->maxLag << ";";
+            if (left->stopTable) {
+                notes << "miracles;";
+            }
+            if (left->countingMiracleOffset) {
+                auto cm = (const RoseCountingMiracle *)
+                    ((const char *)t + left->countingMiracleOffset);
+                notes << "counting_miracle:" << (int)cm->count
+                      << (cm->shufti ? "s" : "v") << ";";
+            }
+            if (nfaSupportsZombie(n)) {
+                notes << " zombie;";
+            }
+            if (left->eod_check) {
+            notes << "left_eod;";
+            }
+        }
+
+        fprintf(f, "%u,%zd,\"%s\",%u,%u,%u,%s,%s\n", i,
+                (const char *)n - (const char *)t, describe(*n).c_str(),
+                n->nPositions, n->streamStateSize, n->length,
+                to_string(kind).c_str(), notes.str().c_str());
+    }
+    fclose(f);
+}
+
+static
+void dumpExhaust(const RoseEngine *t, const string &base) {
+    stringstream sstxt;
+    sstxt << base << "rose_exhaust.txt";
+    FILE *f = fopen(sstxt.str().c_str(), "w");
+
+    const NfaInfo *infos
+        = (const NfaInfo *)((const char *)t + t->nfaInfoOffset);
+
+    u32 queue_count = t->activeArrayCount;
+
+    for (u32 i = 0; i < queue_count; ++i) {
+        u32 ekey_offset = infos[i].ekeyListOffset;
+
+        fprintf(f, "%u (%u):", i, ekey_offset);
+
+        if (ekey_offset) {
+            const u32 *ekeys = (const u32 *)((const char *)t + ekey_offset);
+            while (1) {
+                u32 e = *ekeys;
+                ++ekeys;
+                if (e == ~0U) {
+                    break;
+                }
+                fprintf(f, " %u", e);
+            }
+        }
+
+        fprintf(f, "\n");
+    }
+
+    fclose(f);
+}
+
+static
+void dumpNfas(const RoseEngine *t, bool dump_raw, const string &base) {
+    dumpExhaust(t, base);
+
+    for (u32 i = 0; i < t->queueCount; i++) {
+        const NfaInfo *nfa_info = getNfaInfoByQueue(t, i);
+        const NFA *n = getNfaByInfo(t, nfa_info);
+
+        stringstream ssbase;
+        ssbase << base << "rose_nfa_" << i;
+        nfaGenerateDumpFiles(n, ssbase.str());
+
+        if (dump_raw) {
+            stringstream ssraw;
+            ssraw << base << "rose_nfa_" << i << ".raw";
+            FILE *f = fopen(ssraw.str().c_str(), "w");
+            fwrite(n, 1, n->length, f);
+            fclose(f);
+        }
+    }
+}
+
+static
+void dumpRevComponentInfo(const RoseEngine *t, const string &base) {
+    stringstream ss;
+    ss << base << "som_rev_components.txt";
+    ofstream fout(ss.str().c_str());
+
+    fout << "Index  Offset\tEngine               \tStates S.State Bytes\n";
+
+    const char *tp = (const char *)t;
+    const u32 *rev_offsets = (const u32 *)(tp + t->somRevOffsetOffset);
+
+    for (u32 i = 0; i < t->somRevCount; i++) {
+        u32 offset = rev_offsets[i];
+        const NFA *n = (const NFA *)(tp + offset);
+
+        fout << left << setw(6) << i << " ";
+
+        fout << left << offset << "\t"; /* offset */
+
+        fout << left << setw(16) << describe(*n) << "\t";
+
+        fout << left << setw(6) << n->nPositions << " ";
+        fout << left << setw(7) << n->streamStateSize << " ";
+        fout << left << setw(7) << n->length;
+        fout << endl;
+    }
+}
+
+static
+void dumpRevNfas(const RoseEngine *t, bool dump_raw, const string &base) {
+    const char *tp = (const char *)t;
+    const u32 *rev_offsets = (const u32 *)(tp + t->somRevOffsetOffset);
+
+    for (u32 i = 0; i < t->somRevCount; i++) {
+        const NFA *n = (const NFA *)(tp + rev_offsets[i]);
+
+        stringstream ssbase;
+        ssbase << base << "som_rev_nfa_" << i;
+        nfaGenerateDumpFiles(n, ssbase.str());
+
+        if (dump_raw) {
+            stringstream ssraw;
+            ssraw << base << "som_rev_nfa_" << i << ".raw";
+            FILE *f = fopen(ssraw.str().c_str(), "w");
+            fwrite(n, 1, n->length, f);
+            fclose(f);
+        }
+    }
+}
+
+static
+void dumpAnchored(const RoseEngine *t, const string &base) {
+    u32 i = 0;
+    const anchored_matcher_info *curr
+        = (const anchored_matcher_info *)getALiteralMatcher(t);
+
+    while (curr) {
+        const NFA *n = (const NFA *)((const char *)curr + sizeof(*curr));
+
+        stringstream ssbase;
+        ssbase << base << "anchored_" << i;
+        nfaGenerateDumpFiles(n, ssbase.str());
+
+        curr = curr->next_offset ? (const anchored_matcher_info *)
+            ((const char *)curr + curr->next_offset) : nullptr;
+        i++;
+    };
+}
+
+static
+void dumpAnchoredStats(const void *atable, FILE *f) {
+    assert(atable);
+
+    u32 i = 0;
+    const anchored_matcher_info *curr = (const anchored_matcher_info *)atable;
+
+    while (curr) {
+        const NFA *n = (const NFA *)((const char *)curr + sizeof(*curr));
+
+        fprintf(f, "  NFA %u: %s, %u states (%u bytes)\n", i,
+                describe(*n).c_str(), n->nPositions, n->length);
+
+        curr = curr->next_offset ? (const anchored_matcher_info *)
+            ((const char *)curr + curr->next_offset) : nullptr;
+        i++;
+    };
+
+}
+
+static
+void dumpLongLiteralSubtable(const RoseLongLitTable *ll_table,
+                             const RoseLongLitSubtable *ll_sub, FILE *f) {
+    if (!ll_sub->hashBits) {
+        fprintf(f, "      <no table>\n");
+        return;
+    }
+
+    const char *base = (const char *)ll_table;
+
+    u32 nbits = ll_sub->hashBits;
+    u32 num_entries = 1U << nbits;
+    const auto *tab = (const RoseLongLitHashEntry *)(base + ll_sub->hashOffset);
+    u32 hash_occ =
+        count_if(tab, tab + num_entries, [](const RoseLongLitHashEntry &ent) {
+            return ent.str_offset != 0;
+        });
+    float hash_occ_percent = ((float)hash_occ / (float)num_entries) * 100;
+
+    fprintf(f, "      hash table   : %u bits, occupancy %u/%u (%0.1f%%)\n",
+            nbits, hash_occ, num_entries, hash_occ_percent);
+
+    u32 bloom_bits = ll_sub->bloomBits;
+    u32 bloom_size = 1U << bloom_bits;
+    const u8 *bloom = (const u8 *)base + ll_sub->bloomOffset;
+    u32 bloom_occ = accumulate(bloom, bloom + bloom_size / 8, 0,
+        [](const u32 &sum, const u8 &elem) { return sum + popcount32(elem); });
+    float bloom_occ_percent = ((float)bloom_occ / (float)(bloom_size)) * 100;
+
+    fprintf(f, "      bloom filter : %u bits, occupancy %u/%u (%0.1f%%)\n",
+            bloom_bits, bloom_occ, bloom_size, bloom_occ_percent);
+}
+
+static
+void dumpLongLiteralTable(const RoseEngine *t, FILE *f) {
+    if (!t->longLitTableOffset) {
+        return;
+    }
+
+    fprintf(f, "\n");
+    fprintf(f, "Long literal table (streaming):\n");
+
+    const auto *ll_table =
+        (const struct RoseLongLitTable *)loadFromByteCodeOffset(
+            t, t->longLitTableOffset);
+
+    fprintf(f, "    total size     : %u bytes\n", ll_table->size);
+    fprintf(f, "    longest len    : %u\n", ll_table->maxLen);
+    fprintf(f, "    stream state   : %u bytes\n", ll_table->streamStateBytes);
+
+    fprintf(f, "    caseful:\n");
+    dumpLongLiteralSubtable(ll_table, &ll_table->caseful, f);
+
+    fprintf(f, "    nocase:\n");
+    dumpLongLiteralSubtable(ll_table, &ll_table->nocase, f);
+}
+
+static
+void roseDumpText(const RoseEngine *t, FILE *f) {
+    if (!t) {
+        fprintf(f, "<< no rose >>\n");
         return;
     }
 
-    const RoseBuildImpl &build = dynamic_cast<const RoseBuildImpl&>(build_base);
+    const void *atable = getAnchoredMatcher(t);
+    const HWLM *ftable = getFloatingMatcher(t);
+    const HWLM *drtable = getDelayRebuildMatcher(t);
+    const HWLM *etable = getEodMatcher(t);
+    const HWLM *sbtable = getSmallBlockMatcher(t);
+
+    fprintf(f, "Rose:\n\n");
+
+    fprintf(f, "mode:                : ");
+    switch(t->mode) {
+    case HS_MODE_BLOCK:
+        fprintf(f, "block");
+        break;
+    case HS_MODE_STREAM:
+        fprintf(f, "streaming");
+        break;
+    case HS_MODE_VECTORED:
+        fprintf(f, "vectored");
+        break;
+    }
+    fprintf(f, "\n");
+
+    fprintf(f, "properties           :");
+    if (t->canExhaust) {
+        fprintf(f, " canExhaust");
+    }
+    if (t->hasSom) {
+        fprintf(f, " hasSom");
+    }
+    if (t->runtimeImpl == ROSE_RUNTIME_PURE_LITERAL) {
+        fprintf(f, " pureLiteral");
+    }
+    if (t->runtimeImpl == ROSE_RUNTIME_SINGLE_OUTFIX) {
+        fprintf(f, " soleOutfix");
+    }
+    fprintf(f, "\n");
+
+    fprintf(f, "dkey count           : %u\n", t->dkeyCount);
+    fprintf(f, "som slot count       : %u\n", t->somLocationCount);
+    fprintf(f, "som width            : %u bytes\n", t->somHorizon);
+    fprintf(f, "rose count           : %u\n", t->roseCount);
+    fprintf(f, "\n");
+
+    fprintf(f, "total engine size    : %u bytes\n", t->size);
+    fprintf(f, " - anchored matcher  : %u bytes over %u bytes\n", t->asize,
+            t->anchoredDistance);
+    fprintf(f, " - floating matcher  : %zu bytes%s",
+            ftable ? hwlmSize(ftable) : 0, t->noFloatingRoots ? " (cond)":"");
+    if (t->floatingMinDistance) {
+        fprintf(f, " from %s bytes\n",
+                rose_off(t->floatingMinDistance).str().c_str());
+    }
+    if (t->floatingDistance != ROSE_BOUND_INF && ftable) {
+        fprintf(f, " over %u bytes\n", t->floatingDistance);
+    } else {
+        fprintf(f, "\n");
+    }
+    fprintf(f, " - delay-rb matcher  : %zu bytes\n",
+            drtable ? hwlmSize(drtable) : 0);
+    fprintf(f, " - eod-anch matcher  : %zu bytes over last %u bytes\n",
+            etable ? hwlmSize(etable) : 0, t->ematcherRegionSize);
+    fprintf(f, " - small-blk matcher : %zu bytes over %u bytes\n",
+            sbtable ? hwlmSize(sbtable) : 0, t->smallBlockDistance);
+    fprintf(f, " - role state table  : %zu bytes\n",
+            t->rolesWithStateCount * sizeof(u32));
+    fprintf(f, " - nfa info table    : %zu bytes\n",
+            t->queueCount * sizeof(NfaInfo));
+
+    fprintf(f, "state space required : %u bytes\n", t->stateOffsets.end);
+    fprintf(f, " - history buffer    : %u bytes\n", t->historyRequired);
+    fprintf(f, " - exhaustion vector : %u bytes\n", (t->ekeyCount + 7) / 8);
+    fprintf(f, " - role state mmbit  : %u bytes\n", t->stateSize);
+    fprintf(f, " - long lit matcher  : %u bytes\n", t->longLitStreamState);
+    fprintf(f, " - active array      : %u bytes\n",
+            mmbit_size(t->activeArrayCount));
+    fprintf(f, " - active rose       : %u bytes\n",
+            mmbit_size(t->activeLeftCount));
+    fprintf(f, " - anchored state    : %u bytes\n", t->anchorStateSize);
+    fprintf(f, " - nfa state         : %u bytes\n", t->nfaStateSize);
+    fprintf(f, " - (trans. nfa state): %u bytes\n", t->tStateSize);
+    fprintf(f, " - one whole bytes   : %u bytes\n",
+            t->stateOffsets.anchorState - t->stateOffsets.leftfixLagTable);
+    fprintf(f, " - groups            : %u bytes\n",
+            t->stateOffsets.groups_size);
+    fprintf(f, "\n");
+
+    fprintf(f, "initial groups       : 0x%016llx\n", t->initialGroups);
+    fprintf(f, "floating groups      : 0x%016llx\n", t->floating_group_mask);
+    fprintf(f, "handled key count    : %u\n", t->handledKeyCount);
+    fprintf(f, "\n");
+
+    fprintf(f, "total literal count  : %u\n", t->totalNumLiterals);
+    fprintf(f, "  delayed literals   : %u\n", t->delay_count);
+
+    fprintf(f, "\n");
+    fprintf(f, "  minWidth                    : %u\n", t->minWidth);
+    fprintf(f, "  minWidthExcludingBoundaries : %u\n",
+            t->minWidthExcludingBoundaries);
+    fprintf(f, "  maxBiAnchoredWidth          : %s\n",
+            rose_off(t->maxBiAnchoredWidth).str().c_str());
+    fprintf(f, "  minFloatLitMatchOffset      : %s\n",
+            rose_off(t->floatingMinLiteralMatchOffset).str().c_str());
+    fprintf(f, "  maxFloatingDelayedMatch     : %s\n",
+            rose_off(t->maxFloatingDelayedMatch).str().c_str());
+
+    if (atable) {
+        fprintf(f, "\nAnchored literal matcher stats:\n\n");
+        dumpAnchoredStats(atable, f);
+    }
+
+    if (ftable) {
+        fprintf(f, "\nFloating literal matcher stats:\n\n");
+        hwlmPrintStats(ftable, f);
+    }
+
+    if (drtable) {
+        fprintf(f, "\nDelay Rebuild literal matcher stats:\n\n");
+        hwlmPrintStats(drtable, f);
+    }
+
+    if (etable) {
+        fprintf(f, "\nEOD-anchored literal matcher stats:\n\n");
+        hwlmPrintStats(etable, f);
+    }
+
+    if (sbtable) {
+        fprintf(f, "\nSmall-block literal matcher stats:\n\n");
+        hwlmPrintStats(sbtable, f);
+    }
+
+    dumpLongLiteralTable(t, f);
+}
+
+#define DUMP_U8(o, member)                                              \
+    fprintf(f, "    %-32s: %hhu/%hhx\n", #member, o->member, o->member)
+#define DUMP_U32(o, member)                                             \
+    fprintf(f, "    %-32s: %u/%08x\n", #member, o->member, o->member)
+#define DUMP_U64(o, member)                                             \
+    fprintf(f, "    %-32s: %llu/%016llx\n", #member, o->member, o->member)
+
+static
+void roseDumpStructRaw(const RoseEngine *t, FILE *f) {
+    fprintf(f, "struct RoseEngine {\n");
+    DUMP_U8(t, noFloatingRoots);
+    DUMP_U8(t, requiresEodCheck);
+    DUMP_U8(t, hasOutfixesInSmallBlock);
+    DUMP_U8(t, runtimeImpl);
+    DUMP_U8(t, mpvTriggeredByLeaf);
+    DUMP_U8(t, canExhaust);
+    DUMP_U8(t, hasSom);
+    DUMP_U8(t, somHorizon);
+    DUMP_U32(t, mode);
+    DUMP_U32(t, historyRequired);
+    DUMP_U32(t, ekeyCount);
+    DUMP_U32(t, dkeyCount);
+    DUMP_U32(t, dkeyLogSize);
+    DUMP_U32(t, invDkeyOffset);
+    DUMP_U32(t, somLocationCount);
+    DUMP_U32(t, somLocationFatbitSize);
+    DUMP_U32(t, rolesWithStateCount);
+    DUMP_U32(t, stateSize);
+    DUMP_U32(t, anchorStateSize);
+    DUMP_U32(t, nfaStateSize);
+    DUMP_U32(t, tStateSize);
+    DUMP_U32(t, smallWriteOffset);
+    DUMP_U32(t, amatcherOffset);
+    DUMP_U32(t, ematcherOffset);
+    DUMP_U32(t, fmatcherOffset);
+    DUMP_U32(t, drmatcherOffset);
+    DUMP_U32(t, sbmatcherOffset);
+    DUMP_U32(t, longLitTableOffset);
+    DUMP_U32(t, amatcherMinWidth);
+    DUMP_U32(t, fmatcherMinWidth);
+    DUMP_U32(t, eodmatcherMinWidth);
+    DUMP_U32(t, amatcherMaxBiAnchoredWidth);
+    DUMP_U32(t, fmatcherMaxBiAnchoredWidth);
+    DUMP_U32(t, reportProgramOffset);
+    DUMP_U32(t, reportProgramCount);
+    DUMP_U32(t, delayProgramOffset);
+    DUMP_U32(t, anchoredProgramOffset);
+    DUMP_U32(t, activeArrayCount);
+    DUMP_U32(t, activeLeftCount);
+    DUMP_U32(t, queueCount);
+    DUMP_U32(t, activeQueueArraySize);
+    DUMP_U32(t, eagerIterOffset);
+    DUMP_U32(t, handledKeyCount);
+    DUMP_U32(t, handledKeyFatbitSize);
+    DUMP_U32(t, leftOffset);
+    DUMP_U32(t, roseCount);
+    DUMP_U32(t, eodProgramOffset);
+    DUMP_U32(t, lastByteHistoryIterOffset);
+    DUMP_U32(t, minWidth);
+    DUMP_U32(t, minWidthExcludingBoundaries);
+    DUMP_U32(t, maxBiAnchoredWidth);
+    DUMP_U32(t, anchoredDistance);
+    DUMP_U32(t, anchoredMinDistance);
+    DUMP_U32(t, floatingDistance);
+    DUMP_U32(t, floatingMinDistance);
+    DUMP_U32(t, smallBlockDistance);
+    DUMP_U32(t, floatingMinLiteralMatchOffset);
+    DUMP_U32(t, nfaInfoOffset);
+    DUMP_U64(t, initialGroups);
+    DUMP_U64(t, floating_group_mask);
+    DUMP_U32(t, size);
+    DUMP_U32(t, delay_count);
+    DUMP_U32(t, delay_fatbit_size);
+    DUMP_U32(t, anchored_count);
+    DUMP_U32(t, anchored_fatbit_size);
+    DUMP_U32(t, maxFloatingDelayedMatch);
+    DUMP_U32(t, delayRebuildLength);
+    DUMP_U32(t, stateOffsets.history);
+    DUMP_U32(t, stateOffsets.exhausted);
+    DUMP_U32(t, stateOffsets.activeLeafArray);
+    DUMP_U32(t, stateOffsets.activeLeftArray);
+    DUMP_U32(t, stateOffsets.activeLeftArray_size);
+    DUMP_U32(t, stateOffsets.leftfixLagTable);
+    DUMP_U32(t, stateOffsets.anchorState);
+    DUMP_U32(t, stateOffsets.groups);
+    DUMP_U32(t, stateOffsets.groups_size);
+    DUMP_U32(t, stateOffsets.longLitState);
+    DUMP_U32(t, stateOffsets.somLocation);
+    DUMP_U32(t, stateOffsets.somValid);
+    DUMP_U32(t, stateOffsets.somWritable);
+    DUMP_U32(t, stateOffsets.end);
+    DUMP_U32(t, boundary.reportEodOffset);
+    DUMP_U32(t, boundary.reportZeroOffset);
+    DUMP_U32(t, boundary.reportZeroEodOffset);
+    DUMP_U32(t, totalNumLiterals);
+    DUMP_U32(t, asize);
+    DUMP_U32(t, outfixBeginQueue);
+    DUMP_U32(t, outfixEndQueue);
+    DUMP_U32(t, leftfixBeginQueue);
+    DUMP_U32(t, initMpvNfa);
+    DUMP_U32(t, rosePrefixCount);
+    DUMP_U32(t, activeLeftIterOffset);
+    DUMP_U32(t, ematcherRegionSize);
+    DUMP_U32(t, somRevCount);
+    DUMP_U32(t, somRevOffsetOffset);
+    DUMP_U32(t, longLitStreamState);
+    fprintf(f, "}\n");
+    fprintf(f, "sizeof(RoseEngine) = %zu\n", sizeof(RoseEngine));
+}
+
+static
+void roseDumpComponents(const RoseEngine *t, bool dump_raw,
+                        const string &base) {
+    dumpComponentInfo(t, base);
+    dumpComponentInfoCsv(t, base);
+    dumpNfas(t, dump_raw, base);
+    dumpAnchored(t, base);
+    dumpRevComponentInfo(t, base);
+    dumpRevNfas(t, dump_raw, base);
+}
+
+static
+void roseDumpPrograms(const vector<LitFragment> &fragments, const RoseEngine *t,
+                      const string &base) {
+    dumpRoseLitPrograms(fragments, t, base + "/rose_lit_programs.txt");
+    dumpRoseEodPrograms(t, base + "/rose_eod_programs.txt");
+    dumpRoseReportPrograms(t, base + "/rose_report_programs.txt");
+    dumpRoseAnchoredPrograms(t, base + "/rose_anchored_programs.txt");
+    dumpRoseDelayPrograms(t, base + "/rose_delay_programs.txt");
+}
+
+void dumpRose(const RoseBuildImpl &build, const vector<LitFragment> &fragments,
+              const map<left_id, u32> &leftfix_queue_map,
+              const map<suffix_id, u32> &suffix_queue_map,
+              const RoseEngine *t) {
+    const Grey &grey = build.cc.grey;
+
+    if (!grey.dumpFlags) {
+        return;
+    }
 
     stringstream ss;
     ss << grey.dumpPath << "rose.txt";
@@ -492,16 +2241,14 @@ void dumpRose(const RoseBuild &build_base, const RoseEngine *t,
     fclose(f);
 
     roseDumpComponents(t, false, grey.dumpPath);
+    roseDumpPrograms(fragments, t, grey.dumpPath);
 
     // Graph.
-    dumpRoseGraph(build, t, "rose.dot");
-
-    // Literals.
-    ss.str("");
-    ss.clear();
-    ss << grey.dumpPath << "rose_literals.txt";
-    dumpRoseLiterals(build, ss.str().c_str());
-    dumpRoseTestLiterals(build, grey.dumpPath);
+    dumpRoseGraph(build, t, fragments, leftfix_queue_map, suffix_queue_map,
+                  "rose.dot");
+
+    // Literals
+    dumpRoseLiterals(build, fragments, grey);
 
     f = fopen((grey.dumpPath + "/rose_struct.txt").c_str(), "w");
     roseDumpStructRaw(t, f);
diff --git a/src/rose/rose_build_dump.h b/src/rose/rose_build_dump.h
index 28e9f53ab..d4c620a3e 100644
--- a/src/rose/rose_build_dump.h
+++ b/src/rose/rose_build_dump.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -29,28 +29,51 @@
 #ifndef ROSE_BUILD_DUMP_H
 #define ROSE_BUILD_DUMP_H
 
+#include "ue2common.h"
+
+#include <map>
+#include <string>
+#include <vector>
+
 struct RoseEngine;
 
 namespace ue2 {
 
-class RoseBuild;
+class RoseBuildImpl;
 struct Grey;
+struct hwlmLiteral;
+struct LitFragment;
+struct left_id;
+struct suffix_id;
 
 #ifdef DUMP_SUPPORT
 // Dump the Rose graph in graphviz representation.
-void dumpRoseGraph(const RoseBuild &build, const RoseEngine *t,
-                   const char *filename);
+void dumpRoseGraph(const RoseBuildImpl &build, const char *filename);
+
+void dumpRose(const RoseBuildImpl &build,
+              const std::vector<LitFragment> &fragments,
+              const std::map<left_id, u32> &leftfix_queue_map,
+              const std::map<suffix_id, u32> &suffix_queue_map,
+              const RoseEngine *t);
+
+void dumpMatcherLiterals(const std::vector<hwlmLiteral> &lits,
+                         const std::string &name, const Grey &grey);
 
-void dumpRose(const RoseBuild &build_base, const RoseEngine *t,
-              const Grey &grey);
 #else
 
 static UNUSED
-void dumpRoseGraph(const RoseBuild &, const RoseEngine *, const char *) {
+void dumpRoseGraph(const RoseBuildImpl &, const char *) {
+}
+
+static UNUSED
+void dumpRose(const RoseBuildImpl &, const std::vector<LitFragment> &,
+              const std::map<left_id, u32> &, const std::map<suffix_id, u32> &,
+              const RoseEngine *) {
 }
 
 static UNUSED
-void dumpRose(const RoseBuild &, const RoseEngine *, const Grey &) {
+void dumpMatcherLiterals(const std::vector<hwlmLiteral> &, const std::string &,
+                         const Grey &) {
 }
 
 #endif
diff --git a/src/rose/rose_build_engine_blob.cpp b/src/rose/rose_build_engine_blob.cpp
new file mode 100644
index 000000000..d39572070
--- /dev/null
+++ b/src/rose/rose_build_engine_blob.cpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "rose_build_engine_blob.h"
+
+#include "rose_build_lookaround.h"
+#include "util/charreach_util.h"
+
+using namespace std;
+
+namespace ue2 {
+
+u32 lookaround_info::get_offset_of(const vector<vector<CharReach>> &reaches,
+                                   RoseEngineBlob &blob) {
+    assert(reaches.size() != 1);
+
+    // Check the cache.
+    auto it = multi_cache.find(reaches);
+    if (it != multi_cache.end()) {
+        DEBUG_PRINTF("reusing reach at idx %u\n", it->second);
+        return it->second;
+    }
+
+    vector<u8> raw_reach(reaches.size() * MULTI_REACH_BITVECTOR_LEN);
+    size_t off = 0;
+    for (const auto &m : reaches) {
+        u8 u = 0;
+        assert(m.size() == MAX_LOOKAROUND_PATHS);
+        for (size_t i = 0; i < m.size(); i++) {
+            if (m[i].none()) {
+                u |= (u8)1U << i;
+            }
+        }
+        fill_n(raw_reach.data() + off, MULTI_REACH_BITVECTOR_LEN, u);
+
+        for (size_t i = 0; i < m.size(); i++) {
+            const CharReach &cr = m[i];
+            if (cr.none()) {
+                continue;
+            }
+
+            for (size_t c = cr.find_first(); c != cr.npos;
+                 c = cr.find_next(c)) {
+                raw_reach[c + off] |= (u8)1U << i;
+            }
+        }
+
+        off += MULTI_REACH_BITVECTOR_LEN;
+    }
+
+    u32 reach_idx = blob.add_range(raw_reach);
+    DEBUG_PRINTF("adding reach at idx %u\n", reach_idx);
+    multi_cache.emplace(reaches, reach_idx);
+
+    return reach_idx;
+}
+
+u32 lookaround_info::get_offset_of(const vector<CharReach> &reach,
+                                   RoseEngineBlob &blob) {
+    if (contains(rcache, reach)) {
+        u32 offset = rcache[reach];
+        DEBUG_PRINTF("reusing reach at idx %u\n", offset);
+        return offset;
+    }
+
+    vector<u8> raw_reach(reach.size() * REACH_BITVECTOR_LEN);
+    size_t off = 0;
+    for (const auto &cr : reach) {
+        assert(cr.any()); // Should be at least one character!
+        fill_bitvector(cr, raw_reach.data() + off);
+        off += REACH_BITVECTOR_LEN;
+    }
+
+    u32 offset = blob.add_range(raw_reach);
+    rcache.emplace(reach, offset);
+    return offset;
+}
+
+u32 lookaround_info::get_offset_of(const vector<s8> &look,
+                                   RoseEngineBlob &blob) {
+    if (contains(lcache, look)) {
+        u32 offset = lcache[look];
+        DEBUG_PRINTF("reusing look at idx %u\n", offset);
+        return offset;
+    }
+
+    u32 offset = blob.add_range(look);
+    lcache.emplace(look, offset);
+    return offset;
+}
+
+} // namespace ue2
diff --git a/src/rose/rose_build_engine_blob.h b/src/rose/rose_build_engine_blob.h
index 8542b87bc..3aa501b47 100644
--- a/src/rose/rose_build_engine_blob.h
+++ b/src/rose/rose_build_engine_blob.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,19 +33,35 @@
 
 #include "ue2common.h"
 #include "util/alloc.h"
+#include "util/bytecode_ptr.h"
+#include "util/charreach.h"
 #include "util/container.h"
 #include "util/multibit_build.h"
+#include "util/noncopyable.h"
 #include "util/ue2_containers.h"
 #include "util/verify_types.h"
 
 #include <vector>
 #include <type_traits>
 
-#include <boost/core/noncopyable.hpp>
-
 namespace ue2 {
 
-class RoseEngineBlob : boost::noncopyable {
+class RoseEngineBlob;
+
+struct lookaround_info : noncopyable {
+    u32 get_offset_of(const std::vector<std::vector<CharReach>> &look,
+                      RoseEngineBlob &blob);
+    u32 get_offset_of(const std::vector<CharReach> &reach,
+                      RoseEngineBlob &blob);
+    u32 get_offset_of(const std::vector<s8> &look, RoseEngineBlob &blob);
+
+private:
+    unordered_map<std::vector<std::vector<CharReach>>, u32> multi_cache;
+    unordered_map<std::vector<s8>, u32> lcache;
+    unordered_map<std::vector<CharReach>, u32> rcache;
+};
+
+class RoseEngineBlob : noncopyable {
 public:
     /** \brief Base offset of engine_blob in the Rose engine bytecode. */
     static constexpr u32 base_offset = ROUNDUP_CL(sizeof(RoseEngine));
@@ -58,10 +74,6 @@ class RoseEngineBlob : boost::noncopyable {
         return blob.size();
     }
 
-    const char *data() const {
-        return blob.data();
-    }
-
     u32 add(const void *a, const size_t len, const size_t align) {
         pad(align);
 
@@ -77,6 +89,11 @@ class RoseEngineBlob : boost::noncopyable {
         return verify_u32(rv);
     }
 
+    template<typename T>
+    u32 add(const bytecode_ptr<T> &a) {
+        return add(a.get(), a.size(), a.align());
+    }
+
     template<typename T>
     u32 add(const T &a) {
         static_assert(std::is_pod<T>::value, "should be pod");
@@ -106,6 +123,11 @@ class RoseEngineBlob : boost::noncopyable {
         return offset;
     }
 
+    template<typename Range>
+    u32 add_range(const Range &range) {
+        return add(begin(range), end(range));
+    }
+
     u32 add_iterator(const std::vector<mmbit_sparse_iter> &iter) {
         auto cache_it = cached_iters.find(iter);
         if (cache_it != cached_iters.end()) {
@@ -123,6 +145,8 @@ class RoseEngineBlob : boost::noncopyable {
         copy_bytes((char *)engine + base_offset, blob);
     }
 
+    lookaround_info lookaround_cache;
+
 private:
     void pad(size_t align) {
         assert(ISALIGNED_N(base_offset, align));
diff --git a/src/rose/rose_build_exclusive.h b/src/rose/rose_build_exclusive.h
index 9cabb1d28..3269dce61 100644
--- a/src/rose/rose_build_exclusive.h
+++ b/src/rose/rose_build_exclusive.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -49,23 +49,6 @@
 
 namespace ue2 {
 
-/** brief subengine info including built engine and
- * corresponding triggering rose vertices */
-struct ExclusiveSubengine {
-    aligned_unique_ptr<NFA> nfa;
-    std::vector<RoseVertex> vertices;
-};
-
-/** \brief exclusive info to build tamarama */
-struct ExclusiveInfo {
-    // subengine info
-    std::vector<ExclusiveSubengine> subengines;
-    // all the report in tamarama
-    std::set<ReportID> reports;
-    // assigned queue id
-    u32 queue;
-};
-
 /** \brief role info structure for exclusive analysis */
 template<typename role_id>
 struct RoleInfo {
diff --git a/src/rose/rose_build_groups.cpp b/src/rose/rose_build_groups.cpp
index 0a1c501f2..c670e6033 100644
--- a/src/rose/rose_build_groups.cpp
+++ b/src/rose/rose_build_groups.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,6 +33,10 @@
 
 #include "rose_build_groups.h"
 
+#include "util/boundary_reports.h"
+#include "util/compile_context.h"
+#include "util/report_manager.h"
+
 #include <queue>
 #include <vector>
 
@@ -71,24 +75,18 @@ bool superStrong(const rose_literal_id &lit) {
 
 static
 bool eligibleForAlwaysOnGroup(const RoseBuildImpl &build, u32 id) {
-    /* returns true if it or any of its delay versions have root role */
-    for (auto v : build.literal_info[id].vertices) {
-        if (build.isRootSuccessor(v)) {
-            NGHolder *h = build.g[v].left.graph.get();
-            if (!h || proper_out_degree(h->startDs, *h)) {
-                return true;
-            }
-        }
+    auto eligble = [&](RoseVertex v) {
+        return build.isRootSuccessor(v)
+        && (!build.g[v].left || !isAnchored(build.g[v].left));
+    };
+
+    if (any_of_in(build.literal_info[id].vertices, eligble)) {
+        return true;
     }
 
     for (u32 delayed_id : build.literal_info[id].delayed_ids) {
-        for (auto v : build.literal_info[delayed_id].vertices) {
-            if (build.isRootSuccessor(v)) {
-                NGHolder *h = build.g[v].left.graph.get();
-                if (!h || proper_out_degree(h->startDs, *h)) {
-                    return true;
-                }
-            }
+        if (any_of_in(build.literal_info[delayed_id].vertices, eligble)) {
+            return true;
         }
     }
 
@@ -170,6 +168,64 @@ u32 next_available_group(u32 counter, u32 min_start_group) {
     return counter;
 }
 
+static
+void allocateGroupForBoundary(RoseBuildImpl &build, u32 group_always_on,
+                              map<u8, u32> &groupCount) {
+    /* Boundary reports at zero will always fired and forgotten, no need to
+     * worry about preventing the stream being marked as exhausted */
+    if (build.boundary.report_at_eod.empty()) {
+        return;
+    }
+
+    /* Group based stream exhaustion is only done at stream boundaries */
+    if (!build.cc.streaming) {
+        return;
+    }
+
+    DEBUG_PRINTF("allocating %u as boundary group id\n", group_always_on);
+
+    build.boundary_group_mask = 1ULL << group_always_on;
+    groupCount[group_always_on]++;
+}
+
+static
+void allocateGroupForEvent(RoseBuildImpl &build, u32 group_always_on,
+                           map<u8, u32> &groupCount, u32 *counter) {
+    if (build.eod_event_literal_id == MO_INVALID_IDX) {
+        return;
+    }
+
+    /* Group based stream exhaustion is only done at stream boundaries */
+    if (!build.cc.streaming) {
+        return;
+    }
+
+    rose_literal_info &info = build.literal_info[build.eod_event_literal_id];
+
+    if (info.vertices.empty()) {
+        return;
+    }
+
+    bool new_group = !groupCount[group_always_on];
+    for (RoseVertex v : info.vertices) {
+        if (build.g[v].left && !isAnchored(build.g[v].left)) {
+            new_group = false;
+        }
+    }
+
+    u32 group;
+    if (!new_group) {
+        group = group_always_on;
+    } else {
+        group = *counter;
+        *counter += 1;
+    }
+
+    DEBUG_PRINTF("allocating %u as eod event group id\n", *counter);
+    info.group_mask = 1ULL << group;
+    groupCount[group]++;
+}
+
 void assignGroupsToLiterals(RoseBuildImpl &build) {
     auto &literals = build.literals;
     auto &literal_info = build.literal_info;
@@ -182,9 +238,8 @@ void assignGroupsToLiterals(RoseBuildImpl &build) {
     u32 group_always_on = 0;
 
     // First pass: handle always on literals.
-    for (const auto &e : literals.right) {
-        u32 id = e.first;
-        const rose_literal_id &lit = e.second;
+    for (u32 id = 0; id < literals.size(); id++) {
+        const rose_literal_id &lit = literals.at(id);
         rose_literal_info &info = literal_info[id];
 
         if (!requires_group_assignment(lit, info)) {
@@ -211,13 +266,15 @@ void assignGroupsToLiterals(RoseBuildImpl &build) {
         counter++;
     }
 
+    allocateGroupForBoundary(build, group_always_on, groupCount);
+    allocateGroupForEvent(build, group_always_on, groupCount, &counter);
+
     u32 min_start_group = counter;
     priority_queue<tuple<s32, s32, u32>> pq;
 
     // Second pass: the other literals.
-    for (const auto &e : literals.right) {
-        u32 id = e.first;
-        const rose_literal_id &lit = e.second;
+    for (u32 id = 0; id < literals.size(); id++) {
+        const rose_literal_id &lit = literals.at(id);
         rose_literal_info &info = literal_info[id];
 
         if (!requires_group_assignment(lit, info)) {
@@ -231,7 +288,7 @@ void assignGroupsToLiterals(RoseBuildImpl &build) {
     while (!pq.empty()) {
         u32 id = get<2>(pq.top());
         pq.pop();
-        UNUSED const rose_literal_id &lit = literals.right.at(id);
+        UNUSED const rose_literal_id &lit = literals.at(id);
         DEBUG_PRINTF("assigning groups to lit %u (v %zu l %zu)\n", id,
                      literal_info[id].vertices.size(), lit.s.length());
 
@@ -302,9 +359,8 @@ void assignGroupsToLiterals(RoseBuildImpl &build) {
         }
     }
     /* assign delayed literals to the same group as their parent */
-    for (const auto &e : literals.right) {
-        u32 id = e.first;
-        const rose_literal_id &lit = e.second;
+    for (u32 id = 0; id < literals.size(); id++) {
+        const rose_literal_id &lit = literals.at(id);
 
         if (!lit.delay) {
             continue;
@@ -319,7 +375,7 @@ void assignGroupsToLiterals(RoseBuildImpl &build) {
     }
 
     DEBUG_PRINTF("populate group to literal mapping\n");
-    for (const u32 id : literals.right | map_keys) {
+    for (u32 id = 0; id < literals.size(); id++) {
         rose_group groups = literal_info[id].group_mask;
         while (groups) {
             u32 group_id = findAndClearLSB_64(&groups);
@@ -453,6 +509,7 @@ rose_group getSquashableGroups(const RoseBuildImpl &build) {
     }
 
     DEBUG_PRINTF("squashable groups=0x%llx\n", squashable_groups);
+    assert(!(squashable_groups & build.boundary_group_mask));
     return squashable_groups;
 }
 
@@ -501,11 +558,11 @@ bool isGroupSquasher(const RoseBuildImpl &build, const u32 id /* literal id */,
     const rose_literal_info &lit_info = build.literal_info.at(id);
 
     DEBUG_PRINTF("checking if %u '%s' is a group squasher %016llx\n", id,
-                  dumpString(build.literals.right.at(id).s).c_str(),
-                  lit_info.group_mask);
+                 dumpString(build.literals.at(id).s).c_str(),
+                 lit_info.group_mask);
 
-    if (build.literals.right.at(id).table == ROSE_EVENT) {
-        DEBUG_PRINTF("event literal, has no groups to squash\n");
+    if (build.literals.at(id).table == ROSE_EVENT) {
+        DEBUG_PRINTF("event literal\n");
         return false;
     }
 
@@ -538,8 +595,12 @@ bool isGroupSquasher(const RoseBuildImpl &build, const u32 id /* literal id */,
 
         /* Case 1 */
 
-        // Can't squash cases with accepts
-        if (!g[v].reports.empty()) {
+        // Can't squash cases with accepts unless they are all
+        // simple-exhaustible.
+        if (any_of_in(g[v].reports, [&](ReportID report) {
+                return !isSimpleExhaustible(build.rm.getReport(report));
+            })) {
+            DEBUG_PRINTF("can't squash reporter\n");
             return false;
         }
 
@@ -628,10 +689,11 @@ bool isGroupSquasher(const RoseBuildImpl &build, const u32 id /* literal id */,
 }
 
 void findGroupSquashers(RoseBuildImpl &build) {
-    rose_group forbidden_squash_group = 0;
-    for (const auto &e : build.literals.right) {
-        if (e.second.delay) {
-            forbidden_squash_group |= build.literal_info[e.first].group_mask;
+    rose_group forbidden_squash_group = build.boundary_group_mask;
+    for (u32 id = 0; id < build.literals.size(); id++) {
+        const auto &lit = build.literals.at(id);
+        if (lit.delay) {
+            forbidden_squash_group |= build.literal_info[id].group_mask;
         }
     }
 
diff --git a/src/rose/rose_build_impl.h b/src/rose/rose_build_impl.h
index 6b326d34b..13f1cfc9c 100644
--- a/src/rose/rose_build_impl.h
+++ b/src/rose/rose_build_impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,28 +26,30 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef ROSE_BUILD_IMPL_H_17E20A3C6935D6
-#define ROSE_BUILD_IMPL_H_17E20A3C6935D6
+#ifndef ROSE_BUILD_IMPL_H
+#define ROSE_BUILD_IMPL_H
 
 #include "rose_build.h"
 #include "rose_build_util.h"
+#include "rose_common.h"
 #include "rose_graph.h"
 #include "nfa/mpvcompile.h"
 #include "nfa/goughcompile.h"
 #include "nfa/nfa_internal.h"
 #include "nfagraph/ng_holder.h"
 #include "nfagraph/ng_revacc.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"
+#include "util/hash.h"
 #include "util/order_check.h"
 #include "util/queue_index_factory.h"
 #include "util/ue2_containers.h"
+#include "util/ue2string.h"
+#include "util/verify_types.h"
 
 #include <deque>
 #include <map>
 #include <string>
 #include <vector>
-#include <boost/bimap.hpp>
-#include <boost/functional/hash/hash.hpp>
 #include <boost/variant.hpp>
 
 struct RoseEngine;
@@ -58,6 +60,17 @@ namespace ue2 {
 
 #define ROSE_LONG_LITERAL_THRESHOLD_MIN 33
 
+/**
+ * \brief The largest allowable "short" literal fragment which can be given to
+ * a literal matcher directly.
+ *
+ * Literals longer than this will be truncated to their suffix and confirmed in
+ * the Rose interpreter, either as "medium length" literals which can be
+ * confirmed from history, or "long literals" which make use of the streaming
+ * table support.
+ */
+#define ROSE_SHORT_LITERAL_LEN_MAX 8
+
 struct BoundaryReports;
 struct CastleProto;
 struct CompileContext;
@@ -252,9 +265,7 @@ struct rose_literal_info {
     ue2::flat_set<RoseVertex> vertices;
     rose_group group_mask = 0;
     u32 undelayed_id = MO_INVALID_IDX;
-    u32 final_id = MO_INVALID_IDX; /* id reported by fdr */
     bool squash_group = false;
-    bool requires_explode = false;
     bool requires_benefits = false;
 };
 
@@ -290,6 +301,11 @@ struct rose_literal_id {
         }
         return MAX(mask_len, s.length()) + delay;
     }
+
+    bool operator==(const rose_literal_id &b) const {
+        return s == b.s && msk == b.msk && cmp == b.cmp && table == b.table &&
+               delay == b.delay && distinctiveness == b.distinctiveness;
+    }
 };
 
 static inline
@@ -303,8 +319,60 @@ bool operator<(const rose_literal_id &a, const rose_literal_id &b) {
     return 0;
 }
 
-// Literals are stored in a map from (string, nocase) -> ID
-typedef boost::bimap<rose_literal_id, u32> RoseLiteralMap;
+inline
+size_t hash_value(const rose_literal_id &lit) {
+    return hash_all(lit.s, lit.msk, lit.cmp, lit.table, lit.delay,
+                    lit.distinctiveness);
+}
+
+class RoseLiteralMap {
+    /**
+     * \brief Main storage for literals.
+     *
+     * Note that this cannot be a vector, as the present code relies on
+     * iterator stability when iterating over this list and adding to it inside
+     * the loop.
+     */
+    std::deque<rose_literal_id> lits;
+
+    /** \brief Quick-lookup index from literal -> index in lits. */
+    unordered_map<rose_literal_id, u32> lits_index;
+
+public:
+    std::pair<u32, bool> insert(const rose_literal_id &lit) {
+        auto it = lits_index.find(lit);
+        if (it != lits_index.end()) {
+            return {it->second, false};
+        }
+        u32 id = verify_u32(lits.size());
+        lits.push_back(lit);
+        lits_index.emplace(lit, id);
+        return {id, true};
+    }
+
+    // Erase the last num elements.
+    void erase_back(size_t num) {
+        assert(num <= lits.size());
+        for (size_t i = 0; i < num; i++) {
+            lits_index.erase(lits.back());
+            lits.pop_back();
+        }
+        assert(lits.size() == lits_index.size());
+    }
+
+    const rose_literal_id &at(u32 id) const {
+        assert(id < lits.size());
+        return lits.at(id);
+    }
+
+    using const_iterator = decltype(lits)::const_iterator;
+    const_iterator begin() const { return lits.begin(); }
+    const_iterator end() const { return lits.end(); }
+
+    size_t size() const {
+        return lits.size();
+    }
+};
 
 struct simple_anchored_info {
     simple_anchored_info(u32 min_b, u32 max_b, const ue2_literal &lit)
@@ -415,8 +483,8 @@ struct OutfixInfo {
 
     RevAccInfo rev_info;
     u32 maxBAWidth = 0; //!< max bi-anchored width
-    depth minWidth = depth::infinity();
-    depth maxWidth = 0;
+    depth minWidth{depth::infinity()};
+    depth maxWidth{0};
     u64a maxOffset = 0;
     bool in_sbmatcher = false; //!< handled by small-block matcher.
 
@@ -438,8 +506,7 @@ class RoseBuildImpl : public RoseBuild {
     void add(bool anchored, bool eod, const ue2_literal &lit,
              const ue2::flat_set<ReportID> &ids) override;
 
-    bool addRose(const RoseInGraph &ig, bool prefilter,
-                 bool finalChance = false) override;
+    bool addRose(const RoseInGraph &ig, bool prefilter) override;
     bool addSombeRose(const RoseInGraph &ig) override;
 
     bool addOutfix(const NGHolder &h) override;
@@ -462,8 +529,8 @@ class RoseBuildImpl : public RoseBuild {
                  bool eod) override;
 
     // Construct a runtime implementation.
-    aligned_unique_ptr<RoseEngine> buildRose(u32 minWidth) override;
-    aligned_unique_ptr<RoseEngine> buildFinalEngine(u32 minWidth);
+    bytecode_ptr<RoseEngine> buildRose(u32 minWidth) override;
+    bytecode_ptr<RoseEngine> buildFinalEngine(u32 minWidth);
 
     void setSom() override { hasSom = true; }
 
@@ -481,8 +548,6 @@ class RoseBuildImpl : public RoseBuild {
                      const std::vector<u8> &cmp, u32 delay,
                      rose_literal_table table);
 
-    bool hasLiteral(const ue2_literal &s, rose_literal_table table) const;
-
     u32 getNewLiteralId(void);
 
     void removeVertices(const std::vector<RoseVertex> &dead);
@@ -490,8 +555,6 @@ class RoseBuildImpl : public RoseBuild {
     // Is the Rose anchored?
     bool hasNoFloatingRoots() const;
 
-    RoseVertex cloneVertex(RoseVertex v);
-
     u32 calcHistoryRequired() const;
 
     rose_group getInitialGroups() const;
@@ -512,8 +575,6 @@ class RoseBuildImpl : public RoseBuild {
     bool isDirectReport(u32 id) const;
     bool isDelayed(u32 id) const;
 
-    bool hasFinalId(u32 id) const;
-
     bool isAnchored(RoseVertex v) const; /* true iff has literal in anchored
                                           * table */
     bool isFloating(RoseVertex v) const; /* true iff has literal in floating
@@ -553,31 +614,19 @@ class RoseBuildImpl : public RoseBuild {
         return next_nfa_report++;
     }
     std::deque<rose_literal_info> literal_info;
-    u32 delay_base_id;
     bool hasSom; //!< at least one pattern requires SOM.
     std::map<size_t, std::vector<std::unique_ptr<raw_dfa>>> anchored_nfas;
     std::map<simple_anchored_info, std::set<u32>> anchored_simple;
     std::map<u32, std::set<u32> > group_to_literal;
     u32 group_end;
 
-    u32 anchored_base_id;
-
     u32 ematcher_region_size; /**< number of bytes the eod table runs over */
 
-    /** \brief Mapping from leftfix to queue ID (used in dump code). */
-    unordered_map<left_id, u32> leftfix_queue_map;
-
-    /** \brief Mapping from suffix to queue ID (used in dump code). */
-    unordered_map<suffix_id, u32> suffix_queue_map;
-
     /** \brief Mapping from anchored literal ID to the original literal suffix
      * present when the literal was added to the literal matcher. Used for
      * overlap calculation in history assignment. */
     std::map<u32, rose_literal_id> anchoredLitSuffix;
 
-    std::map<u32, std::set<u32> > final_id_to_literal; /* final literal id to
-                                                        * literal id */
-
     unordered_set<left_id> transient;
     unordered_map<left_id, rose_group> rose_squash_masks;
 
@@ -592,6 +641,8 @@ class RoseBuildImpl : public RoseBuild {
 
     u32 max_rose_anchored_floating_overlap;
 
+    rose_group boundary_group_mask = 0;
+
     QueueIndexFactory qif;
     ReportManager &rm;
     SomSlotManager &ssm;
@@ -614,8 +665,6 @@ size_t maxOverlap(const rose_literal_id &a, const rose_literal_id &b);
 ue2_literal findNonOverlappingTail(const std::set<ue2_literal> &lits,
                                    const ue2_literal &s);
 
-void setReportId(NGHolder &g, ReportID id);
-
 #ifndef NDEBUG
 bool roseHasTops(const RoseBuildImpl &build, RoseVertex v);
 bool hasOrphanedTops(const RoseBuildImpl &build);
@@ -629,10 +678,15 @@ u64a findMaxOffset(const std::set<ReportID> &reports, const ReportManager &rm);
 void normaliseLiteralMask(const ue2_literal &s, std::vector<u8> &msk,
                           std::vector<u8> &cmp);
 
+u32 findMinOffset(const RoseBuildImpl &build, u32 lit_id);
+u32 findMaxOffset(const RoseBuildImpl &build, u32 lit_id);
+
+bool canEagerlyReportAtEod(const RoseBuildImpl &build, const RoseEdge &e);
+
 #ifndef NDEBUG
 bool canImplementGraphs(const RoseBuildImpl &tbi);
 #endif
 
 } // namespace ue2
 
-#endif /* ROSE_BUILD_IMPL_H_17E20A3C6935D6 */
+#endif /* ROSE_BUILD_IMPL_H */
diff --git a/src/rose/rose_build_instructions.cpp b/src/rose/rose_build_instructions.cpp
new file mode 100644
index 000000000..b00c36be6
--- /dev/null
+++ b/src/rose/rose_build_instructions.cpp
@@ -0,0 +1,639 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "rose_build_instructions.h"
+
+#include "rose_build_engine_blob.h"
+#include "util/multibit_build.h"
+#include "util/verify_types.h"
+
+#include <algorithm>
+
+using namespace std;
+
+namespace ue2 {
+/* Destructors to avoid weak vtables. */
+
+RoseInstruction::~RoseInstruction() = default;
+RoseInstrCatchUp::~RoseInstrCatchUp() = default;
+RoseInstrCatchUpMpv::~RoseInstrCatchUpMpv() = default;
+RoseInstrSomZero::~RoseInstrSomZero() = default;
+RoseInstrSuffixesEod::~RoseInstrSuffixesEod() = default;
+RoseInstrMatcherEod::~RoseInstrMatcherEod() = default;
+RoseInstrEnd::~RoseInstrEnd() = default;
+RoseInstrClearWorkDone::~RoseInstrClearWorkDone() = default;
+
+using OffsetMap = RoseInstruction::OffsetMap;
+
+static
+u32 calc_jump(const OffsetMap &offset_map, const RoseInstruction *from,
+              const RoseInstruction *to) {
+    DEBUG_PRINTF("computing relative jump from %p to %p\n", from, to);
+    assert(from && contains(offset_map, from));
+    assert(to && contains(offset_map, to));
+
+    u32 from_offset = offset_map.at(from);
+    u32 to_offset = offset_map.at(to);
+    DEBUG_PRINTF("offsets: %u -> %u\n", from_offset, to_offset);
+    assert(from_offset <= to_offset);
+
+    return to_offset - from_offset;
+}
+
+void RoseInstrAnchoredDelay::write(void *dest, RoseEngineBlob &blob,
+                                   const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->groups = groups;
+    inst->anch_id = anch_id;
+    inst->done_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckLitEarly::write(void *dest, RoseEngineBlob &blob,
+                                   const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->min_offset = min_offset;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckGroups::write(void *dest, RoseEngineBlob &blob,
+                                 const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->groups = groups;
+}
+
+void RoseInstrCheckOnlyEod::write(void *dest, RoseEngineBlob &blob,
+                                  const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckBounds::write(void *dest, RoseEngineBlob &blob,
+                                 const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->min_bound = min_bound;
+    inst->max_bound = max_bound;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckNotHandled::write(void *dest, RoseEngineBlob &blob,
+                                     const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->key = key;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckSingleLookaround::write(void *dest, RoseEngineBlob &blob,
+                                           const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->offset = offset;
+    inst->reach_index = blob.lookaround_cache.get_offset_of({reach}, blob);
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckLookaround::write(void *dest, RoseEngineBlob &blob,
+                                     const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    vector<s8> look_offsets;
+    vector<CharReach> reaches;
+    for (const auto &le : look) {
+        look_offsets.push_back(le.offset);
+        reaches.push_back(le.reach);
+    }
+    inst->look_index = blob.lookaround_cache.get_offset_of(look_offsets, blob);
+    inst->reach_index = blob.lookaround_cache.get_offset_of(reaches, blob);
+    inst->count = verify_u32(look.size());
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckMask::write(void *dest, RoseEngineBlob &blob,
+                               const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->and_mask = and_mask;
+    inst->cmp_mask = cmp_mask;
+    inst->neg_mask = neg_mask;
+    inst->offset = offset;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckMask32::write(void *dest, RoseEngineBlob &blob,
+                                 const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    copy(begin(and_mask), end(and_mask), inst->and_mask);
+    copy(begin(cmp_mask), end(cmp_mask), inst->cmp_mask);
+    inst->neg_mask = neg_mask;
+    inst->offset = offset;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckByte::write(void *dest, RoseEngineBlob &blob,
+                               const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->and_mask = and_mask;
+    inst->cmp_mask = cmp_mask;
+    inst->negation = negation;
+    inst->offset = offset;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckShufti16x8::write(void *dest, RoseEngineBlob &blob,
+                                     const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    copy(begin(nib_mask), end(nib_mask), inst->nib_mask);
+    copy(begin(bucket_select_mask), end(bucket_select_mask),
+         inst->bucket_select_mask);
+    inst->neg_mask = neg_mask;
+    inst->offset = offset;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckShufti32x8::write(void *dest, RoseEngineBlob &blob,
+                                     const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    copy(begin(hi_mask), end(hi_mask), inst->hi_mask);
+    copy(begin(lo_mask), end(lo_mask), inst->lo_mask);
+    copy(begin(bucket_select_mask), end(bucket_select_mask),
+         inst->bucket_select_mask);
+
+    inst->neg_mask = neg_mask;
+    inst->offset = offset;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckShufti16x16::write(void *dest, RoseEngineBlob &blob,
+                                      const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    copy(begin(hi_mask), end(hi_mask), inst->hi_mask);
+    copy(begin(lo_mask), end(lo_mask), inst->lo_mask);
+    copy(begin(bucket_select_mask), end(bucket_select_mask),
+         inst->bucket_select_mask);
+    inst->neg_mask = neg_mask;
+    inst->offset = offset;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckShufti32x16::write(void *dest, RoseEngineBlob &blob,
+                                      const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    copy(begin(hi_mask), end(hi_mask), inst->hi_mask);
+    copy(begin(lo_mask), end(lo_mask), inst->lo_mask);
+    copy(begin(bucket_select_mask_hi), end(bucket_select_mask_hi),
+         inst->bucket_select_mask_hi);
+    copy(begin(bucket_select_mask_lo), end(bucket_select_mask_lo),
+         inst->bucket_select_mask_lo);
+    inst->neg_mask = neg_mask;
+    inst->offset = offset;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckInfix::write(void *dest, RoseEngineBlob &blob,
+                                const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->queue = queue;
+    inst->lag = lag;
+    inst->report = report;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckPrefix::write(void *dest, RoseEngineBlob &blob,
+                                 const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->queue = queue;
+    inst->lag = lag;
+    inst->report = report;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrPushDelayed::write(void *dest, RoseEngineBlob &blob,
+                                 const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->delay = delay;
+    inst->index = index;
+}
+
+void RoseInstrSomAdjust::write(void *dest, RoseEngineBlob &blob,
+                               const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->distance = distance;
+}
+
+void RoseInstrSomLeftfix::write(void *dest, RoseEngineBlob &blob,
+                                const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->queue = queue;
+    inst->lag = lag;
+}
+
+void RoseInstrSomFromReport::write(void *dest, RoseEngineBlob &blob,
+                                   const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->som = som;
+}
+
+void RoseInstrTriggerInfix::write(void *dest, RoseEngineBlob &blob,
+                                  const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->cancel = cancel;
+    inst->queue = queue;
+    inst->event = event;
+}
+
+void RoseInstrTriggerSuffix::write(void *dest, RoseEngineBlob &blob,
+                                   const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->queue = queue;
+    inst->event = event;
+}
+
+void RoseInstrDedupe::write(void *dest, RoseEngineBlob &blob,
+                            const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->quash_som = quash_som;
+    inst->dkey = dkey;
+    inst->offset_adjust = offset_adjust;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrDedupeSom::write(void *dest, RoseEngineBlob &blob,
+                               const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->quash_som = quash_som;
+    inst->dkey = dkey;
+    inst->offset_adjust = offset_adjust;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrReportChain::write(void *dest, RoseEngineBlob &blob,
+                                 const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->event = event;
+    inst->top_squash_distance = top_squash_distance;
+}
+
+void RoseInstrReportSomInt::write(void *dest, RoseEngineBlob &blob,
+                                  const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->som = som;
+}
+
+void RoseInstrReportSomAware::write(void *dest, RoseEngineBlob &blob,
+                                    const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->som = som;
+}
+
+void RoseInstrReport::write(void *dest, RoseEngineBlob &blob,
+                            const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->onmatch = onmatch;
+    inst->offset_adjust = offset_adjust;
+}
+
+void RoseInstrReportExhaust::write(void *dest, RoseEngineBlob &blob,
+                                   const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->onmatch = onmatch;
+    inst->offset_adjust = offset_adjust;
+    inst->ekey = ekey;
+}
+
+void RoseInstrReportSom::write(void *dest, RoseEngineBlob &blob,
+                               const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->onmatch = onmatch;
+    inst->offset_adjust = offset_adjust;
+}
+
+void RoseInstrReportSomExhaust::write(void *dest, RoseEngineBlob &blob,
+                                      const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->onmatch = onmatch;
+    inst->offset_adjust = offset_adjust;
+    inst->ekey = ekey;
+}
+
+void RoseInstrDedupeAndReport::write(void *dest, RoseEngineBlob &blob,
+                                     const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->quash_som = quash_som;
+    inst->dkey = dkey;
+    inst->onmatch = onmatch;
+    inst->offset_adjust = offset_adjust;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrFinalReport::write(void *dest, RoseEngineBlob &blob,
+                                 const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->onmatch = onmatch;
+    inst->offset_adjust = offset_adjust;
+}
+
+void RoseInstrCheckExhausted::write(void *dest, RoseEngineBlob &blob,
+                                    const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->ekey = ekey;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckMinLength::write(void *dest, RoseEngineBlob &blob,
+                                    const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->end_adj = end_adj;
+    inst->min_length = min_length;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrSetState::write(void *dest, RoseEngineBlob &blob,
+                              const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->index = index;
+}
+
+void RoseInstrSetGroups::write(void *dest, RoseEngineBlob &blob,
+                              const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->groups = groups;
+}
+
+void RoseInstrSquashGroups::write(void *dest, RoseEngineBlob &blob,
+                                  const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->groups = groups;
+}
+
+void RoseInstrCheckState::write(void *dest, RoseEngineBlob &blob,
+                                const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->index = index;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrSparseIterBegin::write(void *dest, RoseEngineBlob &blob,
+                                     const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->fail_jump = calc_jump(offset_map, this, target);
+
+    // Resolve and write the multibit sparse iterator and the jump table.
+    vector<u32> keys;
+    vector<u32> jump_offsets;
+    for (const auto &jump : jump_table) {
+        keys.push_back(jump.first);
+        assert(contains(offset_map, jump.second));
+        jump_offsets.push_back(offset_map.at(jump.second));
+    }
+
+    auto iter = mmbBuildSparseIterator(keys, num_keys);
+    assert(!iter.empty());
+    inst->iter_offset = blob.add_iterator(iter);
+    inst->jump_table = blob.add(jump_offsets.begin(), jump_offsets.end());
+
+    // Store offsets for corresponding SPARSE_ITER_NEXT operations.
+    is_written = true;
+    iter_offset = inst->iter_offset;
+    jump_table_offset = inst->jump_table;
+}
+
+void RoseInstrSparseIterNext::write(void *dest, RoseEngineBlob &blob,
+                                    const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->state = state;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+
+    // Use the same sparse iterator and jump table as the SPARSE_ITER_BEGIN
+    // instruction.
+    assert(begin);
+    assert(contains(offset_map, begin));
+    assert(begin->is_written);
+    inst->iter_offset = begin->iter_offset;
+    inst->jump_table = begin->jump_table_offset;
+}
+
+void RoseInstrSparseIterAny::write(void *dest, RoseEngineBlob &blob,
+                                   const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->fail_jump = calc_jump(offset_map, this, target);
+
+    // Write the multibit sparse iterator.
+    auto iter = mmbBuildSparseIterator(keys, num_keys);
+    assert(!iter.empty());
+    inst->iter_offset = blob.add_iterator(iter);
+}
+
+void RoseInstrEnginesEod::write(void *dest, RoseEngineBlob &blob,
+                                const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    inst->iter_offset = iter_offset;
+}
+
+void RoseInstrCheckLongLit::write(void *dest, RoseEngineBlob &blob,
+                                  const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    assert(!literal.empty());
+    inst->lit_offset = blob.add(literal.c_str(), literal.size(), 1);
+    inst->lit_length = verify_u32(literal.size());
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckLongLitNocase::write(void *dest, RoseEngineBlob &blob,
+                                        const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    assert(!literal.empty());
+    inst->lit_offset = blob.add(literal.c_str(), literal.size(), 1);
+    inst->lit_length = verify_u32(literal.size());
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckMedLit::write(void *dest, RoseEngineBlob &blob,
+                                 const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    assert(!literal.empty());
+    inst->lit_offset = blob.add(literal.c_str(), literal.size(), 1);
+    inst->lit_length = verify_u32(literal.size());
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckMedLitNocase::write(void *dest, RoseEngineBlob &blob,
+                                       const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    assert(!literal.empty());
+    inst->lit_offset = blob.add(literal.c_str(), literal.size(), 1);
+    inst->lit_length = verify_u32(literal.size());
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrMultipathLookaround::write(void *dest, RoseEngineBlob &blob,
+                                         const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    auto &cache = blob.lookaround_cache;
+    vector<s8> look_offsets;
+    vector<vector<CharReach>> reaches;
+    for (const auto &vle : multi_look) {
+        reaches.push_back({});
+        bool done_offset = false;
+
+        for (const auto &le : vle) {
+            reaches.back().push_back(le.reach);
+
+            /* empty reaches don't have valid offsets */
+            if (!done_offset && le.reach.any()) {
+                look_offsets.push_back(le.offset);
+                done_offset = true;
+            }
+        }
+    }
+    inst->look_index = cache.get_offset_of(look_offsets, blob);
+    inst->reach_index = cache.get_offset_of(reaches, blob);
+    inst->count = verify_u32(multi_look.size());
+    inst->last_start = last_start;
+    copy(begin(start_mask), end(start_mask), inst->start_mask);
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckMultipathShufti16x8::write(void *dest, RoseEngineBlob &blob,
+                                          const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    copy(begin(nib_mask), end(nib_mask), inst->nib_mask);
+    copy(begin(bucket_select_mask), begin(bucket_select_mask) + 16,
+         inst->bucket_select_mask);
+    copy(begin(data_select_mask), begin(data_select_mask) + 16,
+         inst->data_select_mask);
+    inst->hi_bits_mask = hi_bits_mask;
+    inst->lo_bits_mask = lo_bits_mask;
+    inst->neg_mask = neg_mask;
+    inst->base_offset = base_offset;
+    inst->last_start = last_start;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckMultipathShufti32x8::write(void *dest, RoseEngineBlob &blob,
+                                          const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    copy(begin(hi_mask), begin(hi_mask) + 16, inst->hi_mask);
+    copy(begin(lo_mask), begin(lo_mask) + 16, inst->lo_mask);
+    copy(begin(bucket_select_mask), begin(bucket_select_mask) + 32,
+         inst->bucket_select_mask);
+    copy(begin(data_select_mask), begin(data_select_mask) + 32,
+         inst->data_select_mask);
+    inst->hi_bits_mask = hi_bits_mask;
+    inst->lo_bits_mask = lo_bits_mask;
+    inst->neg_mask = neg_mask;
+    inst->base_offset = base_offset;
+    inst->last_start = last_start;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckMultipathShufti32x16::write(void *dest, RoseEngineBlob &blob,
+                                           const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    copy(begin(hi_mask), end(hi_mask), inst->hi_mask);
+    copy(begin(lo_mask), end(lo_mask), inst->lo_mask);
+    copy(begin(bucket_select_mask_hi), begin(bucket_select_mask_hi) + 32,
+         inst->bucket_select_mask_hi);
+    copy(begin(bucket_select_mask_lo), begin(bucket_select_mask_lo) + 32,
+         inst->bucket_select_mask_lo);
+    copy(begin(data_select_mask), begin(data_select_mask) + 32,
+         inst->data_select_mask);
+    inst->hi_bits_mask = hi_bits_mask;
+    inst->lo_bits_mask = lo_bits_mask;
+    inst->neg_mask = neg_mask;
+    inst->base_offset = base_offset;
+    inst->last_start = last_start;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+void RoseInstrCheckMultipathShufti64::write(void *dest, RoseEngineBlob &blob,
+                                            const OffsetMap &offset_map) const {
+    RoseInstrBase::write(dest, blob, offset_map);
+    auto *inst = static_cast<impl_type *>(dest);
+    copy(begin(hi_mask), begin(hi_mask) + 16, inst->hi_mask);
+    copy(begin(lo_mask), begin(lo_mask) + 16, inst->lo_mask);
+    copy(begin(bucket_select_mask), end(bucket_select_mask),
+         inst->bucket_select_mask);
+    copy(begin(data_select_mask), end(data_select_mask),
+         inst->data_select_mask);
+    inst->hi_bits_mask = hi_bits_mask;
+    inst->lo_bits_mask = lo_bits_mask;
+    inst->neg_mask = neg_mask;
+    inst->base_offset = base_offset;
+    inst->last_start = last_start;
+    inst->fail_jump = calc_jump(offset_map, this, target);
+}
+
+}
diff --git a/src/rose/rose_build_instructions.h b/src/rose/rose_build_instructions.h
new file mode 100644
index 000000000..025f6a671
--- /dev/null
+++ b/src/rose/rose_build_instructions.h
@@ -0,0 +1,2132 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Concrete classes for interpreter instructions.
+ *
+ * Note: this header should only be included in files which need to deal with
+ * the details of actual instructions. It is expected that most will only
+ * require access to the RoseInstruction API exposed in rose_build_program.h
+ */
+
+#ifndef ROSE_BUILD_INSTRUCTIONS_H
+#define ROSE_BUILD_INSTRUCTIONS_H
+
+#include "rose_build_lookaround.h"
+#include "rose_build_program.h"
+#include "util/verify_types.h"
+
+namespace ue2 {
+
+/**
+ * \brief Abstract base class representing a single Rose instruction.
+ */
+class RoseInstruction {
+public:
+    virtual ~RoseInstruction();
+
+    /** \brief Opcode used for the instruction in the bytecode. */
+    virtual RoseInstructionCode code() const = 0;
+
+    /**
+     * \brief Simple hash used for program equivalence.
+     *
+     * Note that pointers (jumps, for example) should not be used when
+     * calculating the hash: they will be converted to instruction offsets when
+     * compared later.
+     */
+    virtual size_t hash() const = 0;
+
+    /** \brief Length of the bytecode instruction in bytes. */
+    virtual size_t byte_length() const = 0;
+
+    using OffsetMap = unordered_map<const RoseInstruction *, u32>;
+
+    /**
+     * \brief Writes a concrete implementation of this instruction.
+     *
+     * Other data that this instruction depends on is written directly into the
+     * blob, while the instruction structure itself (of size given by
+     * the byte_length() function) is written to dest.
+     */
+    virtual void write(void *dest, RoseEngineBlob &blob,
+                       const OffsetMap &offset_map) const = 0;
+
+    /**
+     * \brief Update a target pointer.
+     *
+     * If this instruction contains any reference to the old target, replace it
+     * with the new one.
+     */
+    virtual void update_target(const RoseInstruction *old_target,
+                               const RoseInstruction *new_target) = 0;
+
+    /**
+     * \brief True if these instructions are equivalent within their own
+     * programs.
+     *
+     * Checks that any pointers to other instructions point to the same
+     * offsets.
+     */
+    bool equiv(const RoseInstruction &other, const OffsetMap &offsets,
+               const OffsetMap &other_offsets) const {
+        return equiv_impl(other, offsets, other_offsets);
+    }
+
+private:
+    virtual bool equiv_impl(const RoseInstruction &other,
+                            const OffsetMap &offsets,
+                            const OffsetMap &other_offsets) const = 0;
+};
+
+/**
+ * \brief Templated implementation class to handle boring boilerplate code.
+ */
+template<RoseInstructionCode Opcode, class ImplType, class RoseInstrType>
+class RoseInstrBase : public RoseInstruction {
+protected:
+    static constexpr RoseInstructionCode opcode = Opcode;
+    using impl_type = ImplType;
+
+public:
+    RoseInstructionCode code() const override { return opcode; }
+
+    size_t byte_length() const override {
+        return sizeof(impl_type);
+    }
+
+    /**
+     * Note: this implementation simply zeroes the destination region and
+     * writes in the correct opcode. This is sufficient for trivial
+     * instructions, but instructions with data members will want to override
+     * it.
+     */
+    void write(void *dest, RoseEngineBlob &,
+               const RoseInstruction::OffsetMap &) const override {
+        assert(dest != nullptr);
+        assert(ISALIGNED_N(dest, ROSE_INSTR_MIN_ALIGN));
+
+        impl_type *inst = static_cast<impl_type *>(dest);
+        memset(inst, 0, sizeof(impl_type));
+        inst->code = verify_u8(opcode);
+    }
+
+private:
+    bool equiv_impl(const RoseInstruction &other, const OffsetMap &offsets,
+                    const OffsetMap &other_offsets) const override {
+        const auto *ri_that = dynamic_cast<const RoseInstrType *>(&other);
+        if (!ri_that) {
+            return false;
+        }
+        const auto *ri_this = dynamic_cast<const RoseInstrType *>(this);
+        assert(ri_this);
+        return ri_this->equiv_to(*ri_that, offsets, other_offsets);
+    }
+};
+
+/**
+ * \brief Refinement of RoseInstrBase to use for instructions that have
+ * just a single target member, called "target".
+ */
+template<RoseInstructionCode Opcode, class ImplType, class RoseInstrType>
+class RoseInstrBaseOneTarget
+    : public RoseInstrBase<Opcode, ImplType, RoseInstrType> {
+public:
+    void update_target(const RoseInstruction *old_target,
+                       const RoseInstruction *new_target) override {
+        RoseInstrType *ri = dynamic_cast<RoseInstrType *>(this);
+        assert(ri);
+        if (ri->target == old_target) {
+            ri->target = new_target;
+        }
+    }
+};
+
+/**
+ * \brief Refinement of RoseInstrBase to use for instructions that have no
+ * targets.
+ */
+template<RoseInstructionCode Opcode, class ImplType, class RoseInstrType>
+class RoseInstrBaseNoTargets
+    : public RoseInstrBase<Opcode, ImplType, RoseInstrType> {
+public:
+    void update_target(const RoseInstruction *,
+                       const RoseInstruction *) override {}
+};
+
+/**
+ * \brief Refinement of RoseInstrBaseNoTargets to use for instructions that
+ * have no members at all, just an opcode.
+ */
+template<RoseInstructionCode Opcode, class ImplType, class RoseInstrType>
+class RoseInstrBaseTrivial
+    : public RoseInstrBaseNoTargets<Opcode, ImplType, RoseInstrType> {
+public:
+    virtual bool operator==(const RoseInstrType &) const { return true; }
+
+    size_t hash() const override {
+        return boost::hash_value(static_cast<int>(Opcode));
+    }
+
+    bool equiv_to(const RoseInstrType &, const RoseInstruction::OffsetMap &,
+                  const RoseInstruction::OffsetMap &) const {
+        return true;
+    }
+};
+
+////
+//// Concrete implementation classes start here.
+////
+
+class RoseInstrAnchoredDelay
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_ANCHORED_DELAY,
+                                    ROSE_STRUCT_ANCHORED_DELAY,
+                                    RoseInstrAnchoredDelay> {
+public:
+    rose_group groups;
+    u32 anch_id;
+    const RoseInstruction *target;
+
+    RoseInstrAnchoredDelay(rose_group groups_in, u32 anch_id_in,
+                           const RoseInstruction *target_in)
+        : groups(groups_in), anch_id(anch_id_in), target(target_in) {}
+
+    bool operator==(const RoseInstrAnchoredDelay &ri) const {
+        return groups == ri.groups && anch_id == ri.anch_id
+        && target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), groups, anch_id);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrAnchoredDelay &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return groups == ri.groups && anch_id == ri.anch_id
+               && offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckLitEarly
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_LIT_EARLY,
+                                    ROSE_STRUCT_CHECK_LIT_EARLY,
+                                    RoseInstrCheckLitEarly> {
+public:
+    u32 min_offset;
+    const RoseInstruction *target;
+
+    RoseInstrCheckLitEarly(u32 min_offset_in, const RoseInstruction *target_in)
+        : min_offset(min_offset_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckLitEarly &ri) const {
+        return min_offset == ri.min_offset && target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), min_offset);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckLitEarly &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return min_offset == ri.min_offset &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckGroups
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_CHECK_GROUPS,
+                                    ROSE_STRUCT_CHECK_GROUPS,
+                                    RoseInstrCheckGroups> {
+public:
+    rose_group groups;
+
+    explicit RoseInstrCheckGroups(rose_group groups_in) : groups(groups_in) {}
+
+    bool operator==(const RoseInstrCheckGroups &ri) const {
+        return groups == ri.groups;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), groups);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckGroups &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return groups == ri.groups;
+    }
+};
+
+class RoseInstrCheckOnlyEod
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_ONLY_EOD,
+                                    ROSE_STRUCT_CHECK_ONLY_EOD,
+                                    RoseInstrCheckOnlyEod> {
+public:
+    const RoseInstruction *target;
+
+    explicit RoseInstrCheckOnlyEod(const RoseInstruction *target_in)
+        : target(target_in) {}
+
+    bool operator==(const RoseInstrCheckOnlyEod &ri) const {
+        return target == ri.target;
+    }
+
+    size_t hash() const override {
+        return boost::hash_value(static_cast<int>(opcode));
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckOnlyEod &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckBounds
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_BOUNDS,
+                                    ROSE_STRUCT_CHECK_BOUNDS,
+                                    RoseInstrCheckBounds> {
+public:
+    u64a min_bound;
+    u64a max_bound;
+    const RoseInstruction *target;
+
+    RoseInstrCheckBounds(u64a min, u64a max, const RoseInstruction *target_in)
+        : min_bound(min), max_bound(max), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckBounds &ri) const {
+        return min_bound == ri.min_bound && max_bound == ri.max_bound &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), min_bound, max_bound);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckBounds &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return min_bound == ri.min_bound && max_bound == ri.max_bound &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckNotHandled
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_NOT_HANDLED,
+                                    ROSE_STRUCT_CHECK_NOT_HANDLED,
+                                    RoseInstrCheckNotHandled> {
+public:
+    u32 key;
+    const RoseInstruction *target;
+
+    RoseInstrCheckNotHandled(u32 key_in, const RoseInstruction *target_in)
+        : key(key_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckNotHandled &ri) const {
+        return key == ri.key && target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), key);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckNotHandled &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return key == ri.key &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckSingleLookaround
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_SINGLE_LOOKAROUND,
+                                    ROSE_STRUCT_CHECK_SINGLE_LOOKAROUND,
+                                    RoseInstrCheckSingleLookaround> {
+public:
+    s8 offset;
+    CharReach reach;
+    const RoseInstruction *target;
+
+    RoseInstrCheckSingleLookaround(s8 offset_in, CharReach reach_in,
+                                   const RoseInstruction *target_in)
+        : offset(offset_in), reach(std::move(reach_in)), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckSingleLookaround &ri) const {
+        return offset == ri.offset && reach == ri.reach && target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), offset, reach);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckSingleLookaround &ri,
+                  const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return offset == ri.offset && reach == ri.reach &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckLookaround
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_LOOKAROUND,
+                                    ROSE_STRUCT_CHECK_LOOKAROUND,
+                                    RoseInstrCheckLookaround> {
+public:
+    std::vector<LookEntry> look;
+    const RoseInstruction *target;
+
+    RoseInstrCheckLookaround(std::vector<LookEntry> look_in,
+                             const RoseInstruction *target_in)
+        : look(std::move(look_in)), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckLookaround &ri) const {
+        return look == ri.look && target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), look);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckLookaround &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return look == ri.look
+            && offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckMask
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_MASK,
+                                    ROSE_STRUCT_CHECK_MASK,
+                                    RoseInstrCheckMask> {
+public:
+    u64a and_mask;
+    u64a cmp_mask;
+    u64a neg_mask;
+    s32 offset;
+    const RoseInstruction *target;
+
+    RoseInstrCheckMask(u64a and_mask_in, u64a cmp_mask_in, u64a neg_mask_in,
+                       s32 offset_in, const RoseInstruction *target_in)
+        : and_mask(and_mask_in), cmp_mask(cmp_mask_in), neg_mask(neg_mask_in),
+          offset(offset_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckMask &ri) const {
+        return and_mask == ri.and_mask && cmp_mask == ri.cmp_mask &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), and_mask, cmp_mask, neg_mask,
+                        offset);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckMask &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return and_mask == ri.and_mask && cmp_mask == ri.cmp_mask &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckMask32
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_MASK_32,
+                                    ROSE_STRUCT_CHECK_MASK_32,
+                                    RoseInstrCheckMask32> {
+public:
+    std::array<u8, 32> and_mask;
+    std::array<u8, 32> cmp_mask;
+    u32 neg_mask;
+    s32 offset;
+    const RoseInstruction *target;
+
+    RoseInstrCheckMask32(std::array<u8, 32> and_mask_in,
+                         std::array<u8, 32> cmp_mask_in, u32 neg_mask_in,
+                         s32 offset_in, const RoseInstruction *target_in)
+        : and_mask(std::move(and_mask_in)), cmp_mask(std::move(cmp_mask_in)),
+          neg_mask(neg_mask_in), offset(offset_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckMask32 &ri) const {
+        return and_mask == ri.and_mask && cmp_mask == ri.cmp_mask &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), and_mask, cmp_mask, neg_mask,
+                        offset);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckMask32 &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return and_mask == ri.and_mask && cmp_mask == ri.cmp_mask &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckByte
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_BYTE,
+                                    ROSE_STRUCT_CHECK_BYTE,
+                                    RoseInstrCheckByte> {
+public:
+    u8 and_mask;
+    u8 cmp_mask;
+    u8 negation;
+    s32 offset;
+    const RoseInstruction *target;
+
+    RoseInstrCheckByte(u8 and_mask_in, u8 cmp_mask_in, u8 negation_in,
+                       s32 offset_in, const RoseInstruction *target_in)
+        : and_mask(and_mask_in), cmp_mask(cmp_mask_in), negation(negation_in),
+          offset(offset_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckByte &ri) const {
+        return and_mask == ri.and_mask && cmp_mask == ri.cmp_mask &&
+               negation == ri.negation && offset == ri.offset &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), and_mask, cmp_mask, negation,
+                        offset);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckByte &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return and_mask == ri.and_mask && cmp_mask == ri.cmp_mask &&
+               negation == ri.negation && offset == ri.offset &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckShufti16x8
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_SHUFTI_16x8,
+                                    ROSE_STRUCT_CHECK_SHUFTI_16x8,
+                                    RoseInstrCheckShufti16x8> {
+public:
+    std::array<u8, 32> nib_mask;
+    std::array<u8, 16> bucket_select_mask;
+    u32 neg_mask;
+    s32 offset;
+    const RoseInstruction *target;
+
+    RoseInstrCheckShufti16x8(std::array<u8, 32> nib_mask_in,
+                             std::array<u8, 16> bucket_select_mask_in,
+                             u32 neg_mask_in, s32 offset_in,
+                             const RoseInstruction *target_in)
+        : nib_mask(std::move(nib_mask_in)),
+          bucket_select_mask(std::move(bucket_select_mask_in)),
+          neg_mask(neg_mask_in), offset(offset_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckShufti16x8 &ri) const {
+        return nib_mask == ri.nib_mask &&
+               bucket_select_mask == ri.bucket_select_mask &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), nib_mask,
+                        bucket_select_mask, neg_mask, offset);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckShufti16x8 &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return nib_mask == ri.nib_mask &&
+               bucket_select_mask == ri.bucket_select_mask &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckShufti32x8
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_SHUFTI_32x8,
+                                    ROSE_STRUCT_CHECK_SHUFTI_32x8,
+                                    RoseInstrCheckShufti32x8> {
+public:
+    std::array<u8, 16> hi_mask;
+    std::array<u8, 16> lo_mask;
+    std::array<u8, 32> bucket_select_mask;
+    u32 neg_mask;
+    s32 offset;
+    const RoseInstruction *target;
+
+    RoseInstrCheckShufti32x8(std::array<u8, 16> hi_mask_in,
+                             std::array<u8, 16> lo_mask_in,
+                             std::array<u8, 32> bucket_select_mask_in,
+                             u32 neg_mask_in, s32 offset_in,
+                             const RoseInstruction *target_in)
+        : hi_mask(std::move(hi_mask_in)), lo_mask(std::move(lo_mask_in)),
+          bucket_select_mask(std::move(bucket_select_mask_in)),
+          neg_mask(neg_mask_in), offset(offset_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckShufti32x8 &ri) const {
+        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
+               bucket_select_mask == ri.bucket_select_mask &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), hi_mask, lo_mask,
+                        bucket_select_mask, neg_mask, offset);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckShufti32x8 &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
+               bucket_select_mask == ri.bucket_select_mask &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckShufti16x16
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_SHUFTI_16x16,
+                                    ROSE_STRUCT_CHECK_SHUFTI_16x16,
+                                    RoseInstrCheckShufti16x16> {
+public:
+    std::array<u8, 32> hi_mask;
+    std::array<u8, 32> lo_mask;
+    std::array<u8, 32> bucket_select_mask;
+    u32 neg_mask;
+    s32 offset;
+    const RoseInstruction *target;
+
+    RoseInstrCheckShufti16x16(std::array<u8, 32> hi_mask_in,
+                              std::array<u8, 32> lo_mask_in,
+                              std::array<u8, 32> bucket_select_mask_in,
+                              u32 neg_mask_in, s32 offset_in,
+                              const RoseInstruction *target_in)
+        : hi_mask(std::move(hi_mask_in)), lo_mask(std::move(lo_mask_in)),
+          bucket_select_mask(std::move(bucket_select_mask_in)),
+          neg_mask(neg_mask_in), offset(offset_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckShufti16x16 &ri) const {
+        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
+               bucket_select_mask == ri.bucket_select_mask &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), hi_mask, lo_mask,
+                        bucket_select_mask, neg_mask, offset);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckShufti16x16 &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
+               bucket_select_mask == ri.bucket_select_mask &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckShufti32x16
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_SHUFTI_32x16,
+                                    ROSE_STRUCT_CHECK_SHUFTI_32x16,
+                                    RoseInstrCheckShufti32x16> {
+public:
+    std::array<u8, 32> hi_mask;
+    std::array<u8, 32> lo_mask;
+    std::array<u8, 32> bucket_select_mask_hi;
+    std::array<u8, 32> bucket_select_mask_lo;
+    u32 neg_mask;
+    s32 offset;
+    const RoseInstruction *target;
+
+    RoseInstrCheckShufti32x16(std::array<u8, 32> hi_mask_in,
+                              std::array<u8, 32> lo_mask_in,
+                              std::array<u8, 32> bucket_select_mask_hi_in,
+                              std::array<u8, 32> bucket_select_mask_lo_in,
+                              u32 neg_mask_in, s32 offset_in,
+                              const RoseInstruction *target_in)
+        : hi_mask(std::move(hi_mask_in)), lo_mask(std::move(lo_mask_in)),
+          bucket_select_mask_hi(std::move(bucket_select_mask_hi_in)),
+          bucket_select_mask_lo(std::move(bucket_select_mask_lo_in)),
+          neg_mask(neg_mask_in), offset(offset_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckShufti32x16 &ri) const {
+        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
+               bucket_select_mask_hi == ri.bucket_select_mask_hi &&
+               bucket_select_mask_lo == ri.bucket_select_mask_lo &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), hi_mask, lo_mask,
+                        bucket_select_mask_hi, bucket_select_mask_lo,
+                        neg_mask, offset);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckShufti32x16 &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
+               bucket_select_mask_hi == ri.bucket_select_mask_hi &&
+               bucket_select_mask_lo == ri.bucket_select_mask_lo &&
+               neg_mask == ri.neg_mask && offset == ri.offset &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckInfix
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_INFIX,
+                                    ROSE_STRUCT_CHECK_INFIX,
+                                    RoseInstrCheckInfix> {
+public:
+    u32 queue;
+    u32 lag;
+    ReportID report;
+    const RoseInstruction *target;
+
+    RoseInstrCheckInfix(u32 queue_in, u32 lag_in, ReportID report_in,
+                        const RoseInstruction *target_in)
+        : queue(queue_in), lag(lag_in), report(report_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckInfix &ri) const {
+        return queue == ri.queue && lag == ri.lag && report == ri.report &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), queue, lag, report);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckInfix &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return queue == ri.queue && lag == ri.lag && report == ri.report &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckPrefix
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_PREFIX,
+                                    ROSE_STRUCT_CHECK_PREFIX,
+                                    RoseInstrCheckPrefix> {
+public:
+    u32 queue;
+    u32 lag;
+    ReportID report;
+    const RoseInstruction *target;
+
+    RoseInstrCheckPrefix(u32 queue_in, u32 lag_in, ReportID report_in,
+                         const RoseInstruction *target_in)
+        : queue(queue_in), lag(lag_in), report(report_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckPrefix &ri) const {
+        return queue == ri.queue && lag == ri.lag && report == ri.report &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), queue, lag, report);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckPrefix &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return queue == ri.queue && lag == ri.lag && report == ri.report &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrPushDelayed
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_PUSH_DELAYED,
+                                    ROSE_STRUCT_PUSH_DELAYED,
+                                    RoseInstrPushDelayed> {
+public:
+    u8 delay;
+    u32 index;
+
+    RoseInstrPushDelayed(u8 delay_in, u32 index_in)
+        : delay(delay_in), index(index_in) {}
+
+    bool operator==(const RoseInstrPushDelayed &ri) const {
+        return delay == ri.delay && index == ri.index;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), delay, index);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrPushDelayed &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return delay == ri.delay && index == ri.index;
+    }
+};
+
+class RoseInstrCatchUp
+    : public RoseInstrBaseTrivial<ROSE_INSTR_CATCH_UP, ROSE_STRUCT_CATCH_UP,
+                                  RoseInstrCatchUp> {
+public:
+    ~RoseInstrCatchUp() override;
+};
+
+class RoseInstrCatchUpMpv
+    : public RoseInstrBaseTrivial<ROSE_INSTR_CATCH_UP_MPV,
+                                  ROSE_STRUCT_CATCH_UP_MPV,
+                                  RoseInstrCatchUpMpv> {
+public:
+    ~RoseInstrCatchUpMpv() override;
+};
+
+class RoseInstrSomAdjust
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_SOM_ADJUST,
+                                    ROSE_STRUCT_SOM_ADJUST,
+                                    RoseInstrSomAdjust> {
+public:
+    u32 distance;
+
+    explicit RoseInstrSomAdjust(u32 distance_in) : distance(distance_in) {}
+
+    bool operator==(const RoseInstrSomAdjust &ri) const {
+        return distance == ri.distance;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), distance);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrSomAdjust &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return distance == ri.distance;
+    }
+};
+
+class RoseInstrSomLeftfix
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_SOM_LEFTFIX,
+                                    ROSE_STRUCT_SOM_LEFTFIX,
+                                    RoseInstrSomLeftfix> {
+public:
+    u32 queue;
+    u32 lag;
+
+    RoseInstrSomLeftfix(u32 queue_in, u32 lag_in)
+        : queue(queue_in), lag(lag_in) {}
+
+    bool operator==(const RoseInstrSomLeftfix &ri) const {
+        return queue == ri.queue && lag == ri.lag;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), queue, lag);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrSomLeftfix &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return queue == ri.queue && lag == ri.lag;
+    }
+};
+
+class RoseInstrSomFromReport
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_SOM_FROM_REPORT,
+                                    ROSE_STRUCT_SOM_FROM_REPORT,
+                                    RoseInstrSomFromReport> {
+public:
+    som_operation som;
+
+    RoseInstrSomFromReport() {
+        std::memset(&som, 0, sizeof(som));
+    }
+
+    bool operator==(const RoseInstrSomFromReport &ri) const {
+        return std::memcmp(&som, &ri.som, sizeof(som)) == 0;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), som.type, som.onmatch);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrSomFromReport &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return std::memcmp(&som, &ri.som, sizeof(som)) == 0;
+    }
+};
+
+class RoseInstrSomZero
+    : public RoseInstrBaseTrivial<ROSE_INSTR_SOM_ZERO, ROSE_STRUCT_SOM_ZERO,
+                                  RoseInstrSomZero> {
+public:
+    ~RoseInstrSomZero() override;
+};
+
+class RoseInstrTriggerInfix
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_TRIGGER_INFIX,
+                                    ROSE_STRUCT_TRIGGER_INFIX,
+                                    RoseInstrTriggerInfix> {
+public:
+    u8 cancel;
+    u32 queue;
+    u32 event;
+
+    RoseInstrTriggerInfix(u8 cancel_in, u32 queue_in, u32 event_in)
+        : cancel(cancel_in), queue(queue_in), event(event_in) {}
+
+    bool operator==(const RoseInstrTriggerInfix &ri) const {
+        return cancel == ri.cancel && queue == ri.queue && event == ri.event;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), cancel, queue, event);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrTriggerInfix &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return cancel == ri.cancel && queue == ri.queue && event == ri.event;
+    }
+};
+
+class RoseInstrTriggerSuffix
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_TRIGGER_SUFFIX,
+                                    ROSE_STRUCT_TRIGGER_SUFFIX,
+                                    RoseInstrTriggerSuffix> {
+public:
+    u32 queue;
+    u32 event;
+
+    RoseInstrTriggerSuffix(u32 queue_in, u32 event_in)
+        : queue(queue_in), event(event_in) {}
+
+    bool operator==(const RoseInstrTriggerSuffix &ri) const {
+        return queue == ri.queue && event == ri.event;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), queue, event);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrTriggerSuffix &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return queue == ri.queue && event == ri.event;
+    }
+};
+
+class RoseInstrDedupe
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_DEDUPE, ROSE_STRUCT_DEDUPE,
+                                    RoseInstrDedupe> {
+public:
+    u8 quash_som;
+    u32 dkey;
+    s32 offset_adjust;
+    const RoseInstruction *target;
+
+    RoseInstrDedupe(u8 quash_som_in, u32 dkey_in, s32 offset_adjust_in,
+                    const RoseInstruction *target_in)
+        : quash_som(quash_som_in), dkey(dkey_in),
+          offset_adjust(offset_adjust_in), target(target_in) {}
+
+    bool operator==(const RoseInstrDedupe &ri) const {
+        return quash_som == ri.quash_som && dkey == ri.dkey &&
+               offset_adjust == ri.offset_adjust && target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), quash_som, dkey,
+                        offset_adjust);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrDedupe &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return quash_som == ri.quash_som && dkey == ri.dkey &&
+               offset_adjust == ri.offset_adjust &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrDedupeSom
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_DEDUPE_SOM,
+                                    ROSE_STRUCT_DEDUPE_SOM,
+                                    RoseInstrDedupeSom> {
+public:
+    u8 quash_som;
+    u32 dkey;
+    s32 offset_adjust;
+    const RoseInstruction *target;
+
+    RoseInstrDedupeSom(u8 quash_som_in, u32 dkey_in, s32 offset_adjust_in,
+                       const RoseInstruction *target_in)
+        : quash_som(quash_som_in), dkey(dkey_in),
+          offset_adjust(offset_adjust_in), target(target_in) {}
+
+    bool operator==(const RoseInstrDedupeSom &ri) const {
+        return quash_som == ri.quash_som && dkey == ri.dkey &&
+               offset_adjust == ri.offset_adjust && target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), quash_som, dkey,
+                        offset_adjust);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrDedupeSom &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return quash_som == ri.quash_som && dkey == ri.dkey &&
+               offset_adjust == ri.offset_adjust &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrReportChain
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_REPORT_CHAIN,
+                                    ROSE_STRUCT_REPORT_CHAIN,
+                                    RoseInstrReportChain> {
+public:
+    u32 event;
+    u64a top_squash_distance;
+
+    RoseInstrReportChain(u32 event_in, u32 top_squash_distance_in)
+        : event(event_in), top_squash_distance(top_squash_distance_in) {}
+
+    bool operator==(const RoseInstrReportChain &ri) const {
+        return event == ri.event &&
+               top_squash_distance == ri.top_squash_distance;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), event, top_squash_distance);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrReportChain &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return event == ri.event &&
+               top_squash_distance == ri.top_squash_distance;
+    }
+};
+
+class RoseInstrReportSomInt
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_REPORT_SOM_INT,
+                                    ROSE_STRUCT_REPORT_SOM_INT,
+                                    RoseInstrReportSomInt> {
+public:
+    som_operation som;
+
+    RoseInstrReportSomInt() {
+        std::memset(&som, 0, sizeof(som));
+    }
+
+    bool operator==(const RoseInstrReportSomInt &ri) const {
+        return std::memcmp(&som, &ri.som, sizeof(som)) == 0;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), som.type, som.onmatch);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrReportSomInt &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return std::memcmp(&som, &ri.som, sizeof(som)) == 0;
+    }
+};
+
+class RoseInstrReportSomAware
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_REPORT_SOM_AWARE,
+                                    ROSE_STRUCT_REPORT_SOM_AWARE,
+                                    RoseInstrReportSomAware> {
+public:
+    som_operation som;
+
+    RoseInstrReportSomAware() {
+        std::memset(&som, 0, sizeof(som));
+    }
+
+    bool operator==(const RoseInstrReportSomAware &ri) const {
+        return std::memcmp(&som, &ri.som, sizeof(som)) == 0;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), som.type, som.onmatch);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrReportSomAware &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return std::memcmp(&som, &ri.som, sizeof(som)) == 0;
+    }
+};
+
+class RoseInstrReport
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_REPORT, ROSE_STRUCT_REPORT,
+                                    RoseInstrReport> {
+public:
+    ReportID onmatch;
+    s32 offset_adjust;
+
+    RoseInstrReport(ReportID onmatch_in, s32 offset_adjust_in)
+        : onmatch(onmatch_in), offset_adjust(offset_adjust_in) {}
+
+    bool operator==(const RoseInstrReport &ri) const {
+        return onmatch == ri.onmatch && offset_adjust == ri.offset_adjust;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), onmatch, offset_adjust);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrReport &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return onmatch == ri.onmatch && offset_adjust == ri.offset_adjust;
+    }
+};
+
+class RoseInstrReportExhaust
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_REPORT_EXHAUST,
+                                    ROSE_STRUCT_REPORT_EXHAUST,
+                                    RoseInstrReportExhaust> {
+public:
+    ReportID onmatch;
+    s32 offset_adjust;
+    u32 ekey;
+
+    RoseInstrReportExhaust(ReportID onmatch_in, s32 offset_adjust_in,
+                           u32 ekey_in)
+        : onmatch(onmatch_in), offset_adjust(offset_adjust_in), ekey(ekey_in) {}
+
+    bool operator==(const RoseInstrReportExhaust &ri) const {
+        return onmatch == ri.onmatch && offset_adjust == ri.offset_adjust &&
+               ekey == ri.ekey;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), onmatch, offset_adjust, ekey);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrReportExhaust &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return onmatch == ri.onmatch && offset_adjust == ri.offset_adjust &&
+               ekey == ri.ekey;
+    }
+};
+
+class RoseInstrReportSom
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_REPORT_SOM,
+                                    ROSE_STRUCT_REPORT_SOM,
+                                    RoseInstrReportSom> {
+public:
+    ReportID onmatch;
+    s32 offset_adjust;
+
+    RoseInstrReportSom(ReportID onmatch_in, s32 offset_adjust_in)
+        : onmatch(onmatch_in), offset_adjust(offset_adjust_in) {}
+
+    bool operator==(const RoseInstrReportSom &ri) const {
+        return onmatch == ri.onmatch && offset_adjust == ri.offset_adjust;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), onmatch, offset_adjust);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrReportSom &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return onmatch == ri.onmatch && offset_adjust == ri.offset_adjust;
+    }
+};
+
+class RoseInstrReportSomExhaust
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_REPORT_SOM_EXHAUST,
+                                    ROSE_STRUCT_REPORT_SOM_EXHAUST,
+                                    RoseInstrReportSomExhaust> {
+public:
+    ReportID onmatch;
+    s32 offset_adjust;
+    u32 ekey;
+
+    RoseInstrReportSomExhaust(ReportID onmatch_in, s32 offset_adjust_in,
+                              u32 ekey_in)
+        : onmatch(onmatch_in), offset_adjust(offset_adjust_in), ekey(ekey_in) {}
+
+    bool operator==(const RoseInstrReportSomExhaust &ri) const {
+        return onmatch == ri.onmatch && offset_adjust == ri.offset_adjust &&
+               ekey == ri.ekey;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), onmatch, offset_adjust, ekey);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrReportSomExhaust &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return onmatch == ri.onmatch && offset_adjust == ri.offset_adjust &&
+               ekey == ri.ekey;
+    }
+};
+
+class RoseInstrDedupeAndReport
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_DEDUPE_AND_REPORT,
+                                    ROSE_STRUCT_DEDUPE_AND_REPORT,
+                                    RoseInstrDedupeAndReport> {
+public:
+    u8 quash_som;
+    u32 dkey;
+    ReportID onmatch;
+    s32 offset_adjust;
+    const RoseInstruction *target;
+
+    RoseInstrDedupeAndReport(u8 quash_som_in, u32 dkey_in, ReportID onmatch_in,
+                             s32 offset_adjust_in,
+                             const RoseInstruction *target_in)
+        : quash_som(quash_som_in), dkey(dkey_in), onmatch(onmatch_in),
+          offset_adjust(offset_adjust_in), target(target_in) {}
+
+    bool operator==(const RoseInstrDedupeAndReport &ri) const {
+        return quash_som == ri.quash_som && dkey == ri.dkey &&
+               onmatch == ri.onmatch && offset_adjust == ri.offset_adjust &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), quash_som, dkey, onmatch,
+                        offset_adjust);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrDedupeAndReport &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return quash_som == ri.quash_som && dkey == ri.dkey &&
+               onmatch == ri.onmatch && offset_adjust == ri.offset_adjust &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrFinalReport
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_FINAL_REPORT,
+                                    ROSE_STRUCT_FINAL_REPORT,
+                                    RoseInstrFinalReport> {
+public:
+    ReportID onmatch;
+    s32 offset_adjust;
+
+    RoseInstrFinalReport(ReportID onmatch_in, s32 offset_adjust_in)
+        : onmatch(onmatch_in), offset_adjust(offset_adjust_in) {}
+
+    bool operator==(const RoseInstrFinalReport &ri) const {
+        return onmatch == ri.onmatch && offset_adjust == ri.offset_adjust;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), onmatch, offset_adjust);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrFinalReport &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return onmatch == ri.onmatch && offset_adjust == ri.offset_adjust;
+    }
+};
+
+class RoseInstrCheckExhausted
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_EXHAUSTED,
+                                    ROSE_STRUCT_CHECK_EXHAUSTED,
+                                    RoseInstrCheckExhausted> {
+public:
+    u32 ekey;
+    const RoseInstruction *target;
+
+    RoseInstrCheckExhausted(u32 ekey_in, const RoseInstruction *target_in)
+        : ekey(ekey_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckExhausted &ri) const {
+        return ekey == ri.ekey && target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), ekey);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckExhausted &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return ekey == ri.ekey &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckMinLength
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_MIN_LENGTH,
+                                    ROSE_STRUCT_CHECK_MIN_LENGTH,
+                                    RoseInstrCheckMinLength> {
+public:
+    s32 end_adj;
+    u64a min_length;
+    const RoseInstruction *target;
+
+    RoseInstrCheckMinLength(s32 end_adj_in, u64a min_length_in,
+                            const RoseInstruction *target_in)
+        : end_adj(end_adj_in), min_length(min_length_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckMinLength &ri) const {
+        return end_adj == ri.end_adj && min_length == ri.min_length &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), end_adj, min_length);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckMinLength &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return end_adj == ri.end_adj && min_length == ri.min_length &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrSetState
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_SET_STATE, ROSE_STRUCT_SET_STATE,
+                                    RoseInstrSetState> {
+public:
+    u32 index;
+
+    explicit RoseInstrSetState(u32 index_in) : index(index_in) {}
+
+    bool operator==(const RoseInstrSetState &ri) const {
+        return index == ri.index;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), index);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrSetState &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return index == ri.index;
+    }
+};
+
+class RoseInstrSetGroups
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_SET_GROUPS,
+                                    ROSE_STRUCT_SET_GROUPS,
+                                    RoseInstrSetGroups> {
+public:
+    rose_group groups;
+
+    explicit RoseInstrSetGroups(rose_group groups_in) : groups(groups_in) {}
+
+    bool operator==(const RoseInstrSetGroups &ri) const {
+        return groups == ri.groups;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), groups);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrSetGroups &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return groups == ri.groups;
+    }
+};
+
+class RoseInstrSquashGroups
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_SQUASH_GROUPS,
+                                    ROSE_STRUCT_SQUASH_GROUPS,
+                                    RoseInstrSquashGroups> {
+public:
+    rose_group groups;
+
+    explicit RoseInstrSquashGroups(rose_group groups_in) : groups(groups_in) {}
+
+    bool operator==(const RoseInstrSquashGroups &ri) const {
+        return groups == ri.groups;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), groups);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrSquashGroups &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return groups == ri.groups;
+    }
+};
+
+class RoseInstrCheckState
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_STATE,
+                                    ROSE_STRUCT_CHECK_STATE,
+                                    RoseInstrCheckState> {
+public:
+    u32 index;
+    const RoseInstruction *target;
+
+    RoseInstrCheckState(u32 index_in, const RoseInstruction *target_in)
+        : index(index_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckState &ri) const {
+        return index == ri.index && target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), index);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckState &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return index == ri.index &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrSparseIterBegin
+    : public RoseInstrBase<ROSE_INSTR_SPARSE_ITER_BEGIN,
+                           ROSE_STRUCT_SPARSE_ITER_BEGIN,
+                           RoseInstrSparseIterBegin> {
+public:
+    u32 num_keys; // total number of multibit keys
+    std::vector<std::pair<u32, const RoseInstruction *>> jump_table;
+    const RoseInstruction *target;
+
+    RoseInstrSparseIterBegin(u32 num_keys_in,
+                             const RoseInstruction *target_in)
+        : num_keys(num_keys_in), target(target_in) {}
+
+    bool operator==(const RoseInstrSparseIterBegin &ri) const {
+        return num_keys == ri.num_keys && jump_table == ri.jump_table &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        size_t v = hash_all(static_cast<int>(opcode), num_keys);
+        for (const u32 &key : jump_table | boost::adaptors::map_keys) {
+            boost::hash_combine(v, key);
+        }
+        return v;
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    void update_target(const RoseInstruction *old_target,
+                       const RoseInstruction *new_target) override {
+        if (target == old_target) {
+            target = new_target;
+        }
+        for (auto &jump : jump_table) {
+            if (jump.second == old_target) {
+                jump.second = new_target;
+            }
+        }
+    }
+
+    bool equiv_to(const RoseInstrSparseIterBegin &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        if (iter_offset != ri.iter_offset ||
+            offsets.at(target) != other_offsets.at(ri.target)) {
+            return false;
+        }
+        if (jump_table.size() != ri.jump_table.size()) {
+            return false;
+        }
+        auto it1 = jump_table.begin(), it2 = ri.jump_table.begin();
+        for (; it1 != jump_table.end(); ++it1, ++it2) {
+            if (it1->first != it2->first) {
+                return false;
+            }
+            if (offsets.at(it1->second) != other_offsets.at(it2->second)) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+private:
+    friend class RoseInstrSparseIterNext;
+
+    // These variables allow us to use the same multibit iterator and jump
+    // table in subsequent SPARSE_ITER_NEXT write() operations.
+    mutable bool is_written = false;
+    mutable u32 iter_offset = 0;
+    mutable u32 jump_table_offset = 0;
+};
+
+class RoseInstrSparseIterNext
+    : public RoseInstrBase<ROSE_INSTR_SPARSE_ITER_NEXT,
+                           ROSE_STRUCT_SPARSE_ITER_NEXT,
+                           RoseInstrSparseIterNext> {
+public:
+    u32 state;
+    const RoseInstrSparseIterBegin *begin;
+    const RoseInstruction *target;
+
+    RoseInstrSparseIterNext(u32 state_in,
+                            const RoseInstrSparseIterBegin *begin_in,
+                            const RoseInstruction *target_in)
+        : state(state_in), begin(begin_in), target(target_in) {}
+
+    bool operator==(const RoseInstrSparseIterNext &ri) const {
+        return state == ri.state && begin == ri.begin && target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), state);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    void update_target(const RoseInstruction *old_target,
+                       const RoseInstruction *new_target) override {
+        if (target == old_target) {
+            target = new_target;
+        }
+        if (begin == old_target) {
+            assert(new_target->code() == ROSE_INSTR_SPARSE_ITER_BEGIN);
+            begin = static_cast<const RoseInstrSparseIterBegin *>(new_target);
+        }
+    }
+
+    bool equiv_to(const RoseInstrSparseIterNext &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return state == ri.state &&
+               offsets.at(begin) == other_offsets.at(ri.begin) &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrSparseIterAny
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_SPARSE_ITER_ANY,
+                                    ROSE_STRUCT_SPARSE_ITER_ANY,
+                                    RoseInstrSparseIterAny> {
+public:
+    u32 num_keys; // total number of multibit keys
+    std::vector<u32> keys;
+    const RoseInstruction *target;
+
+    RoseInstrSparseIterAny(u32 num_keys_in, std::vector<u32> keys_in,
+                           const RoseInstruction *target_in)
+        : num_keys(num_keys_in), keys(std::move(keys_in)), target(target_in) {}
+
+    bool operator==(const RoseInstrSparseIterAny &ri) const {
+        return num_keys == ri.num_keys && keys == ri.keys &&
+               target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), num_keys, keys);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrSparseIterAny &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return num_keys == ri.num_keys && keys == ri.keys &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrEnginesEod
+    : public RoseInstrBaseNoTargets<ROSE_INSTR_ENGINES_EOD,
+                                    ROSE_STRUCT_ENGINES_EOD,
+                                    RoseInstrEnginesEod> {
+public:
+    u32 iter_offset;
+
+    explicit RoseInstrEnginesEod(u32 iter_in) : iter_offset(iter_in) {}
+
+    bool operator==(const RoseInstrEnginesEod &ri) const {
+        return iter_offset == ri.iter_offset;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), iter_offset);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrEnginesEod &ri, const OffsetMap &,
+                  const OffsetMap &) const {
+        return iter_offset == ri.iter_offset;
+    }
+};
+
+class RoseInstrSuffixesEod
+    : public RoseInstrBaseTrivial<ROSE_INSTR_SUFFIXES_EOD,
+                                  ROSE_STRUCT_SUFFIXES_EOD,
+                                  RoseInstrSuffixesEod> {
+public:
+    ~RoseInstrSuffixesEod() override;
+};
+
+class RoseInstrMatcherEod : public RoseInstrBaseTrivial<ROSE_INSTR_MATCHER_EOD,
+                                                        ROSE_STRUCT_MATCHER_EOD,
+                                                        RoseInstrMatcherEod> {
+public:
+    ~RoseInstrMatcherEod() override;
+};
+
+class RoseInstrCheckLongLit
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_LONG_LIT,
+                                    ROSE_STRUCT_CHECK_LONG_LIT,
+                                    RoseInstrCheckLongLit> {
+public:
+    std::string literal;
+    const RoseInstruction *target;
+
+    RoseInstrCheckLongLit(std::string literal_in,
+                          const RoseInstruction *target_in)
+        : literal(std::move(literal_in)), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckLongLit &ri) const {
+        return literal == ri.literal && target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), literal);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckLongLit &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return literal == ri.literal &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckLongLitNocase
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_LONG_LIT_NOCASE,
+                                    ROSE_STRUCT_CHECK_LONG_LIT_NOCASE,
+                                    RoseInstrCheckLongLitNocase> {
+public:
+    std::string literal;
+    const RoseInstruction *target;
+
+    RoseInstrCheckLongLitNocase(std::string literal_in,
+                                const RoseInstruction *target_in)
+        : literal(std::move(literal_in)), target(target_in) {
+        upperString(literal);
+    }
+
+    bool operator==(const RoseInstrCheckLongLitNocase &ri) const {
+        return literal == ri.literal && target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), literal);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckLongLitNocase &ri,
+                  const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return literal == ri.literal &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckMedLit
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_MED_LIT,
+                                    ROSE_STRUCT_CHECK_MED_LIT,
+                                    RoseInstrCheckMedLit> {
+public:
+    std::string literal;
+    const RoseInstruction *target;
+
+    explicit RoseInstrCheckMedLit(std::string literal_in,
+                                  const RoseInstruction *target_in)
+        : literal(std::move(literal_in)), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckMedLit &ri) const {
+        return literal == ri.literal && target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), literal);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckMedLit &ri, const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return literal == ri.literal &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckMedLitNocase
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_MED_LIT_NOCASE,
+                                    ROSE_STRUCT_CHECK_MED_LIT_NOCASE,
+                                    RoseInstrCheckMedLitNocase> {
+public:
+    std::string literal;
+    const RoseInstruction *target;
+
+    explicit RoseInstrCheckMedLitNocase(std::string literal_in,
+                                        const RoseInstruction *target_in)
+        : literal(std::move(literal_in)), target(target_in) {
+        upperString(literal);
+    }
+
+    bool operator==(const RoseInstrCheckMedLitNocase &ri) const {
+        return literal == ri.literal && target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), literal);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckMedLitNocase &ri,
+                  const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return literal == ri.literal &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrClearWorkDone
+    : public RoseInstrBaseTrivial<ROSE_INSTR_CLEAR_WORK_DONE,
+                                  ROSE_STRUCT_CLEAR_WORK_DONE,
+                                  RoseInstrClearWorkDone> {
+public:
+    ~RoseInstrClearWorkDone() override;
+};
+
+class RoseInstrMultipathLookaround
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_MULTIPATH_LOOKAROUND,
+                                    ROSE_STRUCT_MULTIPATH_LOOKAROUND,
+                                    RoseInstrMultipathLookaround> {
+public:
+    std::vector<std::vector<LookEntry>> multi_look;
+    s32 last_start;
+    std::array<u8, 16> start_mask;
+    const RoseInstruction *target;
+
+    RoseInstrMultipathLookaround(std::vector<std::vector<LookEntry>> ml,
+                                 s32 last_start_in,
+                                 std::array<u8, 16> start_mask_in,
+                                 const RoseInstruction *target_in)
+        : multi_look(std::move(ml)), last_start(last_start_in),
+          start_mask(std::move(start_mask_in)), target(target_in) {}
+
+    bool operator==(const RoseInstrMultipathLookaround &ri) const {
+        return multi_look == ri.multi_look && last_start == ri.last_start
+        && start_mask == ri.start_mask && target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), multi_look, last_start,
+                        start_mask);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrMultipathLookaround &ri,
+                  const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return multi_look == ri.multi_look && last_start == ri.last_start
+            && start_mask == ri.start_mask
+            && offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckMultipathShufti16x8
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_16x8,
+                                    ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_16x8,
+                                    RoseInstrCheckMultipathShufti16x8> {
+public:
+    std::array<u8, 32> nib_mask;
+    std::array<u8, 64> bucket_select_mask;
+    std::array<u8, 64> data_select_mask;
+    u16 hi_bits_mask;
+    u16 lo_bits_mask;
+    u16 neg_mask;
+    s32 base_offset;
+    s32 last_start;
+    const RoseInstruction *target;
+
+    RoseInstrCheckMultipathShufti16x8(std::array<u8, 32> nib_mask_in,
+                                      std::array<u8, 64> bucket_select_mask_in,
+                                      std::array<u8, 64> data_select_mask_in,
+                                      u16 hi_bits_mask_in, u16 lo_bits_mask_in,
+                                      u16 neg_mask_in, s32 base_offset_in,
+                                      s32 last_start_in,
+                                      const RoseInstruction *target_in)
+        : nib_mask(std::move(nib_mask_in)),
+          bucket_select_mask(std::move(bucket_select_mask_in)),
+          data_select_mask(std::move(data_select_mask_in)),
+          hi_bits_mask(hi_bits_mask_in), lo_bits_mask(lo_bits_mask_in),
+          neg_mask(neg_mask_in), base_offset(base_offset_in),
+          last_start(last_start_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckMultipathShufti16x8 &ri) const {
+        return nib_mask == ri.nib_mask &&
+               bucket_select_mask == ri.bucket_select_mask &&
+               data_select_mask == ri.data_select_mask &&
+               hi_bits_mask == ri.hi_bits_mask &&
+               lo_bits_mask == ri.lo_bits_mask &&
+               neg_mask == ri.neg_mask && base_offset == ri.base_offset &&
+               last_start == ri.last_start && target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), nib_mask,
+                        bucket_select_mask, data_select_mask, hi_bits_mask,
+                        lo_bits_mask, neg_mask, base_offset, last_start);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckMultipathShufti16x8 &ri,
+                  const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return nib_mask == ri.nib_mask &&
+               bucket_select_mask == ri.bucket_select_mask &&
+               data_select_mask == ri.data_select_mask &&
+               hi_bits_mask == ri.hi_bits_mask &&
+               lo_bits_mask == ri.lo_bits_mask && neg_mask == ri.neg_mask &&
+               base_offset == ri.base_offset && last_start == ri.last_start &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckMultipathShufti32x8
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_32x8,
+                                    ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_32x8,
+                                    RoseInstrCheckMultipathShufti32x8> {
+public:
+    std::array<u8, 32> hi_mask;
+    std::array<u8, 32> lo_mask;
+    std::array<u8, 64> bucket_select_mask;
+    std::array<u8, 64> data_select_mask;
+    u32 hi_bits_mask;
+    u32 lo_bits_mask;
+    u32 neg_mask;
+    s32 base_offset;
+    s32 last_start;
+    const RoseInstruction *target;
+
+    RoseInstrCheckMultipathShufti32x8(std::array<u8, 32> hi_mask_in,
+                                      std::array<u8, 32> lo_mask_in,
+                                      std::array<u8, 64> bucket_select_mask_in,
+                                      std::array<u8, 64> data_select_mask_in,
+                                      u32 hi_bits_mask_in, u32 lo_bits_mask_in,
+                                      u32 neg_mask_in, s32 base_offset_in,
+                                      s32 last_start_in,
+                                      const RoseInstruction *target_in)
+        : hi_mask(std::move(hi_mask_in)), lo_mask(std::move(lo_mask_in)),
+          bucket_select_mask(std::move(bucket_select_mask_in)),
+          data_select_mask(std::move(data_select_mask_in)),
+          hi_bits_mask(hi_bits_mask_in), lo_bits_mask(lo_bits_mask_in),
+          neg_mask(neg_mask_in), base_offset(base_offset_in),
+          last_start(last_start_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckMultipathShufti32x8 &ri) const {
+        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
+               bucket_select_mask == ri.bucket_select_mask &&
+               data_select_mask == ri.data_select_mask &&
+               hi_bits_mask == ri.hi_bits_mask &&
+               lo_bits_mask == ri.lo_bits_mask &&
+               neg_mask == ri.neg_mask && base_offset == ri.base_offset &&
+               last_start == ri.last_start && target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), hi_mask, lo_mask,
+                        bucket_select_mask, data_select_mask, hi_bits_mask,
+                        lo_bits_mask, neg_mask, base_offset, last_start);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckMultipathShufti32x8 &ri,
+                  const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
+               bucket_select_mask == ri.bucket_select_mask &&
+               data_select_mask == ri.data_select_mask &&
+               hi_bits_mask == ri.hi_bits_mask &&
+               lo_bits_mask == ri.lo_bits_mask && neg_mask == ri.neg_mask &&
+               base_offset == ri.base_offset && last_start == ri.last_start &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckMultipathShufti32x16
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_32x16,
+                                    ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_32x16,
+                                    RoseInstrCheckMultipathShufti32x16> {
+public:
+    std::array<u8, 32> hi_mask;
+    std::array<u8, 32> lo_mask;
+    std::array<u8, 64> bucket_select_mask_hi;
+    std::array<u8, 64> bucket_select_mask_lo;
+    std::array<u8, 64> data_select_mask;
+    u32 hi_bits_mask;
+    u32 lo_bits_mask;
+    u32 neg_mask;
+    s32 base_offset;
+    s32 last_start;
+    const RoseInstruction *target;
+
+    RoseInstrCheckMultipathShufti32x16(std::array<u8, 32> hi_mask_in,
+                                       std::array<u8, 32> lo_mask_in,
+                                   std::array<u8, 64> bucket_select_mask_hi_in,
+                                   std::array<u8, 64> bucket_select_mask_lo_in,
+                                       std::array<u8, 64> data_select_mask_in,
+                                       u32 hi_bits_mask_in, u32 lo_bits_mask_in,
+                                       u32 neg_mask_in, s32 base_offset_in,
+                                       s32 last_start_in,
+                                       const RoseInstruction *target_in)
+        : hi_mask(std::move(hi_mask_in)), lo_mask(std::move(lo_mask_in)),
+          bucket_select_mask_hi(std::move(bucket_select_mask_hi_in)),
+          bucket_select_mask_lo(std::move(bucket_select_mask_lo_in)),
+          data_select_mask(std::move(data_select_mask_in)),
+          hi_bits_mask(hi_bits_mask_in), lo_bits_mask(lo_bits_mask_in),
+          neg_mask(neg_mask_in), base_offset(base_offset_in),
+          last_start(last_start_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckMultipathShufti32x16 &ri) const {
+        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
+               bucket_select_mask_hi == ri.bucket_select_mask_hi &&
+               bucket_select_mask_lo == ri.bucket_select_mask_lo &&
+               data_select_mask == ri.data_select_mask &&
+               hi_bits_mask == ri.hi_bits_mask &&
+               lo_bits_mask == ri.lo_bits_mask &&
+               neg_mask == ri.neg_mask && base_offset == ri.base_offset &&
+               last_start == ri.last_start && target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), hi_mask, lo_mask,
+                        bucket_select_mask_hi, bucket_select_mask_lo,
+                        data_select_mask, hi_bits_mask, lo_bits_mask, neg_mask,
+                        base_offset, last_start);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckMultipathShufti32x16 &ri,
+                  const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
+               bucket_select_mask_hi == ri.bucket_select_mask_hi &&
+               bucket_select_mask_lo == ri.bucket_select_mask_lo &&
+               data_select_mask == ri.data_select_mask &&
+               hi_bits_mask == ri.hi_bits_mask &&
+               lo_bits_mask == ri.lo_bits_mask && neg_mask == ri.neg_mask &&
+               base_offset == ri.base_offset && last_start == ri.last_start &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrCheckMultipathShufti64
+    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_64,
+                                    ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_64,
+                                    RoseInstrCheckMultipathShufti64> {
+public:
+    std::array<u8, 32> hi_mask;
+    std::array<u8, 32> lo_mask;
+    std::array<u8, 64> bucket_select_mask;
+    std::array<u8, 64> data_select_mask;
+    u64a hi_bits_mask;
+    u64a lo_bits_mask;
+    u64a neg_mask;
+    s32 base_offset;
+    s32 last_start;
+    const RoseInstruction *target;
+
+    RoseInstrCheckMultipathShufti64(std::array<u8, 32> hi_mask_in,
+                                    std::array<u8, 32> lo_mask_in,
+                                    std::array<u8, 64> bucket_select_mask_in,
+                                    std::array<u8, 64> data_select_mask_in,
+                                    u64a hi_bits_mask_in, u64a lo_bits_mask_in,
+                                    u64a neg_mask_in, s32 base_offset_in,
+                                    s32 last_start_in,
+                                    const RoseInstruction *target_in)
+        : hi_mask(std::move(hi_mask_in)), lo_mask(std::move(lo_mask_in)),
+          bucket_select_mask(std::move(bucket_select_mask_in)),
+          data_select_mask(std::move(data_select_mask_in)),
+          hi_bits_mask(hi_bits_mask_in), lo_bits_mask(lo_bits_mask_in),
+          neg_mask(neg_mask_in), base_offset(base_offset_in),
+          last_start(last_start_in), target(target_in) {}
+
+    bool operator==(const RoseInstrCheckMultipathShufti64 &ri) const {
+        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
+               bucket_select_mask == ri.bucket_select_mask &&
+               data_select_mask == ri.data_select_mask &&
+               hi_bits_mask == ri.hi_bits_mask &&
+               lo_bits_mask == ri.lo_bits_mask &&
+               neg_mask == ri.neg_mask && base_offset == ri.base_offset &&
+               last_start == ri.last_start && target == ri.target;
+    }
+
+    size_t hash() const override {
+        return hash_all(static_cast<int>(opcode), hi_mask, lo_mask,
+                        bucket_select_mask, data_select_mask, hi_bits_mask,
+                        lo_bits_mask, neg_mask, base_offset, last_start);
+    }
+
+    void write(void *dest, RoseEngineBlob &blob,
+               const OffsetMap &offset_map) const override;
+
+    bool equiv_to(const RoseInstrCheckMultipathShufti64 &ri,
+                  const OffsetMap &offsets,
+                  const OffsetMap &other_offsets) const {
+        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
+               bucket_select_mask == ri.bucket_select_mask &&
+               data_select_mask == ri.data_select_mask &&
+               hi_bits_mask == ri.hi_bits_mask &&
+               lo_bits_mask == ri.lo_bits_mask && neg_mask == ri.neg_mask &&
+               base_offset == ri.base_offset && last_start == ri.last_start &&
+               offsets.at(target) == other_offsets.at(ri.target);
+    }
+};
+
+class RoseInstrEnd
+    : public RoseInstrBaseTrivial<ROSE_INSTR_END, ROSE_STRUCT_END,
+                                  RoseInstrEnd> {
+public:
+    ~RoseInstrEnd() override;
+};
+
+}
+#endif
diff --git a/src/rose/rose_build_lit_accel.cpp b/src/rose/rose_build_lit_accel.cpp
new file mode 100644
index 000000000..b389f493d
--- /dev/null
+++ b/src/rose/rose_build_lit_accel.cpp
@@ -0,0 +1,467 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "rose_build_lit_accel.h"
+
+#include "grey.h"
+#include "ue2common.h"
+#include "hwlm/hwlm_build.h"
+#include "hwlm/hwlm_internal.h"
+#include "hwlm/hwlm_literal.h"
+#include "nfa/accel.h"
+#include "nfa/shufticompile.h"
+#include "nfa/trufflecompile.h"
+#include "util/compare.h"
+#include "util/dump_charclass.h"
+#include "util/ue2string.h"
+#include "util/verify_types.h"
+
+using namespace std;
+
+namespace ue2 {
+
+static const unsigned int MAX_ACCEL_OFFSET = 16;
+static const unsigned int MAX_SHUFTI_WIDTH = 240;
+
+static
+size_t mask_overhang(const AccelString &lit) {
+    size_t msk_true_size = lit.msk.size();
+    assert(msk_true_size <= HWLM_MASKLEN);
+    assert(HWLM_MASKLEN <= MAX_ACCEL_OFFSET);
+    for (u8 c : lit.msk) {
+        if (!c) {
+            msk_true_size--;
+        } else {
+            break;
+        }
+    }
+
+    if (lit.s.length() >= msk_true_size) {
+        return 0;
+    }
+
+    /* only short literals should be able to have a mask which overhangs */
+    assert(lit.s.length() < MAX_ACCEL_OFFSET);
+    return msk_true_size - lit.s.length();
+}
+
+static
+bool findDVerm(const vector<const AccelString *> &lits, AccelAux *aux) {
+    const AccelString &first = *lits.front();
+
+    struct candidate {
+        candidate(void)
+            : c1(0), c2(0), max_offset(0), b5insens(false), valid(false) {}
+        candidate(const AccelString &base, u32 offset)
+            : c1(base.s[offset]), c2(base.s[offset + 1]), max_offset(0),
+              b5insens(false), valid(true) {}
+        char c1;
+        char c2;
+        u32 max_offset;
+        bool b5insens;
+        bool valid;
+
+        bool operator>(const candidate &other) const {
+            if (!valid) {
+                return false;
+            }
+
+            if (!other.valid) {
+                return true;
+            }
+
+            if (other.cdiffers() && !cdiffers()) {
+                return false;
+            }
+
+            if (!other.cdiffers() && cdiffers()) {
+                return true;
+            }
+
+            if (!other.b5insens && b5insens) {
+                return false;
+            }
+
+            if (other.b5insens && !b5insens) {
+                return true;
+            }
+
+            if (max_offset > other.max_offset) {
+                return false;
+            }
+
+            return true;
+        }
+
+        bool cdiffers(void) const {
+            if (!b5insens) {
+                return c1 != c2;
+            }
+            return (c1 & CASE_CLEAR) != (c2 & CASE_CLEAR);
+        }
+    };
+
+    candidate best;
+
+    for (u32 i = 0; i < MIN(MAX_ACCEL_OFFSET, first.s.length()) - 1; i++) {
+        candidate curr(first, i);
+
+        /* check to see if this pair appears in each string */
+        for (const auto &lit_ptr : lits) {
+            const AccelString &lit = *lit_ptr;
+            if (lit.nocase && (ourisalpha(curr.c1) || ourisalpha(curr.c2))) {
+                curr.b5insens = true; /* no choice but to be case insensitive */
+            }
+
+            bool found = false;
+            bool found_nc = false;
+            for (u32 j = 0;
+                 !found && j < MIN(MAX_ACCEL_OFFSET, lit.s.length()) - 1; j++) {
+                found |= curr.c1 == lit.s[j] && curr.c2 == lit.s[j + 1];
+                found_nc |= (curr.c1 & CASE_CLEAR) == (lit.s[j] & CASE_CLEAR)
+                    && (curr.c2 & CASE_CLEAR) == (lit.s[j + 1] & CASE_CLEAR);
+
+                if (curr.b5insens) {
+                    found = found_nc;
+                }
+            }
+
+            if (!curr.b5insens && !found && found_nc) {
+                curr.b5insens = true;
+                found = true;
+            }
+
+            if (!found) {
+                goto next_candidate;
+            }
+        }
+
+        /* check to find the max offset where this appears */
+        for (const auto &lit_ptr : lits) {
+            const AccelString &lit = *lit_ptr;
+            for (u32 j = 0; j < MIN(MAX_ACCEL_OFFSET, lit.s.length()) - 1;
+                 j++) {
+                bool found = false;
+                if (curr.b5insens) {
+                    found = (curr.c1 & CASE_CLEAR) == (lit.s[j] & CASE_CLEAR)
+                     && (curr.c2 & CASE_CLEAR) == (lit.s[j + 1] & CASE_CLEAR);
+                } else {
+                    found = curr.c1 == lit.s[j] && curr.c2 == lit.s[j + 1];
+                }
+
+                if (found) {
+                    assert(j + mask_overhang(lit) <= MAX_ACCEL_OFFSET);
+                    ENSURE_AT_LEAST(&curr.max_offset, j + mask_overhang(lit));
+                    break;
+                }
+            }
+        }
+
+        if (curr > best) {
+            best = curr;
+        }
+
+    next_candidate:;
+    }
+
+    if (!best.valid) {
+        return false;
+    }
+
+    aux->dverm.offset = verify_u8(best.max_offset);
+
+    if (!best.b5insens) {
+        aux->dverm.accel_type = ACCEL_DVERM;
+        aux->dverm.c1 = best.c1;
+        aux->dverm.c2 = best.c2;
+        DEBUG_PRINTF("built dverm for %02hhx%02hhx\n",
+                     aux->dverm.c1, aux->dverm.c2);
+    } else {
+        aux->dverm.accel_type = ACCEL_DVERM_NOCASE;
+        aux->dverm.c1 = best.c1 & CASE_CLEAR;
+        aux->dverm.c2 = best.c2 & CASE_CLEAR;
+        DEBUG_PRINTF("built dverm nc for %02hhx%02hhx\n",
+                     aux->dverm.c1, aux->dverm.c2);
+    }
+    return true;
+}
+
+static
+bool findSVerm(const vector<const AccelString *> &lits, AccelAux *aux) {
+    const AccelString &first = *lits.front();
+
+    struct candidate {
+        candidate(void)
+            : c(0), max_offset(0), b5insens(false), valid(false) {}
+        candidate(const AccelString &base, u32 offset)
+            : c(base.s[offset]), max_offset(0),
+              b5insens(false), valid(true) {}
+        char c;
+        u32 max_offset;
+        bool b5insens;
+        bool valid;
+
+        bool operator>(const candidate &other) const {
+            if (!valid) {
+                return false;
+            }
+
+            if (!other.valid) {
+                return true;
+            }
+
+            if (!other.b5insens && b5insens) {
+                return false;
+            }
+
+            if (other.b5insens && !b5insens) {
+                return true;
+            }
+
+            if (max_offset > other.max_offset) {
+                return false;
+            }
+
+            return true;
+        }
+    };
+
+    candidate best;
+
+    for (u32 i = 0; i < MIN(MAX_ACCEL_OFFSET, first.s.length()); i++) {
+        candidate curr(first, i);
+
+        /* check to see if this pair appears in each string */
+        for (const auto &lit_ptr : lits) {
+            const AccelString &lit = *lit_ptr;
+            if (lit.nocase && ourisalpha(curr.c)) {
+                curr.b5insens = true; /* no choice but to be case insensitive */
+            }
+
+            bool found = false;
+            bool found_nc = false;
+            for (u32 j = 0;
+                 !found && j < MIN(MAX_ACCEL_OFFSET, lit.s.length()); j++) {
+                found |= curr.c == lit.s[j];
+                found_nc |= (curr.c & CASE_CLEAR) == (lit.s[j] & CASE_CLEAR);
+
+                if (curr.b5insens) {
+                    found = found_nc;
+                }
+            }
+
+            if (!curr.b5insens && !found && found_nc) {
+                curr.b5insens = true;
+                found = true;
+            }
+
+            if (!found) {
+                goto next_candidate;
+            }
+        }
+
+        /* check to find the max offset where this appears */
+        for (const auto &lit_ptr : lits) {
+            const AccelString &lit = *lit_ptr;
+            for (u32 j = 0; j < MIN(MAX_ACCEL_OFFSET, lit.s.length()); j++) {
+                bool found = false;
+                if (curr.b5insens) {
+                    found = (curr.c & CASE_CLEAR) == (lit.s[j] & CASE_CLEAR);
+                } else {
+                    found = curr.c == lit.s[j];
+                }
+
+                if (found) {
+                    assert(j + mask_overhang(lit) <= MAX_ACCEL_OFFSET);
+                    ENSURE_AT_LEAST(&curr.max_offset, j + mask_overhang(lit));
+                }
+            }
+        }
+
+        if (curr > best) {
+            best = curr;
+        }
+
+    next_candidate:;
+    }
+
+    if (!best.valid) {
+        return false;
+    }
+
+    if (!best.b5insens) {
+        aux->verm.accel_type = ACCEL_VERM;
+        aux->verm.c = best.c;
+        DEBUG_PRINTF("built verm for %02hhx\n", aux->verm.c);
+    } else {
+        aux->verm.accel_type = ACCEL_VERM_NOCASE;
+        aux->verm.c = best.c & CASE_CLEAR;
+        DEBUG_PRINTF("built verm nc for %02hhx\n", aux->verm.c);
+    }
+    aux->verm.offset = verify_u8(best.max_offset);
+
+    return true;
+}
+
+static
+void filterLits(const vector<AccelString> &lits, hwlm_group_t expected_groups,
+                vector<const AccelString *> *filtered_lits, u32 *min_len) {
+    *min_len = MAX_ACCEL_OFFSET;
+
+    for (const auto &lit : lits) {
+        if (!(lit.groups & expected_groups)) {
+            continue;
+        }
+
+        const size_t lit_len = lit.s.length();
+        if (lit_len < *min_len) {
+            *min_len = verify_u32(lit_len);
+        }
+
+        DEBUG_PRINTF("lit: '%s', nocase=%d, groups=0x%llx\n",
+                     escapeString(lit.s).c_str(), lit.nocase ? 1 : 0,
+                     lit.groups);
+        filtered_lits->push_back(&lit);
+    }
+}
+
+static
+bool litGuardedByCharReach(const CharReach &cr, const AccelString &lit,
+                           u32 max_offset) {
+    for (u32 i = 0; i <= max_offset && i < lit.s.length(); i++) {
+         unsigned char c = lit.s[i];
+         if (lit.nocase) {
+             if (cr.test(mytoupper(c)) && cr.test(mytolower(c))) {
+                 return true;
+             }
+         } else {
+             if (cr.test(c)) {
+                 return true;
+             }
+         }
+    }
+
+    return false;
+}
+
+static
+void findForwardAccelScheme(const vector<AccelString> &lits,
+                            hwlm_group_t expected_groups, AccelAux *aux) {
+    DEBUG_PRINTF("building accel expected=%016llx\n", expected_groups);
+    u32 min_len = MAX_ACCEL_OFFSET;
+    vector<const AccelString *> filtered_lits;
+
+    filterLits(lits, expected_groups, &filtered_lits, &min_len);
+    if (filtered_lits.empty()) {
+        return;
+    }
+
+    if (findDVerm(filtered_lits, aux)
+        || findSVerm(filtered_lits, aux)) {
+        return;
+    }
+
+    /* look for shufti/truffle */
+
+    vector<CharReach> reach(MAX_ACCEL_OFFSET, CharReach());
+    for (const auto &lit : lits) {
+        if (!(lit.groups & expected_groups)) {
+            continue;
+        }
+
+        u32 overhang = mask_overhang(lit);
+        for (u32 i = 0; i < overhang; i++) {
+            /* this offset overhangs the start of the real literal; look at the
+             * msk/cmp */
+            for (u32 j = 0; j < N_CHARS; j++) {
+                if ((j & lit.msk[i]) == lit.cmp[i]) {
+                    reach[i].set(j);
+                }
+            }
+        }
+        for (u32 i = overhang; i < MAX_ACCEL_OFFSET; i++) {
+            CharReach &reach_i = reach[i];
+            u32 i_effective = i - overhang;
+
+            if (litGuardedByCharReach(reach_i, lit, i_effective)) {
+                continue;
+            }
+            unsigned char c = i_effective < lit.s.length() ? lit.s[i_effective]
+                                                           : lit.s.back();
+            if (lit.nocase) {
+                reach_i.set(mytoupper(c));
+                reach_i.set(mytolower(c));
+            } else {
+                reach_i.set(c);
+            }
+        }
+    }
+
+    u32 min_count = ~0U;
+    u32 min_offset = ~0U;
+    for (u32 i = 0; i < MAX_ACCEL_OFFSET; i++) {
+        size_t count = reach[i].count();
+        DEBUG_PRINTF("offset %u is %s (reach %zu)\n", i,
+                     describeClass(reach[i]).c_str(), count);
+        if (count < min_count) {
+            min_count = (u32)count;
+            min_offset = i;
+        }
+    }
+
+    if (min_count > MAX_SHUFTI_WIDTH) {
+        DEBUG_PRINTF("FAIL: min shufti with %u chars is too wide\n", min_count);
+        return;
+    }
+
+    const CharReach &cr = reach[min_offset];
+    if (-1 !=
+        shuftiBuildMasks(cr, (u8 *)&aux->shufti.lo, (u8 *)&aux->shufti.hi)) {
+        DEBUG_PRINTF("built shufti for %s (%zu chars, offset %u)\n",
+                     describeClass(cr).c_str(), cr.count(), min_offset);
+        aux->shufti.accel_type = ACCEL_SHUFTI;
+        aux->shufti.offset = verify_u8(min_offset);
+        return;
+    }
+
+    truffleBuildMasks(cr, (u8 *)&aux->truffle.mask1, (u8 *)&aux->truffle.mask2);
+    DEBUG_PRINTF("built truffle for %s (%zu chars, offset %u)\n",
+                 describeClass(cr).c_str(), cr.count(), min_offset);
+    aux->truffle.accel_type = ACCEL_TRUFFLE;
+    aux->truffle.offset = verify_u8(min_offset);
+}
+
+void buildForwardAccel(HWLM *h, const vector<AccelString> &lits,
+                       hwlm_group_t expected_groups) {
+    findForwardAccelScheme(lits, expected_groups, &h->accel1);
+    findForwardAccelScheme(lits, HWLM_ALL_GROUPS, &h->accel0);
+
+    h->accel1_groups = expected_groups;
+}
+
+} // namespace ue2
diff --git a/src/rose/rose_build_lit_accel.h b/src/rose/rose_build_lit_accel.h
new file mode 100644
index 000000000..f0c014348
--- /dev/null
+++ b/src/rose/rose_build_lit_accel.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef ROSE_BUILD_LIT_ACCEL_H
+#define ROSE_BUILD_LIT_ACCEL_H
+
+#include "hwlm/hwlm.h"
+
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+struct HWLM;
+
+namespace ue2 {
+
+struct AccelString {
+    AccelString(std::string s_in, bool nocase_in, std::vector<u8> msk_in,
+                std::vector<u8> cmp_in, hwlm_group_t groups_in)
+        : s(std::move(s_in)), nocase(nocase_in), msk(std::move(msk_in)),
+          cmp(std::move(cmp_in)), groups(groups_in) {}
+
+    std::string s;
+    bool nocase;
+    std::vector<u8> msk;
+    std::vector<u8> cmp;
+    hwlm_group_t groups;
+
+    bool operator==(const AccelString &a) const {
+        return s == a.s && nocase == a.nocase && msk == a.msk && cmp == a.cmp &&
+               groups == a.groups;
+    }
+
+    bool operator<(const AccelString &a) const {
+        return std::tie(s, nocase, msk, cmp, groups) <
+               std::tie(a.s, a.nocase, a.msk, a.cmp, a.groups);
+    }
+};
+
+void buildForwardAccel(HWLM *h, const std::vector<AccelString> &lits,
+                       hwlm_group_t expected_groups);
+
+} // namespace ue2
+
+#endif // ROSE_BUILD_LIT_ACCEL_H
diff --git a/src/rose/rose_build_long_lit.cpp b/src/rose/rose_build_long_lit.cpp
index c32f49d00..7ebf73ecb 100644
--- a/src/rose/rose_build_long_lit.cpp
+++ b/src/rose/rose_build_long_lit.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,7 +31,7 @@
 #include "rose_build_engine_blob.h"
 #include "rose_build_impl.h"
 #include "stream_long_lit_hash.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"
 #include "util/bitutils.h"
 #include "util/verify_types.h"
 #include "util/compile_context.h"
@@ -401,7 +401,7 @@ u32 buildLongLiteralTable(const RoseBuildImpl &build, RoseEngineBlob &blob,
     u8 streamBitsNocase = lg2(roundUpToPowerOfTwo(tab_nocase.size() + 2));
     u32 tot_state_bytes = ROUNDUP_N(streamBitsCase + streamBitsNocase, 8) / 8;
 
-    auto table = aligned_zmalloc_unique<char>(tabSize);
+    auto table = make_zeroed_bytecode_ptr<char>(tabSize, 16);
     assert(table); // otherwise would have thrown std::bad_alloc
 
     // Fill in the RoseLongLitTable header structure.
@@ -435,7 +435,7 @@ u32 buildLongLiteralTable(const RoseBuildImpl &build, RoseEngineBlob &blob,
     *historyRequired = max(*historyRequired, max_len);
     *longLitStreamStateRequired = tot_state_bytes;
 
-    return blob.add(table.get(), tabSize, 16);
+    return blob.add(table);
 }
 
 } // namespace ue2
diff --git a/src/rose/rose_build_lookaround.cpp b/src/rose/rose_build_lookaround.cpp
index 10bd59dea..a46a1aeb6 100644
--- a/src/rose/rose_build_lookaround.cpp
+++ b/src/rose/rose_build_lookaround.cpp
@@ -45,6 +45,7 @@
 
 #include <cstdlib>
 #include <queue>
+#include <sstream>
 
 using namespace std;
 
@@ -62,6 +63,20 @@ static const u32 MAX_LOOKAROUND_ENTRIES = 16;
 /** \brief We would rather have lookarounds with smaller reach than this. */
 static const u32 LOOKAROUND_WIDE_REACH = 200;
 
+#if defined(DEBUG) || defined(DUMP_SUPPORT)
+static UNUSED
+string dump(const map<s32, CharReach> &look) {
+    ostringstream oss;
+    for (auto it = look.begin(), ite = look.end(); it != ite; ++it) {
+        if (it != look.begin()) {
+            oss << ", ";
+        }
+        oss << "{" << it->first << ": " << describeClass(it->second) << "}";
+    }
+    return oss.str();
+}
+#endif
+
 static
 void getForwardReach(const NGHolder &g, u32 top, map<s32, CharReach> &look) {
     ue2::flat_set<NFAVertex> curr, next;
@@ -298,21 +313,6 @@ void findBackwardReach(const RoseGraph &g, const RoseVertex v,
     // TODO: implement DFA variants if necessary.
 }
 
-#if defined(DEBUG) || defined(DUMP_SUPPORT)
-#include <sstream>
-static UNUSED
-string dump(const map<s32, CharReach> &look) {
-    ostringstream oss;
-    for (auto it = look.begin(), ite = look.end(); it != ite; ++it) {
-        if (it != look.begin()) {
-            oss << ", ";
-        }
-        oss << "{" << it->first << ": " << describeClass(it->second) << "}";
-    }
-    return oss.str();
-}
-#endif
-
 static
 void normalise(map<s32, CharReach> &look) {
     // We can erase entries where the reach is "all characters".
@@ -447,7 +447,7 @@ static
 void findFloodReach(const RoseBuildImpl &tbi, const RoseVertex v,
                     set<CharReach> &flood_reach) {
     for (u32 lit_id : tbi.g[v].literals) {
-        const ue2_literal &s = tbi.literals.right.at(lit_id).s;
+        const ue2_literal &s = tbi.literals.at(lit_id).s;
         if (s.empty()) {
             continue;
         }
@@ -460,13 +460,24 @@ void findFloodReach(const RoseBuildImpl &tbi, const RoseVertex v,
     }
 }
 
+
+namespace {
+struct LookProto {
+    LookProto(s32 offset_in, CharReach reach_in)
+        : offset(offset_in), reach(move(reach_in)) {}
+    s32 offset;
+    CharReach reach;
+};
+}
+
 static
-map<s32, CharReach> findLiteralReach(const rose_literal_id &lit) {
-    map<s32, CharReach> look;
+vector<LookProto> findLiteralReach(const rose_literal_id &lit) {
+    vector<LookProto> look;
+    look.reserve(lit.s.length());
 
-    u32 i = lit.delay + 1;
-    for (auto it = lit.s.rbegin(), ite = lit.s.rend(); it != ite; ++it) {
-        look[0 - i] |= *it;
+    s32 i = 0 - lit.s.length() - lit.delay;
+    for (const auto &c : lit.s) {
+        look.emplace_back(i, c);
         i++;
     }
 
@@ -478,22 +489,40 @@ map<s32, CharReach> findLiteralReach(const RoseBuildImpl &build,
                                      const RoseVertex v) {
     bool first = true;
     map<s32, CharReach> look;
+
     for (u32 lit_id : build.g[v].literals) {
-        const rose_literal_id &lit = build.literals.right.at(lit_id);
+        const rose_literal_id &lit = build.literals.at(lit_id);
         auto lit_look = findLiteralReach(lit);
 
         if (first) {
-            look = move(lit_look);
+            for (auto &p : lit_look) {
+                look.emplace(p.offset, p.reach);
+            }
             first = false;
-        } else {
-            for (auto it = look.begin(); it != look.end();) {
-                auto l_it = lit_look.find(it->first);
-                if (l_it == lit_look.end()) {
-                    it = look.erase(it);
-                } else {
-                    it->second |= l_it->second;
-                    ++it;
-                }
+            continue;
+        }
+
+        // Erase elements from look with keys not in lit_look. Where a key is
+        // in both maps, union its reach with the lookaround.
+        auto jt = begin(lit_look);
+        for (auto it = begin(look); it != end(look);) {
+            if (jt == end(lit_look)) {
+                // No further lit_look entries, erase remaining elements from
+                // look.
+                look.erase(it, end(look));
+                break;
+            }
+            if (it->first < jt->offset) {
+                // Offset is present in look but not in lit_look, erase.
+                it = look.erase(it);
+            } else if (it->first > jt->offset) {
+                // Offset is preset in lit_look but not in look, ignore.
+                ++jt;
+            } else {
+                // Offset is present in both, union its reach with look.
+                it->second |= jt->reach;
+                ++it;
+                ++jt;
             }
         }
     }
@@ -525,6 +554,76 @@ void trimLiterals(const RoseBuildImpl &build, const RoseVertex v,
     DEBUG_PRINTF("post-trim lookaround: %s\n", dump(look).c_str());
 }
 
+static
+void normaliseLeftfix(map<s32, CharReach> &look) {
+    // We can erase entries where the reach is "all characters", except for the
+    // very first one -- this might be required to establish a minimum bound on
+    // the literal's match offset.
+
+    // TODO: It would be cleaner to use a literal program instruction to check
+    // the minimum bound explicitly.
+
+    if (look.empty()) {
+        return;
+    }
+
+    const auto earliest = begin(look)->first;
+
+    vector<s32> dead;
+    for (const auto &m : look) {
+        if (m.second.all() && m.first != earliest) {
+            dead.push_back(m.first);
+        }
+    }
+    erase_all(&look, dead);
+}
+
+static
+bool trimMultipathLeftfix(const RoseBuildImpl &build, const RoseVertex v,
+                          vector<map<s32, CharReach>> &looks) {
+    size_t path_count = 0;
+    for (auto &look : looks) {
+        ++path_count;
+        DEBUG_PRINTF("Path #%ld\n", path_count);
+
+        assert(!look.empty());
+        trimLiterals(build, v, look);
+
+        if (look.empty()) {
+            return false;
+        }
+
+        // Could be optimized here, just keep the empty byte of the longest path
+        normaliseLeftfix(look);
+
+        if (look.size() > MAX_LOOKAROUND_ENTRIES) {
+            DEBUG_PRINTF("lookaround too big (%zu entries)\n", look.size());
+            return false;
+        }
+    }
+    return true;
+}
+
+static
+void transToLookaround(const vector<map<s32, CharReach>> &looks,
+                       vector<vector<LookEntry>> &lookarounds) {
+    for (const auto &look : looks) {
+        vector<LookEntry> lookaround;
+        DEBUG_PRINTF("lookaround: %s\n", dump(look).c_str());
+        lookaround.reserve(look.size());
+        for (const auto &m : look) {
+            if (m.first < -128 || m.first > 127) {
+                DEBUG_PRINTF("range too big\n");
+                lookarounds.clear();
+                return;
+            }
+            s8 offset = verify_s8(m.first);
+            lookaround.emplace_back(offset, m.second);
+        }
+        lookarounds.push_back(lookaround);
+    }
+}
+
 void findLookaroundMasks(const RoseBuildImpl &tbi, const RoseVertex v,
                          vector<LookEntry> &lookaround) {
     lookaround.clear();
@@ -563,115 +662,155 @@ void findLookaroundMasks(const RoseBuildImpl &tbi, const RoseVertex v,
 }
 
 static
-bool hasSingleFloatingStart(const NGHolder &g) {
-    NFAVertex initial = NGHolder::null_vertex();
-    for (auto v : adjacent_vertices_range(g.startDs, g)) {
-        if (v == g.startDs) {
-            continue;
-        }
-        if (initial != NGHolder::null_vertex()) {
-            DEBUG_PRINTF("more than one start\n");
-            return false;
-        }
-        initial = v;
-    }
-
-    if (initial == NGHolder::null_vertex()) {
-        DEBUG_PRINTF("no floating starts\n");
-        return false;
-    }
+bool checkShuftiBuckets(const vector<map<s32, CharReach>> &looks,
+                        u32 bucket_size) {
+    set<u32> bucket;
+    for (const auto &look : looks) {
+        for (const auto &l : look) {
+            CharReach cr = l.second;
+            if (cr.count() > 128) {
+                cr.flip();
+            }
+            map <u16, u16> lo2hi;
+
+            for (size_t i = cr.find_first(); i != CharReach::npos;) {
+                u8 it_hi = i >> 4;
+                u16 low_encode = 0;
+                while (i != CharReach::npos && (i >> 4) == it_hi) {
+                    low_encode |= 1 << (i &0xf);
+                    i = cr.find_next(i);
+                }
+                lo2hi[low_encode] |= 1 << it_hi;
+            }
 
-    // Anchored start must have no successors other than startDs and initial.
-    for (auto v : adjacent_vertices_range(g.start, g)) {
-        if (v != initial && v != g.startDs) {
-            DEBUG_PRINTF("anchored start\n");
-            return false;
+            for (const auto &it : lo2hi) {
+                u32 hi_lo = (it.second << 16) | it.first;
+                bucket.insert(hi_lo);
+            }
         }
     }
-
-    return true;
+    DEBUG_PRINTF("shufti has %lu bucket(s)\n", bucket.size());
+    return bucket.size() <= bucket_size;
 }
 
 static
-bool getTransientPrefixReach(const NGHolder &g, u32 lag,
-                             map<s32, CharReach> &look) {
-    if (in_degree(g.accept, g) != 1) {
-        DEBUG_PRINTF("more than one accept\n");
+bool getTransientPrefixReach(const NGHolder &g, ReportID report, u32 lag,
+                             vector<map<s32, CharReach>> &looks) {
+    if (!isAcyclic(g)) {
+        DEBUG_PRINTF("contains back-edge\n");
         return false;
     }
 
-    // Must be a floating chain wired to startDs.
-    if (!hasSingleFloatingStart(g)) {
-        DEBUG_PRINTF("not a single floating start\n");
+    // Must be floating chains wired to startDs.
+    if (!isFloating(g)) {
+        DEBUG_PRINTF("not a floating start\n");
         return false;
     }
 
-    NFAVertex v = *(inv_adjacent_vertices(g.accept, g).first);
-    u32 i = lag + 1;
-    while (v != g.startDs) {
-        DEBUG_PRINTF("i=%u, v=%zu\n", i, g[v].index);
-        if (is_special(v, g)) {
-            DEBUG_PRINTF("special\n");
-            return false;
+    vector<NFAVertex> curr;
+    for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
+        if (v == g.start || v == g.startDs) {
+            DEBUG_PRINTF("empty graph\n");
+            return true;
+        }
+        if (contains(g[v].reports, report)) {
+            curr.push_back(v);
         }
+    }
 
-        look[0 - i] = g[v].char_reach;
+    assert(!curr.empty());
 
-        NFAVertex next = NGHolder::null_vertex();
-        for (auto u : inv_adjacent_vertices_range(v, g)) {
-            if (u == g.start) {
-                continue; // Benign, checked by hasSingleFloatingStart
-            }
-            if (next == NGHolder::null_vertex()) {
-                next = u;
-                continue;
-            }
-            DEBUG_PRINTF("branch\n");
-            return false;
-        }
+    u32 total_len = curr.size();
+
+    for (const auto &v : curr) {
+        looks.emplace_back(map<s32, CharReach>());
+        looks.back()[0 - (lag + 1)] = g[v].char_reach;
+    }
 
-        if (next == NGHolder::null_vertex() || next == v) {
-            DEBUG_PRINTF("no predecessor or only self-loop\n");
-            // This graph is malformed -- all vertices in a graph that makes it
-            // to this analysis should have predecessors.
-            assert(0);
+    bool curr_active = false;
+
+    /* For each offset -i, we backwardly trace the path by vertices in curr.
+     * Once there are more than 8 paths and more than 64 bits total_len,
+     * which means that neither MULTIPATH_LOOKAROUND nor MULTIPATH_SHUFTI
+     * could be successfully built, we will give up the path finding.
+     * Otherwise, the loop will halt when all vertices in curr are startDs.
+     */
+    for (u32 i = lag + 2; i < (lag + 2) + MAX_BACK_LEN; i++) {
+        curr_active = false;
+        size_t curr_size = curr.size();
+        if (curr.size() > 1 && i > lag + MULTIPATH_MAX_LEN) {
+            DEBUG_PRINTF("range is larger than 16 in multi-path\n");
             return false;
         }
 
-        v = next;
-        i++;
-    }
+        for (size_t idx = 0; idx < curr_size; idx++) {
+            NFAVertex v = curr[idx];
+            if (v == g.startDs) {
+                continue;
+            }
+            assert(!is_special(v, g));
 
-    DEBUG_PRINTF("done\n");
-    return true;
-}
+            for (auto u : inv_adjacent_vertices_range(v, g)) {
+                if (u == g.start || u == g.startDs) {
+                    curr[idx] = g.startDs;
+                    break;
+                }
+            }
 
-static
-void normaliseLeftfix(map<s32, CharReach> &look) {
-    // We can erase entries where the reach is "all characters", except for the
-    // very first one -- this might be required to establish a minimum bound on
-    // the literal's match offset.
+            if (is_special(curr[idx], g)) {
+                continue;
+            }
 
-    // TODO: It would be cleaner to use a literal program instruction to check
-    // the minimum bound explicitly.
+            for (auto u : inv_adjacent_vertices_range(v, g)) {
+                curr_active = true;
+                if (curr[idx] == v) {
+                    curr[idx] = u;
+                    looks[idx][0 - i] = g[u].char_reach;
+                    total_len++;
+                } else {
+                    curr.push_back(u);
+                    looks.push_back(looks[idx]);
+                    (looks.back())[0 - i] = g[u].char_reach;
+                    total_len += looks.back().size();
+                }
 
-    if (look.empty()) {
-        return;
+                if (curr.size() > MAX_LOOKAROUND_PATHS && total_len > 64) {
+                    DEBUG_PRINTF("too many branches\n");
+                    return false;
+                }
+            }
+        }
+        if (!curr_active) {
+            break;
+        }
     }
 
-    const auto earliest = begin(look)->first;
+    if (curr_active) {
+        DEBUG_PRINTF("single path too long\n");
+        return false;
+    }
 
-    vector<s32> dead;
-    for (const auto &m : look) {
-        if (m.second.all() && m.first != earliest) {
-            dead.push_back(m.first);
+    // More than 8 paths, check multi-path shufti.
+    if (curr.size() > MAX_LOOKAROUND_PATHS) {
+        u32 bucket_size = total_len > 32 ? 8 : 16;
+        if (!checkShuftiBuckets(looks, bucket_size)) {
+            DEBUG_PRINTF("shufti has too many buckets\n");
+            return false;
         }
     }
-    erase_all(&look, dead);
+
+    assert(!looks.empty());
+    if (looks.size() == 1) {
+        DEBUG_PRINTF("single lookaround\n");
+    } else {
+        DEBUG_PRINTF("multi-path lookaround\n");
+    }
+    DEBUG_PRINTF("done\n");
+    return true;
 }
 
 bool makeLeftfixLookaround(const RoseBuildImpl &build, const RoseVertex v,
-                           vector<LookEntry> &lookaround) {
+                           vector<vector<LookEntry>> &lookaround) {
     lookaround.clear();
 
     const RoseGraph &g = build.g;
@@ -687,36 +826,19 @@ bool makeLeftfixLookaround(const RoseBuildImpl &build, const RoseVertex v,
         return false;
     }
 
-    map<s32, CharReach> look;
-    if (!getTransientPrefixReach(*leftfix.graph(), g[v].left.lag, look)) {
-        DEBUG_PRINTF("not a chain\n");
-        return false;
-    }
-
-    trimLiterals(build, v, look);
-    normaliseLeftfix(look);
-
-    if (look.size() > MAX_LOOKAROUND_ENTRIES) {
-        DEBUG_PRINTF("lookaround too big (%zu entries)\n", look.size());
+    vector<map<s32, CharReach>> looks;
+    if (!getTransientPrefixReach(*leftfix.graph(), g[v].left.leftfix_report,
+                                 g[v].left.lag, looks)) {
+        DEBUG_PRINTF("graph has loop or too large\n");
         return false;
     }
 
-    if (look.empty()) {
-        DEBUG_PRINTF("lookaround empty; this is weird\n");
+    if (!trimMultipathLeftfix(build, v, looks)) {
         return false;
     }
+    transToLookaround(looks, lookaround);
 
-    lookaround.reserve(look.size());
-    for (const auto &m : look) {
-        if (m.first < -128 || m.first > 127) {
-            DEBUG_PRINTF("range too big\n");
-            return false;
-        }
-        s8 offset = verify_s8(m.first);
-        lookaround.emplace_back(offset, m.second);
-    }
-
-    return true;
+    return !lookaround.empty();
 }
 
 void mergeLookaround(vector<LookEntry> &lookaround,
diff --git a/src/rose/rose_build_lookaround.h b/src/rose/rose_build_lookaround.h
index 993bd2291..aea87ccf8 100644
--- a/src/rose/rose_build_lookaround.h
+++ b/src/rose/rose_build_lookaround.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -36,6 +36,9 @@
 
 #include <vector>
 
+/** \brief Max path number for multi-path lookaround. */
+#define MAX_LOOKAROUND_PATHS 8
+
 namespace ue2 {
 
 class CharReach;
@@ -44,6 +47,7 @@ class RoseBuildImpl;
 /** \brief Lookaround entry prototype, describing the reachability at a given
  * distance from the end of a role match. */
 struct LookEntry {
+    LookEntry() : offset(0) {}
     LookEntry(s8 offset_in, const CharReach &reach_in)
         : offset(offset_in), reach(reach_in) {}
     s8 offset; //!< offset from role match location.
@@ -63,7 +67,7 @@ size_t hash_value(const LookEntry &l) {
 }
 
 void findLookaroundMasks(const RoseBuildImpl &tbi, const RoseVertex v,
-                         std::vector<LookEntry> &lookaround);
+                         std::vector<LookEntry> &look_more);
 
 /**
  * \brief If possible, render the prefix of the given vertex as a lookaround.
@@ -72,7 +76,7 @@ void findLookaroundMasks(const RoseBuildImpl &tbi, const RoseVertex v,
  * it can be satisfied with a lookaround alone.
  */
 bool makeLeftfixLookaround(const RoseBuildImpl &build, const RoseVertex v,
-                           std::vector<LookEntry> &lookaround);
+                           std::vector<std::vector<LookEntry>> &lookaround);
 
 void mergeLookaround(std::vector<LookEntry> &lookaround,
                      const std::vector<LookEntry> &more_lookaround);
diff --git a/src/rose/rose_build_matchers.cpp b/src/rose/rose_build_matchers.cpp
index 01633c06c..682a87c38 100644
--- a/src/rose/rose_build_matchers.cpp
+++ b/src/rose/rose_build_matchers.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,9 +33,12 @@
 
 #include "rose_build_matchers.h"
 
+#include "rose_build_dump.h"
 #include "rose_build_impl.h"
+#include "rose_build_lit_accel.h"
 #include "rose_build_width.h"
 #include "hwlm/hwlm_build.h"
+#include "hwlm/hwlm_internal.h"
 #include "hwlm/hwlm_literal.h"
 #include "nfa/castlecompile.h"
 #include "nfa/nfa_api_queue.h"
@@ -58,6 +61,8 @@ using boost::adaptors::map_values;
 
 namespace ue2 {
 
+static const size_t MAX_ACCEL_STRING_LEN = 16;
+
 #ifdef DEBUG
 static UNUSED
 string dumpMask(const vector<u8> &v) {
@@ -206,7 +211,7 @@ bool maskFromPreds(const RoseBuildImpl &build, const rose_literal_id &id,
     }
 
     u32 u_lit_id = *(g[u].literals.begin());
-    const rose_literal_id &u_id = build.literals.right.at(u_lit_id);
+    const rose_literal_id &u_id = build.literals.at(u_lit_id);
     DEBUG_PRINTF("u has lit: %s\n", escapeString(u_id.s).c_str());
 
     // Number of characters to take from the back of u's literal.
@@ -341,12 +346,8 @@ void findMoreLiteralMasks(RoseBuildImpl &build) {
     }
 
     vector<u32> candidates;
-    for (const auto &e : build.literals.right) {
-        const u32 id = e.first;
-        const auto &lit = e.second;
-
-        // This pass takes place before final IDs are assigned to literals.
-        assert(!build.hasFinalId(id));
+    for (u32 id = 0; id < build.literals.size(); id++) {
+        const auto &lit = build.literals.at(id);
 
         if (lit.delay || build.isDelayed(id)) {
             continue;
@@ -375,7 +376,7 @@ void findMoreLiteralMasks(RoseBuildImpl &build) {
     }
 
     for (const u32 &id : candidates) {
-        const auto &lit = build.literals.right.at(id);
+        const auto &lit = build.literals.at(id);
         auto &lit_info = build.literal_info.at(id);
 
         vector<u8> msk, cmp;
@@ -404,7 +405,6 @@ void findMoreLiteralMasks(RoseBuildImpl &build) {
         lit_info.vertices.clear();
 
         // Preserve other properties.
-        new_info.requires_explode = lit_info.requires_explode;
         new_info.requires_benefits = lit_info.requires_benefits;
     }
 }
@@ -491,8 +491,14 @@ bool isNoRunsLiteral(const RoseBuildImpl &build, const u32 id,
         return false;
     }
 
-    if (build.literals.right.at(id).s.length() > max_len) {
-        DEBUG_PRINTF("requires literal check\n");
+    size_t len = build.literals.at(id).s.length();
+    if (len > max_len) {
+        DEBUG_PRINTF("long literal, requires confirm\n");
+        return false;
+    }
+
+    if (len > ROSE_SHORT_LITERAL_LEN_MAX) {
+        DEBUG_PRINTF("medium-length literal, requires confirm\n");
         return false;
     }
 
@@ -610,7 +616,7 @@ u64a literalMinReportOffset(const RoseBuildImpl &build,
     // If this literal in the undelayed literal corresponding to some delayed
     // literals, we must take their minimum offsets into account.
     for (const u32 &delayed_id : info.delayed_ids) {
-        const auto &delayed_lit = build.literals.right.at(delayed_id);
+        const auto &delayed_lit = build.literals.at(delayed_id);
         const auto &delayed_info = build.literal_info.at(delayed_id);
         u64a delayed_min_offset = literalMinReportOffset(build, delayed_lit,
                                                          delayed_info);
@@ -626,159 +632,245 @@ u64a literalMinReportOffset(const RoseBuildImpl &build,
     return lit_min_offset;
 }
 
-vector<hwlmLiteral> fillHamsterLiteralList(const RoseBuildImpl &build,
-                                           rose_literal_table table,
-                                           size_t max_len, u32 max_offset) {
+template<class Container>
+void trim_to_suffix(Container &c, size_t len) {
+    if (c.size() <= len) {
+        return;
+    }
+
+    size_t suffix_len = c.size() - len;
+    c.erase(c.begin(), c.begin() + suffix_len);
+}
+
+namespace {
+
+/** \brief Prototype for literal matcher construction. */
+struct MatcherProto {
+    /** \brief Literal fragments used to construct the literal matcher. */
     vector<hwlmLiteral> lits;
 
-    for (const auto &e : build.literals.right) {
-        const u32 id = e.first;
-        if (!build.hasFinalId(id)) {
-            continue;
-        }
+    /** \brief Longer literals used for acceleration analysis. */
+    vector<AccelString> accel_lits;
 
-        if (e.second.delay) {
-            continue; /* delay id's are virtual-ish */
-        }
+    /** \brief The history required by the literal matcher. */
+    size_t history_required = 0;
 
-        if (e.second.table != table) {
-            continue; /* wrong table */
-        }
+    /** \brief Insert the contents of another MatcherProto. */
+    void insert(const MatcherProto &a);
+};
+}
+
+/**
+ * \brief Build up a vector of literals (and associated other data) for the
+ * given table.
+ *
+ * If max_offset is specified (and not ROSE_BOUND_INF), then literals that can
+ * only lead to a pattern match after max_offset may be excluded.
+ */
+static
+MatcherProto makeMatcherProto(const RoseBuildImpl &build,
+                              const vector<LitFragment> &fragments,
+                              rose_literal_table table, bool delay_rebuild,
+                              size_t max_len, u32 max_offset = ROSE_BOUND_INF) {
+    MatcherProto mp;
 
-        assert(id < build.literal_info.size());
-        const rose_literal_info &info = build.literal_info[id];
-        u32 final_id = info.final_id;
-        rose_group groups = info.group_mask;
-        /* Note: requires_benefits are handled in the literal entries */
-        const ue2_literal &lit = e.second.s;
+    if (delay_rebuild) {
+        assert(table == ROSE_FLOATING);
+        assert(build.cc.streaming);
+    }
 
-        DEBUG_PRINTF("lit='%s'\n", escapeString(lit).c_str());
+    for (const auto &f : fragments) {
+        for (u32 id : f.lit_ids) {
+            const rose_literal_id &lit = build.literals.at(id);
 
-        if (max_offset != ROSE_BOUND_INF) {
-            u64a min_report = literalMinReportOffset(build, e.second, info);
-            if (min_report > max_offset) {
-                DEBUG_PRINTF("min report offset=%llu exceeds max_offset=%u\n",
-                             min_report, max_offset);
-                continue;
+            if (lit.table != table) {
+                continue; /* wrong table */
             }
-        }
-
-        const vector<u8> &msk = e.second.msk;
-        const vector<u8> &cmp = e.second.cmp;
 
-        bool noruns = isNoRunsLiteral(build, id, info, max_len);
+            if (lit.delay) {
+                continue;  /* delay id's are virtual-ish */
+            }
 
-        if (info.requires_explode) {
-            DEBUG_PRINTF("exploding lit\n");
+            assert(id < build.literal_info.size());
+            const auto &info = build.literal_info.at(id);
 
-            // We do not require_explode for long literals.
-            assert(lit.length() <= max_len);
+            /* Note: requires_benefits are handled in the literal entries */
+            const ue2_literal &s = lit.s;
 
-            case_iter cit = caseIterateBegin(lit);
-            case_iter cite = caseIterateEnd();
-            for (; cit != cite; ++cit) {
-                string s = *cit;
-                bool nocase = false;
+            DEBUG_PRINTF("lit='%s' (len %zu)\n", escapeString(s).c_str(),
+                         s.length());
 
-                DEBUG_PRINTF("id=%u, s='%s', nocase=%d, noruns=%d msk=%s, "
-                             "cmp=%s (exploded)\n",
-                             final_id, escapeString(s).c_str(), nocase, noruns,
-                             dumpMask(msk).c_str(), dumpMask(cmp).c_str());
+            // When building the delay rebuild table, we only want to include
+            // literals that have delayed variants.
+            if (delay_rebuild && info.delayed_ids.empty()) {
+                DEBUG_PRINTF("not needed for delay rebuild\n");
+                continue;
+            }
 
-                if (!maskIsConsistent(s, nocase, msk, cmp)) {
-                    DEBUG_PRINTF("msk/cmp for literal can't match, skipping\n");
+            if (max_offset != ROSE_BOUND_INF) {
+                u64a min_report = literalMinReportOffset(build, lit, info);
+                if (min_report > max_offset) {
+                    DEBUG_PRINTF("min report offset=%llu exceeds "
+                                 "max_offset=%u\n", min_report, max_offset);
                     continue;
                 }
+            }
+
+            const vector<u8> &msk = lit.msk;
+            const vector<u8> &cmp = lit.cmp;
+            bool noruns = isNoRunsLiteral(build, id, info, max_len);
 
-                lits.emplace_back(move(s), nocase, noruns, final_id, groups,
-                                  msk, cmp);
+            size_t lit_hist_len = 0;
+            if (build.cc.streaming) {
+                lit_hist_len = max(msk.size(), min(s.length(), max_len));
+                lit_hist_len = lit_hist_len ? lit_hist_len - 1 : 0;
             }
-        } else {
-            string s = lit.get_string();
-            bool nocase = lit.any_nocase();
+            DEBUG_PRINTF("lit requires %zu bytes of history\n", lit_hist_len);
+            assert(lit_hist_len <= build.cc.grey.maxHistoryAvailable);
 
-            DEBUG_PRINTF("id=%u, s='%s', nocase=%d, noruns=%d, msk=%s, "
-                         "cmp=%s\n",
-                         final_id, escapeString(s).c_str(), (int)nocase, noruns,
-                         dumpMask(msk).c_str(), dumpMask(cmp).c_str());
+            auto lit_final = s; // copy
 
-            if (s.length() > max_len) {
-                DEBUG_PRINTF("truncating to tail of length %zu\n", max_len);
-                s.erase(0, s.length() - max_len);
+            if (lit_final.length() > ROSE_SHORT_LITERAL_LEN_MAX) {
+                DEBUG_PRINTF("truncating to tail of length %zu\n",
+                             size_t{ROSE_SHORT_LITERAL_LEN_MAX});
+                lit_final.erase(0, lit_final.length()
+                                - ROSE_SHORT_LITERAL_LEN_MAX);
                 // We shouldn't have set a threshold below 8 chars.
-                assert(msk.size() <= max_len);
+                assert(msk.size() <= ROSE_SHORT_LITERAL_LEN_MAX);
+                assert(!noruns);
             }
 
-            if (!maskIsConsistent(s, nocase, msk, cmp)) {
+            const auto &s_final = lit_final.get_string();
+            bool nocase = lit_final.any_nocase();
+
+            DEBUG_PRINTF("id=%u, s='%s', nocase=%d, noruns=%d, msk=%s, "
+                         "cmp=%s\n", f.fragment_id,
+                         escapeString(s_final).c_str(), (int)nocase, noruns,
+                         dumpMask(msk).c_str(), dumpMask(cmp).c_str());
+
+            if (!maskIsConsistent(s_final, nocase, msk, cmp)) {
                 DEBUG_PRINTF("msk/cmp for literal can't match, skipping\n");
                 continue;
             }
 
-            lits.emplace_back(move(s), nocase, noruns, final_id, groups, msk,
-                              cmp);
+            mp.accel_lits.emplace_back(s.get_string(), s.any_nocase(), msk, cmp,
+                                       info.group_mask);
+            mp.history_required = max(mp.history_required, lit_hist_len);
+
+            u32 prog_offset = delay_rebuild ? f.delay_program_offset
+                                            : f.lit_program_offset;
+            const auto &groups = f.groups;
+
+            mp.lits.emplace_back(move(s_final), nocase, noruns, prog_offset,
+                                 groups, msk, cmp);
         }
     }
 
-    return lits;
+    sort_and_unique(mp.lits);
+
+    // Literals used for acceleration must be limited to max_len, as that's all
+    // we can see in history.
+    for_each(begin(mp.accel_lits), end(mp.accel_lits),
+             [&max_len](AccelString &a) {
+                 trim_to_suffix(a.s, max_len);
+                 trim_to_suffix(a.msk, max_len);
+                 trim_to_suffix(a.cmp, max_len);
+             });
+
+    sort_and_unique(mp.accel_lits);
+
+    return mp;
+}
+
+void MatcherProto::insert(const MatcherProto &a) {
+    ::ue2::insert(&lits, lits.end(), a.lits);
+    ::ue2::insert(&accel_lits, accel_lits.end(), a.accel_lits);
+    sort_and_unique(lits);
+    sort_and_unique(accel_lits);
+    history_required = max(history_required, a.history_required);
+}
+
+static
+void buildAccel(const RoseBuildImpl &build, const MatcherProto &mp,
+                HWLM &hwlm) {
+    if (!build.cc.grey.hamsterAccelForward) {
+        return;
+    }
+
+    if (hwlm.type == HWLM_ENGINE_NOOD) {
+        return;
+    }
+
+    buildForwardAccel(&hwlm, mp.accel_lits, build.getInitialGroups());
 }
 
-aligned_unique_ptr<HWLM> buildFloatingMatcher(const RoseBuildImpl &build,
-                                              size_t longLitLengthThreshold,
-                                              rose_group *fgroups,
-                                              size_t *fsize,
-                                              size_t *historyRequired) {
-    *fsize = 0;
+bytecode_ptr<HWLM> buildFloatingMatcher(const RoseBuildImpl &build,
+                                        const vector<LitFragment> &fragments,
+                                        size_t longLitLengthThreshold,
+                                        rose_group *fgroups,
+                                        size_t *historyRequired) {
     *fgroups = 0;
 
-    auto fl = fillHamsterLiteralList(build, ROSE_FLOATING,
-                                     longLitLengthThreshold);
-    if (fl.empty()) {
+    auto mp = makeMatcherProto(build, fragments, ROSE_FLOATING, false,
+                               longLitLengthThreshold);
+    if (mp.lits.empty()) {
         DEBUG_PRINTF("empty floating matcher\n");
         return nullptr;
     }
+    dumpMatcherLiterals(mp.lits, "floating", build.cc.grey);
 
-    for (const hwlmLiteral &hlit : fl) {
-        *fgroups |= hlit.groups;
+    for (const hwlmLiteral &lit : mp.lits) {
+        *fgroups |= lit.groups;
     }
 
-    hwlmStreamingControl ctl;
-    hwlmStreamingControl *ctlp;
-    if (build.cc.streaming) {
-        ctl.history_max = build.cc.grey.maxHistoryAvailable;
-        ctl.history_min = MAX(*historyRequired,
-                              build.cc.grey.minHistoryAvailable);
-        DEBUG_PRINTF("streaming control, history max=%zu, min=%zu\n",
-                     ctl.history_max, ctl.history_min);
-        ctlp = &ctl;
-    } else {
-        ctlp = nullptr; // Null for non-streaming.
-    }
-
-    aligned_unique_ptr<HWLM> ftable =
-        hwlmBuild(fl, ctlp, false, build.cc, build.getInitialGroups());
-    if (!ftable) {
+    auto hwlm = hwlmBuild(mp.lits, false, build.cc, build.getInitialGroups());
+    if (!hwlm) {
         throw CompileError("Unable to generate bytecode.");
     }
 
+    buildAccel(build, mp, *hwlm);
+
     if (build.cc.streaming) {
-        DEBUG_PRINTF("literal_history_required=%zu\n",
-                ctl.literal_history_required);
-        assert(ctl.literal_history_required <=
-               build.cc.grey.maxHistoryAvailable);
-        *historyRequired = max(*historyRequired,
-                ctl.literal_history_required);
-    }
-
-    *fsize = hwlmSize(ftable.get());
-    assert(*fsize);
-    DEBUG_PRINTF("built floating literal table size %zu bytes\n", *fsize);
-    return ftable;
+        DEBUG_PRINTF("history_required=%zu\n", mp.history_required);
+        assert(mp.history_required <= build.cc.grey.maxHistoryAvailable);
+        *historyRequired = max(*historyRequired, mp.history_required);
+    }
+
+    DEBUG_PRINTF("built floating literal table size %zu bytes\n", hwlm.size());
+    return hwlm;
 }
 
-aligned_unique_ptr<HWLM> buildSmallBlockMatcher(const RoseBuildImpl &build,
-                                                size_t *sbsize) {
-    *sbsize = 0;
+bytecode_ptr<HWLM>
+buildDelayRebuildMatcher(const RoseBuildImpl &build,
+                         const vector<LitFragment> &fragments,
+                         size_t longLitLengthThreshold) {
+    if (!build.cc.streaming) {
+        DEBUG_PRINTF("not streaming\n");
+        return nullptr;
+    }
+
+    auto mp = makeMatcherProto(build, fragments, ROSE_FLOATING, true,
+                               longLitLengthThreshold);
+    if (mp.lits.empty()) {
+        DEBUG_PRINTF("empty delay rebuild matcher\n");
+        return nullptr;
+    }
+    dumpMatcherLiterals(mp.lits, "delay_rebuild", build.cc.grey);
+
+    auto hwlm = hwlmBuild(mp.lits, false, build.cc, build.getInitialGroups());
+    if (!hwlm) {
+        throw CompileError("Unable to generate bytecode.");
+    }
 
+    buildAccel(build, mp, *hwlm);
+
+    DEBUG_PRINTF("built delay rebuild table size %zu bytes\n", hwlm.size());
+    return hwlm;
+}
+
+bytecode_ptr<HWLM>
+buildSmallBlockMatcher(const RoseBuildImpl &build,
+                       const vector<LitFragment> &fragments) {
     if (build.cc.streaming) {
         DEBUG_PRINTF("streaming mode\n");
         return nullptr;
@@ -791,74 +883,75 @@ aligned_unique_ptr<HWLM> buildSmallBlockMatcher(const RoseBuildImpl &build,
         return nullptr;
     }
 
-    auto lits = fillHamsterLiteralList(
-        build, ROSE_FLOATING, ROSE_SMALL_BLOCK_LEN, ROSE_SMALL_BLOCK_LEN);
-    if (lits.empty()) {
+    auto mp = makeMatcherProto(build, fragments, ROSE_FLOATING, false,
+                               ROSE_SMALL_BLOCK_LEN, ROSE_SMALL_BLOCK_LEN);
+    if (mp.lits.empty()) {
         DEBUG_PRINTF("no floating table\n");
         return nullptr;
-    } else if (lits.size() == 1) {
+    } else if (mp.lits.size() == 1) {
         DEBUG_PRINTF("single floating literal, noodle will be fast enough\n");
         return nullptr;
     }
 
-    auto anchored_lits =
-        fillHamsterLiteralList(build, ROSE_ANCHORED_SMALL_BLOCK,
-                               ROSE_SMALL_BLOCK_LEN, ROSE_SMALL_BLOCK_LEN);
-    if (anchored_lits.empty()) {
+    auto mp_anchored = makeMatcherProto(build, fragments,
+                                        ROSE_ANCHORED_SMALL_BLOCK, false,
+                                        ROSE_SMALL_BLOCK_LEN,
+                                        ROSE_SMALL_BLOCK_LEN);
+    if (mp_anchored.lits.empty()) {
         DEBUG_PRINTF("no small-block anchored literals\n");
         return nullptr;
     }
 
-    lits.insert(lits.end(), anchored_lits.begin(), anchored_lits.end());
+    mp.insert(mp_anchored);
+    dumpMatcherLiterals(mp.lits, "smallblock", build.cc.grey);
 
     // None of our literals should be longer than the small block limit.
-    assert(all_of(begin(lits), end(lits), [](const hwlmLiteral &lit) {
+    assert(all_of(begin(mp.lits), end(mp.lits), [](const hwlmLiteral &lit) {
         return lit.s.length() <= ROSE_SMALL_BLOCK_LEN;
     }));
 
-    if (lits.empty()) {
+    if (mp.lits.empty()) {
         DEBUG_PRINTF("no literals shorter than small block len\n");
         return nullptr;
     }
 
-    aligned_unique_ptr<HWLM> hwlm =
-        hwlmBuild(lits, nullptr, true, build.cc, build.getInitialGroups());
+    auto hwlm = hwlmBuild(mp.lits, true, build.cc, build.getInitialGroups());
     if (!hwlm) {
         throw CompileError("Unable to generate bytecode.");
     }
 
-    *sbsize = hwlmSize(hwlm.get());
-    assert(*sbsize);
-    DEBUG_PRINTF("built small block literal table size %zu bytes\n", *sbsize);
+    buildAccel(build, mp, *hwlm);
+
+    DEBUG_PRINTF("built small block literal table size %zu bytes\n",
+                 hwlm.size());
     return hwlm;
 }
 
-aligned_unique_ptr<HWLM> buildEodAnchoredMatcher(const RoseBuildImpl &build,
-                                                 size_t *esize) {
-    *esize = 0;
+bytecode_ptr<HWLM>
+buildEodAnchoredMatcher(const RoseBuildImpl &build,
+                        const vector<LitFragment> &fragments) {
+    auto mp = makeMatcherProto(build, fragments, ROSE_EOD_ANCHORED, false,
+                               build.ematcher_region_size);
 
-    auto el = fillHamsterLiteralList(build, ROSE_EOD_ANCHORED,
-                                     build.ematcher_region_size);
-
-    if (el.empty()) {
+    if (mp.lits.empty()) {
         DEBUG_PRINTF("no eod anchored literals\n");
         assert(!build.ematcher_region_size);
         return nullptr;
     }
+    dumpMatcherLiterals(mp.lits, "eod", build.cc.grey);
 
     assert(build.ematcher_region_size);
 
-    hwlmStreamingControl *ctlp = nullptr; // not a streaming case
-    aligned_unique_ptr<HWLM> etable =
-        hwlmBuild(el, ctlp, true, build.cc, build.getInitialGroups());
-    if (!etable) {
+    auto hwlm = hwlmBuild(mp.lits, true, build.cc, build.getInitialGroups());
+    if (!hwlm) {
         throw CompileError("Unable to generate bytecode.");
     }
 
-    *esize = hwlmSize(etable.get());
-    assert(*esize);
-    DEBUG_PRINTF("built eod-anchored literal table size %zu bytes\n", *esize);
-    return etable;
+    buildAccel(build, mp, *hwlm);
+
+    DEBUG_PRINTF("built eod-anchored literal table size %zu bytes\n",
+                 hwlm.size());
+    return hwlm;
 }
 
 } // namespace ue2
diff --git a/src/rose/rose_build_matchers.h b/src/rose/rose_build_matchers.h
index a25dbca39..2b1afc8c6 100644
--- a/src/rose/rose_build_matchers.h
+++ b/src/rose/rose_build_matchers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,36 +35,47 @@
 #define ROSE_BUILD_MATCHERS_H
 
 #include "rose_build_impl.h"
+#include "util/bytecode_ptr.h"
 
 #include <vector>
 
+struct Grey;
 struct HWLM;
 
 namespace ue2 {
 
-struct hwlmLiteral;
+struct LitFragment {
+    LitFragment(u32 fragment_id_in, rose_group groups_in, u32 lit_id)
+    : fragment_id(fragment_id_in), groups(groups_in), lit_ids({lit_id}) {}
+    LitFragment(u32 fragment_id_in, rose_group groups_in,
+                std::vector<u32> lit_ids_in)
+    : fragment_id(fragment_id_in), groups(groups_in),
+        lit_ids(std::move(lit_ids_in)) {}
+    u32 fragment_id;
+    rose_group groups;
+    std::vector<u32> lit_ids;
+    u32 lit_program_offset = ROSE_INVALID_PROG_OFFSET;
+    u32 delay_program_offset = ROSE_INVALID_PROG_OFFSET;
+};
 
-/**
- * \brief Build up a vector of literals for the given table.
- *
- * If max_offset is specified (and not ROSE_BOUND_INF), then literals that can
- * only lead to a pattern match after max_offset may be excluded.
- */
-std::vector<hwlmLiteral> fillHamsterLiteralList(const RoseBuildImpl &build,
-                    rose_literal_table table, size_t max_len,
-                    u32 max_offset = ROSE_BOUND_INF);
+bytecode_ptr<HWLM>
+buildFloatingMatcher(const RoseBuildImpl &build,
+                     const std::vector<LitFragment> &fragments,
+                     size_t longLitLengthThreshold, rose_group *fgroups,
+                     size_t *historyRequired);
 
-aligned_unique_ptr<HWLM> buildFloatingMatcher(const RoseBuildImpl &build,
-                                              size_t longLitLengthThreshold,
-                                              rose_group *fgroups,
-                                              size_t *fsize,
-                                              size_t *historyRequired);
+bytecode_ptr<HWLM>
+buildDelayRebuildMatcher(const RoseBuildImpl &build,
+                         const std::vector<LitFragment> &fragments,
+                         size_t longLitLengthThreshold);
 
-aligned_unique_ptr<HWLM> buildSmallBlockMatcher(const RoseBuildImpl &build,
-                                                size_t *sbsize);
+bytecode_ptr<HWLM>
+buildSmallBlockMatcher(const RoseBuildImpl &build,
+                       const std::vector<LitFragment> &fragments);
 
-aligned_unique_ptr<HWLM> buildEodAnchoredMatcher(const RoseBuildImpl &build,
-                                                 size_t *esize);
+bytecode_ptr<HWLM>
+buildEodAnchoredMatcher(const RoseBuildImpl &build,
+                        const std::vector<LitFragment> &fragments);
 
 void findMoreLiteralMasks(RoseBuildImpl &build);
 
diff --git a/src/rose/rose_build_merge.cpp b/src/rose/rose_build_merge.cpp
index 54a7390ea..d638e589e 100644
--- a/src/rose/rose_build_merge.cpp
+++ b/src/rose/rose_build_merge.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -1054,14 +1054,14 @@ bool mergeableRoseVertices(const RoseBuildImpl &tbi, RoseVertex u,
     vector<pair<const rose_literal_id *, u32>> ulits;
     ulits.reserve(tbi.g[u].literals.size());
     for (u32 id : tbi.g[u].literals) {
-        ulits.push_back(make_pair(&tbi.literals.right.at(id), ulag));
+        ulits.emplace_back(&tbi.literals.at(id), ulag);
     }
 
     u32 vlag = tbi.g[v].left.lag;
     vector<pair<const rose_literal_id *, u32>> vlits;
     vlits.reserve(tbi.g[v].literals.size());
     for (u32 id : tbi.g[v].literals) {
-        vlits.push_back(make_pair(&tbi.literals.right.at(id), vlag));
+        vlits.emplace_back(&tbi.literals.at(id), vlag);
     }
 
     if (!compatibleLiteralsForMerge(ulits, vlits)) {
@@ -1130,7 +1130,7 @@ bool checkPredDelays(const RoseBuildImpl &tbi, const deque<RoseVertex> &v1,
     vector<const rose_literal_id *> pred_rose_lits;
     pred_rose_lits.reserve(pred_lits.size());
     for (const auto &p : pred_lits) {
-        pred_rose_lits.push_back(&tbi.literals.right.at(p));
+        pred_rose_lits.push_back(&tbi.literals.at(p));
     }
 
     for (auto v : v2) {
@@ -1140,7 +1140,7 @@ bool checkPredDelays(const RoseBuildImpl &tbi, const deque<RoseVertex> &v1,
         }
 
         for (const u32 vlit : tbi.g[v].literals) {
-            const rose_literal_id &vl = tbi.literals.right.at(vlit);
+            const rose_literal_id &vl = tbi.literals.at(vlit);
             assert(!vl.delay); // this should never have got this far?
             for (const auto &ul : pred_rose_lits) {
                 assert(!ul->delay); // this should never have got this far?
@@ -1195,7 +1195,7 @@ bool mergeableRoseVertices(const RoseBuildImpl &tbi,
 
         u32 ulag = tbi.g[a].left.lag;
         for (u32 id : tbi.g[a].literals) {
-            ulits.push_back(make_pair(&tbi.literals.right.at(id), ulag));
+            ulits.emplace_back(&tbi.literals.at(id), ulag);
         }
     }
 
@@ -1207,7 +1207,7 @@ bool mergeableRoseVertices(const RoseBuildImpl &tbi,
 
         u32 vlag = tbi.g[a].left.lag;
         for (u32 id : tbi.g[a].literals) {
-            vlits.push_back(make_pair(&tbi.literals.right.at(id), vlag));
+            vlits.emplace_back(&tbi.literals.at(id), vlag);
         }
     }
 
@@ -1759,7 +1759,7 @@ void replaceTops(NGHolder &h, const map<u32, u32> &top_mapping) {
             DEBUG_PRINTF("vertex %zu has top %u\n", h[v].index, t);
             new_tops.insert(top_mapping.at(t));
         }
-        h[e].tops = move(new_tops);
+        h[e].tops = std::move(new_tops);
     }
 }
 
@@ -2730,7 +2730,7 @@ u32 allowedSquashDistance(const CharReach &cr, u32 min_width,
 
     /* TODO: inspect further back in the pattern */
     for (u32 lit_id : g[tv].literals) {
-        const rose_literal_id &lit = tbi.literals.right.at(lit_id);
+        const rose_literal_id &lit = tbi.literals.at(lit_id);
         if (lit.delay) {
             return 0; /* TODO: better */
         }
diff --git a/src/rose/rose_build_misc.cpp b/src/rose/rose_build_misc.cpp
index 28b885bd5..01be11ef8 100644
--- a/src/rose/rose_build_misc.cpp
+++ b/src/rose/rose_build_misc.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -28,7 +28,7 @@
 
 #include "rose_build_impl.h"
 
-#include "hwlm/hwlm_build.h"
+#include "hwlm/hwlm_literal.h"
 #include "nfa/castlecompile.h"
 #include "nfa/goughcompile.h"
 #include "nfa/mcclellancompile_util.h"
@@ -75,10 +75,8 @@ RoseBuildImpl::RoseBuildImpl(ReportManager &rm_in,
     : cc(cc_in),
       root(add_vertex(g)),
       anchored_root(add_vertex(g)),
-      delay_base_id(MO_INVALID_IDX),
       hasSom(false),
       group_end(0),
-      anchored_base_id(MO_INVALID_IDX),
       ematcher_region_size(0),
       eod_event_literal_id(MO_INVALID_IDX),
       max_rose_anchored_floating_overlap(0),
@@ -156,14 +154,12 @@ bool isInTable(const RoseBuildImpl &tbi, RoseVertex v,
 
     // All literals for a given vertex will be in the same table, so we need
     // only inspect the first one.
-    const auto lit_table = tbi.literals.right.at(*lit_ids.begin()).table;
+    const auto lit_table = tbi.literals.at(*lit_ids.begin()).table;
 
-#ifndef NDEBUG
     // Verify that all literals for this vertex are in the same table.
-    for (auto lit_id : lit_ids) {
-        assert(tbi.literals.right.at(lit_id).table == lit_table);
-    }
-#endif
+    assert(all_of_in(lit_ids, [&](u32 lit_id) {
+        return tbi.literals.at(lit_id).table == lit_table;
+    }));
 
     return lit_table == table;
 }
@@ -213,7 +209,7 @@ size_t RoseBuildImpl::maxLiteralLen(RoseVertex v) const {
     size_t maxlen = 0;
 
     for (const auto &lit_id : lit_ids) {
-        maxlen = max(maxlen, literals.right.at(lit_id).elength());
+        maxlen = max(maxlen, literals.at(lit_id).elength());
     }
 
     return maxlen;
@@ -226,7 +222,7 @@ size_t RoseBuildImpl::minLiteralLen(RoseVertex v) const {
     size_t minlen = ROSE_BOUND_INF;
 
     for (const auto &lit_id : lit_ids) {
-        minlen = min(minlen, literals.right.at(lit_id).elength());
+        minlen = min(minlen, literals.at(lit_id).elength());
     }
 
     return minlen;
@@ -241,11 +237,6 @@ unique_ptr<RoseBuild> makeRoseBuilder(ReportManager &rm,
     return ue2::make_unique<RoseBuildImpl>(rm, ssm, smwr, cc, boundary);
 }
 
-size_t roseSize(const RoseEngine *t) {
-    assert(t);
-    return t->size;
-}
-
 bool roseIsPureLiteral(const RoseEngine *t) {
     return t->runtimeImpl == ROSE_RUNTIME_PURE_LITERAL;
 }
@@ -294,12 +285,11 @@ size_t maxOverlap(const rose_literal_id &a, const rose_literal_id &b) {
 static
 const rose_literal_id &getOverlapLiteral(const RoseBuildImpl &tbi,
                                          u32 literal_id) {
-    map<u32, rose_literal_id>::const_iterator it =
-        tbi.anchoredLitSuffix.find(literal_id);
+    auto it = tbi.anchoredLitSuffix.find(literal_id);
     if (it != tbi.anchoredLitSuffix.end()) {
         return it->second;
     }
-    return tbi.literals.right.at(literal_id);
+    return tbi.literals.at(literal_id);
 }
 
 ue2_literal findNonOverlappingTail(const set<ue2_literal> &lits,
@@ -375,16 +365,14 @@ u32 RoseBuildImpl::calcSuccMaxBound(RoseVertex u) const {
 
 u32 RoseBuildImpl::getLiteralId(const ue2_literal &s, u32 delay,
                                 rose_literal_table table) {
-    DEBUG_PRINTF("getting id for %s\n", dumpString(s).c_str());
+    DEBUG_PRINTF("getting id for %s in table %d\n", dumpString(s).c_str(),
+                 table);
     assert(table != ROSE_ANCHORED);
     rose_literal_id key(s, table, delay);
-    u32 numLiterals = verify_u32(literals.left.size());
 
-    RoseLiteralMap::iterator it;
-    bool inserted;
-    tie(it, inserted)
-        = literals.insert(RoseLiteralMap::value_type(key, numLiterals));
-    u32 id = it->right;
+    auto m = literals.insert(key);
+    u32 id = m.first;
+    bool inserted = m.second;
 
     if (inserted) {
         literal_info.push_back(rose_literal_info());
@@ -464,19 +452,17 @@ rose_literal_id::rose_literal_id(const ue2_literal &s_in,
 u32 RoseBuildImpl::getLiteralId(const ue2_literal &s, const vector<u8> &msk,
                                 const vector<u8> &cmp, u32 delay,
                                 rose_literal_table table) {
-    DEBUG_PRINTF("getting id for %s\n", dumpString(s).c_str());
+    DEBUG_PRINTF("getting id for %s in table %d\n", dumpString(s).c_str(),
+                 table);
     assert(table != ROSE_ANCHORED);
     rose_literal_id key(s, msk, cmp, table, delay);
-    u32 numLiterals = verify_u32(literals.left.size());
 
     /* ue2_literals are always uppercased if nocase and must have an
      * alpha char */
 
-    RoseLiteralMap::iterator it;
-    bool inserted;
-    tie(it, inserted) = literals.insert(
-            RoseLiteralMap::value_type(key, numLiterals));
-    u32 id = it->right;
+    auto m = literals.insert(key);
+    u32 id = m.first;
+    bool inserted = m.second;
 
     if (inserted) {
         literal_info.push_back(rose_literal_info());
@@ -493,40 +479,14 @@ u32 RoseBuildImpl::getLiteralId(const ue2_literal &s, const vector<u8> &msk,
     return id;
 }
 
-bool RoseBuildImpl::hasLiteral(const ue2_literal &s,
-                               rose_literal_table table) const {
-    DEBUG_PRINTF("looking if %s exists\n", dumpString(s).c_str());
-    assert(table != ROSE_ANCHORED);
-
-    for (RoseLiteralMap::left_map::const_iterator it
-        = literals.left.lower_bound(rose_literal_id(s, table, 0));
-         it != literals.left.end(); ++it) {
-        if (it->first.table != table || it->first.s != s) {
-            break;
-        }
-        const rose_literal_info &info = literal_info[it->second];
-        if (!info.vertices.empty()) {
-            return true;
-        }
-    }
-
-    DEBUG_PRINTF("(used) literal not found\n");
-
-    return false;
-}
-
 u32 RoseBuildImpl::getNewLiteralId() {
     rose_literal_id key(ue2_literal(), ROSE_ANCHORED, 0);
-    u32 numLiterals = verify_u32(literals.left.size());
+    u32 numLiterals = verify_u32(literals.size());
     key.distinctiveness = numLiterals;
 
-    RoseLiteralMap::iterator it;
-    bool inserted;
-    tie(it, inserted)
-        = literals.insert(RoseLiteralMap::value_type(key, numLiterals));
-    u32 id = it->right;
-
-    assert(inserted);
+    auto m = literals.insert(key);
+    assert(m.second);
+    u32 id = m.first;
 
     literal_info.push_back(rose_literal_info());
     assert(literal_info.size() == id + 1);
@@ -536,350 +496,6 @@ u32 RoseBuildImpl::getNewLiteralId() {
     return id;
 }
 
-static
-bool requiresDedupe(const NGHolder &h, const ue2::flat_set<ReportID> &reports,
-                    const Grey &grey) {
-    /* TODO: tighten */
-    NFAVertex seen_vert = NGHolder::null_vertex();
-
-    for (auto v : inv_adjacent_vertices_range(h.accept, h)) {
-        if (has_intersection(h[v].reports, reports)) {
-            if (seen_vert != NGHolder::null_vertex()) {
-                return true;
-            }
-            seen_vert = v;
-        }
-    }
-
-    for (auto v : inv_adjacent_vertices_range(h.acceptEod, h)) {
-        if (has_intersection(h[v].reports, reports)) {
-            if (seen_vert != NGHolder::null_vertex()) {
-                return true;
-            }
-            seen_vert = v;
-        }
-    }
-
-    if (seen_vert) {
-        /* if the reporting vertex is part of of a terminal repeat, the
-         * construction process may reform the graph splitting it into two
-         * vertices (pos, cyclic) and hence require dedupe */
-        vector<GraphRepeatInfo> repeats;
-        findRepeats(h, grey.minExtBoundedRepeatSize, &repeats);
-        for (const auto &repeat : repeats) {
-            if (find(repeat.vertices.begin(), repeat.vertices.end(),
-                     seen_vert) != repeat.vertices.end()) {
-                return true;
-            }
-        }
-    }
-
-    return false;
-}
-
-class RoseDedupeAuxImpl : public RoseDedupeAux {
-public:
-    explicit RoseDedupeAuxImpl(const RoseBuildImpl &tbi_in);
-    bool requiresDedupeSupport(
-        const ue2::flat_set<ReportID> &reports) const override;
-
-private:
-    bool hasSafeMultiReports(const ue2::flat_set<ReportID> &reports) const;
-
-    const RoseBuildImpl &tbi;
-    map<ReportID, set<RoseVertex>> vert_map; //!< ordinary literals
-    map<ReportID, set<RoseVertex>> sb_vert_map; //!< small block literals
-    map<ReportID, set<suffix_id>> suffix_map;
-    map<ReportID, set<const OutfixInfo *>> outfix_map;
-    map<ReportID, set<const raw_puff *>> puff_map;
-};
-
-unique_ptr<RoseDedupeAux> RoseBuildImpl::generateDedupeAux() const {
-    return ue2::make_unique<RoseDedupeAuxImpl>(*this);
-}
-
-RoseDedupeAux::~RoseDedupeAux() {
-}
-
-RoseDedupeAuxImpl::RoseDedupeAuxImpl(const RoseBuildImpl &tbi_in)
-    : tbi(tbi_in) {
-    const RoseGraph &g = tbi.g;
-
-    set<suffix_id> suffixes;
-
-    for (auto v : vertices_range(g)) {
-        // Literals in the small block table are "shadow" copies of literals in
-        // the other tables that do not run in the same runtime invocation.
-        // Dedupe key assignment will be taken care of by the real literals.
-        if (tbi.hasLiteralInTable(v, ROSE_ANCHORED_SMALL_BLOCK)) {
-            for (const auto &report_id : g[v].reports) {
-                sb_vert_map[report_id].insert(v);
-            }
-        } else {
-            for (const auto &report_id : g[v].reports) {
-                vert_map[report_id].insert(v);
-            }
-        }
-
-        // Several vertices may share a suffix, so we collect the set of
-        // suffixes first to avoid repeating work.
-        if (g[v].suffix) {
-            suffixes.insert(g[v].suffix);
-        }
-    }
-
-    for (const auto &suffix : suffixes) {
-        for (const auto &report_id : all_reports(suffix)) {
-            suffix_map[report_id].insert(suffix);
-        }
-    }
-
-    for (const auto &outfix : tbi.outfixes) {
-        for (const auto &report_id : all_reports(outfix)) {
-            outfix_map[report_id].insert(&outfix);
-        }
-    }
-
-    if (tbi.mpv_outfix) {
-        auto *mpv = tbi.mpv_outfix->mpv();
-        for (const auto &puff : mpv->puffettes) {
-            puff_map[puff.report].insert(&puff);
-        }
-        for (const auto &puff : mpv->triggered_puffettes) {
-            puff_map[puff.report].insert(&puff);
-        }
-    }
-}
-
-static
-vector<CharReach> makePath(const rose_literal_id &lit) {
-    vector<CharReach> path(begin(lit.s), end(lit.s));
-    for (u32 i = 0; i < lit.delay; i++) {
-        path.push_back(CharReach::dot());
-    }
-    return path;
-}
-
-/**
- * \brief True if one of the given literals overlaps with the suffix of
- * another, meaning that they could arrive at the same offset.
- */
-static
-bool literalsCouldRace(const rose_literal_id &lit1,
-                       const rose_literal_id &lit2) {
-    DEBUG_PRINTF("compare %s (delay %u) and %s (delay %u)\n",
-                 dumpString(lit1.s).c_str(), lit1.delay,
-                 dumpString(lit2.s).c_str(), lit2.delay);
-
-    // Add dots on the end of each literal for delay.
-    const auto v1 = makePath(lit1);
-    const auto v2 = makePath(lit2);
-
-    // See if the smaller path is a suffix of the larger path.
-    const auto *smaller = v1.size() < v2.size() ? &v1 : &v2;
-    const auto *bigger = v1.size() < v2.size() ? &v2 : &v1;
-    auto r = mismatch(smaller->rbegin(), smaller->rend(), bigger->rbegin(),
-                      overlaps);
-    return r.first == smaller->rend();
-}
-
-bool RoseDedupeAuxImpl::hasSafeMultiReports(
-    const flat_set<ReportID> &reports) const {
-    if (reports.size() <= 1) {
-        return true;
-    }
-
-    /* We have more than one ReportID corresponding to the external ID that is
-     * presented to the user. These may differ in offset adjustment, bounds
-     * checks, etc. */
-
-    /* TODO: work out if these differences will actually cause problems */
-
-    /* One common case where we know we don't have a problem is if there are
-     * precisely two reports, one for the main Rose path and one for the
-     * "small block matcher" path. */
-    if (reports.size() == 2) {
-        ReportID id1 = *reports.begin();
-        ReportID id2 = *reports.rbegin();
-
-        bool has_verts_1 = contains(vert_map, id1);
-        bool has_verts_2 = contains(vert_map, id2);
-        bool has_sb_verts_1 = contains(sb_vert_map, id1);
-        bool has_sb_verts_2 = contains(sb_vert_map, id2);
-
-        if (has_verts_1 != has_verts_2 && has_sb_verts_1 != has_sb_verts_2) {
-            DEBUG_PRINTF("two reports, one full and one small block: ok\n");
-            return true;
-        }
-    }
-
-    DEBUG_PRINTF("more than one report\n");
-    return false;
-}
-
-bool RoseDedupeAuxImpl::requiresDedupeSupport(
-    const ue2::flat_set<ReportID> &reports) const {
-    /* TODO: this could be expanded to check for offset or character
-       constraints */
-
-    DEBUG_PRINTF("reports: %s\n", as_string_list(reports).c_str());
-
-    const RoseGraph &g = tbi.g;
-
-    bool has_suffix = false;
-    bool has_outfix = false;
-
-    if (!hasSafeMultiReports(reports)) {
-        DEBUG_PRINTF("multiple reports not safe\n");
-        return true;
-    }
-
-    set<RoseVertex> roles;
-    set<suffix_id> suffixes;
-    set<const OutfixInfo *> outfixes;
-    set<const raw_puff *> puffettes;
-    for (ReportID r : reports) {
-        if (contains(vert_map, r)) {
-            insert(&roles, vert_map.at(r));
-        }
-        if (contains(suffix_map, r)) {
-            insert(&suffixes, suffix_map.at(r));
-        }
-
-        if (contains(outfix_map, r)) {
-            insert(&outfixes, outfix_map.at(r));
-        }
-
-        if (contains(puff_map, r)) {
-            insert(&puffettes, puff_map.at(r));
-        }
-    }
-
-    /* roles */
-
-    map<u32, u32> lits; // Literal ID -> count of occurrences.
-
-    const bool has_role = !roles.empty();
-    for (auto v : roles) {
-        for (const auto &lit : g[v].literals) {
-            lits[lit]++;
-        }
-        if (g[v].eod_accept) {
-            // Literals plugged into this EOD accept must be taken into account
-            // as well.
-            for (auto u : inv_adjacent_vertices_range(v, g)) {
-                for (const auto &lit : g[u].literals) {
-                    lits[lit]++;
-                }
-            }
-        }
-    }
-
-    /* literals */
-
-    for (const auto &m : lits) {
-        if (m.second > 1) {
-            DEBUG_PRINTF("lit %u used by >1 reporting roles\n", m.first);
-            return true;
-        }
-    }
-
-    for (auto it = begin(lits); it != end(lits); ++it) {
-        const auto &lit1 = tbi.literals.right.at(it->first);
-        for (auto jt = next(it); jt != end(lits); ++jt) {
-            const auto &lit2 = tbi.literals.right.at(jt->first);
-            if (literalsCouldRace(lit1, lit2)) {
-                DEBUG_PRINTF("literals could race\n");
-                return true;
-            }
-        }
-    }
-
-    /* suffixes */
-
-    for (const auto &suffix : suffixes) {
-        if (has_suffix || has_role) {
-            return true; /* scope for badness */
-        }
-
-        has_suffix = true;
-
-        /* some lesser suffix engines (nfas, haig, castle) can raise multiple
-         * matches for a report id at the same offset if there are multiple
-         * report states live. */
-        if (suffix.haig()) {
-            return true;
-        }
-        if (suffix.graph() &&
-            requiresDedupe(*suffix.graph(), reports, tbi.cc.grey)) {
-            return true;
-        }
-        if (suffix.castle() && requiresDedupe(*suffix.castle(), reports)) {
-            return true;
-        }
-    }
-
-    /* outfixes */
-
-    for (const auto &outfix_ptr : outfixes) {
-        assert(outfix_ptr);
-        const OutfixInfo &out = *outfix_ptr;
-
-        if (has_outfix || has_role || has_suffix) {
-            return true;
-        }
-        has_outfix = true;
-
-        if (out.haig()) {
-            return true; /* haig may report matches with different SOM at the
-                            same offset */
-        }
-
-        if (out.holder() &&
-            requiresDedupe(*out.holder(), reports, tbi.cc.grey)) {
-            return true;
-        }
-    }
-
-    /* mpv */
-    for (UNUSED const auto &puff : puffettes) {
-        if (has_outfix || has_role || has_suffix) {
-            return true;
-        }
-        has_outfix = true;
-    }
-
-    /* boundary */
-    if (has_intersection(tbi.boundary.report_at_eod, reports)) {
-        if (has_outfix || has_role || has_suffix) {
-            return true;
-        }
-    }
-
-    return false;
-}
-
-// Sets the report ID for all vertices connected to an accept to `id`.
-void setReportId(NGHolder &g, ReportID id) {
-    // First, wipe the report IDs on all vertices.
-    for (auto v : vertices_range(g)) {
-        g[v].reports.clear();
-    }
-
-    // Any predecessors of accept get our id.
-    for (auto v : inv_adjacent_vertices_range(g.accept, g)) {
-        g[v].reports.insert(id);
-    }
-
-    // Same for preds of acceptEod, except accept itself.
-    for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) {
-        if (v == g.accept) {
-            continue;
-        }
-        g[v].reports.insert(id);
-    }
-}
-
 bool operator<(const RoseEdgeProps &a, const RoseEdgeProps &b) {
     ORDER_CHECK(minBound);
     ORDER_CHECK(maxBound);
@@ -887,17 +503,6 @@ bool operator<(const RoseEdgeProps &a, const RoseEdgeProps &b) {
     return false;
 }
 
-// Note: only clones the vertex, you'll have to wire up your own edges.
-RoseVertex RoseBuildImpl::cloneVertex(RoseVertex v) {
-    RoseVertex v2 = add_vertex(g[v], g);
-
-    for (const auto &lit_id : g[v2].literals) {
-        literal_info[lit_id].vertices.insert(v2);
-    }
-
-    return v2;
-}
-
 #ifndef NDEBUG
 bool roseHasTops(const RoseBuildImpl &build, RoseVertex v) {
     const RoseGraph &g = build.g;
@@ -979,7 +584,7 @@ void RoseSuffixInfo::reset(void) {
     rdfa.reset();
     haig.reset();
     tamarama.reset();
-    dfa_min_width = 0;
+    dfa_min_width = depth(0);
     dfa_max_width = depth::infinity();
 }
 
@@ -1103,6 +708,13 @@ bool isAnchored(const left_id &r) {
     if (r.graph()) {
         return isAnchored(*r.graph());
     }
+    if (r.dfa()) {
+        return r.dfa()->start_anchored == DEAD_STATE;
+    }
+    if (r.haig()) {
+        return r.haig()->start_anchored == DEAD_STATE;
+    }
+
     // All other types are explicitly anchored.
     return true;
 }
@@ -1183,7 +795,7 @@ void LeftEngInfo::reset(void) {
     tamarama.reset();
     lag = 0;
     leftfix_report = MO_INVALID_IDX;
-    dfa_min_width = 0;
+    dfa_min_width = depth(0);
     dfa_max_width = depth::infinity();
 }
 
@@ -1264,6 +876,59 @@ u32 roseQuality(const RoseEngine *t) {
     return 1;
 }
 
+u32 findMinOffset(const RoseBuildImpl &build, u32 lit_id) {
+    const auto &lit_vertices = build.literal_info.at(lit_id).vertices;
+    assert(!lit_vertices.empty());
+
+    u32 min_offset = UINT32_MAX;
+    for (const auto &v : lit_vertices) {
+        min_offset = min(min_offset, build.g[v].min_offset);
+    }
+
+    return min_offset;
+}
+
+u32 findMaxOffset(const RoseBuildImpl &build, u32 lit_id) {
+    const auto &lit_vertices = build.literal_info.at(lit_id).vertices;
+    assert(!lit_vertices.empty());
+
+    u32 max_offset = 0;
+    for (const auto &v : lit_vertices) {
+        max_offset = max(max_offset, build.g[v].max_offset);
+    }
+
+    return max_offset;
+}
+
+bool canEagerlyReportAtEod(const RoseBuildImpl &build, const RoseEdge &e) {
+    const auto &g = build.g;
+    const auto v = target(e, g);
+
+    if (!build.g[v].eod_accept) {
+        return false;
+    }
+
+    // If there's a graph between us and EOD, we shouldn't be eager.
+    if (build.g[v].left) {
+        return false;
+    }
+
+    // Must be exactly at EOD.
+    if (g[e].minBound != 0 || g[e].maxBound != 0) {
+        return false;
+    }
+
+    // In streaming mode, we can only eagerly report EOD for literals in the
+    // EOD-anchored table, as that's the only time we actually know where EOD
+    // is. In block mode, we always have this information.
+    const auto u = source(e, g);
+    if (build.cc.streaming && !build.isInETable(u)) {
+        return false;
+    }
+
+    return true;
+}
+
 #ifndef NDEBUG
 /** \brief Returns true if all the graphs (NFA, DFA, Haig, etc) in this Rose
  * graph are implementable. */
diff --git a/src/rose/rose_build_program.cpp b/src/rose/rose_build_program.cpp
index ee237639b..23a8b959b 100644
--- a/src/rose/rose_build_program.cpp
+++ b/src/rose/rose_build_program.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,515 +26,191 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "rose_build_engine_blob.h"
 #include "rose_build_program.h"
+
+#include "rose_build_engine_blob.h"
+#include "rose_build_instructions.h"
+#include "rose_build_lookaround.h"
+#include "rose_build_resources.h"
+#include "nfa/nfa_api_queue.h"
+#include "nfa/nfa_build_util.h"
+#include "nfa/tamaramacompile.h"
+#include "nfagraph/ng_util.h"
+#include "util/charreach_util.h"
 #include "util/container.h"
-#include "util/multibit_build.h"
+#include "util/compile_context.h"
+#include "util/compile_error.h"
+#include "util/report_manager.h"
 #include "util/verify_types.h"
 
+#include <boost/range/adaptor/map.hpp>
+
 #include <algorithm>
 #include <cstring>
 
 using namespace std;
+using boost::adaptors::map_values;
+using boost::adaptors::map_keys;
 
 namespace ue2 {
 
-/* Destructors to avoid weak vtables. */
+engine_info::engine_info(const NFA *nfa, bool trans)
+    : type((NFAEngineType)nfa->type), accepts_eod(nfaAcceptsEod(nfa)),
+      stream_size(nfa->streamStateSize),
+      scratch_size(nfa->scratchStateSize),
+      scratch_align(state_alignment(*nfa)),
+      transient(trans) {
+    assert(scratch_align);
+}
+
+left_build_info::left_build_info(u32 q, u32 l, u32 t, rose_group sm,
+                                 const std::vector<u8> &stops, u32 max_ql,
+                                 u8 cm_count, const CharReach &cm_cr)
+    : queue(q), lag(l), transient(t), squash_mask(sm), stopAlphabet(stops),
+      max_queuelen(max_ql), countingMiracleCount(cm_count),
+      countingMiracleReach(cm_cr) {
+}
 
-RoseInstruction::~RoseInstruction() = default;
-RoseInstrCatchUp::~RoseInstrCatchUp() = default;
-RoseInstrCatchUpMpv::~RoseInstrCatchUpMpv() = default;
-RoseInstrSomZero::~RoseInstrSomZero() = default;
-RoseInstrSuffixesEod::~RoseInstrSuffixesEod() = default;
-RoseInstrMatcherEod::~RoseInstrMatcherEod() = default;
-RoseInstrEnd::~RoseInstrEnd() = default;
+left_build_info::left_build_info(const vector<vector<LookEntry>> &looks)
+    : has_lookaround(true), lookaround(looks) {
+}
 
 using OffsetMap = RoseInstruction::OffsetMap;
 
 static
-u32 calc_jump(const OffsetMap &offset_map, const RoseInstruction *from,
-              const RoseInstruction *to) {
-    DEBUG_PRINTF("computing relative jump from %p to %p\n", from, to);
-    assert(from && contains(offset_map, from));
-    assert(to && contains(offset_map, to));
-
-    u32 from_offset = offset_map.at(from);
-    u32 to_offset = offset_map.at(to);
-    DEBUG_PRINTF("offsets: %u -> %u\n", from_offset, to_offset);
-    assert(from_offset <= to_offset);
-
-    return to_offset - from_offset;
-}
-
-void RoseInstrAnchoredDelay::write(void *dest, RoseEngineBlob &blob,
-                                   const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->groups = groups;
-    inst->done_jump = calc_jump(offset_map, this, target);
-}
-
-void RoseInstrCheckLitEarly::write(void *dest, RoseEngineBlob &blob,
-                                  const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->min_offset = min_offset;
-}
-
-void RoseInstrCheckGroups::write(void *dest, RoseEngineBlob &blob,
-                                 const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->groups = groups;
-}
-
-void RoseInstrCheckOnlyEod::write(void *dest, RoseEngineBlob &blob,
-                                  const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->fail_jump = calc_jump(offset_map, this, target);
-}
-
-void RoseInstrCheckBounds::write(void *dest, RoseEngineBlob &blob,
-                                 const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->min_bound = min_bound;
-    inst->max_bound = max_bound;
-    inst->fail_jump = calc_jump(offset_map, this, target);
-}
-
-void RoseInstrCheckNotHandled::write(void *dest, RoseEngineBlob &blob,
-                                     const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->key = key;
-    inst->fail_jump = calc_jump(offset_map, this, target);
-}
-
-void RoseInstrCheckSingleLookaround::write(void *dest, RoseEngineBlob &blob,
-                                           const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->offset = offset;
-    inst->reach_index = reach_index;
-    inst->fail_jump = calc_jump(offset_map, this, target);
-}
-
-void RoseInstrCheckLookaround::write(void *dest, RoseEngineBlob &blob,
-                                     const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->index = index;
-    inst->count = count;
-    inst->fail_jump = calc_jump(offset_map, this, target);
-}
-
-void RoseInstrCheckMask::write(void *dest, RoseEngineBlob &blob,
-                               const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->and_mask = and_mask;
-    inst->cmp_mask = cmp_mask;
-    inst->neg_mask = neg_mask;
-    inst->offset = offset;
-    inst->fail_jump = calc_jump(offset_map, this, target);
-}
-
-void RoseInstrCheckMask32::write(void *dest, RoseEngineBlob &blob,
-                                 const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    copy(begin(and_mask), end(and_mask), inst->and_mask);
-    copy(begin(cmp_mask), end(cmp_mask), inst->cmp_mask);
-    inst->neg_mask = neg_mask;
-    inst->offset = offset;
-    inst->fail_jump = calc_jump(offset_map, this, target);
-}
-
-void RoseInstrCheckByte::write(void *dest, RoseEngineBlob &blob,
-                               const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->and_mask = and_mask;
-    inst->cmp_mask = cmp_mask;
-    inst->negation = negation;
-    inst->offset = offset;
-    inst->fail_jump = calc_jump(offset_map, this, target);
-}
-
-void RoseInstrCheckShufti16x8::write(void *dest, RoseEngineBlob &blob,
-                                     const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    copy(begin(nib_mask), end(nib_mask), inst->nib_mask);
-    copy(begin(bucket_select_mask), end(bucket_select_mask),
-         inst->bucket_select_mask);
-    inst->neg_mask = neg_mask;
-    inst->offset = offset;
-    inst->fail_jump = calc_jump(offset_map, this, target);
-}
-
-void RoseInstrCheckShufti32x8::write(void *dest, RoseEngineBlob &blob,
-                                     const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    copy(begin(hi_mask), end(hi_mask), inst->hi_mask);
-    copy(begin(lo_mask), end(lo_mask), inst->lo_mask);
-    copy(begin(bucket_select_mask), end(bucket_select_mask),
-         inst->bucket_select_mask);
-
-    inst->neg_mask = neg_mask;
-    inst->offset = offset;
-    inst->fail_jump = calc_jump(offset_map, this, target);
-}
-
-void RoseInstrCheckShufti16x16::write(void *dest, RoseEngineBlob &blob,
-                                     const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    copy(begin(hi_mask), end(hi_mask), inst->hi_mask);
-    copy(begin(lo_mask), end(lo_mask), inst->lo_mask);
-    copy(begin(bucket_select_mask), end(bucket_select_mask),
-         inst->bucket_select_mask);
-    inst->neg_mask = neg_mask;
-    inst->offset = offset;
-    inst->fail_jump = calc_jump(offset_map, this, target);
-}
-
-void RoseInstrCheckShufti32x16::write(void *dest, RoseEngineBlob &blob,
-                                     const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    copy(begin(hi_mask), end(hi_mask), inst->hi_mask);
-    copy(begin(lo_mask), end(lo_mask), inst->lo_mask);
-    copy(begin(bucket_select_mask_hi), end(bucket_select_mask_hi),
-         inst->bucket_select_mask_hi);
-    copy(begin(bucket_select_mask_lo), end(bucket_select_mask_lo),
-         inst->bucket_select_mask_lo);
-    inst->neg_mask = neg_mask;
-    inst->offset = offset;
-    inst->fail_jump = calc_jump(offset_map, this, target);
-}
-
-void RoseInstrCheckInfix::write(void *dest, RoseEngineBlob &blob,
-                                const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->queue = queue;
-    inst->lag = lag;
-    inst->report = report;
-    inst->fail_jump = calc_jump(offset_map, this, target);
-}
-
-void RoseInstrCheckPrefix::write(void *dest, RoseEngineBlob &blob,
-                                 const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->queue = queue;
-    inst->lag = lag;
-    inst->report = report;
-    inst->fail_jump = calc_jump(offset_map, this, target);
-}
-
-void RoseInstrPushDelayed::write(void *dest, RoseEngineBlob &blob,
-                                 const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->delay = delay;
-    inst->index = index;
-}
-
-void RoseInstrRecordAnchored::write(void *dest, RoseEngineBlob &blob,
-                                    const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->id = id;
-}
-
-void RoseInstrSomAdjust::write(void *dest, RoseEngineBlob &blob,
-                               const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->distance = distance;
-}
-
-void RoseInstrSomLeftfix::write(void *dest, RoseEngineBlob &blob,
-                                const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->queue = queue;
-    inst->lag = lag;
-}
-
-void RoseInstrSomFromReport::write(void *dest, RoseEngineBlob &blob,
-                                   const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->som = som;
-}
-
-void RoseInstrTriggerInfix::write(void *dest, RoseEngineBlob &blob,
-                                  const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->cancel = cancel;
-    inst->queue = queue;
-    inst->event = event;
-}
-
-void RoseInstrTriggerSuffix::write(void *dest, RoseEngineBlob &blob,
-                                   const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->queue = queue;
-    inst->event = event;
-}
-
-void RoseInstrDedupe::write(void *dest, RoseEngineBlob &blob,
-                            const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->quash_som = quash_som;
-    inst->dkey = dkey;
-    inst->offset_adjust = offset_adjust;
-    inst->fail_jump = calc_jump(offset_map, this, target);
-}
-
-void RoseInstrDedupeSom::write(void *dest, RoseEngineBlob &blob,
-                               const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->quash_som = quash_som;
-    inst->dkey = dkey;
-    inst->offset_adjust = offset_adjust;
-    inst->fail_jump = calc_jump(offset_map, this, target);
-}
-
-void RoseInstrReportChain::write(void *dest, RoseEngineBlob &blob,
-                                 const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->event = event;
-    inst->top_squash_distance = top_squash_distance;
-}
-
-void RoseInstrReportSomInt::write(void *dest, RoseEngineBlob &blob,
-                                 const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->som = som;
+OffsetMap makeOffsetMap(const RoseProgram &program, u32 *total_len) {
+    OffsetMap offset_map;
+    u32 offset = 0;
+    for (const auto &ri : program) {
+        offset = ROUNDUP_N(offset, ROSE_INSTR_MIN_ALIGN);
+        DEBUG_PRINTF("instr %p (opcode %d) -> offset %u\n", ri.get(),
+                     ri->code(), offset);
+        assert(!contains(offset_map, ri.get()));
+        offset_map.emplace(ri.get(), offset);
+        offset += ri->byte_length();
+    }
+    *total_len = offset;
+    return offset_map;
 }
 
-void RoseInstrReportSomAware::write(void *dest, RoseEngineBlob &blob,
-                                    const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->som = som;
+RoseProgram::RoseProgram() {
+    prog.push_back(make_unique<RoseInstrEnd>());
 }
 
-void RoseInstrReport::write(void *dest, RoseEngineBlob &blob,
-                            const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->onmatch = onmatch;
-    inst->offset_adjust = offset_adjust;
-}
+RoseProgram::~RoseProgram() = default;
 
-void RoseInstrReportExhaust::write(void *dest, RoseEngineBlob &blob,
-                                   const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->onmatch = onmatch;
-    inst->offset_adjust = offset_adjust;
-    inst->ekey = ekey;
-}
+RoseProgram::RoseProgram(RoseProgram &&) = default;
+RoseProgram &RoseProgram::operator=(RoseProgram &&) = default;
 
-void RoseInstrReportSom::write(void *dest, RoseEngineBlob &blob,
-                               const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->onmatch = onmatch;
-    inst->offset_adjust = offset_adjust;
+bool RoseProgram::empty() const {
+    assert(!prog.empty());
+    assert(prog.back()->code() == ROSE_INSTR_END);
+    // Empty if we only have one element, the END instruction.
+    return next(prog.begin()) == prog.end();
 }
 
-void RoseInstrReportSomExhaust::write(void *dest, RoseEngineBlob &blob,
-                                      const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->onmatch = onmatch;
-    inst->offset_adjust = offset_adjust;
-    inst->ekey = ekey;
-}
-
-void RoseInstrDedupeAndReport::write(void *dest, RoseEngineBlob &blob,
-                                     const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->quash_som = quash_som;
-    inst->dkey = dkey;
-    inst->onmatch = onmatch;
-    inst->offset_adjust = offset_adjust;
-    inst->fail_jump = calc_jump(offset_map, this, target);
-}
-
-void RoseInstrFinalReport::write(void *dest, RoseEngineBlob &blob,
-                                 const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->onmatch = onmatch;
-    inst->offset_adjust = offset_adjust;
-}
-
-void RoseInstrCheckExhausted::write(void *dest, RoseEngineBlob &blob,
-                                    const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->ekey = ekey;
-    inst->fail_jump = calc_jump(offset_map, this, target);
-}
-
-void RoseInstrCheckMinLength::write(void *dest, RoseEngineBlob &blob,
-                                    const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->end_adj = end_adj;
-    inst->min_length = min_length;
-    inst->fail_jump = calc_jump(offset_map, this, target);
-}
-
-void RoseInstrSetState::write(void *dest, RoseEngineBlob &blob,
-                              const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->index = index;
-}
+const RoseInstruction *RoseProgram::end_instruction() const {
+    assert(!prog.empty());
+    assert(prog.back()->code() == ROSE_INSTR_END);
 
-void RoseInstrSetGroups::write(void *dest, RoseEngineBlob &blob,
-                              const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->groups = groups;
+    return prog.back().get();
 }
 
-void RoseInstrSquashGroups::write(void *dest, RoseEngineBlob &blob,
-                                  const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->groups = groups;
+void RoseProgram::update_targets(RoseProgram::iterator it,
+                                 RoseProgram::iterator it_end,
+                                 const RoseInstruction *old_target,
+                                 const RoseInstruction *new_target) {
+    assert(old_target && new_target && old_target != new_target);
+    for (; it != it_end; ++it) {
+        unique_ptr<RoseInstruction> &ri = *it;
+        assert(ri);
+        ri->update_target(old_target, new_target);
+    }
 }
 
-void RoseInstrCheckState::write(void *dest, RoseEngineBlob &blob,
-                                const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->index = index;
-    inst->fail_jump = calc_jump(offset_map, this, target);
+RoseProgram::iterator RoseProgram::insert(RoseProgram::iterator it,
+                                          unique_ptr<RoseInstruction> ri) {
+    assert(!prog.empty());
+    assert(it != end());
+    assert(prog.back()->code() == ROSE_INSTR_END);
+
+    return prog.insert(it, move(ri));
 }
 
-void RoseInstrSparseIterBegin::write(void *dest, RoseEngineBlob &blob,
-                                     const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->fail_jump = calc_jump(offset_map, this, target);
+RoseProgram::iterator RoseProgram::insert(RoseProgram::iterator it,
+                                          RoseProgram &&block) {
+    assert(!prog.empty());
+    assert(it != end());
+    assert(prog.back()->code() == ROSE_INSTR_END);
 
-    // Resolve and write the multibit sparse iterator and the jump table.
-    vector<u32> keys;
-    vector<u32> jump_offsets;
-    for (const auto &jump : jump_table) {
-        keys.push_back(jump.first);
-        assert(contains(offset_map, jump.second));
-        jump_offsets.push_back(offset_map.at(jump.second));
+    if (block.empty()) {
+        return it;
     }
 
-    vector<mmbit_sparse_iter> iter;
-    mmbBuildSparseIterator(iter, keys, num_keys);
-    assert(!iter.empty());
-    inst->iter_offset = blob.add_iterator(iter);
-    inst->jump_table = blob.add(jump_offsets.begin(), jump_offsets.end());
-
-    // Store offsets for corresponding SPARSE_ITER_NEXT operations.
-    is_written = true;
-    iter_offset = inst->iter_offset;
-    jump_table_offset = inst->jump_table;
+    const RoseInstruction *end_ptr = block.end_instruction();
+    assert(end_ptr->code() == ROSE_INSTR_END);
+    block.prog.pop_back();
+
+    const RoseInstruction *new_target = it->get();
+    update_targets(block.prog.begin(), block.prog.end(), end_ptr, new_target);
+
+    // Workaround: container insert() for ranges doesn't return an iterator
+    // in the version of the STL distributed with gcc 4.8.
+    auto dist = distance(prog.begin(), it);
+    prog.insert(it, make_move_iterator(block.prog.begin()),
+                make_move_iterator(block.prog.end()));
+    it = prog.begin();
+    advance(it, dist);
+    return it;
 }
 
-void RoseInstrSparseIterNext::write(void *dest, RoseEngineBlob &blob,
-                                    const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->state = state;
-    inst->fail_jump = calc_jump(offset_map, this, target);
-
-    // Use the same sparse iterator and jump table as the SPARSE_ITER_BEGIN
-    // instruction.
-    assert(begin);
-    assert(contains(offset_map, begin));
-    assert(begin->is_written);
-    inst->iter_offset = begin->iter_offset;
-    inst->jump_table = begin->jump_table_offset;
+RoseProgram::iterator RoseProgram::erase(RoseProgram::iterator first,
+                                          RoseProgram::iterator last) {
+     return prog.erase(first, last);
 }
 
-void RoseInstrSparseIterAny::write(void *dest, RoseEngineBlob &blob,
-                                   const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->fail_jump = calc_jump(offset_map, this, target);
-
-    // Write the multibit sparse iterator.
-    vector<mmbit_sparse_iter> iter;
-    mmbBuildSparseIterator(iter, keys, num_keys);
-    assert(!iter.empty());
-    inst->iter_offset = blob.add_iterator(iter);
+void RoseProgram::add_before_end(std::unique_ptr<RoseInstruction> ri) {
+    assert(!prog.empty());
+    insert(std::prev(prog.end()), std::move(ri));
 }
 
-void RoseInstrEnginesEod::write(void *dest, RoseEngineBlob &blob,
-                                const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    inst->iter_offset = iter_offset;
-}
+void RoseProgram::add_before_end(RoseProgram &&block) {
+    assert(!prog.empty());
+    assert(prog.back()->code() == ROSE_INSTR_END);
 
-void RoseInstrCheckLongLit::write(void *dest, RoseEngineBlob &blob,
-                                  const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    assert(!literal.empty());
-    inst->lit_offset = blob.add(literal.c_str(), literal.size(), 1);
-    inst->lit_length = verify_u32(literal.size());
-}
+    if (block.empty()) {
+        return;
+    }
 
-void RoseInstrCheckLongLitNocase::write(void *dest, RoseEngineBlob &blob,
-                                        const OffsetMap &offset_map) const {
-    RoseInstrBase::write(dest, blob, offset_map);
-    auto *inst = static_cast<impl_type *>(dest);
-    assert(!literal.empty());
-    inst->lit_offset = blob.add(literal.c_str(), literal.size(), 1);
-    inst->lit_length = verify_u32(literal.size());
+    insert(prev(prog.end()), move(block));
 }
 
-static
-OffsetMap makeOffsetMap(const RoseProgram &program, u32 *total_len) {
-    OffsetMap offset_map;
-    u32 offset = 0;
-    for (const auto &ri : program) {
-        offset = ROUNDUP_N(offset, ROSE_INSTR_MIN_ALIGN);
-        DEBUG_PRINTF("instr %p (opcode %d) -> offset %u\n", ri.get(),
-                     ri->code(), offset);
-        assert(!contains(offset_map, ri.get()));
-        offset_map.emplace(ri.get(), offset);
-        offset += ri->byte_length();
+void RoseProgram::add_block(RoseProgram &&block) {
+    assert(!prog.empty());
+    assert(prog.back()->code() == ROSE_INSTR_END);
+
+    if (block.empty()) {
+        return;
     }
-    *total_len = offset;
-    return offset_map;
+
+    // Replace pointers to the current END with pointers to the first
+    // instruction in the new sequence.
+    const RoseInstruction *end_ptr = end_instruction();
+    prog.pop_back();
+    update_targets(prog.begin(), prog.end(), end_ptr,
+                   block.prog.front().get());
+    prog.insert(prog.end(), make_move_iterator(block.prog.begin()),
+                make_move_iterator(block.prog.end()));
 }
 
-aligned_unique_ptr<char>
-writeProgram(RoseEngineBlob &blob, const RoseProgram &program, u32 *total_len) {
-    const auto offset_map = makeOffsetMap(program, total_len);
-    DEBUG_PRINTF("%zu instructions, len %u\n", program.size(), *total_len);
+bytecode_ptr<char> writeProgram(RoseEngineBlob &blob,
+                                const RoseProgram &program) {
+    u32 total_len = 0;
+    const auto offset_map = makeOffsetMap(program, &total_len);
+    DEBUG_PRINTF("%zu instructions, len %u\n", program.size(), total_len);
 
-    auto bytecode = aligned_zmalloc_unique<char>(*total_len);
+    auto bytecode = make_zeroed_bytecode_ptr<char>(total_len,
+                                                   ROSE_INSTR_MIN_ALIGN);
     char *ptr = bytecode.get();
 
     for (const auto &ri : program) {
@@ -546,6 +222,15 @@ writeProgram(RoseEngineBlob &blob, const RoseProgram &program, u32 *total_len) {
     return bytecode;
 }
 
+size_t RoseProgramHash::operator()(const RoseProgram &program) const {
+    size_t v = 0;
+    for (const auto &ri : program) {
+        assert(ri);
+        boost::hash_combine(v, ri->hash());
+    }
+    return v;
+}
+
 bool RoseProgramEquivalence::operator()(const RoseProgram &prog1,
                                         const RoseProgram &prog2) const {
     if (prog1.size() != prog2.size()) {
@@ -569,4 +254,2095 @@ bool RoseProgramEquivalence::operator()(const RoseProgram &prog1,
     return std::equal(prog1.begin(), prog1.end(), prog2.begin(), is_equiv);
 }
 
+/* Removes any CHECK_HANDLED instructions from the given program */
+static
+void stripCheckHandledInstruction(RoseProgram &prog) {
+    for (auto it = prog.begin(); it != prog.end();) {
+        auto ins = dynamic_cast<const RoseInstrCheckNotHandled *>(it->get());
+        if (!ins) {
+            ++it;
+            continue;
+        }
+
+        auto next_it = next(it);
+        assert(next_it != prog.end()); /* there should always be an end ins */
+        auto next_ins = next_it->get();
+
+        /* update all earlier instructions which point to ins to instead point
+         * to the next instruction. Only need to look at earlier as we only ever
+         * jump forward. */
+        RoseProgram::update_targets(prog.begin(), it, ins, next_ins);
+
+        /* remove check handled instruction */
+        it = prog.erase(it, next_it);
+    }
+}
+
+
+/** Returns true if the program may read the the interpreter's work_done flag */
+static
+bool reads_work_done_flag(const RoseProgram &prog) {
+    for (const auto &ri : prog) {
+        if (dynamic_cast<const RoseInstrSquashGroups *>(ri.get())) {
+            return true;
+        }
+    }
+    return false;
+}
+
+void addEnginesEodProgram(u32 eodNfaIterOffset, RoseProgram &program) {
+    if (!eodNfaIterOffset) {
+        return;
+    }
+
+    RoseProgram block;
+    block.add_before_end(make_unique<RoseInstrEnginesEod>(eodNfaIterOffset));
+    program.add_block(move(block));
+}
+
+void addSuffixesEodProgram(RoseProgram &program) {
+    RoseProgram block;
+    block.add_before_end(make_unique<RoseInstrSuffixesEod>());
+    program.add_block(move(block));
+}
+
+void addMatcherEodProgram(RoseProgram &program) {
+    RoseProgram block;
+    block.add_before_end(make_unique<RoseInstrMatcherEod>());
+    program.add_block(move(block));
+}
+
+static
+void makeRoleCheckLeftfix(const RoseBuildImpl &build,
+                          const map<RoseVertex, left_build_info> &leftfix_info,
+                          RoseVertex v, RoseProgram &program) {
+    auto it = leftfix_info.find(v);
+    if (it == end(leftfix_info)) {
+        return;
+    }
+    const left_build_info &lni = it->second;
+    if (lni.has_lookaround) {
+        return; // Leftfix completely implemented by lookaround.
+    }
+
+    assert(!build.cc.streaming ||
+           build.g[v].left.lag <= MAX_STORED_LEFTFIX_LAG);
+
+    bool is_prefix = build.isRootSuccessor(v);
+    const auto *end_inst = program.end_instruction();
+
+    unique_ptr<RoseInstruction> ri;
+    if (is_prefix) {
+        ri = make_unique<RoseInstrCheckPrefix>(lni.queue, build.g[v].left.lag,
+                                               build.g[v].left.leftfix_report,
+                                               end_inst);
+    } else {
+        ri = make_unique<RoseInstrCheckInfix>(lni.queue, build.g[v].left.lag,
+                                              build.g[v].left.leftfix_report,
+                                              end_inst);
+    }
+    program.add_before_end(move(ri));
+}
+
+static
+void makeAnchoredLiteralDelay(const RoseBuildImpl &build,
+                              const ProgramBuild &prog_build, u32 lit_id,
+                              RoseProgram &program) {
+    // Only relevant for literals in the anchored table.
+    const rose_literal_id &lit = build.literals.at(lit_id);
+    if (lit.table != ROSE_ANCHORED) {
+        return;
+    }
+
+    // If this literal match cannot occur after floatingMinLiteralMatchOffset,
+    // we do not need this check.
+    bool all_too_early = true;
+    rose_group groups = 0;
+
+    const auto &lit_vertices = build.literal_info.at(lit_id).vertices;
+    for (RoseVertex v : lit_vertices) {
+         if (build.g[v].max_offset > prog_build.floatingMinLiteralMatchOffset) {
+             all_too_early = false;
+         }
+         groups |= build.g[v].groups;
+    }
+
+    if (all_too_early) {
+        return;
+    }
+
+    assert(contains(prog_build.anchored_programs, lit_id));
+    u32 anch_id = prog_build.anchored_programs.at(lit_id);
+
+    const auto *end_inst = program.end_instruction();
+    auto ri = make_unique<RoseInstrAnchoredDelay>(groups, anch_id, end_inst);
+    program.add_before_end(move(ri));
+}
+
+static
+void makeDedupe(const ReportManager &rm, const Report &report,
+                RoseProgram &program) {
+    const auto *end_inst = program.end_instruction();
+    auto ri =
+        make_unique<RoseInstrDedupe>(report.quashSom, rm.getDkey(report),
+                                     report.offsetAdjust, end_inst);
+    program.add_before_end(move(ri));
+}
+
+static
+void makeDedupeSom(const ReportManager &rm, const Report &report,
+                   RoseProgram &program) {
+    const auto *end_inst = program.end_instruction();
+    auto ri = make_unique<RoseInstrDedupeSom>(report.quashSom,
+                                              rm.getDkey(report),
+                                              report.offsetAdjust, end_inst);
+    program.add_before_end(move(ri));
+}
+
+static
+void makeCatchup(const ReportManager &rm, bool needs_catchup,
+                 const flat_set<ReportID> &reports, RoseProgram &program) {
+    if (!needs_catchup) {
+        return;
+    }
+
+    // Everything except the INTERNAL_ROSE_CHAIN report needs catchup to run
+    // before reports are triggered.
+
+    auto report_needs_catchup = [&](const ReportID &id) {
+        const Report &report = rm.getReport(id);
+        return report.type != INTERNAL_ROSE_CHAIN;
+    };
+
+    if (!any_of(begin(reports), end(reports), report_needs_catchup)) {
+        DEBUG_PRINTF("none of the given reports needs catchup\n");
+        return;
+    }
+
+    program.add_before_end(make_unique<RoseInstrCatchUp>());
+}
+
+static
+void writeSomOperation(const Report &report, som_operation *op) {
+    assert(op);
+
+    memset(op, 0, sizeof(*op));
+
+    switch (report.type) {
+    case EXTERNAL_CALLBACK_SOM_REL:
+        op->type = SOM_EXTERNAL_CALLBACK_REL;
+        break;
+    case INTERNAL_SOM_LOC_SET:
+        op->type = SOM_INTERNAL_LOC_SET;
+        break;
+    case INTERNAL_SOM_LOC_SET_IF_UNSET:
+        op->type = SOM_INTERNAL_LOC_SET_IF_UNSET;
+        break;
+    case INTERNAL_SOM_LOC_SET_IF_WRITABLE:
+        op->type = SOM_INTERNAL_LOC_SET_IF_WRITABLE;
+        break;
+    case INTERNAL_SOM_LOC_SET_SOM_REV_NFA:
+        op->type = SOM_INTERNAL_LOC_SET_REV_NFA;
+        break;
+    case INTERNAL_SOM_LOC_SET_SOM_REV_NFA_IF_UNSET:
+        op->type = SOM_INTERNAL_LOC_SET_REV_NFA_IF_UNSET;
+        break;
+    case INTERNAL_SOM_LOC_SET_SOM_REV_NFA_IF_WRITABLE:
+        op->type = SOM_INTERNAL_LOC_SET_REV_NFA_IF_WRITABLE;
+        break;
+    case INTERNAL_SOM_LOC_COPY:
+        op->type = SOM_INTERNAL_LOC_COPY;
+        break;
+    case INTERNAL_SOM_LOC_COPY_IF_WRITABLE:
+        op->type = SOM_INTERNAL_LOC_COPY_IF_WRITABLE;
+        break;
+    case INTERNAL_SOM_LOC_MAKE_WRITABLE:
+        op->type = SOM_INTERNAL_LOC_MAKE_WRITABLE;
+        break;
+    case EXTERNAL_CALLBACK_SOM_STORED:
+        op->type = SOM_EXTERNAL_CALLBACK_STORED;
+        break;
+    case EXTERNAL_CALLBACK_SOM_ABS:
+        op->type = SOM_EXTERNAL_CALLBACK_ABS;
+        break;
+    case EXTERNAL_CALLBACK_SOM_REV_NFA:
+        op->type = SOM_EXTERNAL_CALLBACK_REV_NFA;
+        break;
+    case INTERNAL_SOM_LOC_SET_FROM:
+        op->type = SOM_INTERNAL_LOC_SET_FROM;
+        break;
+    case INTERNAL_SOM_LOC_SET_FROM_IF_WRITABLE:
+        op->type = SOM_INTERNAL_LOC_SET_FROM_IF_WRITABLE;
+        break;
+    default:
+        // This report doesn't correspond to a SOM operation.
+        assert(0);
+        throw CompileError("Unable to generate bytecode.");
+    }
+
+    op->onmatch = report.onmatch;
+
+    switch (report.type) {
+    case EXTERNAL_CALLBACK_SOM_REV_NFA:
+    case INTERNAL_SOM_LOC_SET_SOM_REV_NFA:
+    case INTERNAL_SOM_LOC_SET_SOM_REV_NFA_IF_UNSET:
+    case INTERNAL_SOM_LOC_SET_SOM_REV_NFA_IF_WRITABLE:
+        op->aux.revNfaIndex = report.revNfaIndex;
+        break;
+    default:
+        op->aux.somDistance = report.somDistance;
+        break;
+    }
+}
+
+static
+void makeReport(const RoseBuildImpl &build, const ReportID id,
+                const bool has_som, RoseProgram &program) {
+    assert(id < build.rm.numReports());
+    const Report &report = build.rm.getReport(id);
+
+    RoseProgram report_block;
+    const RoseInstruction *end_inst = report_block.end_instruction();
+
+    // Handle min/max offset checks.
+    if (report.minOffset > 0 || report.maxOffset < MAX_OFFSET) {
+        auto ri = make_unique<RoseInstrCheckBounds>(report.minOffset,
+                                                    report.maxOffset, end_inst);
+        report_block.add_before_end(move(ri));
+    }
+
+    // If this report has an exhaustion key, we can check it in the program
+    // rather than waiting until we're in the callback adaptor.
+    if (report.ekey != INVALID_EKEY) {
+        auto ri = make_unique<RoseInstrCheckExhausted>(report.ekey, end_inst);
+        report_block.add_before_end(move(ri));
+    }
+
+    // External SOM reports that aren't passthrough need their SOM value
+    // calculated.
+    if (isExternalSomReport(report) &&
+        report.type != EXTERNAL_CALLBACK_SOM_PASS) {
+        auto ri = make_unique<RoseInstrSomFromReport>();
+        writeSomOperation(report, &ri->som);
+        report_block.add_before_end(move(ri));
+    }
+
+    // Min length constraint.
+    if (report.minLength > 0) {
+        assert(build.hasSom);
+        auto ri = make_unique<RoseInstrCheckMinLength>(
+            report.offsetAdjust, report.minLength, end_inst);
+        report_block.add_before_end(move(ri));
+    }
+
+    if (report.quashSom) {
+        report_block.add_before_end(make_unique<RoseInstrSomZero>());
+    }
+
+    switch (report.type) {
+    case EXTERNAL_CALLBACK:
+        if (!has_som) {
+            // Dedupe is only necessary if this report has a dkey, or if there
+            // are SOM reports to catch up.
+            bool needs_dedupe = build.rm.getDkey(report) != ~0U || build.hasSom;
+            if (report.ekey == INVALID_EKEY) {
+                if (needs_dedupe) {
+                    report_block.add_before_end(
+                        make_unique<RoseInstrDedupeAndReport>(
+                            report.quashSom, build.rm.getDkey(report),
+                            report.onmatch, report.offsetAdjust, end_inst));
+                } else {
+                    report_block.add_before_end(make_unique<RoseInstrReport>(
+                        report.onmatch, report.offsetAdjust));
+                }
+            } else {
+                if (needs_dedupe) {
+                    makeDedupe(build.rm, report, report_block);
+                }
+                report_block.add_before_end(make_unique<RoseInstrReportExhaust>(
+                    report.onmatch, report.offsetAdjust, report.ekey));
+            }
+        } else { // has_som
+            makeDedupeSom(build.rm, report, report_block);
+            if (report.ekey == INVALID_EKEY) {
+                report_block.add_before_end(make_unique<RoseInstrReportSom>(
+                    report.onmatch, report.offsetAdjust));
+            } else {
+                report_block.add_before_end(
+                    make_unique<RoseInstrReportSomExhaust>(
+                        report.onmatch, report.offsetAdjust, report.ekey));
+            }
+        }
+        break;
+    case INTERNAL_SOM_LOC_SET:
+    case INTERNAL_SOM_LOC_SET_IF_UNSET:
+    case INTERNAL_SOM_LOC_SET_IF_WRITABLE:
+    case INTERNAL_SOM_LOC_SET_SOM_REV_NFA:
+    case INTERNAL_SOM_LOC_SET_SOM_REV_NFA_IF_UNSET:
+    case INTERNAL_SOM_LOC_SET_SOM_REV_NFA_IF_WRITABLE:
+    case INTERNAL_SOM_LOC_COPY:
+    case INTERNAL_SOM_LOC_COPY_IF_WRITABLE:
+    case INTERNAL_SOM_LOC_MAKE_WRITABLE:
+    case INTERNAL_SOM_LOC_SET_FROM:
+    case INTERNAL_SOM_LOC_SET_FROM_IF_WRITABLE:
+        if (has_som) {
+            auto ri = make_unique<RoseInstrReportSomAware>();
+            writeSomOperation(report, &ri->som);
+            report_block.add_before_end(move(ri));
+        } else {
+            auto ri = make_unique<RoseInstrReportSomInt>();
+            writeSomOperation(report, &ri->som);
+            report_block.add_before_end(move(ri));
+        }
+        break;
+    case INTERNAL_ROSE_CHAIN: {
+        report_block.add_before_end(make_unique<RoseInstrReportChain>(
+            report.onmatch, report.topSquashDistance));
+        break;
+    }
+    case EXTERNAL_CALLBACK_SOM_REL:
+    case EXTERNAL_CALLBACK_SOM_STORED:
+    case EXTERNAL_CALLBACK_SOM_ABS:
+    case EXTERNAL_CALLBACK_SOM_REV_NFA:
+        makeDedupeSom(build.rm, report, report_block);
+        if (report.ekey == INVALID_EKEY) {
+            report_block.add_before_end(make_unique<RoseInstrReportSom>(
+                report.onmatch, report.offsetAdjust));
+        } else {
+            report_block.add_before_end(make_unique<RoseInstrReportSomExhaust>(
+                report.onmatch, report.offsetAdjust, report.ekey));
+        }
+        break;
+    case EXTERNAL_CALLBACK_SOM_PASS:
+        makeDedupeSom(build.rm, report, report_block);
+        if (report.ekey == INVALID_EKEY) {
+            report_block.add_before_end(make_unique<RoseInstrReportSom>(
+                report.onmatch, report.offsetAdjust));
+        } else {
+            report_block.add_before_end(make_unique<RoseInstrReportSomExhaust>(
+                report.onmatch, report.offsetAdjust, report.ekey));
+        }
+        break;
+
+    default:
+        assert(0);
+        throw CompileError("Unable to generate bytecode.");
+    }
+
+    assert(!report_block.empty());
+    program.add_block(move(report_block));
+}
+
+static
+void makeRoleReports(const RoseBuildImpl &build,
+                     const std::map<RoseVertex, left_build_info> &leftfix_info,
+                     bool needs_catchup, RoseVertex v, RoseProgram &program) {
+    const auto &g = build.g;
+
+    bool report_som = false;
+    if (g[v].left.tracksSom()) {
+        /* we are a suffaig - need to update role to provide som to the
+         * suffix. */
+        assert(contains(leftfix_info, v));
+        const left_build_info &lni = leftfix_info.at(v);
+        program.add_before_end(
+            make_unique<RoseInstrSomLeftfix>(lni.queue, g[v].left.lag));
+        report_som = true;
+    } else if (g[v].som_adjust) {
+        program.add_before_end(
+            make_unique<RoseInstrSomAdjust>(g[v].som_adjust));
+        report_som = true;
+    }
+
+    makeCatchup(build.rm, needs_catchup, g[v].reports, program);
+
+    RoseProgram report_block;
+    for (ReportID id : g[v].reports) {
+        makeReport(build, id, report_som, report_block);
+    }
+    program.add_before_end(move(report_block));
+}
+
+static
+void makeRoleSetState(const unordered_map<RoseVertex, u32> &roleStateIndices,
+                      RoseVertex v, RoseProgram &program) {
+    // We only need this instruction if a state index has been assigned to this
+    // vertex.
+    auto it = roleStateIndices.find(v);
+    if (it == end(roleStateIndices)) {
+        return;
+    }
+    program.add_before_end(make_unique<RoseInstrSetState>(it->second));
+}
+
+static
+void makePushDelayedInstructions(const RoseLiteralMap &literals,
+                                 ProgramBuild &prog_build,
+                                 const flat_set<u32> &delayed_ids,
+                                 RoseProgram &program) {
+    vector<RoseInstrPushDelayed> delay_instructions;
+
+    for (const auto &delayed_lit_id : delayed_ids) {
+        DEBUG_PRINTF("delayed lit id %u\n", delayed_lit_id);
+        assert(contains(prog_build.delay_programs, delayed_lit_id));
+        u32 delay_id = prog_build.delay_programs.at(delayed_lit_id);
+        const auto &delay_lit = literals.at(delayed_lit_id);
+        delay_instructions.emplace_back(verify_u8(delay_lit.delay), delay_id);
+    }
+
+    sort_and_unique(delay_instructions, [](const RoseInstrPushDelayed &a,
+                                           const RoseInstrPushDelayed &b) {
+        return tie(a.delay, a.index) < tie(b.delay, b.index);
+    });
+
+    for (const auto &ri : delay_instructions) {
+        program.add_before_end(make_unique<RoseInstrPushDelayed>(ri));
+    }
+}
+
+static
+void makeCheckLiteralInstruction(const rose_literal_id &lit,
+                                 size_t longLitLengthThreshold,
+                                 RoseProgram &program,
+                                 const CompileContext &cc) {
+    assert(longLitLengthThreshold > 0);
+
+    DEBUG_PRINTF("lit=%s, long lit threshold %zu\n", dumpString(lit.s).c_str(),
+                 longLitLengthThreshold);
+
+    if (lit.s.length() <= ROSE_SHORT_LITERAL_LEN_MAX) {
+        DEBUG_PRINTF("lit short enough to not need confirm\n");
+        return;
+    }
+
+    // Check resource limits as well.
+    if (lit.s.length() > cc.grey.limitLiteralLength) {
+        throw ResourceLimitError();
+    }
+
+    if (lit.s.length() <= longLitLengthThreshold) {
+        DEBUG_PRINTF("is a medium-length literal\n");
+        const auto *end_inst = program.end_instruction();
+        unique_ptr<RoseInstruction> ri;
+        if (lit.s.any_nocase()) {
+            ri = make_unique<RoseInstrCheckMedLitNocase>(lit.s.get_string(),
+                                                         end_inst);
+        } else {
+            ri = make_unique<RoseInstrCheckMedLit>(lit.s.get_string(),
+                                                   end_inst);
+        }
+        program.add_before_end(move(ri));
+        return;
+    }
+
+    // Long literal support should only really be used for the floating table
+    // in streaming mode.
+    assert(lit.table == ROSE_FLOATING && cc.streaming);
+
+    DEBUG_PRINTF("is a long literal\n");
+
+    const auto *end_inst = program.end_instruction();
+    unique_ptr<RoseInstruction> ri;
+    if (lit.s.any_nocase()) {
+        ri = make_unique<RoseInstrCheckLongLitNocase>(lit.s.get_string(),
+                                                      end_inst);
+    } else {
+        ri = make_unique<RoseInstrCheckLongLit>(lit.s.get_string(), end_inst);
+    }
+    program.add_before_end(move(ri));
+}
+
+static
+void makeRoleCheckNotHandled(ProgramBuild &prog_build, RoseVertex v,
+                             RoseProgram &program) {
+    u32 handled_key;
+    if (contains(prog_build.handledKeys, v)) {
+        handled_key = prog_build.handledKeys.at(v);
+    } else {
+        handled_key = verify_u32(prog_build.handledKeys.size());
+        prog_build.handledKeys.emplace(v, handled_key);
+    }
+
+    const auto *end_inst = program.end_instruction();
+    auto ri = make_unique<RoseInstrCheckNotHandled>(handled_key, end_inst);
+    program.add_before_end(move(ri));
+}
+
+static
+void makeRoleCheckBounds(const RoseBuildImpl &build, RoseVertex v,
+                         const RoseEdge &e, RoseProgram &program) {
+    const RoseGraph &g = build.g;
+    const RoseVertex u = source(e, g);
+
+    // We know that we can trust the anchored table (DFA) to always deliver us
+    // literals at the correct offset.
+    if (build.isAnchored(v)) {
+        DEBUG_PRINTF("literal in anchored table, skipping bounds check\n");
+        return;
+    }
+
+    // Use the minimum literal length.
+    u32 lit_length = g[v].eod_accept ? 0 : verify_u32(build.minLiteralLen(v));
+
+    u64a min_bound = g[e].minBound + lit_length;
+    u64a max_bound = g[e].maxBound == ROSE_BOUND_INF
+                         ? ROSE_BOUND_INF
+                         : g[e].maxBound + lit_length;
+
+    if (g[e].history == ROSE_ROLE_HISTORY_ANCH) {
+        assert(g[u].fixedOffset());
+        // Make offsets absolute.
+        min_bound += g[u].max_offset;
+        if (max_bound != ROSE_BOUND_INF) {
+            max_bound += g[u].max_offset;
+        }
+    }
+
+    assert(max_bound <= ROSE_BOUND_INF);
+    assert(min_bound <= max_bound);
+
+    // CHECK_BOUNDS instruction uses 64-bit bounds, so we can use MAX_OFFSET
+    // (max value of a u64a) to represent ROSE_BOUND_INF.
+    if (max_bound == ROSE_BOUND_INF) {
+        max_bound = MAX_OFFSET;
+    }
+
+    // This instruction should be doing _something_ -- bounds should be tighter
+    // than just {length, inf}.
+    assert(min_bound > lit_length || max_bound < MAX_OFFSET);
+
+    const auto *end_inst = program.end_instruction();
+    program.add_before_end(
+        make_unique<RoseInstrCheckBounds>(min_bound, max_bound, end_inst));
+}
+
+static
+void makeRoleGroups(const RoseGraph &g, ProgramBuild &prog_build,
+                    RoseVertex v, RoseProgram &program) {
+    rose_group groups = g[v].groups;
+    if (!groups) {
+        return;
+    }
+
+    // The set of "already on" groups as we process this vertex is the
+    // intersection of the groups set by our predecessors.
+    assert(in_degree(v, g) > 0);
+    rose_group already_on = ~rose_group{0};
+    for (const auto &u : inv_adjacent_vertices_range(v, g)) {
+        already_on &= prog_build.vertex_group_map.at(u);
+    }
+
+    DEBUG_PRINTF("already_on=0x%llx\n", already_on);
+    DEBUG_PRINTF("squashable=0x%llx\n", prog_build.squashable_groups);
+    DEBUG_PRINTF("groups=0x%llx\n", groups);
+
+    already_on &= ~prog_build.squashable_groups;
+    DEBUG_PRINTF("squashed already_on=0x%llx\n", already_on);
+
+    // We don't *have* to mask off the groups that we know are already on, but
+    // this will make bugs more apparent.
+    groups &= ~already_on;
+
+    if (!groups) {
+        DEBUG_PRINTF("no new groups to set, skipping\n");
+        return;
+    }
+
+    program.add_before_end(make_unique<RoseInstrSetGroups>(groups));
+}
+
+static
+bool checkReachMask(const CharReach &cr, u8 &andmask, u8 &cmpmask) {
+    size_t reach_size = cr.count();
+    assert(reach_size > 0);
+    // check whether entry_size is some power of 2.
+    if ((reach_size - 1) & reach_size) {
+        return false;
+    }
+    make_and_cmp_mask(cr, &andmask, &cmpmask);
+    if ((1 << popcount32((u8)(~andmask))) ^ reach_size) {
+        return false;
+    }
+    return true;
+}
+
+static
+bool checkReachWithFlip(const CharReach &cr, u8 &andmask,
+                       u8 &cmpmask, u8 &flip) {
+    if (checkReachMask(cr, andmask, cmpmask)) {
+        flip = 0;
+        return true;
+    }
+    if (checkReachMask(~cr, andmask, cmpmask)) {
+        flip = 1;
+        return true;
+    }
+    return false;
+}
+
+static
+bool makeRoleByte(const vector<LookEntry> &look, RoseProgram &program) {
+    if (look.size() == 1) {
+        const auto &entry = look[0];
+        u8 andmask_u8, cmpmask_u8;
+        u8 flip;
+        if (!checkReachWithFlip(entry.reach, andmask_u8, cmpmask_u8, flip)) {
+            return false;
+        }
+        s32 checkbyte_offset = verify_s32(entry.offset);
+        DEBUG_PRINTF("CHECK BYTE offset=%d\n", checkbyte_offset);
+        const auto *end_inst = program.end_instruction();
+        auto ri = make_unique<RoseInstrCheckByte>(andmask_u8, cmpmask_u8, flip,
+                                                  checkbyte_offset, end_inst);
+        program.add_before_end(move(ri));
+        return true;
+    }
+    return false;
+}
+
+static
+bool makeRoleMask(const vector<LookEntry> &look, RoseProgram &program) {
+    if (look.back().offset < look.front().offset + 8) {
+        s32 base_offset = verify_s32(look.front().offset);
+        u64a and_mask = 0;
+        u64a cmp_mask = 0;
+        u64a neg_mask = 0;
+        for (const auto &entry : look) {
+            u8 andmask_u8, cmpmask_u8, flip;
+            if (!checkReachWithFlip(entry.reach, andmask_u8,
+                                    cmpmask_u8, flip)) {
+                return false;
+            }
+            DEBUG_PRINTF("entry offset %d\n", entry.offset);
+            u32 shift = (entry.offset - base_offset) << 3;
+            and_mask |= (u64a)andmask_u8 << shift;
+            cmp_mask |= (u64a)cmpmask_u8 << shift;
+            if (flip) {
+                neg_mask |= 0xffLLU << shift;
+            }
+        }
+        DEBUG_PRINTF("CHECK MASK and_mask=%llx cmp_mask=%llx\n",
+                     and_mask, cmp_mask);
+        const auto *end_inst = program.end_instruction();
+        auto ri = make_unique<RoseInstrCheckMask>(and_mask, cmp_mask, neg_mask,
+                                                  base_offset, end_inst);
+        program.add_before_end(move(ri));
+        return true;
+    }
+    return false;
+}
+
+static UNUSED
+string convertMaskstoString(u8 *p, int byte_len) {
+    string s;
+    for (int i = 0; i < byte_len; i++) {
+        u8 hi = *p >> 4;
+        u8 lo = *p & 0xf;
+        s += (char)(hi + (hi < 10 ? 48 : 87));
+        s += (char)(lo + (lo < 10 ? 48 : 87));
+        p++;
+    }
+    return s;
+}
+
+static
+bool makeRoleMask32(const vector<LookEntry> &look,
+                    RoseProgram &program) {
+    if (look.back().offset >= look.front().offset + 32) {
+        return false;
+    }
+    s32 base_offset = verify_s32(look.front().offset);
+    array<u8, 32> and_mask, cmp_mask;
+    and_mask.fill(0);
+    cmp_mask.fill(0);
+    u32 neg_mask = 0;
+    for (const auto &entry : look) {
+        u8 andmask_u8, cmpmask_u8, flip;
+        if (!checkReachWithFlip(entry.reach, andmask_u8,
+                                cmpmask_u8, flip)) {
+            return false;
+        }
+        u32 shift = entry.offset - base_offset;
+        assert(shift < 32);
+        and_mask[shift] = andmask_u8;
+        cmp_mask[shift] = cmpmask_u8;
+        if (flip) {
+            neg_mask |= 1 << shift;
+        }
+    }
+
+    DEBUG_PRINTF("and_mask %s\n",
+                 convertMaskstoString(and_mask.data(), 32).c_str());
+    DEBUG_PRINTF("cmp_mask %s\n",
+                 convertMaskstoString(cmp_mask.data(), 32).c_str());
+    DEBUG_PRINTF("neg_mask %08x\n", neg_mask);
+    DEBUG_PRINTF("base_offset %d\n", base_offset);
+
+    const auto *end_inst = program.end_instruction();
+    auto ri = make_unique<RoseInstrCheckMask32>(and_mask, cmp_mask, neg_mask,
+                                                base_offset, end_inst);
+    program.add_before_end(move(ri));
+    return true;
+}
+
+// Sorting by the size of every bucket.
+// Used in map<u32, vector<s8>, cmpNibble>.
+struct cmpNibble {
+    bool operator()(const u32 data1, const u32 data2) const{
+        u32 size1 = popcount32(data1 >> 16) * popcount32(data1 << 16);
+        u32 size2 = popcount32(data2 >> 16) * popcount32(data2 << 16);
+        return std::tie(size1, data1) < std::tie(size2, data2);
+    }
+};
+
+// Insert all pairs of bucket and offset into buckets.
+static really_inline
+void getAllBuckets(const vector<LookEntry> &look,
+                   map<u32, vector<s8>, cmpNibble> &buckets, u64a &neg_mask) {
+    s32 base_offset = verify_s32(look.front().offset);
+    for (const auto &entry : look) {
+        CharReach cr = entry.reach;
+        // Flip heavy character classes to save buckets.
+        if (cr.count() > 128 ) {
+            cr.flip();
+        } else {
+            neg_mask ^= 1ULL << (entry.offset - base_offset);
+        }
+        map <u16, u16> lo2hi;
+        // We treat Ascii Table as a 16x16 grid.
+        // Push every row in cr into lo2hi and mark the row number.
+        for (size_t i = cr.find_first(); i != CharReach::npos;) {
+            u8 it_hi = i >> 4;
+            u16 low_encode = 0;
+            while (i != CharReach::npos && (i >> 4) == it_hi) {
+                low_encode |= 1 << (i & 0xf);
+                i = cr.find_next(i);
+            }
+            lo2hi[low_encode] |= 1 << it_hi;
+        }
+        for (const auto &it : lo2hi) {
+            u32 hi_lo = (it.second << 16) | it.first;
+            buckets[hi_lo].push_back(entry.offset);
+        }
+    }
+}
+
+// Once we have a new bucket, we'll try to combine it with all old buckets.
+static really_inline
+void nibUpdate(map<u32, u16> &nib, u32 hi_lo) {
+    u16 hi = hi_lo >> 16;
+    u16 lo = hi_lo & 0xffff;
+    for (const auto pairs : nib) {
+        u32 old = pairs.first;
+        if ((old >> 16) == hi || (old & 0xffff) == lo) {
+            if (!nib[old | hi_lo]) {
+                nib[old | hi_lo] = nib[old] | nib[hi_lo];
+            }
+        }
+    }
+}
+
+static really_inline
+void nibMaskUpdate(array<u8, 32> &mask, u32 data, u8 bit_index) {
+    for (u8 index = 0; data > 0; data >>= 1, index++) {
+        if (data & 1) {
+            // 0 ~ 7 bucket in first 16 bytes,
+            // 8 ~ 15 bucket in second 16 bytes.
+            if (bit_index >= 8) {
+                mask[index + 16] |= 1 << (bit_index - 8);
+            } else {
+                mask[index] |= 1 << bit_index;
+            }
+        }
+    }
+}
+
+static
+bool getShuftiMasks(const vector<LookEntry> &look, array<u8, 32> &hi_mask,
+                    array<u8, 32> &lo_mask, u8 *bucket_select_hi,
+                    u8 *bucket_select_lo, u64a &neg_mask,
+                    u8 &bit_idx, size_t len) {
+    map<u32, u16> nib; // map every bucket to its bucket number.
+    map<u32, vector<s8>, cmpNibble> bucket2offsets;
+    s32 base_offset = look.front().offset;
+
+    bit_idx = 0;
+    neg_mask = ~0ULL;
+
+    getAllBuckets(look, bucket2offsets, neg_mask);
+
+    for (const auto &it : bucket2offsets) {
+        u32 hi_lo = it.first;
+        // New bucket.
+        if (!nib[hi_lo]) {
+            if ((bit_idx >= 8 && len == 64) || bit_idx >= 16) {
+                return false;
+            }
+            nib[hi_lo] = 1 << bit_idx;
+
+            nibUpdate(nib, hi_lo);
+            nibMaskUpdate(hi_mask, hi_lo >> 16, bit_idx);
+            nibMaskUpdate(lo_mask, hi_lo & 0xffff, bit_idx);
+            bit_idx++;
+        }
+
+        DEBUG_PRINTF("hi_lo %x bucket %x\n", hi_lo, nib[hi_lo]);
+
+        // Update bucket_select_mask.
+        u8 nib_hi = nib[hi_lo] >> 8;
+        u8 nib_lo = nib[hi_lo] & 0xff;
+        for (const auto offset : it.second) {
+            bucket_select_hi[offset - base_offset] |= nib_hi;
+            bucket_select_lo[offset - base_offset] |= nib_lo;
+        }
+    }
+    return true;
+}
+
+static
+unique_ptr<RoseInstruction>
+makeCheckShufti16x8(u32 offset_range, u8 bucket_idx,
+                    const array<u8, 32> &hi_mask, const array<u8, 32> &lo_mask,
+                    const array<u8, 32> &bucket_select_mask,
+                    u32 neg_mask, s32 base_offset,
+                    const RoseInstruction *end_inst) {
+    if (offset_range > 16 || bucket_idx > 8) {
+        return nullptr;
+    }
+    array<u8, 32> nib_mask;
+    array<u8, 16> bucket_select_mask_16;
+    copy(lo_mask.begin(), lo_mask.begin() + 16, nib_mask.begin());
+    copy(hi_mask.begin(), hi_mask.begin() + 16, nib_mask.begin() + 16);
+    copy(bucket_select_mask.begin(), bucket_select_mask.begin() + 16,
+         bucket_select_mask_16.begin());
+    return make_unique<RoseInstrCheckShufti16x8>
+           (nib_mask, bucket_select_mask_16,
+            neg_mask & 0xffff, base_offset, end_inst);
+}
+
+static
+unique_ptr<RoseInstruction>
+makeCheckShufti32x8(u32 offset_range, u8 bucket_idx,
+                    const array<u8, 32> &hi_mask, const array<u8, 32> &lo_mask,
+                    const array<u8, 32> &bucket_select_mask,
+                    u32 neg_mask, s32 base_offset,
+                    const RoseInstruction *end_inst) {
+    if (offset_range > 32 || bucket_idx > 8) {
+        return nullptr;
+    }
+
+    array<u8, 16> hi_mask_16;
+    array<u8, 16> lo_mask_16;
+    copy(hi_mask.begin(), hi_mask.begin() + 16, hi_mask_16.begin());
+    copy(lo_mask.begin(), lo_mask.begin() + 16, lo_mask_16.begin());
+    return make_unique<RoseInstrCheckShufti32x8>
+           (hi_mask_16, lo_mask_16, bucket_select_mask,
+            neg_mask, base_offset, end_inst);
+}
+
+static
+unique_ptr<RoseInstruction>
+makeCheckShufti16x16(u32 offset_range, u8 bucket_idx,
+                     const array<u8, 32> &hi_mask, const array<u8, 32> &lo_mask,
+                     const array<u8, 32> &bucket_select_mask_lo,
+                     const array<u8, 32> &bucket_select_mask_hi,
+                     u32 neg_mask, s32 base_offset,
+                     const RoseInstruction *end_inst) {
+    if (offset_range > 16 || bucket_idx > 16) {
+        return nullptr;
+    }
+
+    array<u8, 32> bucket_select_mask_32;
+    copy(bucket_select_mask_lo.begin(), bucket_select_mask_lo.begin() + 16,
+         bucket_select_mask_32.begin());
+    copy(bucket_select_mask_hi.begin(), bucket_select_mask_hi.begin() + 16,
+         bucket_select_mask_32.begin() + 16);
+    return make_unique<RoseInstrCheckShufti16x16>
+           (hi_mask, lo_mask, bucket_select_mask_32,
+            neg_mask & 0xffff, base_offset, end_inst);
+}
+static
+unique_ptr<RoseInstruction>
+makeCheckShufti32x16(u32 offset_range, u8 bucket_idx,
+                     const array<u8, 32> &hi_mask, const array<u8, 32> &lo_mask,
+                     const array<u8, 32> &bucket_select_mask_lo,
+                     const array<u8, 32> &bucket_select_mask_hi,
+                     u32 neg_mask, s32 base_offset,
+                     const RoseInstruction *end_inst) {
+    if (offset_range > 32 || bucket_idx > 16) {
+        return nullptr;
+    }
+
+    return make_unique<RoseInstrCheckShufti32x16>
+           (hi_mask, lo_mask, bucket_select_mask_hi,
+            bucket_select_mask_lo, neg_mask, base_offset, end_inst);
+}
+
+static
+bool makeRoleShufti(const vector<LookEntry> &look, RoseProgram &program) {
+
+    s32 base_offset = verify_s32(look.front().offset);
+    if (look.back().offset >= base_offset + 32) {
+        return false;
+    }
+
+    u8 bucket_idx = 0; // number of buckets
+    u64a neg_mask_64;
+    array<u8, 32> hi_mask;
+    array<u8, 32> lo_mask;
+    array<u8, 32> bucket_select_hi;
+    array<u8, 32> bucket_select_lo;
+    hi_mask.fill(0);
+    lo_mask.fill(0);
+    bucket_select_hi.fill(0); // will not be used in 16x8 and 32x8.
+    bucket_select_lo.fill(0);
+
+    if (!getShuftiMasks(look, hi_mask, lo_mask, bucket_select_hi.data(),
+                        bucket_select_lo.data(), neg_mask_64, bucket_idx, 32)) {
+        return false;
+    }
+    u32 neg_mask = (u32)neg_mask_64;
+
+    DEBUG_PRINTF("hi_mask %s\n",
+                 convertMaskstoString(hi_mask.data(), 32).c_str());
+    DEBUG_PRINTF("lo_mask %s\n",
+                 convertMaskstoString(lo_mask.data(), 32).c_str());
+    DEBUG_PRINTF("bucket_select_hi %s\n",
+                 convertMaskstoString(bucket_select_hi.data(), 32).c_str());
+    DEBUG_PRINTF("bucket_select_lo %s\n",
+                 convertMaskstoString(bucket_select_lo.data(), 32).c_str());
+
+    const auto *end_inst = program.end_instruction();
+    s32 offset_range = look.back().offset - base_offset + 1;
+
+    auto ri = makeCheckShufti16x8(offset_range, bucket_idx, hi_mask, lo_mask,
+                                  bucket_select_lo, neg_mask, base_offset,
+                                  end_inst);
+    if (!ri) {
+        ri = makeCheckShufti32x8(offset_range, bucket_idx, hi_mask, lo_mask,
+                                 bucket_select_lo, neg_mask, base_offset,
+                                 end_inst);
+    }
+    if (!ri) {
+        ri = makeCheckShufti16x16(offset_range, bucket_idx, hi_mask, lo_mask,
+                                  bucket_select_lo, bucket_select_hi,
+                                  neg_mask, base_offset, end_inst);
+    }
+    if (!ri) {
+        ri = makeCheckShufti32x16(offset_range, bucket_idx, hi_mask, lo_mask,
+                                  bucket_select_lo, bucket_select_hi,
+                                  neg_mask, base_offset, end_inst);
+    }
+    assert(ri);
+    program.add_before_end(move(ri));
+
+    return true;
+}
+
+/**
+ * Builds a lookaround instruction, or an appropriate specialization if one is
+ * available.
+ */
+static
+void makeLookaroundInstruction(const vector<LookEntry> &look,
+                               RoseProgram &program) {
+    assert(!look.empty());
+
+    if (makeRoleByte(look, program)) {
+        return;
+    }
+
+    if (look.size() == 1) {
+        s8 offset = look.begin()->offset;
+        const CharReach &reach = look.begin()->reach;
+        auto ri = make_unique<RoseInstrCheckSingleLookaround>(offset, reach,
+                                                     program.end_instruction());
+        program.add_before_end(move(ri));
+        return;
+    }
+
+    if (makeRoleMask(look, program)) {
+        return;
+    }
+
+    if (makeRoleMask32(look, program)) {
+        return;
+    }
+
+    if (makeRoleShufti(look, program)) {
+        return;
+    }
+
+    auto ri = make_unique<RoseInstrCheckLookaround>(look,
+                                                    program.end_instruction());
+    program.add_before_end(move(ri));
+}
+
+static
+void makeCheckLitMaskInstruction(const RoseBuildImpl &build, u32 lit_id,
+                                 RoseProgram &program) {
+    const auto &info = build.literal_info.at(lit_id);
+    if (!info.requires_benefits) {
+        return;
+    }
+
+    vector<LookEntry> look;
+
+    const ue2_literal &s = build.literals.at(lit_id).s;
+    DEBUG_PRINTF("building mask for lit %u: %s\n", lit_id,
+                 dumpString(s).c_str());
+    assert(s.length() <= MAX_MASK2_WIDTH);
+    s32 i = 0 - s.length();
+    for (const auto &e : s) {
+        if (!e.nocase) {
+            look.emplace_back(verify_s8(i), e);
+        }
+        i++;
+    }
+
+    assert(!look.empty());
+    makeLookaroundInstruction(look, program);
+}
+
+static
+void makeCheckLitEarlyInstruction(const RoseBuildImpl &build, u32 lit_id,
+                                  const vector<RoseEdge> &lit_edges,
+                                  u32 floatingMinLiteralMatchOffset,
+                                  RoseProgram &prog) {
+    if (lit_edges.empty()) {
+        return;
+    }
+
+    if (floatingMinLiteralMatchOffset == 0) {
+        return;
+    }
+
+    RoseVertex v = target(lit_edges.front(), build.g);
+    if (!build.isFloating(v)) {
+        return;
+    }
+
+    const auto &lit = build.literals.at(lit_id);
+    size_t min_len = lit.elength();
+    u32 min_offset = findMinOffset(build, lit_id);
+    DEBUG_PRINTF("has min_len=%zu, min_offset=%u, global min is %u\n", min_len,
+                 min_offset, floatingMinLiteralMatchOffset);
+
+    // If we can't match before the min offset, we don't need the check.
+    if (min_len >= floatingMinLiteralMatchOffset) {
+        DEBUG_PRINTF("no need for check, min is %u\n",
+                     floatingMinLiteralMatchOffset);
+        return;
+    }
+
+    assert(min_offset >= floatingMinLiteralMatchOffset);
+    assert(min_offset < UINT32_MAX);
+
+    DEBUG_PRINTF("adding lit early check, min_offset=%u\n", min_offset);
+    const auto *end = prog.end_instruction();
+    prog.add_before_end(make_unique<RoseInstrCheckLitEarly>(min_offset, end));
+}
+
+static
+void makeGroupCheckInstruction(const RoseBuildImpl &build, u32 lit_id,
+                               RoseProgram &prog) {
+    const auto &info = build.literal_info.at(lit_id);
+
+    if (!info.group_mask) {
+        return;
+    }
+    prog.add_before_end(make_unique<RoseInstrCheckGroups>(info.group_mask));
+}
+
+static
+bool hasDelayedLiteral(const RoseBuildImpl &build,
+                       const vector<RoseEdge> &lit_edges) {
+    auto is_delayed = [&build](u32 lit_id) { return build.isDelayed(lit_id); };
+    for (const auto &e : lit_edges) {
+        auto v = target(e, build.g);
+        const auto &lits = build.g[v].literals;
+        if (any_of(begin(lits), end(lits), is_delayed)) {
+            return true;
+        }
+    }
+    return false;
+}
+
+static
+RoseProgram makeLitInitialProgram(const RoseBuildImpl &build,
+                                  ProgramBuild &prog_build, u32 lit_id,
+                                  const vector<RoseEdge> &lit_edges,
+                                  bool is_anchored_replay_program) {
+    RoseProgram program;
+
+    // Check long literal info.
+    if (!build.isDelayed(lit_id)) {
+        makeCheckLiteralInstruction(build.literals.at(lit_id),
+                                    prog_build.longLitLengthThreshold,
+                                    program, build.cc);
+    }
+
+    // Check lit mask.
+    makeCheckLitMaskInstruction(build, lit_id, program);
+
+    // Check literal groups. This is an optimisation that we only perform for
+    // delayed literals, as their groups may be switched off; ordinarily, we
+    // can trust the HWLM matcher.
+    if (hasDelayedLiteral(build, lit_edges)) {
+        makeGroupCheckInstruction(build, lit_id, program);
+    }
+
+    // Add instructions for pushing delayed matches, if there are any.
+    makePushDelayedInstructions(build.literals, prog_build,
+                                build.literal_info.at(lit_id).delayed_ids,
+                                program);
+
+    // Add pre-check for early literals in the floating table.
+    makeCheckLitEarlyInstruction(build, lit_id, lit_edges,
+                                 prog_build.floatingMinLiteralMatchOffset,
+                                 program);
+
+    /* Check if we are able to deliever matches from the anchored table now */
+    if (!is_anchored_replay_program) {
+        makeAnchoredLiteralDelay(build, prog_build, lit_id, program);
+    }
+
+    return program;
+}
+
+static
+bool makeRoleMultipathShufti(const vector<vector<LookEntry>> &multi_look,
+                             RoseProgram &program) {
+    if (multi_look.empty()) {
+        return false;
+    }
+
+    // find the base offset
+    assert(!multi_look[0].empty());
+    s32 base_offset = multi_look[0].front().offset;
+    s32 last_start = base_offset;
+    s32 end_offset = multi_look[0].back().offset;
+    size_t multi_len = 0;
+
+    for (const auto &look : multi_look) {
+        assert(look.size() > 0);
+        multi_len += look.size();
+
+        LIMIT_TO_AT_MOST(&base_offset, look.front().offset);
+        ENSURE_AT_LEAST(&last_start, look.front().offset);
+        ENSURE_AT_LEAST(&end_offset, look.back().offset);
+    }
+
+    assert(last_start < 0);
+
+    if (end_offset - base_offset >= MULTIPATH_MAX_LEN) {
+        return false;
+    }
+
+    if (multi_len <= 16) {
+        multi_len = 16;
+    } else if (multi_len <= 32) {
+        multi_len = 32;
+    } else if (multi_len <= 64) {
+        multi_len = 64;
+    } else {
+        DEBUG_PRINTF("too long for multi-path\n");
+        return false;
+    }
+
+    vector<LookEntry> linear_look;
+    array<u8, 64> data_select_mask;
+    data_select_mask.fill(0);
+    u64a hi_bits_mask = 0;
+    u64a lo_bits_mask = 0;
+
+    for (const auto &look : multi_look) {
+        assert(linear_look.size() < 64);
+        lo_bits_mask |= 1LLU << linear_look.size();
+        for (const auto &entry : look) {
+            assert(entry.offset - base_offset < MULTIPATH_MAX_LEN);
+            data_select_mask[linear_look.size()] =
+                                          verify_u8(entry.offset - base_offset);
+            linear_look.emplace_back(verify_s8(linear_look.size()), entry.reach);
+        }
+        hi_bits_mask |= 1LLU << (linear_look.size() - 1);
+    }
+
+    u8 bit_index = 0; // number of buckets
+    u64a neg_mask;
+    array<u8, 32> hi_mask;
+    array<u8, 32> lo_mask;
+    array<u8, 64> bucket_select_hi;
+    array<u8, 64> bucket_select_lo;
+    hi_mask.fill(0);
+    lo_mask.fill(0);
+    bucket_select_hi.fill(0);
+    bucket_select_lo.fill(0);
+
+    if (!getShuftiMasks(linear_look, hi_mask, lo_mask, bucket_select_hi.data(),
+                        bucket_select_lo.data(), neg_mask, bit_index,
+                        multi_len)) {
+        return false;
+    }
+
+    DEBUG_PRINTF("hi_mask %s\n",
+                 convertMaskstoString(hi_mask.data(), 16).c_str());
+    DEBUG_PRINTF("lo_mask %s\n",
+                 convertMaskstoString(lo_mask.data(), 16).c_str());
+    DEBUG_PRINTF("bucket_select_hi %s\n",
+                 convertMaskstoString(bucket_select_hi.data(), 64).c_str());
+    DEBUG_PRINTF("bucket_select_lo %s\n",
+                 convertMaskstoString(bucket_select_lo.data(), 64).c_str());
+    DEBUG_PRINTF("data_select_mask %s\n",
+                 convertMaskstoString(data_select_mask.data(), 64).c_str());
+    DEBUG_PRINTF("hi_bits_mask %llx\n", hi_bits_mask);
+    DEBUG_PRINTF("lo_bits_mask %llx\n", lo_bits_mask);
+    DEBUG_PRINTF("neg_mask %llx\n", neg_mask);
+    DEBUG_PRINTF("base_offset %d\n", base_offset);
+    DEBUG_PRINTF("last_start %d\n", last_start);
+
+    // Since we don't have 16x16 now, just call 32x16 instead.
+    if (bit_index > 8) {
+        assert(multi_len <= 32);
+        multi_len = 32;
+    }
+
+    const auto *end_inst = program.end_instruction();
+    assert(multi_len == 16 || multi_len == 32 || multi_len == 64);
+    if (multi_len == 16) {
+        neg_mask &= 0xffff;
+        assert(!(hi_bits_mask & ~0xffffULL));
+        assert(!(lo_bits_mask & ~0xffffULL));
+        assert(bit_index <=8);
+        array<u8, 32> nib_mask;
+        copy(begin(lo_mask), begin(lo_mask) + 16, nib_mask.begin());
+        copy(begin(hi_mask), begin(hi_mask) + 16, nib_mask.begin() + 16);
+
+        auto ri = make_unique<RoseInstrCheckMultipathShufti16x8>
+                  (nib_mask, bucket_select_lo, data_select_mask, hi_bits_mask,
+                   lo_bits_mask, neg_mask, base_offset, last_start, end_inst);
+        program.add_before_end(move(ri));
+    } else if (multi_len == 32) {
+        neg_mask &= 0xffffffff;
+        assert(!(hi_bits_mask & ~0xffffffffULL));
+        assert(!(lo_bits_mask & ~0xffffffffULL));
+        if (bit_index <= 8) {
+            auto ri = make_unique<RoseInstrCheckMultipathShufti32x8>
+                      (hi_mask, lo_mask, bucket_select_lo, data_select_mask,
+                       hi_bits_mask, lo_bits_mask, neg_mask, base_offset,
+                       last_start, end_inst);
+            program.add_before_end(move(ri));
+        } else {
+            auto ri = make_unique<RoseInstrCheckMultipathShufti32x16>
+                      (hi_mask, lo_mask, bucket_select_hi, bucket_select_lo,
+                       data_select_mask, hi_bits_mask, lo_bits_mask, neg_mask,
+                       base_offset, last_start, end_inst);
+            program.add_before_end(move(ri));
+        }
+    } else {
+        auto ri = make_unique<RoseInstrCheckMultipathShufti64>
+                  (hi_mask, lo_mask, bucket_select_lo, data_select_mask,
+                   hi_bits_mask, lo_bits_mask, neg_mask, base_offset,
+                   last_start, end_inst);
+        program.add_before_end(move(ri));
+    }
+    return true;
+}
+
+static
+void makeRoleMultipathLookaround(const vector<vector<LookEntry>> &multi_look,
+                                 RoseProgram &program) {
+    assert(!multi_look.empty());
+    assert(multi_look.size() <= MAX_LOOKAROUND_PATHS);
+    vector<vector<LookEntry>> ordered_look;
+    set<s32> look_offset;
+
+    assert(!multi_look[0].empty());
+    s32 last_start = multi_look[0][0].offset;
+
+    // build offset table.
+    for (const auto &look : multi_look) {
+        assert(look.size() > 0);
+        last_start = max(last_start, (s32)look.begin()->offset);
+
+        for (const auto &t : look) {
+            look_offset.insert(t.offset);
+        }
+    }
+
+    array<u8, MULTIPATH_MAX_LEN> start_mask;
+    if (multi_look.size() < MAX_LOOKAROUND_PATHS) {
+        start_mask.fill((1 << multi_look.size()) - 1);
+    } else {
+        start_mask.fill(0xff);
+    }
+
+    u32 path_idx = 0;
+    for (const auto &look : multi_look) {
+        for (const auto &t : look) {
+            assert(t.offset >= (int)*look_offset.begin());
+            size_t update_offset = t.offset - *look_offset.begin() + 1;
+            if (update_offset < start_mask.size()) {
+                start_mask[update_offset] &= ~(1 << path_idx);
+            }
+        }
+        path_idx++;
+    }
+
+    for (u32 i = 1; i < MULTIPATH_MAX_LEN; i++) {
+        start_mask[i] &= start_mask[i - 1];
+        DEBUG_PRINTF("start_mask[%u] = %x\n", i, start_mask[i]);
+    }
+
+    assert(look_offset.size() <= MULTIPATH_MAX_LEN);
+
+    assert(last_start < 0);
+
+    for (const auto &offset : look_offset) {
+        vector<LookEntry> multi_entry;
+        multi_entry.resize(MAX_LOOKAROUND_PATHS);
+
+        for (size_t i = 0; i < multi_look.size(); i++) {
+            for (const auto &t : multi_look[i]) {
+                if (t.offset == offset) {
+                    multi_entry[i] = t;
+                }
+            }
+        }
+        ordered_look.emplace_back(multi_entry);
+    }
+
+    auto ri = make_unique<RoseInstrMultipathLookaround>(move(ordered_look),
+                                                        last_start, start_mask,
+                                                    program.end_instruction());
+    program.add_before_end(move(ri));
+}
+
+static
+void makeRoleLookaround(const RoseBuildImpl &build,
+                        const map<RoseVertex, left_build_info> &leftfix_info,
+                        RoseVertex v, RoseProgram &program) {
+    if (!build.cc.grey.roseLookaroundMasks) {
+        return;
+    }
+
+    vector<vector<LookEntry>> looks;
+
+    // Lookaround from leftfix (mandatory).
+    if (contains(leftfix_info, v) && leftfix_info.at(v).has_lookaround) {
+        DEBUG_PRINTF("using leftfix lookaround\n");
+        looks = leftfix_info.at(v).lookaround;
+    }
+
+    // We may be able to find more lookaround info (advisory) and merge it
+    // in.
+    if (looks.size() <= 1) {
+        vector<LookEntry> look;
+        vector<LookEntry> look_more;
+        if (!looks.empty()) {
+            look = move(looks.front());
+        }
+        findLookaroundMasks(build, v, look_more);
+        mergeLookaround(look, look_more);
+        if (!look.empty()) {
+            makeLookaroundInstruction(look, program);
+        }
+        return;
+    }
+
+    if (!makeRoleMultipathShufti(looks, program)) {
+        assert(looks.size() <= 8);
+        makeRoleMultipathLookaround(looks, program);
+    }
+}
+
+static
+void makeRoleSuffix(const RoseBuildImpl &build,
+                    const map<suffix_id, u32> &suffixes,
+                    const map<u32, engine_info> &engine_info_by_queue,
+                    RoseVertex v, RoseProgram &prog) {
+    const auto &g = build.g;
+    if (!g[v].suffix) {
+        return;
+    }
+    assert(contains(suffixes, g[v].suffix));
+    u32 queue = suffixes.at(g[v].suffix);
+    u32 event;
+    assert(contains(engine_info_by_queue, queue));
+    const auto eng_info = engine_info_by_queue.at(queue);
+    if (isContainerType(eng_info.type)) {
+        auto tamaProto = g[v].suffix.tamarama.get();
+        assert(tamaProto);
+        event = (u32)MQE_TOP_FIRST +
+                  tamaProto->top_remap.at(make_pair(g[v].index,
+                                                    g[v].suffix.top));
+        assert(event < MQE_INVALID);
+    } else if (isMultiTopType(eng_info.type)) {
+        assert(!g[v].suffix.haig);
+        event = (u32)MQE_TOP_FIRST + g[v].suffix.top;
+        assert(event < MQE_INVALID);
+    } else {
+        // DFAs/Puffs have no MQE_TOP_N support, so they get a classic TOP
+        // event.
+        assert(!g[v].suffix.graph || onlyOneTop(*g[v].suffix.graph));
+        event = MQE_TOP;
+    }
+
+    prog.add_before_end(make_unique<RoseInstrTriggerSuffix>(queue, event));
+}
+
+static
+void addInfixTriggerInstructions(vector<TriggerInfo> triggers,
+                                 RoseProgram &prog) {
+    // Order, de-dupe and add instructions to the end of program.
+    sort_and_unique(triggers, [](const TriggerInfo &a, const TriggerInfo &b) {
+        return tie(a.cancel, a.queue, a.event) <
+               tie(b.cancel, b.queue, b.event);
+    });
+    for (const auto &ti : triggers) {
+        prog.add_before_end(
+             make_unique<RoseInstrTriggerInfix>(ti.cancel, ti.queue, ti.event));
+    }
+}
+
+static
+void makeRoleInfixTriggers(const RoseBuildImpl &build,
+                           const map<RoseVertex, left_build_info> &leftfix_info,
+                           const map<u32, engine_info> &engine_info_by_queue,
+                           RoseVertex u, RoseProgram &program) {
+    const auto &g = build.g;
+
+    vector<TriggerInfo> triggers;
+
+    for (const auto &e : out_edges_range(u, g)) {
+        RoseVertex v = target(e, g);
+        if (!g[v].left) {
+            continue;
+        }
+
+        assert(contains(leftfix_info, v));
+        const left_build_info &lbi = leftfix_info.at(v);
+        if (lbi.has_lookaround) {
+            continue;
+        }
+
+        assert(contains(engine_info_by_queue, lbi.queue));
+        const auto &eng_info = engine_info_by_queue.at(lbi.queue);
+
+        // DFAs have no TOP_N support, so they get a classic MQE_TOP event.
+        u32 top;
+        if (isContainerType(eng_info.type)) {
+            auto tamaProto = g[v].left.tamarama.get();
+            assert(tamaProto);
+            top = MQE_TOP_FIRST + tamaProto->top_remap.at(
+                                      make_pair(g[v].index, g[e].rose_top));
+            assert(top < MQE_INVALID);
+        } else if (!isMultiTopType(eng_info.type)) {
+            assert(num_tops(g[v].left) == 1);
+            top = MQE_TOP;
+        } else {
+            top = MQE_TOP_FIRST + g[e].rose_top;
+            assert(top < MQE_INVALID);
+        }
+
+        triggers.emplace_back(g[e].rose_cancel_prev_top, lbi.queue, top);
+    }
+
+    addInfixTriggerInstructions(move(triggers), program);
+}
+
+
+/**
+ * \brief True if the given vertex is a role that can only be switched on at
+ * EOD.
+ */
+static
+bool onlyAtEod(const RoseBuildImpl &tbi, RoseVertex v) {
+    const RoseGraph &g = tbi.g;
+
+    // All such roles have only (0,0) edges to vertices with the eod_accept
+    // property, and no other effects (suffixes, ordinary reports, etc, etc).
+
+    if (isLeafNode(v, g) || !g[v].reports.empty() || g[v].suffix) {
+        return false;
+    }
+
+    for (const auto &e : out_edges_range(v, g)) {
+        RoseVertex w = target(e, g);
+        if (!g[w].eod_accept) {
+            return false;
+        }
+        assert(!g[w].reports.empty());
+        assert(g[w].literals.empty());
+
+        if (g[e].minBound || g[e].maxBound) {
+            return false;
+        }
+    }
+
+    /* There is no pointing enforcing this check at runtime if
+     * this role is only fired by the eod event literal */
+    if (tbi.eod_event_literal_id != MO_INVALID_IDX &&
+        g[v].literals.size() == 1 &&
+        *g[v].literals.begin() == tbi.eod_event_literal_id) {
+        return false;
+    }
+
+    return true;
+}
+
+static
+void addCheckOnlyEodInstruction(RoseProgram &prog) {
+    DEBUG_PRINTF("only at eod\n");
+    const auto *end_inst = prog.end_instruction();
+    prog.add_before_end(make_unique<RoseInstrCheckOnlyEod>(end_inst));
+}
+
+static
+void makeRoleEagerEodReports(const RoseBuildImpl &build,
+                         const map<RoseVertex, left_build_info> &leftfix_info,
+                         bool needs_catchup, RoseVertex v,
+                         RoseProgram &program) {
+    RoseProgram eod_program;
+
+    for (const auto &e : out_edges_range(v, build.g)) {
+        if (canEagerlyReportAtEod(build, e)) {
+            RoseProgram block;
+            makeRoleReports(build, leftfix_info, needs_catchup,
+                            target(e, build.g), block);
+            eod_program.add_block(move(block));
+        }
+    }
+
+    if (eod_program.empty()) {
+        return;
+    }
+
+    if (!onlyAtEod(build, v)) {
+        // The rest of our program wasn't EOD anchored, so we need to guard
+        // these reports with a check.
+        addCheckOnlyEodInstruction(program);
+    }
+
+    program.add_before_end(move(eod_program));
+}
+
+/* Makes a program for a role/vertex given a specfic pred/in_edge. */
+static
+RoseProgram makeRoleProgram(const RoseBuildImpl &build,
+                        const map<RoseVertex, left_build_info> &leftfix_info,
+                        const map<suffix_id, u32> &suffixes,
+                        const map<u32, engine_info> &engine_info_by_queue,
+                        const unordered_map<RoseVertex, u32> &roleStateIndices,
+                        ProgramBuild &prog_build, const RoseEdge &e) {
+    const RoseGraph &g = build.g;
+    auto v = target(e, g);
+
+    RoseProgram program;
+
+    // First, add program instructions that enforce preconditions without
+    // effects.
+
+    if (onlyAtEod(build, v)) {
+        addCheckOnlyEodInstruction(program);
+    }
+
+    if (g[e].history == ROSE_ROLE_HISTORY_ANCH) {
+        makeRoleCheckBounds(build, v, e, program);
+    }
+
+    // This role program may be triggered by different predecessors, with
+    // different offset bounds. We must ensure we put this check/set operation
+    // after the bounds check to deal with this case.
+    if (in_degree(v, g) > 1) {
+        assert(!build.isRootSuccessor(v));
+        makeRoleCheckNotHandled(prog_build, v, program);
+    }
+
+    makeRoleLookaround(build, leftfix_info, v, program);
+    makeRoleCheckLeftfix(build, leftfix_info, v, program);
+
+    // Next, we can add program instructions that have effects. This must be
+    // done as a series of blocks, as some of them (like reports) are
+    // escapable.
+
+    RoseProgram effects_block;
+
+    RoseProgram reports_block;
+    makeRoleReports(build, leftfix_info, prog_build.needs_catchup, v,
+                    reports_block);
+    effects_block.add_block(move(reports_block));
+
+    RoseProgram infix_block;
+    makeRoleInfixTriggers(build, leftfix_info, engine_info_by_queue, v,
+                          infix_block);
+    effects_block.add_block(move(infix_block));
+
+    // Note: SET_GROUPS instruction must be after infix triggers, as an infix
+    // going dead may switch off groups.
+    RoseProgram groups_block;
+    makeRoleGroups(build.g, prog_build, v, groups_block);
+    effects_block.add_block(move(groups_block));
+
+    RoseProgram suffix_block;
+    makeRoleSuffix(build, suffixes, engine_info_by_queue, v, suffix_block);
+    effects_block.add_block(move(suffix_block));
+
+    RoseProgram state_block;
+    makeRoleSetState(roleStateIndices, v, state_block);
+    effects_block.add_block(move(state_block));
+
+    // Note: EOD eager reports may generate a CHECK_ONLY_EOD instruction (if
+    // the program doesn't have one already).
+    RoseProgram eod_block;
+    makeRoleEagerEodReports(build, leftfix_info, prog_build.needs_catchup, v,
+                            eod_block);
+    effects_block.add_block(move(eod_block));
+
+    /* a 'ghost role' may do nothing if we know that its groups are already set
+     * - in this case we can avoid producing a program at all. */
+    if (effects_block.empty()) {
+        return {};
+    }
+
+    program.add_before_end(move(effects_block));
+    return program;
+}
+
+static
+void makeGroupSquashInstruction(const RoseBuildImpl &build, u32 lit_id,
+                                RoseProgram &prog) {
+    const auto &info = build.literal_info.at(lit_id);
+    if (!info.squash_group) {
+        return;
+    }
+
+    DEBUG_PRINTF("squashes 0x%llx\n", info.group_mask);
+    assert(info.group_mask);
+    /* Note: group_mask is negated. */
+    prog.add_before_end(make_unique<RoseInstrSquashGroups>(~info.group_mask));
+}
+
+namespace {
+struct ProgKey {
+    ProgKey(const RoseProgram &p) : prog(&p) { }
+
+    bool operator==(const ProgKey &b) const {
+        return RoseProgramEquivalence()(*prog, *b.prog);
+    }
+
+    friend size_t hash_value(const ProgKey &a) {
+        return RoseProgramHash()(*a.prog);
+    }
+private:
+    const RoseProgram *prog;
+};
+}
+
+RoseProgram assembleProgramBlocks(vector<RoseProgram> &&blocks_in) {
+    DEBUG_PRINTF("%zu blocks before dedupe\n", blocks_in.size());
+
+    vector<RoseProgram> blocks;
+    blocks.reserve(blocks_in.size()); /* to ensure stable reference for seen */
+
+    unordered_set<ProgKey> seen;
+    for (auto &block : blocks_in) {
+        if (contains(seen, block)) {
+            continue;
+        }
+
+        blocks.push_back(move(block));
+        seen.emplace(blocks.back());
+    }
+
+    DEBUG_PRINTF("%zu blocks after dedupe\n", blocks.size());
+
+    RoseProgram prog;
+    for (auto &block : blocks) {
+        /* If we have multiple blocks from different literals and any of them
+         * squash groups, we will have to add a CLEAR_WORK_DONE instruction to
+         * each literal program block to clear the work_done flags so that it's
+         * only set if a state has been. */
+        if (!prog.empty() && reads_work_done_flag(block)) {
+            RoseProgram clear_block;
+            clear_block.add_before_end(make_unique<RoseInstrClearWorkDone>());
+            prog.add_block(move(clear_block));
+        }
+
+        prog.add_block(move(block));
+    }
+
+    return prog;
+}
+
+RoseProgram makeLiteralProgram(const RoseBuildImpl &build,
+                         const map<RoseVertex, left_build_info> &leftfix_info,
+                         const map<suffix_id, u32> &suffixes,
+                         const map<u32, engine_info> &engine_info_by_queue,
+                         const unordered_map<RoseVertex, u32> &roleStateIndices,
+                         ProgramBuild &prog_build, u32 lit_id,
+                         const vector<RoseEdge> &lit_edges,
+                         bool is_anchored_replay_program) {
+    const auto &g = build.g;
+
+    DEBUG_PRINTF("lit id=%u, %zu lit edges\n", lit_id, lit_edges.size());
+
+    // Construct initial program up front, as its early checks must be able
+    // to jump to end and terminate processing for this literal.
+    auto lit_program = makeLitInitialProgram(build, prog_build, lit_id,
+                                             lit_edges,
+                                             is_anchored_replay_program);
+
+    RoseProgram role_programs;
+
+    // Predecessor state id -> program block.
+    map<u32, RoseProgram> pred_blocks;
+
+    // Construct sparse iter sub-programs.
+    for (const auto &e : lit_edges) {
+        const auto &u = source(e, g);
+        if (build.isAnyStart(u)) {
+            continue; // Root roles are not handled with sparse iterator.
+        }
+        DEBUG_PRINTF("sparse iter edge (%zu,%zu)\n", g[u].index,
+                     g[target(e, g)].index);
+        assert(contains(roleStateIndices, u));
+        u32 pred_state = roleStateIndices.at(u);
+        auto role_prog = makeRoleProgram(build, leftfix_info, suffixes,
+                                         engine_info_by_queue, roleStateIndices,
+                                         prog_build, e);
+        if (!role_prog.empty()) {
+            pred_blocks[pred_state].add_block(move(role_prog));
+        }
+    }
+
+    // Add blocks to deal with non-root edges (triggered by sparse iterator or
+    // mmbit_isset checks).
+    addPredBlocks(pred_blocks, roleStateIndices.size(), role_programs);
+
+    // Add blocks to handle root roles.
+    for (const auto &e : lit_edges) {
+        const auto &u = source(e, g);
+        if (!build.isAnyStart(u)) {
+            continue;
+        }
+        DEBUG_PRINTF("root edge (%zu,%zu)\n", g[u].index,
+                     g[target(e, g)].index);
+        auto role_prog = makeRoleProgram(build, leftfix_info, suffixes,
+                                         engine_info_by_queue, roleStateIndices,
+                                         prog_build, e);
+        role_programs.add_block(move(role_prog));
+    }
+
+    if (lit_id == build.eod_event_literal_id) {
+        /* Note: does not require the lit intial program */
+        assert(build.eod_event_literal_id != MO_INVALID_IDX);
+        return role_programs;
+    }
+
+    /* Instructions to run even if a role program bails out */
+    RoseProgram unconditional_block;
+
+    // Literal may squash groups.
+    makeGroupSquashInstruction(build, lit_id, unconditional_block);
+
+    role_programs.add_block(move(unconditional_block));
+    lit_program.add_before_end(move(role_programs));
+
+    return lit_program;
+}
+
+RoseProgram makeDelayRebuildProgram(const RoseBuildImpl &build,
+                                    ProgramBuild &prog_build,
+                                    const vector<u32> &lit_ids) {
+    assert(!lit_ids.empty());
+    assert(build.cc.streaming);
+
+    vector<RoseProgram> blocks;
+
+    for (const auto &lit_id : lit_ids) {
+        DEBUG_PRINTF("lit_id=%u\n", lit_id);
+        const auto &info = build.literal_info.at(lit_id);
+        if (info.delayed_ids.empty()) {
+            continue; // No delayed IDs, no work to do.
+        }
+
+        RoseProgram prog;
+        if (!build.isDelayed(lit_id)) {
+            makeCheckLiteralInstruction(build.literals.at(lit_id),
+                                        prog_build.longLitLengthThreshold, prog,
+                                        build.cc);
+        }
+
+        makeCheckLitMaskInstruction(build, lit_id, prog);
+        makePushDelayedInstructions(build.literals, prog_build,
+                                    build.literal_info.at(lit_id).delayed_ids,
+                                    prog);
+        blocks.push_back(move(prog));
+    }
+
+    return assembleProgramBlocks(move(blocks));
+}
+
+RoseProgram makeEodAnchorProgram(const RoseBuildImpl &build,
+                                 ProgramBuild &prog_build, const RoseEdge &e,
+                                 const bool multiple_preds) {
+    const RoseGraph &g = build.g;
+    const RoseVertex v = target(e, g);
+
+    RoseProgram program;
+
+    if (g[e].history == ROSE_ROLE_HISTORY_ANCH) {
+        makeRoleCheckBounds(build, v, e, program);
+    }
+
+    if (multiple_preds) {
+        // Only necessary when there is more than one pred.
+        makeRoleCheckNotHandled(prog_build, v, program);
+    }
+
+    makeCatchup(build.rm, prog_build.needs_catchup, g[v].reports, program);
+
+    const bool has_som = false;
+    RoseProgram report_block;
+    for (const auto &id : g[v].reports) {
+        makeReport(build, id, has_som, report_block);
+    }
+    program.add_before_end(move(report_block));
+
+    return program;
+}
+
+static
+void makeCatchupMpv(const ReportManager &rm, bool needs_mpv_catchup,
+                    ReportID id, RoseProgram &program) {
+    if (!needs_mpv_catchup) {
+        return;
+    }
+
+    const Report &report = rm.getReport(id);
+    if (report.type == INTERNAL_ROSE_CHAIN) {
+        return;
+    }
+
+    program.add_before_end(make_unique<RoseInstrCatchUpMpv>());
+}
+
+RoseProgram makeReportProgram(const RoseBuildImpl &build,
+                              bool needs_mpv_catchup, ReportID id) {
+    RoseProgram prog;
+
+    makeCatchupMpv(build.rm, needs_mpv_catchup, id, prog);
+
+    const bool has_som = false;
+    makeReport(build, id, has_som, prog);
+
+    return prog;
+}
+
+RoseProgram makeBoundaryProgram(const RoseBuildImpl &build,
+                                const set<ReportID> &reports) {
+    // Note: no CATCHUP instruction is necessary in the boundary case, as we
+    // should always be caught up (and may not even have the resources in
+    // scratch to support it).
+
+    const bool has_som = false;
+    RoseProgram prog;
+    for (const auto &id : reports) {
+        makeReport(build, id, has_som, prog);
+    }
+
+    return prog;
+}
+
+static
+void addPredBlockSingle(u32 pred_state, RoseProgram &pred_block,
+                        RoseProgram &program) {
+    // Prepend an instruction to check the pred state is on.
+    const auto *end_inst = pred_block.end_instruction();
+    pred_block.insert(begin(pred_block),
+                      make_unique<RoseInstrCheckState>(pred_state, end_inst));
+    program.add_block(move(pred_block));
+}
+
+static
+void addPredBlocksAny(map<u32, RoseProgram> &pred_blocks, u32 num_states,
+                      RoseProgram &program) {
+    RoseProgram sparse_program;
+
+    vector<u32> keys;
+    for (const u32 &key : pred_blocks | map_keys) {
+        keys.push_back(key);
+    }
+
+    const RoseInstruction *end_inst = sparse_program.end_instruction();
+    auto ri = make_unique<RoseInstrSparseIterAny>(num_states, keys, end_inst);
+    sparse_program.add_before_end(move(ri));
+
+    RoseProgram &block = pred_blocks.begin()->second;
+
+    /* we no longer need the check handled instruction as all the pred-role
+     * blocks are being collapsed together */
+    stripCheckHandledInstruction(block);
+
+    sparse_program.add_before_end(move(block));
+    program.add_block(move(sparse_program));
+}
+
+static
+void addPredBlocksMulti(map<u32, RoseProgram> &pred_blocks,
+                        u32 num_states, RoseProgram &program) {
+    assert(!pred_blocks.empty());
+
+    RoseProgram sparse_program;
+    const RoseInstruction *end_inst = sparse_program.end_instruction();
+    vector<pair<u32, const RoseInstruction *>> jump_table;
+
+    // BEGIN instruction.
+    auto ri_begin = make_unique<RoseInstrSparseIterBegin>(num_states, end_inst);
+    RoseInstrSparseIterBegin *begin_inst = ri_begin.get();
+    sparse_program.add_before_end(move(ri_begin));
+
+    // NEXT instructions, one per pred program.
+    u32 prev_key = pred_blocks.begin()->first;
+    for (auto it = next(begin(pred_blocks)); it != end(pred_blocks); ++it) {
+        auto ri = make_unique<RoseInstrSparseIterNext>(prev_key, begin_inst,
+                                                       end_inst);
+        sparse_program.add_before_end(move(ri));
+        prev_key = it->first;
+    }
+
+    // Splice in each pred program after its BEGIN/NEXT.
+    auto out_it = begin(sparse_program);
+    for (auto &m : pred_blocks) {
+        u32 key = m.first;
+        RoseProgram &flat_prog = m.second;
+        assert(!flat_prog.empty());
+        const size_t block_len = flat_prog.size() - 1; // without INSTR_END.
+
+        assert(dynamic_cast<const RoseInstrSparseIterBegin *>(out_it->get()) ||
+               dynamic_cast<const RoseInstrSparseIterNext *>(out_it->get()));
+        out_it = sparse_program.insert(++out_it, move(flat_prog));
+
+        // Jump table target for this key is the beginning of the block we just
+        // spliced in.
+        jump_table.emplace_back(key, out_it->get());
+
+        assert(distance(begin(sparse_program), out_it) + block_len <=
+               sparse_program.size());
+        advance(out_it, block_len);
+    }
+
+    // Write the jump table back into the SPARSE_ITER_BEGIN instruction.
+    begin_inst->jump_table = move(jump_table);
+
+    program.add_block(move(sparse_program));
+}
+
+void addPredBlocks(map<u32, RoseProgram> &pred_blocks, u32 num_states,
+                   RoseProgram &program) {
+    // Trim empty blocks, if any exist.
+    for (auto it = pred_blocks.begin(); it != pred_blocks.end();) {
+        if (it->second.empty()) {
+            it = pred_blocks.erase(it);
+        } else {
+            ++it;
+        }
+    }
+
+    const size_t num_preds = pred_blocks.size();
+    if (num_preds == 0) {
+        return;
+    }
+
+    if (num_preds == 1) {
+        const auto head = pred_blocks.begin();
+        addPredBlockSingle(head->first, head->second, program);
+        return;
+    }
+
+    // First, see if all our blocks are equivalent, in which case we can
+    // collapse them down into one.
+    const auto &blocks = pred_blocks | map_values;
+    if (all_of(begin(blocks), end(blocks), [&](const RoseProgram &block) {
+            return RoseProgramEquivalence()(*begin(blocks), block);
+        })) {
+        DEBUG_PRINTF("all blocks equiv\n");
+        addPredBlocksAny(pred_blocks, num_states, program);
+        return;
+    }
+
+    addPredBlocksMulti(pred_blocks, num_states, program);
+}
+
+void applyFinalSpecialisation(RoseProgram &program) {
+    assert(!program.empty());
+    assert(program.back().code() == ROSE_INSTR_END);
+    if (program.size() < 2) {
+        return;
+    }
+
+    /* Replace the second-to-last instruction (before END) with a one-shot
+     * specialisation if available. */
+    auto it = next(program.rbegin());
+    if (auto *ri = dynamic_cast<const RoseInstrReport *>(it->get())) {
+        DEBUG_PRINTF("replacing REPORT with FINAL_REPORT\n");
+        program.replace(it, make_unique<RoseInstrFinalReport>(
+                                ri->onmatch, ri->offset_adjust));
+    }
+}
+
+void recordLongLiterals(vector<ue2_case_string> &longLiterals,
+                        const RoseProgram &program) {
+    for (const auto &ri : program) {
+        if (const auto *ri_check =
+                dynamic_cast<const RoseInstrCheckLongLit *>(ri.get())) {
+            DEBUG_PRINTF("found CHECK_LONG_LIT for string '%s'\n",
+                         escapeString(ri_check->literal).c_str());
+            longLiterals.emplace_back(ri_check->literal, false);
+            continue;
+        }
+        if (const auto *ri_check =
+                dynamic_cast<const RoseInstrCheckLongLitNocase *>(ri.get())) {
+            DEBUG_PRINTF("found CHECK_LONG_LIT_NOCASE for string '%s'\n",
+                         escapeString(ri_check->literal).c_str());
+            longLiterals.emplace_back(ri_check->literal, true);
+        }
+    }
+}
+
+void recordResources(RoseResources &resources, const RoseProgram &program) {
+    for (const auto &ri : program) {
+        switch (ri->code()) {
+        case ROSE_INSTR_TRIGGER_SUFFIX:
+            resources.has_suffixes = true;
+            break;
+        case ROSE_INSTR_TRIGGER_INFIX:
+        case ROSE_INSTR_CHECK_INFIX:
+        case ROSE_INSTR_CHECK_PREFIX:
+        case ROSE_INSTR_SOM_LEFTFIX:
+            resources.has_leftfixes = true;
+            break;
+        case ROSE_INSTR_SET_STATE:
+        case ROSE_INSTR_CHECK_STATE:
+        case ROSE_INSTR_SPARSE_ITER_BEGIN:
+        case ROSE_INSTR_SPARSE_ITER_NEXT:
+            resources.has_states = true;
+            break;
+        case ROSE_INSTR_CHECK_GROUPS:
+            resources.checks_groups = true;
+            break;
+        case ROSE_INSTR_PUSH_DELAYED:
+            resources.has_lit_delay = true;
+            break;
+        case ROSE_INSTR_CHECK_LONG_LIT:
+        case ROSE_INSTR_CHECK_LONG_LIT_NOCASE:
+            resources.has_lit_check = true;
+            break;
+        default:
+            break;
+        }
+    }
+}
+
 } // namespace ue2
diff --git a/src/rose/rose_build_program.h b/src/rose/rose_build_program.h
index 0c725b46d..8758ef64a 100644
--- a/src/rose/rose_build_program.h
+++ b/src/rose/rose_build_program.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,1759 +31,21 @@
 
 #include "rose_build_impl.h"
 #include "rose_program.h"
-#include "som/som_operation.h"
-#include "util/alloc.h"
-#include "util/container.h"
+#include "util/bytecode_ptr.h"
 #include "util/hash.h"
 #include "util/make_unique.h"
 #include "util/ue2_containers.h"
-#include "util/ue2string.h"
 
-#include <algorithm>
-#include <array>
 #include <vector>
 
-#include <boost/functional/hash/hash_fwd.hpp>
 #include <boost/range/adaptor/map.hpp>
 
 namespace ue2 {
 
+struct LookEntry;
 class RoseEngineBlob;
-
-/**
- * \brief Abstract base class representing a single Rose instruction.
- */
-class RoseInstruction {
-public:
-    virtual ~RoseInstruction();
-
-    /** \brief Opcode used for the instruction in the bytecode. */
-    virtual RoseInstructionCode code() const = 0;
-
-    /**
-     * \brief Simple hash used for program equivalence.
-     *
-     * Note that pointers (jumps, for example) should not be used when
-     * calculating the hash: they will be converted to instruction offsets when
-     * compared later.
-     */
-    virtual size_t hash() const = 0;
-
-    /** \brief Length of the bytecode instruction in bytes. */
-    virtual size_t byte_length() const = 0;
-
-    using OffsetMap = unordered_map<const RoseInstruction *, u32>;
-
-    /**
-     * \brief Writes a concrete implementation of this instruction.
-     *
-     * Other data that this instruction depends on is written directly into the
-     * blob, while the instruction structure itself (of size given by
-     * the byte_length() function) is written to dest.
-     */
-    virtual void write(void *dest, RoseEngineBlob &blob,
-                       const OffsetMap &offset_map) const = 0;
-
-    /**
-     * \brief Update a target pointer.
-     *
-     * If this instruction contains any reference to the old target, replace it
-     * with the new one.
-     */
-    virtual void update_target(const RoseInstruction *old_target,
-                               const RoseInstruction *new_target) = 0;
-
-    /**
-     * \brief True if these instructions are equivalent within their own
-     * programs.
-     *
-     * Checks that any pointers to other instructions point to the same
-     * offsets.
-     */
-    bool equiv(const RoseInstruction &other, const OffsetMap &offsets,
-               const OffsetMap &other_offsets) const {
-        return equiv_impl(other, offsets, other_offsets);
-    }
-
-private:
-    virtual bool equiv_impl(const RoseInstruction &other,
-                            const OffsetMap &offsets,
-                            const OffsetMap &other_offsets) const = 0;
-};
-
-/**
- * \brief Templated implementation class to handle boring boilerplate code.
- */
-template<RoseInstructionCode Opcode, class ImplType, class RoseInstrType>
-class RoseInstrBase : public RoseInstruction {
-protected:
-    static constexpr RoseInstructionCode opcode = Opcode;
-    using impl_type = ImplType;
-
-public:
-    RoseInstructionCode code() const override { return opcode; }
-
-    size_t byte_length() const override {
-        return sizeof(impl_type);
-    }
-
-    /**
-     * Note: this implementation simply zeroes the destination region and
-     * writes in the correct opcode. This is sufficient for trivial
-     * instructions, but instructions with data members will want to override
-     * it.
-     */
-    void write(void *dest, RoseEngineBlob &,
-               const RoseInstruction::OffsetMap &) const override {
-        assert(dest != nullptr);
-        assert(ISALIGNED_N(dest, ROSE_INSTR_MIN_ALIGN));
-
-        impl_type *inst = static_cast<impl_type *>(dest);
-        memset(inst, 0, sizeof(impl_type));
-        inst->code = verify_u8(opcode);
-    }
-
-private:
-    bool equiv_impl(const RoseInstruction &other, const OffsetMap &offsets,
-                    const OffsetMap &other_offsets) const override {
-        const auto *ri_that = dynamic_cast<const RoseInstrType *>(&other);
-        if (!ri_that) {
-            return false;
-        }
-        const auto *ri_this = dynamic_cast<const RoseInstrType *>(this);
-        assert(ri_this);
-        return ri_this->equiv_to(*ri_that, offsets, other_offsets);
-    }
-};
-
-/**
- * \brief Refinement of RoseInstrBase to use for instructions that have
- * just a single target member, called "target".
- */
-template<RoseInstructionCode Opcode, class ImplType, class RoseInstrType>
-class RoseInstrBaseOneTarget
-    : public RoseInstrBase<Opcode, ImplType, RoseInstrType> {
-public:
-    void update_target(const RoseInstruction *old_target,
-                       const RoseInstruction *new_target) override {
-        RoseInstrType *ri = dynamic_cast<RoseInstrType *>(this);
-        assert(ri);
-        if (ri->target == old_target) {
-            ri->target = new_target;
-        }
-    }
-};
-
-/**
- * \brief Refinement of RoseInstrBase to use for instructions that have no
- * targets.
- */
-template<RoseInstructionCode Opcode, class ImplType, class RoseInstrType>
-class RoseInstrBaseNoTargets
-    : public RoseInstrBase<Opcode, ImplType, RoseInstrType> {
-public:
-    void update_target(const RoseInstruction *,
-                       const RoseInstruction *) override {}
-};
-
-/**
- * \brief Refinement of RoseInstrBaseNoTargets to use for instructions that
- * have no members at all, just an opcode.
- */
-template<RoseInstructionCode Opcode, class ImplType, class RoseInstrType>
-class RoseInstrBaseTrivial
-    : public RoseInstrBaseNoTargets<Opcode, ImplType, RoseInstrType> {
-public:
-    virtual bool operator==(const RoseInstrType &) const { return true; }
-
-    size_t hash() const override {
-        return boost::hash_value(static_cast<int>(Opcode));
-    }
-
-    bool equiv_to(const RoseInstrType &, const RoseInstruction::OffsetMap &,
-                  const RoseInstruction::OffsetMap &) const {
-        return true;
-    }
-};
-
-////
-//// Concrete implementation classes start here.
-////
-
-class RoseInstrAnchoredDelay
-    : public RoseInstrBaseOneTarget<ROSE_INSTR_ANCHORED_DELAY,
-                                    ROSE_STRUCT_ANCHORED_DELAY,
-                                    RoseInstrAnchoredDelay> {
-public:
-    rose_group groups;
-    const RoseInstruction *target;
-
-    RoseInstrAnchoredDelay(rose_group groups_in,
-                           const RoseInstruction *target_in)
-        : groups(groups_in), target(target_in) {}
-
-    bool operator==(const RoseInstrAnchoredDelay &ri) const {
-        return groups == ri.groups && target == ri.target;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), groups);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrAnchoredDelay &ri, const OffsetMap &offsets,
-                  const OffsetMap &other_offsets) const {
-        return groups == ri.groups &&
-               offsets.at(target) == other_offsets.at(ri.target);
-    }
-};
-
-class RoseInstrCheckLitEarly
-    : public RoseInstrBaseNoTargets<ROSE_INSTR_CHECK_LIT_EARLY,
-                                    ROSE_STRUCT_CHECK_LIT_EARLY,
-                                    RoseInstrCheckLitEarly> {
-public:
-    u32 min_offset;
-
-    explicit RoseInstrCheckLitEarly(u32 min) : min_offset(min) {}
-
-    bool operator==(const RoseInstrCheckLitEarly &ri) const {
-        return min_offset == ri.min_offset;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), min_offset);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrCheckLitEarly &ri, const OffsetMap &,
-                  const OffsetMap &) const {
-        return min_offset == ri.min_offset;
-    }
-};
-
-class RoseInstrCheckGroups
-    : public RoseInstrBaseNoTargets<ROSE_INSTR_CHECK_GROUPS,
-                                    ROSE_STRUCT_CHECK_GROUPS,
-                                    RoseInstrCheckGroups> {
-public:
-    rose_group groups;
-
-    explicit RoseInstrCheckGroups(rose_group groups_in) : groups(groups_in) {}
-
-    bool operator==(const RoseInstrCheckGroups &ri) const {
-        return groups == ri.groups;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), groups);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrCheckGroups &ri, const OffsetMap &,
-                  const OffsetMap &) const {
-        return groups == ri.groups;
-    }
-};
-
-class RoseInstrCheckOnlyEod
-    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_ONLY_EOD,
-                                    ROSE_STRUCT_CHECK_ONLY_EOD,
-                                    RoseInstrCheckOnlyEod> {
-public:
-    const RoseInstruction *target;
-
-    explicit RoseInstrCheckOnlyEod(const RoseInstruction *target_in)
-        : target(target_in) {}
-
-    bool operator==(const RoseInstrCheckOnlyEod &ri) const {
-        return target == ri.target;
-    }
-
-    size_t hash() const override {
-        return boost::hash_value(static_cast<int>(opcode));
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrCheckOnlyEod &ri, const OffsetMap &offsets,
-                  const OffsetMap &other_offsets) const {
-        return offsets.at(target) == other_offsets.at(ri.target);
-    }
-};
-
-class RoseInstrCheckBounds
-    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_BOUNDS,
-                                    ROSE_STRUCT_CHECK_BOUNDS,
-                                    RoseInstrCheckBounds> {
-public:
-    u64a min_bound;
-    u64a max_bound;
-    const RoseInstruction *target;
-
-    RoseInstrCheckBounds(u64a min, u64a max, const RoseInstruction *target_in)
-        : min_bound(min), max_bound(max), target(target_in) {}
-
-    bool operator==(const RoseInstrCheckBounds &ri) const {
-        return min_bound == ri.min_bound && max_bound == ri.max_bound &&
-               target == ri.target;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), min_bound, max_bound);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrCheckBounds &ri, const OffsetMap &offsets,
-                  const OffsetMap &other_offsets) const {
-        return min_bound == ri.min_bound && max_bound == ri.max_bound &&
-               offsets.at(target) == other_offsets.at(ri.target);
-    }
-};
-
-class RoseInstrCheckNotHandled
-    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_NOT_HANDLED,
-                                    ROSE_STRUCT_CHECK_NOT_HANDLED,
-                                    RoseInstrCheckNotHandled> {
-public:
-    u32 key;
-    const RoseInstruction *target;
-
-    RoseInstrCheckNotHandled(u32 key_in, const RoseInstruction *target_in)
-        : key(key_in), target(target_in) {}
-
-    bool operator==(const RoseInstrCheckNotHandled &ri) const {
-        return key == ri.key && target == ri.target;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), key);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrCheckNotHandled &ri, const OffsetMap &offsets,
-                  const OffsetMap &other_offsets) const {
-        return key == ri.key &&
-               offsets.at(target) == other_offsets.at(ri.target);
-    }
-};
-
-class RoseInstrCheckSingleLookaround
-    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_SINGLE_LOOKAROUND,
-                                    ROSE_STRUCT_CHECK_SINGLE_LOOKAROUND,
-                                    RoseInstrCheckSingleLookaround> {
-public:
-    s8 offset;
-    u32 reach_index;
-    const RoseInstruction *target;
-
-    RoseInstrCheckSingleLookaround(s8 offset_in, u32 reach_index_in,
-                                   const RoseInstruction *target_in)
-        : offset(offset_in), reach_index(reach_index_in), target(target_in) {}
-
-    bool operator==(const RoseInstrCheckSingleLookaround &ri) const {
-        return offset == ri.offset && reach_index == ri.reach_index &&
-               target == ri.target;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), offset, reach_index);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrCheckSingleLookaround &ri,
-                  const OffsetMap &offsets,
-                  const OffsetMap &other_offsets) const {
-        return offset == ri.offset && reach_index == ri.reach_index &&
-               offsets.at(target) == other_offsets.at(ri.target);
-    }
-};
-
-class RoseInstrCheckLookaround
-    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_LOOKAROUND,
-                                    ROSE_STRUCT_CHECK_LOOKAROUND,
-                                    RoseInstrCheckLookaround> {
-public:
-    u32 index;
-    u32 count;
-    const RoseInstruction *target;
-
-    RoseInstrCheckLookaround(u32 index_in, u32 count_in,
-                             const RoseInstruction *target_in)
-        : index(index_in), count(count_in), target(target_in) {}
-
-    bool operator==(const RoseInstrCheckLookaround &ri) const {
-        return index == ri.index && count == ri.count && target == ri.target;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), index, count);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrCheckLookaround &ri, const OffsetMap &offsets,
-                  const OffsetMap &other_offsets) const {
-        return index == ri.index && count == ri.count &&
-               offsets.at(target) == other_offsets.at(ri.target);
-    }
-};
-
-class RoseInstrCheckMask
-    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_MASK,
-                                    ROSE_STRUCT_CHECK_MASK,
-                                    RoseInstrCheckMask> {
-public:
-    u64a and_mask;
-    u64a cmp_mask;
-    u64a neg_mask;
-    s32 offset;
-    const RoseInstruction *target;
-
-    RoseInstrCheckMask(u64a and_mask_in, u64a cmp_mask_in, u64a neg_mask_in,
-                       s32 offset_in, const RoseInstruction *target_in)
-        : and_mask(and_mask_in), cmp_mask(cmp_mask_in), neg_mask(neg_mask_in),
-          offset(offset_in), target(target_in) {}
-
-    bool operator==(const RoseInstrCheckMask &ri) const {
-        return and_mask == ri.and_mask && cmp_mask == ri.cmp_mask &&
-               neg_mask == ri.neg_mask && offset == ri.offset &&
-               target == ri.target;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), and_mask, cmp_mask, neg_mask,
-                        offset);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrCheckMask &ri, const OffsetMap &offsets,
-                  const OffsetMap &other_offsets) const {
-        return and_mask == ri.and_mask && cmp_mask == ri.cmp_mask &&
-               neg_mask == ri.neg_mask && offset == ri.offset &&
-               offsets.at(target) == other_offsets.at(ri.target);
-    }
-};
-
-class RoseInstrCheckMask32
-    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_MASK_32,
-                                    ROSE_STRUCT_CHECK_MASK_32,
-                                    RoseInstrCheckMask32> {
-public:
-    std::array<u8, 32> and_mask;
-    std::array<u8, 32> cmp_mask;
-    u32 neg_mask;
-    s32 offset;
-    const RoseInstruction *target;
-
-    RoseInstrCheckMask32(std::array<u8, 32> and_mask_in,
-                         std::array<u8, 32> cmp_mask_in, u32 neg_mask_in,
-                         s32 offset_in, const RoseInstruction *target_in)
-        : and_mask(move(and_mask_in)), cmp_mask(move(cmp_mask_in)),
-          neg_mask(neg_mask_in), offset(offset_in), target(target_in) {}
-
-    bool operator==(const RoseInstrCheckMask32 &ri) const {
-        return and_mask == ri.and_mask && cmp_mask == ri.cmp_mask &&
-               neg_mask == ri.neg_mask && offset == ri.offset &&
-               target == ri.target;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), and_mask, cmp_mask, neg_mask,
-                        offset);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrCheckMask32 &ri, const OffsetMap &offsets,
-                  const OffsetMap &other_offsets) const {
-        return and_mask == ri.and_mask && cmp_mask == ri.cmp_mask &&
-               neg_mask == ri.neg_mask && offset == ri.offset &&
-               offsets.at(target) == other_offsets.at(ri.target);
-    }
-};
-
-class RoseInstrCheckByte
-    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_BYTE,
-                                    ROSE_STRUCT_CHECK_BYTE,
-                                    RoseInstrCheckByte> {
-public:
-    u8 and_mask;
-    u8 cmp_mask;
-    u8 negation;
-    s32 offset;
-    const RoseInstruction *target;
-
-    RoseInstrCheckByte(u8 and_mask_in, u8 cmp_mask_in, u8 negation_in,
-                       s32 offset_in, const RoseInstruction *target_in)
-        : and_mask(and_mask_in), cmp_mask(cmp_mask_in), negation(negation_in),
-          offset(offset_in), target(target_in) {}
-
-    bool operator==(const RoseInstrCheckByte &ri) const {
-        return and_mask == ri.and_mask && cmp_mask == ri.cmp_mask &&
-               negation == ri.negation && offset == ri.offset &&
-               target == ri.target;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), and_mask, cmp_mask, negation,
-                        offset);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrCheckByte &ri, const OffsetMap &offsets,
-                  const OffsetMap &other_offsets) const {
-        return and_mask == ri.and_mask && cmp_mask == ri.cmp_mask &&
-               negation == ri.negation && offset == ri.offset &&
-               offsets.at(target) == other_offsets.at(ri.target);
-    }
-};
-
-class RoseInstrCheckShufti16x8
-    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_SHUFTI_16x8,
-                                    ROSE_STRUCT_CHECK_SHUFTI_16x8,
-                                    RoseInstrCheckShufti16x8> {
-public:
-    std::array<u8, 32> nib_mask;
-    std::array<u8, 16> bucket_select_mask;
-    u32 neg_mask;
-    s32 offset;
-    const RoseInstruction *target;
-
-    RoseInstrCheckShufti16x8(std::array<u8, 32> nib_mask_in,
-                             std::array<u8, 16> bucket_select_mask_in,
-                             u32 neg_mask_in, s32 offset_in,
-                             const RoseInstruction *target_in)
-        : nib_mask(move(nib_mask_in)),
-          bucket_select_mask(move(bucket_select_mask_in)),
-          neg_mask(neg_mask_in), offset(offset_in), target(target_in) {}
-
-    bool operator==(const RoseInstrCheckShufti16x8 &ri) const {
-        return nib_mask == ri.nib_mask &&
-               bucket_select_mask == ri.bucket_select_mask &&
-               neg_mask == ri.neg_mask && offset == ri.offset &&
-               target == ri.target;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), nib_mask,
-                        bucket_select_mask, neg_mask, offset);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrCheckShufti16x8 &ri, const OffsetMap &offsets,
-                  const OffsetMap &other_offsets) const {
-        return nib_mask == ri.nib_mask &&
-               bucket_select_mask == ri.bucket_select_mask &&
-               neg_mask == ri.neg_mask && offset == ri.offset &&
-               offsets.at(target) == other_offsets.at(ri.target);
-    }
-};
-
-class RoseInstrCheckShufti32x8
-    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_SHUFTI_32x8,
-                                    ROSE_STRUCT_CHECK_SHUFTI_32x8,
-                                    RoseInstrCheckShufti32x8> {
-public:
-    std::array<u8, 16> hi_mask;
-    std::array<u8, 16> lo_mask;
-    std::array<u8, 32> bucket_select_mask;
-    u32 neg_mask;
-    s32 offset;
-    const RoseInstruction *target;
-
-    RoseInstrCheckShufti32x8(std::array<u8, 16> hi_mask_in,
-                             std::array<u8, 16> lo_mask_in,
-                             std::array<u8, 32> bucket_select_mask_in,
-                             u32 neg_mask_in, s32 offset_in,
-                             const RoseInstruction *target_in)
-        : hi_mask(move(hi_mask_in)), lo_mask(move(lo_mask_in)),
-          bucket_select_mask(move(bucket_select_mask_in)),
-          neg_mask(neg_mask_in), offset(offset_in), target(target_in) {}
-
-    bool operator==(const RoseInstrCheckShufti32x8 &ri) const {
-        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
-               bucket_select_mask == ri.bucket_select_mask &&
-               neg_mask == ri.neg_mask && offset == ri.offset &&
-               target == ri.target;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), hi_mask, lo_mask,
-                        bucket_select_mask, neg_mask, offset);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrCheckShufti32x8 &ri, const OffsetMap &offsets,
-                  const OffsetMap &other_offsets) const {
-        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
-               bucket_select_mask == ri.bucket_select_mask &&
-               neg_mask == ri.neg_mask && offset == ri.offset &&
-               offsets.at(target) == other_offsets.at(ri.target);
-    }
-};
-
-class RoseInstrCheckShufti16x16
-    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_SHUFTI_16x16,
-                                    ROSE_STRUCT_CHECK_SHUFTI_16x16,
-                                    RoseInstrCheckShufti16x16> {
-public:
-    std::array<u8, 32> hi_mask;
-    std::array<u8, 32> lo_mask;
-    std::array<u8, 32> bucket_select_mask;
-    u32 neg_mask;
-    s32 offset;
-    const RoseInstruction *target;
-
-    RoseInstrCheckShufti16x16(std::array<u8, 32> hi_mask_in,
-                              std::array<u8, 32> lo_mask_in,
-                              std::array<u8, 32> bucket_select_mask_in,
-                              u32 neg_mask_in, s32 offset_in,
-                              const RoseInstruction *target_in)
-        : hi_mask(move(hi_mask_in)), lo_mask(move(lo_mask_in)),
-          bucket_select_mask(move(bucket_select_mask_in)),
-          neg_mask(neg_mask_in), offset(offset_in), target(target_in) {}
-
-    bool operator==(const RoseInstrCheckShufti16x16 &ri) const {
-        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
-               bucket_select_mask == ri.bucket_select_mask &&
-               neg_mask == ri.neg_mask && offset == ri.offset &&
-               target == ri.target;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), hi_mask, lo_mask,
-                        bucket_select_mask, neg_mask, offset);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrCheckShufti16x16 &ri, const OffsetMap &offsets,
-                  const OffsetMap &other_offsets) const {
-        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
-               bucket_select_mask == ri.bucket_select_mask &&
-               neg_mask == ri.neg_mask && offset == ri.offset &&
-               offsets.at(target) == other_offsets.at(ri.target);
-    }
-};
-
-class RoseInstrCheckShufti32x16
-    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_SHUFTI_32x16,
-                                    ROSE_STRUCT_CHECK_SHUFTI_32x16,
-                                    RoseInstrCheckShufti32x16> {
-public:
-    std::array<u8, 32> hi_mask;
-    std::array<u8, 32> lo_mask;
-    std::array<u8, 32> bucket_select_mask_hi;
-    std::array<u8, 32> bucket_select_mask_lo;
-    u32 neg_mask;
-    s32 offset;
-    const RoseInstruction *target;
-
-    RoseInstrCheckShufti32x16(std::array<u8, 32> hi_mask_in,
-                              std::array<u8, 32> lo_mask_in,
-                              std::array<u8, 32> bucket_select_mask_hi_in,
-                              std::array<u8, 32> bucket_select_mask_lo_in,
-                              u32 neg_mask_in, s32 offset_in,
-                              const RoseInstruction *target_in)
-        : hi_mask(move(hi_mask_in)), lo_mask(move(lo_mask_in)),
-          bucket_select_mask_hi(move(bucket_select_mask_hi_in)),
-          bucket_select_mask_lo(move(bucket_select_mask_lo_in)),
-          neg_mask(neg_mask_in), offset(offset_in), target(target_in) {}
-
-    bool operator==(const RoseInstrCheckShufti32x16 &ri) const {
-        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
-               bucket_select_mask_hi == ri.bucket_select_mask_hi &&
-               bucket_select_mask_lo == ri.bucket_select_mask_lo &&
-               neg_mask == ri.neg_mask && offset == ri.offset &&
-               target == ri.target;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), hi_mask, lo_mask,
-                        bucket_select_mask_hi, bucket_select_mask_lo,
-                        neg_mask, offset);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrCheckShufti32x16 &ri, const OffsetMap &offsets,
-                  const OffsetMap &other_offsets) const {
-        return hi_mask == ri.hi_mask && lo_mask == ri.lo_mask &&
-               bucket_select_mask_hi == ri.bucket_select_mask_hi &&
-               bucket_select_mask_lo == ri.bucket_select_mask_lo &&
-               neg_mask == ri.neg_mask && offset == ri.offset &&
-               offsets.at(target) == other_offsets.at(ri.target);
-    }
-};
-
-class RoseInstrCheckInfix
-    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_INFIX,
-                                    ROSE_STRUCT_CHECK_INFIX,
-                                    RoseInstrCheckInfix> {
-public:
-    u32 queue;
-    u32 lag;
-    ReportID report;
-    const RoseInstruction *target;
-
-    RoseInstrCheckInfix(u32 queue_in, u32 lag_in, ReportID report_in,
-                        const RoseInstruction *target_in)
-        : queue(queue_in), lag(lag_in), report(report_in), target(target_in) {}
-
-    bool operator==(const RoseInstrCheckInfix &ri) const {
-        return queue == ri.queue && lag == ri.lag && report == ri.report &&
-               target == ri.target;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), queue, lag, report);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrCheckInfix &ri, const OffsetMap &offsets,
-                  const OffsetMap &other_offsets) const {
-        return queue == ri.queue && lag == ri.lag && report == ri.report &&
-               offsets.at(target) == other_offsets.at(ri.target);
-    }
-};
-
-class RoseInstrCheckPrefix
-    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_PREFIX,
-                                    ROSE_STRUCT_CHECK_PREFIX,
-                                    RoseInstrCheckPrefix> {
-public:
-    u32 queue;
-    u32 lag;
-    ReportID report;
-    const RoseInstruction *target;
-
-    RoseInstrCheckPrefix(u32 queue_in, u32 lag_in, ReportID report_in,
-                         const RoseInstruction *target_in)
-        : queue(queue_in), lag(lag_in), report(report_in), target(target_in) {}
-
-    bool operator==(const RoseInstrCheckPrefix &ri) const {
-        return queue == ri.queue && lag == ri.lag && report == ri.report &&
-               target == ri.target;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), queue, lag, report);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrCheckPrefix &ri, const OffsetMap &offsets,
-                  const OffsetMap &other_offsets) const {
-        return queue == ri.queue && lag == ri.lag && report == ri.report &&
-               offsets.at(target) == other_offsets.at(ri.target);
-    }
-};
-
-class RoseInstrPushDelayed
-    : public RoseInstrBaseNoTargets<ROSE_INSTR_PUSH_DELAYED,
-                                    ROSE_STRUCT_PUSH_DELAYED,
-                                    RoseInstrPushDelayed> {
-public:
-    u8 delay;
-    u32 index;
-
-    RoseInstrPushDelayed(u8 delay_in, u32 index_in)
-        : delay(delay_in), index(index_in) {}
-
-    bool operator==(const RoseInstrPushDelayed &ri) const {
-        return delay == ri.delay && index == ri.index;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), delay, index);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrPushDelayed &ri, const OffsetMap &,
-                  const OffsetMap &) const {
-        return delay == ri.delay && index == ri.index;
-    }
-};
-
-class RoseInstrRecordAnchored
-    : public RoseInstrBaseNoTargets<ROSE_INSTR_RECORD_ANCHORED,
-                                    ROSE_STRUCT_RECORD_ANCHORED,
-                                    RoseInstrRecordAnchored> {
-public:
-    u32 id;
-
-    explicit RoseInstrRecordAnchored(u32 id_in) : id(id_in) {}
-
-    bool operator==(const RoseInstrRecordAnchored &ri) const {
-        return id == ri.id;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), id);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrRecordAnchored &ri, const OffsetMap &,
-                  const OffsetMap &) const {
-        return id == ri.id;
-    }
-};
-
-class RoseInstrCatchUp
-    : public RoseInstrBaseTrivial<ROSE_INSTR_CATCH_UP, ROSE_STRUCT_CATCH_UP,
-                                  RoseInstrCatchUp> {
-public:
-    ~RoseInstrCatchUp() override;
-};
-
-class RoseInstrCatchUpMpv
-    : public RoseInstrBaseTrivial<ROSE_INSTR_CATCH_UP_MPV,
-                                  ROSE_STRUCT_CATCH_UP_MPV,
-                                  RoseInstrCatchUpMpv> {
-public:
-    ~RoseInstrCatchUpMpv() override;
-};
-
-class RoseInstrSomAdjust
-    : public RoseInstrBaseNoTargets<ROSE_INSTR_SOM_ADJUST,
-                                    ROSE_STRUCT_SOM_ADJUST,
-                                    RoseInstrSomAdjust> {
-public:
-    u32 distance;
-
-    explicit RoseInstrSomAdjust(u32 distance_in) : distance(distance_in) {}
-
-    bool operator==(const RoseInstrSomAdjust &ri) const {
-        return distance == ri.distance;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), distance);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrSomAdjust &ri, const OffsetMap &,
-                  const OffsetMap &) const {
-        return distance == ri.distance;
-    }
-};
-
-class RoseInstrSomLeftfix
-    : public RoseInstrBaseNoTargets<ROSE_INSTR_SOM_LEFTFIX,
-                                    ROSE_STRUCT_SOM_LEFTFIX,
-                                    RoseInstrSomLeftfix> {
-public:
-    u32 queue;
-    u32 lag;
-
-    RoseInstrSomLeftfix(u32 queue_in, u32 lag_in)
-        : queue(queue_in), lag(lag_in) {}
-
-    bool operator==(const RoseInstrSomLeftfix &ri) const {
-        return queue == ri.queue && lag == ri.lag;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), queue, lag);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrSomLeftfix &ri, const OffsetMap &,
-                  const OffsetMap &) const {
-        return queue == ri.queue && lag == ri.lag;
-    }
-};
-
-class RoseInstrSomFromReport
-    : public RoseInstrBaseNoTargets<ROSE_INSTR_SOM_FROM_REPORT,
-                                    ROSE_STRUCT_SOM_FROM_REPORT,
-                                    RoseInstrSomFromReport> {
-public:
-    som_operation som;
-
-    RoseInstrSomFromReport() {
-        std::memset(&som, 0, sizeof(som));
-    }
-
-    bool operator==(const RoseInstrSomFromReport &ri) const {
-        return std::memcmp(&som, &ri.som, sizeof(som)) == 0;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), som.type, som.onmatch);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrSomFromReport &ri, const OffsetMap &,
-                  const OffsetMap &) const {
-        return std::memcmp(&som, &ri.som, sizeof(som)) == 0;
-    }
-};
-
-class RoseInstrSomZero
-    : public RoseInstrBaseTrivial<ROSE_INSTR_SOM_ZERO, ROSE_STRUCT_SOM_ZERO,
-                                  RoseInstrSomZero> {
-public:
-    ~RoseInstrSomZero() override;
-};
-
-class RoseInstrTriggerInfix
-    : public RoseInstrBaseNoTargets<ROSE_INSTR_TRIGGER_INFIX,
-                                    ROSE_STRUCT_TRIGGER_INFIX,
-                                    RoseInstrTriggerInfix> {
-public:
-    u8 cancel;
-    u32 queue;
-    u32 event;
-
-    RoseInstrTriggerInfix(u8 cancel_in, u32 queue_in, u32 event_in)
-        : cancel(cancel_in), queue(queue_in), event(event_in) {}
-
-    bool operator==(const RoseInstrTriggerInfix &ri) const {
-        return cancel == ri.cancel && queue == ri.queue && event == ri.event;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), cancel, queue, event);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrTriggerInfix &ri, const OffsetMap &,
-                  const OffsetMap &) const {
-        return cancel == ri.cancel && queue == ri.queue && event == ri.event;
-    }
-};
-
-class RoseInstrTriggerSuffix
-    : public RoseInstrBaseNoTargets<ROSE_INSTR_TRIGGER_SUFFIX,
-                                    ROSE_STRUCT_TRIGGER_SUFFIX,
-                                    RoseInstrTriggerSuffix> {
-public:
-    u32 queue;
-    u32 event;
-
-    RoseInstrTriggerSuffix(u32 queue_in, u32 event_in)
-        : queue(queue_in), event(event_in) {}
-
-    bool operator==(const RoseInstrTriggerSuffix &ri) const {
-        return queue == ri.queue && event == ri.event;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), queue, event);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrTriggerSuffix &ri, const OffsetMap &,
-                  const OffsetMap &) const {
-        return queue == ri.queue && event == ri.event;
-    }
-};
-
-class RoseInstrDedupe
-    : public RoseInstrBaseOneTarget<ROSE_INSTR_DEDUPE, ROSE_STRUCT_DEDUPE,
-                                    RoseInstrDedupe> {
-public:
-    u8 quash_som;
-    u32 dkey;
-    s32 offset_adjust;
-    const RoseInstruction *target;
-
-    RoseInstrDedupe(u8 quash_som_in, u32 dkey_in, s32 offset_adjust_in,
-                    const RoseInstruction *target_in)
-        : quash_som(quash_som_in), dkey(dkey_in),
-          offset_adjust(offset_adjust_in), target(target_in) {}
-
-    bool operator==(const RoseInstrDedupe &ri) const {
-        return quash_som == ri.quash_som && dkey == ri.dkey &&
-               offset_adjust == ri.offset_adjust && target == ri.target;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), quash_som, dkey,
-                        offset_adjust);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrDedupe &ri, const OffsetMap &offsets,
-                  const OffsetMap &other_offsets) const {
-        return quash_som == ri.quash_som && dkey == ri.dkey &&
-               offset_adjust == ri.offset_adjust &&
-               offsets.at(target) == other_offsets.at(ri.target);
-    }
-};
-
-class RoseInstrDedupeSom
-    : public RoseInstrBaseOneTarget<ROSE_INSTR_DEDUPE_SOM,
-                                    ROSE_STRUCT_DEDUPE_SOM,
-                                    RoseInstrDedupeSom> {
-public:
-    u8 quash_som;
-    u32 dkey;
-    s32 offset_adjust;
-    const RoseInstruction *target;
-
-    RoseInstrDedupeSom(u8 quash_som_in, u32 dkey_in, s32 offset_adjust_in,
-                       const RoseInstruction *target_in)
-        : quash_som(quash_som_in), dkey(dkey_in),
-          offset_adjust(offset_adjust_in), target(target_in) {}
-
-    bool operator==(const RoseInstrDedupeSom &ri) const {
-        return quash_som == ri.quash_som && dkey == ri.dkey &&
-               offset_adjust == ri.offset_adjust && target == ri.target;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), quash_som, dkey,
-                        offset_adjust);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrDedupeSom &ri, const OffsetMap &offsets,
-                  const OffsetMap &other_offsets) const {
-        return quash_som == ri.quash_som && dkey == ri.dkey &&
-               offset_adjust == ri.offset_adjust &&
-               offsets.at(target) == other_offsets.at(ri.target);
-    }
-};
-
-class RoseInstrReportChain
-    : public RoseInstrBaseNoTargets<ROSE_INSTR_REPORT_CHAIN,
-                                    ROSE_STRUCT_REPORT_CHAIN,
-                                    RoseInstrReportChain> {
-public:
-    u32 event;
-    u64a top_squash_distance;
-
-    RoseInstrReportChain(u32 event_in, u32 top_squash_distance_in)
-        : event(event_in), top_squash_distance(top_squash_distance_in) {}
-
-    bool operator==(const RoseInstrReportChain &ri) const {
-        return event == ri.event &&
-               top_squash_distance == ri.top_squash_distance;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), event, top_squash_distance);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrReportChain &ri, const OffsetMap &,
-                  const OffsetMap &) const {
-        return event == ri.event &&
-               top_squash_distance == ri.top_squash_distance;
-    }
-};
-
-class RoseInstrReportSomInt
-    : public RoseInstrBaseNoTargets<ROSE_INSTR_REPORT_SOM_INT,
-                                    ROSE_STRUCT_REPORT_SOM_INT,
-                                    RoseInstrReportSomInt> {
-public:
-    som_operation som;
-
-    RoseInstrReportSomInt() {
-        std::memset(&som, 0, sizeof(som));
-    }
-
-    bool operator==(const RoseInstrReportSomInt &ri) const {
-        return std::memcmp(&som, &ri.som, sizeof(som)) == 0;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), som.type, som.onmatch);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrReportSomInt &ri, const OffsetMap &,
-                  const OffsetMap &) const {
-        return std::memcmp(&som, &ri.som, sizeof(som)) == 0;
-    }
-};
-
-class RoseInstrReportSomAware
-    : public RoseInstrBaseNoTargets<ROSE_INSTR_REPORT_SOM_AWARE,
-                                    ROSE_STRUCT_REPORT_SOM_AWARE,
-                                    RoseInstrReportSomAware> {
-public:
-    som_operation som;
-
-    RoseInstrReportSomAware() {
-        std::memset(&som, 0, sizeof(som));
-    }
-
-    bool operator==(const RoseInstrReportSomAware &ri) const {
-        return std::memcmp(&som, &ri.som, sizeof(som)) == 0;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), som.type, som.onmatch);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrReportSomAware &ri, const OffsetMap &,
-                  const OffsetMap &) const {
-        return std::memcmp(&som, &ri.som, sizeof(som)) == 0;
-    }
-};
-
-class RoseInstrReport
-    : public RoseInstrBaseNoTargets<ROSE_INSTR_REPORT, ROSE_STRUCT_REPORT,
-                                    RoseInstrReport> {
-public:
-    ReportID onmatch;
-    s32 offset_adjust;
-
-    RoseInstrReport(ReportID onmatch_in, s32 offset_adjust_in)
-        : onmatch(onmatch_in), offset_adjust(offset_adjust_in) {}
-
-    bool operator==(const RoseInstrReport &ri) const {
-        return onmatch == ri.onmatch && offset_adjust == ri.offset_adjust;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), onmatch, offset_adjust);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrReport &ri, const OffsetMap &,
-                  const OffsetMap &) const {
-        return onmatch == ri.onmatch && offset_adjust == ri.offset_adjust;
-    }
-};
-
-class RoseInstrReportExhaust
-    : public RoseInstrBaseNoTargets<ROSE_INSTR_REPORT_EXHAUST,
-                                    ROSE_STRUCT_REPORT_EXHAUST,
-                                    RoseInstrReportExhaust> {
-public:
-    ReportID onmatch;
-    s32 offset_adjust;
-    u32 ekey;
-
-    RoseInstrReportExhaust(ReportID onmatch_in, s32 offset_adjust_in,
-                           u32 ekey_in)
-        : onmatch(onmatch_in), offset_adjust(offset_adjust_in), ekey(ekey_in) {}
-
-    bool operator==(const RoseInstrReportExhaust &ri) const {
-        return onmatch == ri.onmatch && offset_adjust == ri.offset_adjust &&
-               ekey == ri.ekey;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), onmatch, offset_adjust, ekey);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrReportExhaust &ri, const OffsetMap &,
-                  const OffsetMap &) const {
-        return onmatch == ri.onmatch && offset_adjust == ri.offset_adjust &&
-               ekey == ri.ekey;
-    }
-};
-
-class RoseInstrReportSom
-    : public RoseInstrBaseNoTargets<ROSE_INSTR_REPORT_SOM,
-                                    ROSE_STRUCT_REPORT_SOM,
-                                    RoseInstrReportSom> {
-public:
-    ReportID onmatch;
-    s32 offset_adjust;
-
-    RoseInstrReportSom(ReportID onmatch_in, s32 offset_adjust_in)
-        : onmatch(onmatch_in), offset_adjust(offset_adjust_in) {}
-
-    bool operator==(const RoseInstrReportSom &ri) const {
-        return onmatch == ri.onmatch && offset_adjust == ri.offset_adjust;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), onmatch, offset_adjust);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrReportSom &ri, const OffsetMap &,
-                  const OffsetMap &) const {
-        return onmatch == ri.onmatch && offset_adjust == ri.offset_adjust;
-    }
-};
-
-class RoseInstrReportSomExhaust
-    : public RoseInstrBaseNoTargets<ROSE_INSTR_REPORT_SOM_EXHAUST,
-                                    ROSE_STRUCT_REPORT_SOM_EXHAUST,
-                                    RoseInstrReportSomExhaust> {
-public:
-    ReportID onmatch;
-    s32 offset_adjust;
-    u32 ekey;
-
-    RoseInstrReportSomExhaust(ReportID onmatch_in, s32 offset_adjust_in,
-                              u32 ekey_in)
-        : onmatch(onmatch_in), offset_adjust(offset_adjust_in), ekey(ekey_in) {}
-
-    bool operator==(const RoseInstrReportSomExhaust &ri) const {
-        return onmatch == ri.onmatch && offset_adjust == ri.offset_adjust &&
-               ekey == ri.ekey;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), onmatch, offset_adjust, ekey);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrReportSomExhaust &ri, const OffsetMap &,
-                  const OffsetMap &) const {
-        return onmatch == ri.onmatch && offset_adjust == ri.offset_adjust &&
-               ekey == ri.ekey;
-    }
-};
-
-class RoseInstrDedupeAndReport
-    : public RoseInstrBaseOneTarget<ROSE_INSTR_DEDUPE_AND_REPORT,
-                                    ROSE_STRUCT_DEDUPE_AND_REPORT,
-                                    RoseInstrDedupeAndReport> {
-public:
-    u8 quash_som;
-    u32 dkey;
-    ReportID onmatch;
-    s32 offset_adjust;
-    const RoseInstruction *target;
-
-    RoseInstrDedupeAndReport(u8 quash_som_in, u32 dkey_in, ReportID onmatch_in,
-                             s32 offset_adjust_in,
-                             const RoseInstruction *target_in)
-        : quash_som(quash_som_in), dkey(dkey_in), onmatch(onmatch_in),
-          offset_adjust(offset_adjust_in), target(target_in) {}
-
-    bool operator==(const RoseInstrDedupeAndReport &ri) const {
-        return quash_som == ri.quash_som && dkey == ri.dkey &&
-               onmatch == ri.onmatch && offset_adjust == ri.offset_adjust &&
-               target == ri.target;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), quash_som, dkey, onmatch,
-                        offset_adjust);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrDedupeAndReport &ri, const OffsetMap &offsets,
-                  const OffsetMap &other_offsets) const {
-        return quash_som == ri.quash_som && dkey == ri.dkey &&
-               onmatch == ri.onmatch && offset_adjust == ri.offset_adjust &&
-               offsets.at(target) == other_offsets.at(ri.target);
-    }
-};
-
-class RoseInstrFinalReport
-    : public RoseInstrBaseNoTargets<ROSE_INSTR_FINAL_REPORT,
-                                    ROSE_STRUCT_FINAL_REPORT,
-                                    RoseInstrFinalReport> {
-public:
-    ReportID onmatch;
-    s32 offset_adjust;
-
-    RoseInstrFinalReport(ReportID onmatch_in, s32 offset_adjust_in)
-        : onmatch(onmatch_in), offset_adjust(offset_adjust_in) {}
-
-    bool operator==(const RoseInstrFinalReport &ri) const {
-        return onmatch == ri.onmatch && offset_adjust == ri.offset_adjust;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), onmatch, offset_adjust);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrFinalReport &ri, const OffsetMap &,
-                  const OffsetMap &) const {
-        return onmatch == ri.onmatch && offset_adjust == ri.offset_adjust;
-    }
-};
-
-class RoseInstrCheckExhausted
-    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_EXHAUSTED,
-                                    ROSE_STRUCT_CHECK_EXHAUSTED,
-                                    RoseInstrCheckExhausted> {
-public:
-    u32 ekey;
-    const RoseInstruction *target;
-
-    RoseInstrCheckExhausted(u32 ekey_in, const RoseInstruction *target_in)
-        : ekey(ekey_in), target(target_in) {}
-
-    bool operator==(const RoseInstrCheckExhausted &ri) const {
-        return ekey == ri.ekey && target == ri.target;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), ekey);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrCheckExhausted &ri, const OffsetMap &offsets,
-                  const OffsetMap &other_offsets) const {
-        return ekey == ri.ekey &&
-               offsets.at(target) == other_offsets.at(ri.target);
-    }
-};
-
-class RoseInstrCheckMinLength
-    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_MIN_LENGTH,
-                                    ROSE_STRUCT_CHECK_MIN_LENGTH,
-                                    RoseInstrCheckMinLength> {
-public:
-    s32 end_adj;
-    u64a min_length;
-    const RoseInstruction *target;
-
-    RoseInstrCheckMinLength(s32 end_adj_in, u64a min_length_in,
-                            const RoseInstruction *target_in)
-        : end_adj(end_adj_in), min_length(min_length_in), target(target_in) {}
-
-    bool operator==(const RoseInstrCheckMinLength &ri) const {
-        return end_adj == ri.end_adj && min_length == ri.min_length &&
-               target == ri.target;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), end_adj, min_length);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrCheckMinLength &ri, const OffsetMap &offsets,
-                  const OffsetMap &other_offsets) const {
-        return end_adj == ri.end_adj && min_length == ri.min_length &&
-               offsets.at(target) == other_offsets.at(ri.target);
-    }
-};
-
-class RoseInstrSetState
-    : public RoseInstrBaseNoTargets<ROSE_INSTR_SET_STATE, ROSE_STRUCT_SET_STATE,
-                                    RoseInstrSetState> {
-public:
-    u32 index;
-
-    explicit RoseInstrSetState(u32 index_in) : index(index_in) {}
-
-    bool operator==(const RoseInstrSetState &ri) const {
-        return index == ri.index;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), index);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrSetState &ri, const OffsetMap &,
-                  const OffsetMap &) const {
-        return index == ri.index;
-    }
-};
-
-class RoseInstrSetGroups
-    : public RoseInstrBaseNoTargets<ROSE_INSTR_SET_GROUPS,
-                                    ROSE_STRUCT_SET_GROUPS,
-                                    RoseInstrSetGroups> {
-public:
-    rose_group groups;
-
-    explicit RoseInstrSetGroups(rose_group groups_in) : groups(groups_in) {}
-
-    bool operator==(const RoseInstrSetGroups &ri) const {
-        return groups == ri.groups;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), groups);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrSetGroups &ri, const OffsetMap &,
-                  const OffsetMap &) const {
-        return groups == ri.groups;
-    }
-};
-
-class RoseInstrSquashGroups
-    : public RoseInstrBaseNoTargets<ROSE_INSTR_SQUASH_GROUPS,
-                                    ROSE_STRUCT_SQUASH_GROUPS,
-                                    RoseInstrSquashGroups> {
-public:
-    rose_group groups;
-
-    explicit RoseInstrSquashGroups(rose_group groups_in) : groups(groups_in) {}
-
-    bool operator==(const RoseInstrSquashGroups &ri) const {
-        return groups == ri.groups;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), groups);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrSquashGroups &ri, const OffsetMap &,
-                  const OffsetMap &) const {
-        return groups == ri.groups;
-    }
-};
-
-class RoseInstrCheckState
-    : public RoseInstrBaseOneTarget<ROSE_INSTR_CHECK_STATE,
-                                    ROSE_STRUCT_CHECK_STATE,
-                                    RoseInstrCheckState> {
-public:
-    u32 index;
-    const RoseInstruction *target;
-
-    RoseInstrCheckState(u32 index_in, const RoseInstruction *target_in)
-        : index(index_in), target(target_in) {}
-
-    bool operator==(const RoseInstrCheckState &ri) const {
-        return index == ri.index && target == ri.target;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), index);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrCheckState &ri, const OffsetMap &offsets,
-                  const OffsetMap &other_offsets) const {
-        return index == ri.index &&
-               offsets.at(target) == other_offsets.at(ri.target);
-    }
-};
-
-class RoseInstrSparseIterBegin
-    : public RoseInstrBase<ROSE_INSTR_SPARSE_ITER_BEGIN,
-                           ROSE_STRUCT_SPARSE_ITER_BEGIN,
-                           RoseInstrSparseIterBegin> {
-public:
-    u32 num_keys; // total number of multibit keys
-    std::vector<std::pair<u32, const RoseInstruction *>> jump_table;
-    const RoseInstruction *target;
-
-    RoseInstrSparseIterBegin(u32 num_keys_in,
-                             const RoseInstruction *target_in)
-        : num_keys(num_keys_in), target(target_in) {}
-
-    bool operator==(const RoseInstrSparseIterBegin &ri) const {
-        return num_keys == ri.num_keys && jump_table == ri.jump_table &&
-               target == ri.target;
-    }
-
-    size_t hash() const override {
-        size_t v = hash_all(static_cast<int>(opcode), num_keys);
-        for (const u32 &key : jump_table | boost::adaptors::map_keys) {
-            boost::hash_combine(v, key);
-        }
-        return v;
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    void update_target(const RoseInstruction *old_target,
-                       const RoseInstruction *new_target) override {
-        if (target == old_target) {
-            target = new_target;
-        }
-        for (auto &jump : jump_table) {
-            if (jump.second == old_target) {
-                jump.second = new_target;
-            }
-        }
-    }
-
-    bool equiv_to(const RoseInstrSparseIterBegin &ri, const OffsetMap &offsets,
-                  const OffsetMap &other_offsets) const {
-        if (iter_offset != ri.iter_offset ||
-            offsets.at(target) != other_offsets.at(ri.target)) {
-            return false;
-        }
-        if (jump_table.size() != ri.jump_table.size()) {
-            return false;
-        }
-        auto it1 = jump_table.begin(), it2 = ri.jump_table.begin();
-        for (; it1 != jump_table.end(); ++it1, ++it2) {
-            if (it1->first != it2->first) {
-                return false;
-            }
-            if (offsets.at(it1->second) != other_offsets.at(it2->second)) {
-                return false;
-            }
-        }
-        return true;
-    }
-
-private:
-    friend class RoseInstrSparseIterNext;
-
-    // These variables allow us to use the same multibit iterator and jump
-    // table in subsequent SPARSE_ITER_NEXT write() operations.
-    mutable bool is_written = false;
-    mutable u32 iter_offset = 0;
-    mutable u32 jump_table_offset = 0;
-};
-
-class RoseInstrSparseIterNext
-    : public RoseInstrBase<ROSE_INSTR_SPARSE_ITER_NEXT,
-                           ROSE_STRUCT_SPARSE_ITER_NEXT,
-                           RoseInstrSparseIterNext> {
-public:
-    u32 state;
-    const RoseInstrSparseIterBegin *begin;
-    const RoseInstruction *target;
-
-    RoseInstrSparseIterNext(u32 state_in,
-                            const RoseInstrSparseIterBegin *begin_in,
-                            const RoseInstruction *target_in)
-        : state(state_in), begin(begin_in), target(target_in) {}
-
-    bool operator==(const RoseInstrSparseIterNext &ri) const {
-        return state == ri.state && begin == ri.begin && target == ri.target;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), state);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    void update_target(const RoseInstruction *old_target,
-                       const RoseInstruction *new_target) override {
-        if (target == old_target) {
-            target = new_target;
-        }
-        if (begin == old_target) {
-            assert(new_target->code() == ROSE_INSTR_SPARSE_ITER_BEGIN);
-            begin = static_cast<const RoseInstrSparseIterBegin *>(new_target);
-        }
-    }
-
-    bool equiv_to(const RoseInstrSparseIterNext &ri, const OffsetMap &offsets,
-                  const OffsetMap &other_offsets) const {
-        return state == ri.state &&
-               offsets.at(begin) == other_offsets.at(ri.begin) &&
-               offsets.at(target) == other_offsets.at(ri.target);
-    }
-};
-
-class RoseInstrSparseIterAny
-    : public RoseInstrBaseOneTarget<ROSE_INSTR_SPARSE_ITER_ANY,
-                                    ROSE_STRUCT_SPARSE_ITER_ANY,
-                                    RoseInstrSparseIterAny> {
-public:
-    u32 num_keys; // total number of multibit keys
-    std::vector<u32> keys;
-    const RoseInstruction *target;
-
-    RoseInstrSparseIterAny(u32 num_keys_in, std::vector<u32> keys_in,
-                           const RoseInstruction *target_in)
-        : num_keys(num_keys_in), keys(std::move(keys_in)), target(target_in) {}
-
-    bool operator==(const RoseInstrSparseIterAny &ri) const {
-        return num_keys == ri.num_keys && keys == ri.keys &&
-               target == ri.target;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), num_keys, keys);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrSparseIterAny &ri, const OffsetMap &offsets,
-                  const OffsetMap &other_offsets) const {
-        return num_keys == ri.num_keys && keys == ri.keys &&
-               offsets.at(target) == other_offsets.at(ri.target);
-    }
-};
-
-class RoseInstrEnginesEod
-    : public RoseInstrBaseNoTargets<ROSE_INSTR_ENGINES_EOD,
-                                    ROSE_STRUCT_ENGINES_EOD,
-                                    RoseInstrEnginesEod> {
-public:
-    u32 iter_offset;
-
-    explicit RoseInstrEnginesEod(u32 iter_in) : iter_offset(iter_in) {}
-
-    bool operator==(const RoseInstrEnginesEod &ri) const {
-        return iter_offset == ri.iter_offset;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), iter_offset);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrEnginesEod &ri, const OffsetMap &,
-                  const OffsetMap &) const {
-        return iter_offset == ri.iter_offset;
-    }
-};
-
-class RoseInstrSuffixesEod
-    : public RoseInstrBaseTrivial<ROSE_INSTR_SUFFIXES_EOD,
-                                  ROSE_STRUCT_SUFFIXES_EOD,
-                                  RoseInstrSuffixesEod> {
-public:
-    ~RoseInstrSuffixesEod() override;
-};
-
-class RoseInstrMatcherEod : public RoseInstrBaseTrivial<ROSE_INSTR_MATCHER_EOD,
-                                                        ROSE_STRUCT_MATCHER_EOD,
-                                                        RoseInstrMatcherEod> {
-public:
-    ~RoseInstrMatcherEod() override;
-};
-
-class RoseInstrCheckLongLit
-    : public RoseInstrBaseNoTargets<ROSE_INSTR_CHECK_LONG_LIT,
-                                    ROSE_STRUCT_CHECK_LONG_LIT,
-                                    RoseInstrCheckLongLit> {
-public:
-    std::string literal;
-
-    RoseInstrCheckLongLit(std::string literal_in)
-        : literal(std::move(literal_in)) {}
-
-    bool operator==(const RoseInstrCheckLongLit &ri) const {
-        return literal == ri.literal;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), literal);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrCheckLongLit &ri, const OffsetMap &,
-                  const OffsetMap &) const {
-        return literal == ri.literal;
-    }
-};
-
-class RoseInstrCheckLongLitNocase
-    : public RoseInstrBaseNoTargets<ROSE_INSTR_CHECK_LONG_LIT_NOCASE,
-                                    ROSE_STRUCT_CHECK_LONG_LIT_NOCASE,
-                                    RoseInstrCheckLongLitNocase> {
-public:
-    std::string literal;
-
-    RoseInstrCheckLongLitNocase(std::string literal_in)
-        : literal(std::move(literal_in)) {
-        upperString(literal);
-    }
-
-    bool operator==(const RoseInstrCheckLongLitNocase &ri) const {
-        return literal == ri.literal;
-    }
-
-    size_t hash() const override {
-        return hash_all(static_cast<int>(opcode), literal);
-    }
-
-    void write(void *dest, RoseEngineBlob &blob,
-               const OffsetMap &offset_map) const override;
-
-    bool equiv_to(const RoseInstrCheckLongLitNocase &ri, const OffsetMap &,
-                  const OffsetMap &) const {
-        return literal == ri.literal;
-    }
-};
-
-class RoseInstrEnd
-    : public RoseInstrBaseTrivial<ROSE_INSTR_END, ROSE_STRUCT_END,
-                                  RoseInstrEnd> {
-public:
-    ~RoseInstrEnd() override;
-};
+class RoseInstruction;
+struct RoseResources;
 
 /**
  * \brief Container for a list of program instructions.
@@ -1793,16 +55,14 @@ class RoseProgram {
     std::vector<std::unique_ptr<RoseInstruction>> prog;
 
 public:
-    RoseProgram() {
-        prog.push_back(make_unique<RoseInstrEnd>());
-    }
+    RoseProgram();
+    ~RoseProgram();
+    RoseProgram(const RoseProgram &) = delete;
+    RoseProgram(RoseProgram &&);
+    RoseProgram &operator=(const RoseProgram &) = delete;
+    RoseProgram &operator=(RoseProgram &&);
 
-    bool empty() const {
-        assert(!prog.empty());
-        assert(prog.back()->code() == ROSE_INSTR_END);
-        // Empty if we only have one element, the END instruction.
-        return std::next(prog.begin()) == prog.end();
-    }
+    bool empty() const;
 
     size_t size() const { return prog.size(); }
 
@@ -1826,105 +86,40 @@ class RoseProgram {
     const_reverse_iterator rend() const { return prog.rend(); }
 
     /** \brief Retrieve a pointer to the terminating ROSE_INSTR_END. */
-    const RoseInstruction *end_instruction() const {
-        assert(!prog.empty());
-        assert(prog.back()->code() == ROSE_INSTR_END);
+    const RoseInstruction *end_instruction() const;
 
-        return prog.back().get();
-    }
-
-private:
     static void update_targets(iterator it, iterator it_end,
                                const RoseInstruction *old_target,
-                               const RoseInstruction *new_target) {
-        assert(old_target && new_target && old_target != new_target);
-        for (; it != it_end; ++it) {
-            std::unique_ptr<RoseInstruction> &ri = *it;
-            assert(ri);
-            ri->update_target(old_target, new_target);
-        }
-    }
-
-public:
-    iterator insert(iterator it, std::unique_ptr<RoseInstruction> ri) {
-        assert(!prog.empty());
-        assert(it != end());
-        assert(prog.back()->code() == ROSE_INSTR_END);
-
-        return prog.insert(it, std::move(ri));
-    }
-
-    iterator insert(iterator it, RoseProgram &&block) {
-        assert(!prog.empty());
-        assert(it != end());
-        assert(prog.back()->code() == ROSE_INSTR_END);
-
-        if (block.empty()) {
-            return it;
-        }
+                               const RoseInstruction *new_target);
 
-        const RoseInstruction *end_ptr = block.end_instruction();
-        assert(end_ptr->code() == ROSE_INSTR_END);
-        block.prog.pop_back();
+    iterator insert(iterator it, std::unique_ptr<RoseInstruction> ri);
 
-        const RoseInstruction *new_target = it->get();
-        update_targets(block.prog.begin(), block.prog.end(), end_ptr,
-                       new_target);
+    iterator insert(iterator it, RoseProgram &&block);
 
-        // Workaround: container insert() for ranges doesn't return an iterator
-        // in the version of the STL distributed with gcc 4.8.
-        auto dist = distance(prog.begin(), it);
-        prog.insert(it, std::make_move_iterator(block.prog.begin()),
-                    std::make_move_iterator(block.prog.end()));
-        it = prog.begin();
-        std::advance(it, dist);
-        return it;
-    }
+    /* Note: takes iterator rather than const_iterator to support toolchains
+     * with pre-C++11 standard libraries (i.e., gcc-4.8). */
+    iterator erase(iterator first, iterator last);
 
     /**
      * \brief Adds this instruction to the program just before the terminating
      * ROSE_INSTR_END.
      */
-    void add_before_end(std::unique_ptr<RoseInstruction> ri) {
-        assert(!prog.empty());
-        insert(std::prev(prog.end()), std::move(ri));
-    }
+    void add_before_end(std::unique_ptr<RoseInstruction> ri);
 
     /**
      * \brief Adds this block to the program just before the terminating
      * ROSE_INSTR_END.
+     *
+     * Any existing instruction that was jumping to end continues to do so.
      */
-    void add_before_end(RoseProgram &&block) {
-        assert(!prog.empty());
-        assert(prog.back()->code() == ROSE_INSTR_END);
-
-        if (block.empty()) {
-            return;
-        }
-
-        insert(std::prev(prog.end()), std::move(block));
-    }
-
+    void add_before_end(RoseProgram &&block);
     /**
      * \brief Append this program block, replacing our current ROSE_INSTR_END.
+     *
+     * Any existing instruction that was jumping to end, now leads to the newly
+     * added block.
      */
-    void add_block(RoseProgram &&block) {
-        assert(!prog.empty());
-        assert(prog.back()->code() == ROSE_INSTR_END);
-
-        if (block.empty()) {
-            return;
-        }
-
-        // Replace pointers to the current END with pointers to the first
-        // instruction in the new sequence.
-        const RoseInstruction *end_ptr = end_instruction();
-        prog.pop_back();
-        update_targets(prog.begin(), prog.end(), end_ptr,
-                       block.prog.front().get());
-        prog.insert(prog.end(), std::make_move_iterator(block.prog.begin()),
-                    std::make_move_iterator(block.prog.end()));
-    }
+    void add_block(RoseProgram &&block);
 
     /**
      * \brief Replace the instruction pointed to by the given iterator.
@@ -1932,29 +127,19 @@ class RoseProgram {
     template<class Iter>
     void replace(Iter it, std::unique_ptr<RoseInstruction> ri) {
         assert(!prog.empty());
-        assert(prog.back()->code() == ROSE_INSTR_END);
 
         const RoseInstruction *old_ptr = it->get();
         *it = move(ri);
         update_targets(prog.begin(), prog.end(), old_ptr, it->get());
-
-        assert(prog.back()->code() == ROSE_INSTR_END);
     }
 };
 
-aligned_unique_ptr<char>
-writeProgram(RoseEngineBlob &blob, const RoseProgram &program, u32 *total_len);
+bytecode_ptr<char> writeProgram(RoseEngineBlob &blob,
+                                const RoseProgram &program);
 
 class RoseProgramHash {
 public:
-    size_t operator()(const RoseProgram &program) const {
-        size_t v = 0;
-        for (const auto &ri : program) {
-            assert(ri);
-            boost::hash_combine(v, ri->hash());
-        }
-        return v;
-    }
+    size_t operator()(const RoseProgram &program) const;
 };
 
 class RoseProgramEquivalence {
@@ -1962,6 +147,141 @@ class RoseProgramEquivalence {
     bool operator()(const RoseProgram &prog1, const RoseProgram &prog2) const;
 };
 
+/** \brief Data only used during construction of various programs (literal,
+ * anchored, delay, etc). */
+struct ProgramBuild : noncopyable {
+    explicit ProgramBuild(u32 fMinLitOffset, size_t longLitThresh,
+                          bool catchup)
+        : floatingMinLiteralMatchOffset(fMinLitOffset),
+        longLitLengthThreshold(longLitThresh), needs_catchup(catchup) {
+    }
+
+    /** \brief Minimum offset of a match from the floating table. */
+    const u32 floatingMinLiteralMatchOffset;
+
+    /** \brief Long literal length threshold, used in streaming mode. */
+    const size_t longLitLengthThreshold;
+
+    /** \brief True if reports need CATCH_UP instructions to catch up suffixes,
+     * outfixes etc. */
+    const bool needs_catchup;
+
+    /** \brief Mapping from vertex to key, for vertices with a
+     * CHECK_NOT_HANDLED instruction. */
+    ue2::unordered_map<RoseVertex, u32> handledKeys;
+
+    /** \brief Mapping from Rose literal ID to anchored program index. */
+    std::map<u32, u32> anchored_programs;
+
+    /** \brief Mapping from Rose literal ID to delayed program index. */
+    std::map<u32, u32> delay_programs;
+
+    /** \brief Mapping from every vertex to the groups that must be on for that
+     * vertex to be reached. */
+    ue2::unordered_map<RoseVertex, rose_group> vertex_group_map;
+
+    /** \brief Global bitmap of groups that can be squashed. */
+    rose_group squashable_groups = 0;
+};
+
+void addEnginesEodProgram(u32 eodNfaIterOffset, RoseProgram &program);
+void addSuffixesEodProgram(RoseProgram &program);
+void addMatcherEodProgram(RoseProgram &program);
+
+static constexpr u32 INVALID_QUEUE = ~0U;
+
+struct left_build_info {
+    // Constructor for an engine implementation.
+    left_build_info(u32 q, u32 l, u32 t, rose_group sm,
+                    const std::vector<u8> &stops, u32 max_ql, u8 cm_count,
+                    const CharReach &cm_cr);
+
+    // Constructor for a lookaround implementation.
+    explicit left_build_info(const std::vector<std::vector<LookEntry>> &looks);
+
+    u32 queue = INVALID_QUEUE; /* uniquely idents the left_build_info */
+    u32 lag = 0;
+    u32 transient = 0;
+    rose_group squash_mask = ~rose_group{0};
+    std::vector<u8> stopAlphabet;
+    u32 max_queuelen = 0;
+    u8 countingMiracleCount = 0;
+    CharReach countingMiracleReach;
+    u32 countingMiracleOffset = 0; /* populated later when laying out bytecode */
+    bool has_lookaround = false;
+
+    // alternative implementation to the NFA
+    std::vector<std::vector<LookEntry>> lookaround;
+};
+
+/**
+ * \brief Provides a brief summary of properties of an NFA that has already been
+ * finalised and stored in the blob.
+ */
+struct engine_info {
+    engine_info(const NFA *nfa, bool trans);
+
+    enum NFAEngineType type;
+    bool accepts_eod;
+    u32 stream_size;
+    u32 scratch_size;
+    u32 scratch_align;
+    bool transient;
+};
+
+/**
+ * \brief Consumes list of program blocks corresponding to different literals,
+ * checks them for duplicates and then concatenates them into one program.
+ *
+ * Note: if a block will squash groups, a CLEAR_WORK_DONE instruction is
+ * inserted to prevent the work_done flag being contaminated by early blocks.
+ */
+RoseProgram assembleProgramBlocks(std::vector<RoseProgram> &&blocks);
+
+RoseProgram makeLiteralProgram(const RoseBuildImpl &build,
+                      const std::map<RoseVertex, left_build_info> &leftfix_info,
+                      const std::map<suffix_id, u32> &suffixes,
+                      const std::map<u32, engine_info> &engine_info_by_queue,
+                      const unordered_map<RoseVertex, u32> &roleStateIndices,
+                      ProgramBuild &prog_build, u32 lit_id,
+                      const std::vector<RoseEdge> &lit_edges,
+                      bool is_anchored_replay_program);
+
+RoseProgram makeDelayRebuildProgram(const RoseBuildImpl &build,
+                                    ProgramBuild &prog_build,
+                                    const std::vector<u32> &lit_ids);
+
+RoseProgram makeEodAnchorProgram(const RoseBuildImpl &build,
+                                 ProgramBuild &prog_build, const RoseEdge &e,
+                                 const bool multiple_preds);
+
+RoseProgram makeReportProgram(const RoseBuildImpl &build,
+                              bool needs_mpv_catchup, ReportID id);
+
+RoseProgram makeBoundaryProgram(const RoseBuildImpl &build,
+                                const std::set<ReportID> &reports);
+
+struct TriggerInfo {
+    TriggerInfo(bool c, u32 q, u32 e) : cancel(c), queue(q), event(e) {}
+    bool cancel;
+    u32 queue;
+    u32 event;
+
+    bool operator==(const TriggerInfo &b) const {
+        return cancel == b.cancel && queue == b.queue && event == b.event;
+    }
+};
+
+void addPredBlocks(std::map<u32, RoseProgram> &pred_blocks, u32 num_states,
+                   RoseProgram &program);
+
+void applyFinalSpecialisation(RoseProgram &program);
+
+void recordLongLiterals(std::vector<ue2_case_string> &longLiterals,
+                        const RoseProgram &program);
+
+void recordResources(RoseResources &resources, const RoseProgram &program);
+
 } // namespace ue2
 
 #endif // ROSE_BUILD_PROGRAM_H
diff --git a/src/rose/rose_build_resources.h b/src/rose/rose_build_resources.h
new file mode 100644
index 000000000..3edb81b96
--- /dev/null
+++ b/src/rose/rose_build_resources.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef ROSE_BUILD_RESOURCES_H
+#define ROSE_BUILD_RESOURCES_H
+
+namespace ue2 {
+
+/**
+ * \brief Structure tracking which resources are used by this Rose instance at
+ * runtime.
+ *
+ * We use this to control how much initialisation we need to do at the
+ * beginning of a stream/block at runtime.
+ */
+struct RoseResources {
+    bool has_outfixes = false;
+    bool has_suffixes = false;
+    bool has_leftfixes = false;
+    bool has_literals = false;
+    bool has_states = false;
+    bool checks_groups = false;
+    bool has_lit_delay = false;
+    bool has_lit_check = false; // long literal support
+    bool has_anchored = false;
+    bool has_floating = false;
+    bool has_eod = false;
+};
+
+}
+
+#endif
diff --git a/src/rose/rose_build_role_aliasing.cpp b/src/rose/rose_build_role_aliasing.cpp
index c6139097e..0e78ec7db 100644
--- a/src/rose/rose_build_role_aliasing.cpp
+++ b/src/rose/rose_build_role_aliasing.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -47,6 +47,7 @@
 #include "util/container.h"
 #include "util/graph.h"
 #include "util/graph_range.h"
+#include "util/hash.h"
 #include "util/order_check.h"
 #include "util/ue2_containers.h"
 
@@ -111,11 +112,14 @@ struct AliasInEdge : EdgeAndVertex {
 
 class CandidateSet {
 public:
-    typedef set<RoseVertex>::iterator iterator;
-    typedef RoseVertex key_type;
+    using key_type = RoseVertex;
+    using iterator = set<RoseVertex>::iterator;
+    using const_iterator = set<RoseVertex>::const_iterator;
 
     iterator begin() { return main_cont.begin(); }
     iterator end() { return main_cont.end(); }
+    const_iterator begin() const { return main_cont.begin(); }
+    const_iterator end() const { return main_cont.end(); }
 
     bool contains(RoseVertex a) const {
         return hash_cont.find(a) != hash_cont.end();
@@ -324,9 +328,9 @@ bool canMergeLiterals(RoseVertex a, RoseVertex b, const RoseBuildImpl &build) {
 
     // Otherwise, all the literals involved must have the same length.
     for (u32 a_id : lits_a) {
-        const rose_literal_id &la = build.literals.right.at(a_id);
+        const rose_literal_id &la = build.literals.at(a_id);
         for (u32 b_id : lits_b) {
-            const rose_literal_id &lb = build.literals.right.at(b_id);
+            const rose_literal_id &lb = build.literals.at(b_id);
 
             if (la.elength() != lb.elength()) {
                 DEBUG_PRINTF("bad merge %zu!=%zu '%s', '%s'\n", la.elength(),
@@ -451,37 +455,6 @@ bool sameRightRoleProperties(const RoseBuildImpl &build, RoseVertex a,
     return true;
 }
 
-/**
- * Hash on some deterministic props checked in sameRoleProperties + properties
- * required for right equivalence.
- */
-static
-size_t hashRightRoleProperties(RoseVertex v, const RoseGraph &g) {
-    using boost::hash_combine;
-    using boost::hash_range;
-
-    const RoseVertexProps &props = g[v];
-
-    size_t val = 0;
-    hash_combine(val, hash_range(begin(props.reports), end(props.reports)));
-
-    if (props.suffix) {
-        const auto &suffix = props.suffix;
-        if (suffix.castle) {
-            hash_combine(val, suffix.castle->reach());
-            hash_combine(val, suffix.castle->repeats.size());
-        }
-        if (suffix.graph) {
-            hash_combine(val, num_vertices(*suffix.graph));
-        }
-        if (suffix.haig) {
-            hash_combine(val, hash_dfa(*suffix.haig));
-        }
-    }
-
-    return val;
-}
-
 static
 void mergeEdgeAdd(RoseVertex u, RoseVertex v, const RoseEdge &from_edge,
                   const RoseEdge *to_edge, RoseGraph &g) {
@@ -684,16 +657,6 @@ void findCandidates(const RoseBuildImpl &build, CandidateSet *candidates) {
                  num_vertices(build.g));
 }
 
-static
-RoseVertex pickSucc(const RoseVertex v, const RoseGraph &g) {
-    RoseGraph::adjacency_iterator ai, ae;
-    tie(ai, ae) = adjacent_vertices(v, g);
-    if (ai == ae) {
-        return RoseGraph::null_vertex();
-    }
-    return *ai;
-}
-
 static
 RoseVertex pickPred(const RoseVertex v, const RoseGraph &g,
                     const RoseBuildImpl &build) {
@@ -854,7 +817,7 @@ void pruneUnusedTops(NGHolder &h, const RoseGraph &g,
         return;
     }
     assert(isCorrectlyTopped(h));
-    DEBUG_PRINTF("prunning unused tops\n");
+    DEBUG_PRINTF("pruning unused tops\n");
     ue2::flat_set<u32> used_tops;
     for (auto v : verts) {
         assert(g[v].left.graph.get() == &h);
@@ -875,7 +838,7 @@ void pruneUnusedTops(NGHolder &h, const RoseGraph &g,
         auto pt_inserter = inserter(pruned_tops, pruned_tops.end());
         set_intersection(h[e].tops.begin(), h[e].tops.end(),
                          used_tops.begin(), used_tops.end(), pt_inserter);
-        h[e].tops = move(pruned_tops);
+        h[e].tops = std::move(pruned_tops);
         if (h[e].tops.empty()) {
             DEBUG_PRINTF("edge (start,%zu) has only unused tops\n", h[v].index);
             dead.push_back(e);
@@ -1162,6 +1125,8 @@ bool attemptRoseGraphMerge(RoseBuildImpl &build, bool preds_same, RoseVertex a,
     shared_ptr<NGHolder> a_h = a_left.graph;
     shared_ptr<NGHolder> b_h = b_left.graph;
     assert(a_h && b_h);
+    assert(isImplementableNFA(*a_h, nullptr, build.cc));
+    assert(isImplementableNFA(*b_h, nullptr, build.cc));
 
     // If we only differ in reports, this is a very easy merge. Just use b's
     // report for both.
@@ -1252,6 +1217,11 @@ bool attemptRoseGraphMerge(RoseBuildImpl &build, bool preds_same, RoseVertex a,
         duplicateReport(*new_graph, b_left.leftfix_report, new_report);
         pruneAllOtherReports(*new_graph, new_report);
 
+        if (!isImplementableNFA(*new_graph, nullptr, build.cc)) {
+            DEBUG_PRINTF("new graph not implementable\n");
+            return false;
+        }
+
         rai.rev_leftfix[a_left_id].erase(a);
         rai.rev_leftfix[b_left_id].erase(b);
         pruneUnusedTops(*a_h, g, rai.rev_leftfix[a_left_id]);
@@ -1427,62 +1397,95 @@ bool attemptRoseMerge(RoseBuildImpl &build, bool preds_same, RoseVertex a,
     return false;
 }
 
+/**
+ * \brief Buckets that only contain one vertex are never going to lead to a
+ * merge.
+ */
 static
-void splitByReportSuffixBehaviour(const RoseGraph &g,
-                                  vector<vector<RoseVertex>> &buckets,
-                                  ue2::unordered_map<RoseVertex, size_t> &inv) {
-    /* vertices with different report/suffixes can never be considered for right
-     * merge. */
-    vector<vector<RoseVertex>> out;
-    for (const vector<RoseVertex> &b : buckets) {
-        assert(!b.empty());
-        map<pair<flat_set<ReportID>, RoseSuffixInfo>, size_t> dest_map;
-        for (RoseVertex v : b) {
-            auto key = decltype(dest_map)::key_type(g[v].reports, g[v].suffix);
-            size_t out_bucket;
-            if (contains(dest_map, key)) {
-                out_bucket = dest_map[key];
-            } else {
-                out_bucket = out.size();
-                out.push_back(vector<RoseVertex>());
-                dest_map[key] = out_bucket;
-            }
-            out[out_bucket].push_back(v);
-            inv[v] = out_bucket;
-        }
-
+void removeSingletonBuckets(vector<vector<RoseVertex>> &buckets) {
+    auto it = remove_if(
+        begin(buckets), end(buckets),
+        [](const vector<RoseVertex> &bucket) { return bucket.size() < 2; });
+    if (it != end(buckets)) {
+        DEBUG_PRINTF("deleting %zu singleton buckets\n",
+                     distance(it, end(buckets)));
+        buckets.erase(it, end(buckets));
     }
-
-    buckets.swap(out);
 }
 
 static
-void splitByLiteralTable(const RoseBuildImpl &build,
-                         vector<vector<RoseVertex>> &buckets,
-                         ue2::unordered_map<RoseVertex, size_t> &inv) {
-    const RoseGraph &g = build.g;
+void buildInvBucketMap(const vector<vector<RoseVertex>> &buckets,
+                       ue2::unordered_map<RoseVertex, size_t> &inv) {
+    inv.clear();
+    for (size_t i = 0; i < buckets.size(); i++) {
+        for (auto v : buckets[i]) {
+            assert(!contains(inv, v));
+            inv.emplace(v, i);
+        }
+    }
+}
+
+/**
+ * \brief Generic splitter that will use the given split function to partition
+ * the vector of buckets, then remove buckets with <= 1 entry.
+ */
+template <class SplitFunction>
+void splitAndFilterBuckets(vector<vector<RoseVertex>> &buckets,
+                           const SplitFunction &make_split_key) {
+    if (buckets.empty()) {
+        return;
+    }
 
     vector<vector<RoseVertex>> out;
 
+    // Mapping from split key value to new bucket index.
+    using key_type = decltype(make_split_key(RoseGraph::null_vertex()));
+    unordered_map<key_type, size_t> dest_map;
+    dest_map.reserve(buckets.front().size());
+
     for (const auto &bucket : buckets) {
         assert(!bucket.empty());
-        map<rose_literal_table, size_t> dest_map;
+        dest_map.clear();
         for (RoseVertex v : bucket) {
-            auto table = build.literals.right.at(*g[v].literals.begin()).table;
-            size_t out_bucket;
-            if (contains(dest_map, table)) {
-                out_bucket = dest_map[table];
-            } else {
-                out_bucket = out.size();
-                out.push_back(vector<RoseVertex>());
-                dest_map[table] = out_bucket;
+            auto p = dest_map.emplace(make_split_key(v), out.size());
+            if (p.second) { // New key, add a bucket.
+                out.emplace_back();
             }
+            auto out_bucket = p.first->second;
             out[out_bucket].push_back(v);
-            inv[v] = out_bucket;
         }
     }
 
-    buckets.swap(out);
+    if (out.size() == buckets.size()) {
+        return; // No new buckets created.
+    }
+
+    buckets = std::move(out);
+    removeSingletonBuckets(buckets);
+}
+
+static
+void splitByReportSuffixBehaviour(const RoseGraph &g,
+                                  vector<vector<RoseVertex>> &buckets) {
+    // Split by report set and suffix info.
+    auto make_split_key = [&g](RoseVertex v) {
+        return hash_all(g[v].reports, g[v].suffix);
+    };
+    splitAndFilterBuckets(buckets, make_split_key);
+}
+
+static
+void splitByLiteralTable(const RoseBuildImpl &build,
+                         vector<vector<RoseVertex>> &buckets) {
+    const RoseGraph &g = build.g;
+
+    // Split by literal table.
+    auto make_split_key = [&](RoseVertex v) {
+        const auto &lits = g[v].literals;
+        assert(!lits.empty());
+        return build.literals.at(*lits.begin()).table;
+    };
+    splitAndFilterBuckets(buckets, make_split_key);
 }
 
 static
@@ -1543,6 +1546,9 @@ void splitByNeighbour(const RoseGraph &g, vector<vector<RoseVertex>> &buckets,
         }
         insert(&buckets, buckets.end(), extras);
     }
+
+    removeSingletonBuckets(buckets);
+    buildInvBucketMap(buckets, inv);
 }
 
 static
@@ -1551,16 +1557,35 @@ splitDiamondMergeBuckets(CandidateSet &candidates, const RoseBuildImpl &build) {
     const RoseGraph &g = build.g;
 
     vector<vector<RoseVertex>> buckets(1);
-    ue2::unordered_map<RoseVertex, size_t> inv;
-    for (RoseVertex v : candidates) {
-        buckets[0].push_back(v);
-        inv[v] = 0;
+    buckets[0].reserve(candidates.size());
+    insert(&buckets[0], buckets[0].end(), candidates);
+
+    DEBUG_PRINTF("at start, %zu candidates in 1 bucket\n", candidates.size());
+
+    splitByReportSuffixBehaviour(g, buckets);
+    DEBUG_PRINTF("split by report/suffix, %zu buckets\n", buckets.size());
+    if (buckets.empty()) {
+        return buckets;
     }
 
-    splitByReportSuffixBehaviour(g, buckets, inv);
-    splitByLiteralTable(build, buckets, inv);
+    splitByLiteralTable(build, buckets);
+    DEBUG_PRINTF("split by lit table, %zu buckets\n", buckets.size());
+    if (buckets.empty()) {
+        return buckets;
+    }
+
+    // Neighbour splits require inverse map.
+    ue2::unordered_map<RoseVertex, size_t> inv;
+    buildInvBucketMap(buckets, inv);
+
     splitByNeighbour(g, buckets, inv, true);
+    DEBUG_PRINTF("split by successor, %zu buckets\n", buckets.size());
+    if (buckets.empty()) {
+        return buckets;
+    }
+
     splitByNeighbour(g, buckets, inv, false);
+    DEBUG_PRINTF("split by predecessor, %zu buckets\n", buckets.size());
 
     return buckets;
 }
@@ -1677,55 +1702,62 @@ vector<RoseVertex>::iterator findLeftMergeSibling(
     return end;
 }
 
+static
+void getLeftMergeSiblings(const RoseBuildImpl &build, RoseVertex a,
+                          vector<RoseVertex> &siblings) {
+    // We have to find a sibling to merge `a' with, and we select between
+    // two approaches to minimize the number of vertices we have to
+    // examine; which we use depends on the shape of the graph.
+
+    const RoseGraph &g = build.g;
+    assert(!g[a].literals.empty());
+    u32 lit_id = *g[a].literals.begin();
+    const auto &verts = build.literal_info.at(lit_id).vertices;
+    RoseVertex pred = pickPred(a, g, build);
+
+    siblings.clear();
+
+    if (pred == RoseGraph::null_vertex() || build.isAnyStart(pred) ||
+        out_degree(pred, g) > verts.size()) {
+        // Select sibling from amongst the vertices that share a literal.
+        insert(&siblings, siblings.end(), verts);
+    } else {
+        // Select sibling from amongst the vertices that share a
+        // predecessor.
+        insert(&siblings, siblings.end(), adjacent_vertices(pred, g));
+    }
+}
+
 static never_inline
 void leftMergePass(CandidateSet &candidates, RoseBuildImpl &build,
                    vector<RoseVertex> *dead, RoseAliasingInfo &rai) {
     DEBUG_PRINTF("begin (%zu)\n", candidates.size());
-    RoseGraph &g = build.g;
     vector<RoseVertex> siblings;
 
-    CandidateSet::iterator it = candidates.begin();
+    auto it = candidates.begin();
     while (it != candidates.end()) {
         RoseVertex a = *it;
         CandidateSet::iterator ait = it;
         ++it;
 
-        // We have to find a sibling to merge `a' with, and we select between
-        // two approaches to minimize the number of vertices we have to
-        // examine; which we use depends on the shape of the graph.
-
-        assert(!g[a].literals.empty());
-        u32 lit_id = *g[a].literals.begin();
-        const auto &verts = build.literal_info.at(lit_id).vertices;
-        RoseVertex pred = pickPred(a, g, build);
-
-        siblings.clear();
-        if (pred == RoseGraph::null_vertex() || build.isAnyStart(pred)
-            || out_degree(pred, g) > verts.size()) {
-            // Select sibling from amongst the vertices that share a literal.
-            siblings.insert(siblings.end(), verts.begin(), verts.end());
-        } else {
-            // Select sibling from amongst the vertices that share a
-            // predecessor.
-            insert(&siblings, siblings.end(), adjacent_vertices(pred, g));
-        }
-
-        auto jt = findLeftMergeSibling(siblings.begin(), siblings.end(), a,
-                                       build, rai, candidates);
-        if (jt == siblings.end()) {
-            continue;
-        }
-
-        RoseVertex b = *jt;
+        getLeftMergeSiblings(build, a, siblings);
 
-        if (!attemptRoseMerge(build, true, a, b, 0, rai)) {
-            DEBUG_PRINTF("rose fail\n");
-            continue;
+        auto jt = siblings.begin();
+        while (jt != siblings.end()) {
+            jt = findLeftMergeSibling(jt, siblings.end(), a, build, rai,
+                                      candidates);
+            if (jt == siblings.end()) {
+                break;
+            }
+            RoseVertex b = *jt;
+            if (attemptRoseMerge(build, true, a, b, false, rai)) {
+                mergeVerticesLeft(a, b, build, rai);
+                dead->push_back(a);
+                candidates.erase(ait);
+                break; // consider next a
+            }
+            ++jt;
         }
-
-        mergeVerticesLeft(a, b, build, rai);
-        dead->push_back(a);
-        candidates.erase(ait);
     }
 
     DEBUG_PRINTF("%zu candidates remaining\n", candidates.size());
@@ -1810,91 +1842,49 @@ vector<RoseVertex>::const_iterator findRightMergeSibling(
     return end;
 }
 
-template<class Iter>
 static
-void split(map<RoseVertex, size_t> &keys, size_t *next_key, Iter it,
-           const Iter end) {
-    map<size_t, size_t> new_keys;
-
-    for (; it != end; ++it) {
-        RoseVertex v = *it;
-        size_t ok = keys[v];
-        size_t nk;
-        if (contains(new_keys, ok)) {
-            nk = new_keys[ok];
-        } else {
-            nk = (*next_key)++;
-            new_keys[ok] = nk;
-        }
-        keys[v] = nk;
-    }
+void splitByRightProps(const RoseGraph &g,
+                      vector<vector<RoseVertex>> &buckets) {
+    // Successor vector used in make_split_key. We declare it here so we can
+    // reuse storage.
+    vector<RoseVertex> succ;
+
+    // Split by {successors, literals, reports}.
+    auto make_split_key = [&](RoseVertex v) {
+        succ.clear();
+        insert(&succ, succ.end(), adjacent_vertices(v, g));
+        sort(succ.begin(), succ.end());
+        return hash_all(g[v].literals, g[v].reports, succ);
+    };
+    splitAndFilterBuckets(buckets, make_split_key);
 }
 
 static never_inline
-void buildCandidateRightSiblings(CandidateSet &candidates, RoseBuildImpl &build,
-                                 map<size_t, vector<RoseVertex>> &sibling_cache,
-                                 map<RoseVertex, size_t> &keys_ext) {
-    RoseGraph &g = build.g;
-
-    size_t next_key = 1;
-    map<RoseVertex, size_t> keys;
-
-    for (const auto &c : candidates) {
-        keys[c] = 0;
-    }
-
-    set<RoseVertex> done_succ;
-    set<u32> done_lit;
-
-    for (auto a : candidates) {
-        assert(!g[a].literals.empty());
-        u32 lit_id = *g[a].literals.begin();
-        RoseVertex succ = pickSucc(a, g);
-        const auto &verts = build.literal_info.at(lit_id).vertices;
-        if (succ != RoseGraph::null_vertex()
-            && in_degree(succ, g) < verts.size()) {
-            if (!done_succ.insert(succ).second) {
-                continue; // succ already in done_succ.
-            }
-            RoseGraph::inv_adjacency_iterator ai, ae;
-            tie (ai, ae) = inv_adjacent_vertices(succ, g);
-            split(keys, &next_key, ai, ae);
-        } else {
-            if (!done_lit.insert(lit_id).second) {
-                continue; // lit_id already in done_lit.
-            }
-            split(keys, &next_key, verts.begin(), verts.end());
-        }
-    }
+vector<vector<RoseVertex>>
+splitRightMergeBuckets(const CandidateSet &candidates,
+                       const RoseBuildImpl &build) {
+    const RoseGraph &g = build.g;
 
-    map<size_t, map<size_t, size_t>> int_to_ext;
+    vector<vector<RoseVertex>> buckets(1);
+    buckets[0].reserve(candidates.size());
+    insert(&buckets[0], buckets[0].end(), candidates);
 
-    for (const auto &key : keys) {
-        RoseVertex v = key.first;
-        u32 ext;
-        size_t rph = hashRightRoleProperties(v, g);
-        if (contains(int_to_ext[key.second], rph)) {
-            ext = int_to_ext[key.second][rph];
-        } else {
-            ext = keys_ext.size();
-            int_to_ext[key.second][rph] = ext;
-        }
+    DEBUG_PRINTF("at start, %zu candidates in 1 bucket\n", candidates.size());
 
-        keys_ext[v] = ext;
-        sibling_cache[ext].push_back(v);
+    splitByReportSuffixBehaviour(g, buckets);
+    DEBUG_PRINTF("split by report/suffix, %zu buckets\n", buckets.size());
+    if (buckets.empty()) {
+        return buckets;
     }
 
-    for (auto &siblings : sibling_cache | map_values) {
-        sort(siblings.begin(), siblings.end());
+    splitByRightProps(g, buckets);
+    DEBUG_PRINTF("split by right-merge properties, %zu buckets\n",
+                 buckets.size());
+    if (buckets.empty()) {
+        return buckets;
     }
-}
 
-static
-const vector<RoseVertex> &getCandidateRightSiblings(
-                         const map<size_t, vector<RoseVertex>> &sibling_cache,
-                         map<RoseVertex, size_t> &keys, RoseVertex a) {
-    size_t key = keys.at(a);
-    return sibling_cache.at(key);
+    return buckets;
 }
 
 static never_inline
@@ -1903,45 +1893,31 @@ void rightMergePass(CandidateSet &candidates, RoseBuildImpl &build,
                     RoseAliasingInfo &rai) {
     DEBUG_PRINTF("begin\n");
 
-    map<size_t, vector<RoseVertex>> sibling_cache;
-    map<RoseVertex, size_t> keys;
-
-    buildCandidateRightSiblings(candidates, build, sibling_cache, keys);
-
-    CandidateSet::iterator it = candidates.begin();
-    while (it != candidates.end()) {
-        RoseVertex a = *it;
-        CandidateSet::iterator ait = it;
-        ++it;
-
-        // We have to find a sibling to merge `a' with, and we select between
-        // two approaches to minimize the number of vertices we have to
-        // examine; which we use depends on the shape of the graph.
+    if (candidates.empty()) {
+        return;
+    }
 
-        const vector<RoseVertex> &siblings
-            = getCandidateRightSiblings(sibling_cache, keys, a);
+    auto buckets = splitRightMergeBuckets(candidates, build);
 
-        auto jt = siblings.begin();
-        while (jt != siblings.end()) {
-            jt = findRightMergeSibling(jt, siblings.end(), a, build, rai,
-                                       candidates);
-            if (jt == siblings.end()) {
-                break;
-            }
-            if (attemptRoseMerge(build, false, a, *jt, !mergeRoses, rai)) {
-                break;
+    for (const auto &bucket : buckets) {
+        assert(!bucket.empty());
+        for (auto it = bucket.begin(); it != bucket.end(); it++) {
+            RoseVertex a = *it;
+            for (auto jt = bucket.begin(); jt != bucket.end(); jt++) {
+                jt = findRightMergeSibling(jt, bucket.end(), a, build, rai,
+                                           candidates);
+                if (jt == bucket.end()) {
+                    break;
+                }
+                RoseVertex b = *jt;
+                if (attemptRoseMerge(build, false, a, b, !mergeRoses, rai)) {
+                    mergeVerticesRight(a, b, build, rai);
+                    dead->push_back(a);
+                    candidates.erase(a);
+                    break; // consider next a
+                }
             }
-            ++jt;
         }
-
-        if (jt == siblings.end()) {
-            continue;
-        }
-
-        RoseVertex b = *jt;
-        mergeVerticesRight(a, b, build, rai);
-        dead->push_back(a);
-        candidates.erase(ait);
     }
 
     DEBUG_PRINTF("%zu candidates remaining\n", candidates.size());
diff --git a/src/rose/rose_build_scatter.cpp b/src/rose/rose_build_scatter.cpp
index 8d30dd23c..87085ae9a 100644
--- a/src/rose/rose_build_scatter.cpp
+++ b/src/rose/rose_build_scatter.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -63,24 +63,24 @@ void merge_in(scatter_plan_raw *out, const scatter_plan_raw &in) {
     insert(&out->p_u8,   out->p_u8.end(),   in.p_u8);
 }
 
-void buildStateScatterPlan(u32 role_state_offset, u32 role_state_count,
-                           u32 left_array_count, u32 left_prefix_count,
-                           const RoseStateOffsets &stateOffsets,
-                           bool streaming, u32 leaf_array_count,
-                           u32 outfix_begin, u32 outfix_end,
-                           scatter_plan_raw *out) {
+scatter_plan_raw buildStateScatterPlan(u32 role_state_offset,
+            u32 role_state_count, u32 left_array_count, u32 left_prefix_count,
+            const RoseStateOffsets &stateOffsets, bool streaming,
+            u32 leaf_array_count, u32 outfix_begin, u32 outfix_end) {
+    scatter_plan_raw out;
+
     /* init role array */
     scatter_plan_raw spr_role;
     mmbBuildClearPlan(role_state_count, &spr_role);
     rebase(&spr_role, role_state_offset);
-    merge_in(out, spr_role);
+    merge_in(&out, spr_role);
 
     /* init rose array: turn on prefixes */
     u32 rose_array_offset = stateOffsets.activeLeftArray;
     scatter_plan_raw spr_rose;
     mmbBuildInitRangePlan(left_array_count, 0, left_prefix_count, &spr_rose);
     rebase(&spr_rose, rose_array_offset);
-    merge_in(out, spr_rose);
+    merge_in(&out, spr_rose);
 
     /* suffix/outfix array */
     scatter_plan_raw spr_leaf;
@@ -91,7 +91,9 @@ void buildStateScatterPlan(u32 role_state_offset, u32 role_state_count,
         mmbBuildClearPlan(leaf_array_count, &spr_leaf);
     }
     rebase(&spr_leaf, stateOffsets.activeLeafArray);
-    merge_in(out, spr_leaf);
+    merge_in(&out, spr_leaf);
+
+    return out;
 }
 
 u32 aux_size(const scatter_plan_raw &raw) {
diff --git a/src/rose/rose_build_scatter.h b/src/rose/rose_build_scatter.h
index a159fe4e2..67a82b993 100644
--- a/src/rose/rose_build_scatter.h
+++ b/src/rose/rose_build_scatter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -45,12 +45,10 @@ struct scatter_plan_raw {
     std::vector<scatter_unit_u8>   p_u8;
 };
 
-void buildStateScatterPlan(u32 role_state_offset, u32 role_state_count,
-                           u32 left_array_count, u32 left_prefix_count,
-                           const RoseStateOffsets &stateOffsets,
-                           bool streaming, u32 leaf_array_count,
-                           u32 outfix_begin, u32 outfix_end,
-                           scatter_plan_raw *out);
+scatter_plan_raw buildStateScatterPlan(u32 role_state_offset,
+            u32 role_state_count, u32 left_array_count, u32 left_prefix_count,
+            const RoseStateOffsets &stateOffsets, bool streaming,
+            u32 leaf_array_count, u32 outfix_begin, u32 outfix_end);
 
 u32 aux_size(const scatter_plan_raw &raw);
 
diff --git a/src/rose/rose_common.h b/src/rose/rose_common.h
index 3249f0b8b..34678b8fc 100644
--- a/src/rose/rose_common.h
+++ b/src/rose/rose_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -41,4 +41,16 @@
 /** \brief Length in bytes of a reach bitvector, used by the lookaround code. */
 #define REACH_BITVECTOR_LEN 32
 
+/** \brief Length in bytes of a reach bitvector for multi-path lookaround. */
+#define MULTI_REACH_BITVECTOR_LEN 256
+
+/**
+ * \brief The max offset from the leftmost byte to the rightmost byte in
+ * multi-path lookaround.
+ */
+#define MULTIPATH_MAX_LEN 16
+
+/** \brief Value used to represent an invalid Rose program offset. */
+#define ROSE_INVALID_PROG_OFFSET 0
+
 #endif // ROSE_COMMON_H
diff --git a/src/rose/rose_dump.cpp b/src/rose/rose_dump.cpp
deleted file mode 100644
index 1867be507..000000000
--- a/src/rose/rose_dump.cpp
+++ /dev/null
@@ -1,1386 +0,0 @@
-/*
- * Copyright (c) 2015-2016, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "config.h"
-
-#include "hwlm/hwlm_build.h"
-#include "hwlm/hwlm_dump.h"
-#include "rose_build.h"
-#include "rose_dump.h"
-#include "rose_common.h"
-#include "rose_internal.h"
-#include "rose_program.h"
-#include "hs_compile.h"
-#include "ue2common.h"
-#include "nfa/nfa_build_util.h"
-#include "nfa/nfa_dump_api.h"
-#include "nfa/nfa_internal.h"
-#include "nfa/nfa_kind.h"
-#include "util/dump_charclass.h"
-#include "util/multibit_build.h"
-#include "util/multibit.h"
-
-#include <algorithm>
-#include <fstream>
-#include <iomanip>
-#include <map>
-#include <numeric>
-#include <ostream>
-#include <sstream>
-#include <string>
-#include <utility>
-
-#ifndef DUMP_SUPPORT
-#error No dump support!
-#endif
-
-using namespace std;
-
-namespace ue2 {
-
-namespace /* anonymous */ {
-
-struct rose_off {
-    explicit rose_off(u32 j) : i(j) {}
-    string str(void) const;
-    u32 i;
-};
-
-ostream &operator<< (ostream &o, const rose_off &to) {
-    if (to.i == ROSE_BOUND_INF) {
-        o << "inf";
-    } else {
-        o << to.i;
-    }
-    return o;
-}
-
-string rose_off::str(void) const {
-    ostringstream out;
-    out << *this;
-    return out.str();
-}
-
-}
-
-static
-const void *loadFromByteCodeOffset(const RoseEngine *t, u32 offset) {
-    if (!offset) {
-        return nullptr;
-    }
-
-    const char *lt = (const char *)t + offset;
-    return lt;
-}
-
-static
-const void *getAnchoredMatcher(const RoseEngine *t) {
-    return loadFromByteCodeOffset(t, t->amatcherOffset);
-}
-
-static
-const HWLM *getFloatingMatcher(const RoseEngine *t) {
-    return (const HWLM *)loadFromByteCodeOffset(t, t->fmatcherOffset);
-}
-
-static
-const HWLM *getEodMatcher(const RoseEngine *t) {
-    return (const HWLM *)loadFromByteCodeOffset(t, t->ematcherOffset);
-}
-
-static
-const HWLM *getSmallBlockMatcher(const RoseEngine *t) {
-    return (const HWLM *)loadFromByteCodeOffset(t, t->sbmatcherOffset);
-}
-
-static
-CharReach bitvectorToReach(const u8 *reach) {
-    CharReach cr;
-
-    for (size_t i = 0; i < 256; i++) {
-        if (reach[i / 8] & (1U << (i % 8))) {
-            cr.set(i);
-
-        }
-    }
-    return cr;
-}
-
-static
-void dumpLookaround(ofstream &os, const RoseEngine *t,
-                    const ROSE_STRUCT_CHECK_LOOKAROUND *ri) {
-    assert(ri);
-
-    const u8 *base = (const u8 *)t;
-    const s8 *look_base = (const s8 *)(base + t->lookaroundTableOffset);
-    const u8 *reach_base = base + t->lookaroundReachOffset;
-
-    const s8 *look = look_base + ri->index;
-    const s8 *look_end = look + ri->count;
-    const u8 *reach = reach_base + ri->index * REACH_BITVECTOR_LEN;
-
-    os << "    contents:" << endl;
-
-    for (; look < look_end; look++, reach += REACH_BITVECTOR_LEN) {
-        os << "      " << std::setw(4) << std::setfill(' ') << int{*look}
-           << ": ";
-        describeClass(os, bitvectorToReach(reach), 1000, CC_OUT_TEXT);
-        os << endl;
-    }
-}
-
-static
-vector<u32> sparseIterValues(const mmbit_sparse_iter *it, u32 num_bits) {
-    vector<u32> keys;
-
-    if (num_bits == 0) {
-        return keys;
-    }
-
-    vector<u8> bits(mmbit_size(num_bits), u8{0xff}); // All bits on.
-    vector<mmbit_sparse_state> state(MAX_SPARSE_ITER_STATES);
-
-    const u8 *b = bits.data();
-    mmbit_sparse_state *s = state.data();
-
-    u32 idx = 0;
-    u32 i = mmbit_sparse_iter_begin(b, num_bits, &idx, it, s);
-    while (i != MMB_INVALID) {
-        keys.push_back(i);
-        i = mmbit_sparse_iter_next(b, num_bits, i, &idx, it, s);
-    }
-
-    return keys;
-}
-
-static
-void dumpJumpTable(ofstream &os, const RoseEngine *t,
-                   const ROSE_STRUCT_SPARSE_ITER_BEGIN *ri) {
-    auto *it =
-        (const mmbit_sparse_iter *)loadFromByteCodeOffset(t, ri->iter_offset);
-    auto *jumps = (const u32 *)loadFromByteCodeOffset(t, ri->jump_table);
-
-    for (const auto &key : sparseIterValues(it, t->rolesWithStateCount)) {
-        os << "      " << std::setw(4) << std::setfill(' ') << key << " : +"
-           << *jumps << endl;
-        ++jumps;
-    }
-}
-
-static
-void dumpSomOperation(ofstream &os, const som_operation &op) {
-    os << "    som (type=" << u32{op.type} << ", onmatch=" << op.onmatch;
-    switch (op.type) {
-    case SOM_EXTERNAL_CALLBACK_REV_NFA:
-    case SOM_INTERNAL_LOC_SET_REV_NFA:
-    case SOM_INTERNAL_LOC_SET_REV_NFA_IF_UNSET:
-    case SOM_INTERNAL_LOC_SET_REV_NFA_IF_WRITABLE:
-        os << ", revNfaIndex=" << op.aux.revNfaIndex;
-        break;
-    default:
-        os << ", somDistance=" << op.aux.somDistance;
-        break;
-    }
-    os << ")" << endl;
-}
-
-static
-string dumpStrMask(const u8 *mask, size_t len) {
-    ostringstream oss;
-    for (size_t i = 0; i < len; i++) {
-        oss << std::hex << std::setw(2) << std::setfill('0') << u32{mask[i]}
-            << " ";
-    }
-    return oss.str();
-}
-
-#define PROGRAM_CASE(name)                                                     \
-    case ROSE_INSTR_##name: {                                                  \
-        os << "  " << std::setw(4) << std::setfill('0') << (pc - pc_base)      \
-           << ": " #name " (" << (int)ROSE_INSTR_##name << ")" << endl;        \
-        const auto *ri = (const struct ROSE_STRUCT_##name *)pc;
-
-#define PROGRAM_NEXT_INSTRUCTION                                               \
-    pc += ROUNDUP_N(sizeof(*ri), ROSE_INSTR_MIN_ALIGN);                        \
-    break;                                                                     \
-    }
-
-static
-void dumpProgram(ofstream &os, const RoseEngine *t, const char *pc) {
-    const char *pc_base = pc;
-    for (;;) {
-        u8 code = *(const u8 *)pc;
-        assert(code <= LAST_ROSE_INSTRUCTION);
-        const size_t offset = pc - pc_base;
-        switch (code) {
-            PROGRAM_CASE(END) { return; }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(ANCHORED_DELAY) {
-                os << "    groups 0x" << std::hex << ri->groups << std::dec
-                   << endl;
-                os << "    done_jump " << offset + ri->done_jump << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_LIT_EARLY) {
-                os << "    min_offset " << ri->min_offset << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_GROUPS) {
-                os << "    groups 0x" << std::hex << ri->groups << std::dec
-                   << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_ONLY_EOD) {
-                os << "    fail_jump " << offset + ri->fail_jump << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_BOUNDS) {
-                os << "    min_bound " << ri->min_bound << endl;
-                os << "    max_bound " << ri->max_bound << endl;
-                os << "    fail_jump " << offset + ri->fail_jump << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_NOT_HANDLED) {
-                os << "    key " << ri->key << endl;
-                os << "    fail_jump " << offset + ri->fail_jump << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_SINGLE_LOOKAROUND) {
-                os << "    offset " << int{ri->offset} << endl;
-                os << "    reach_index " << ri->reach_index << endl;
-                os << "    fail_jump " << offset + ri->fail_jump << endl;
-                const u8 *base = (const u8 *)t;
-                const u8 *reach_base = base + t->lookaroundReachOffset;
-                const u8 *reach = reach_base +
-                                  ri->reach_index * REACH_BITVECTOR_LEN;
-                os << "    contents ";
-                describeClass(os, bitvectorToReach(reach), 1000, CC_OUT_TEXT);
-                os << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_LOOKAROUND) {
-                os << "    index " << ri->index << endl;
-                os << "    count " << ri->count << endl;
-                os << "    fail_jump " << offset + ri->fail_jump << endl;
-                dumpLookaround(os, t, ri);
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_MASK) {
-                os << "    and_mask 0x" << std::hex << std::setw(16)
-                   << std::setfill('0') << ri->and_mask << std::dec << endl;
-                os << "    cmp_mask 0x" << std::hex << std::setw(16)
-                   << std::setfill('0') << ri->cmp_mask << std::dec << endl;
-                os << "    neg_mask 0x" << std::hex << std::setw(16)
-                   << std::setfill('0') << ri->neg_mask << std::dec << endl;
-                os << "    offset " << ri->offset << endl;
-                os << "    fail_jump " << offset + ri->fail_jump << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_MASK_32) {
-                os << "    and_mask "
-                   << dumpStrMask(ri->and_mask, sizeof(ri->and_mask))
-                   << endl;
-                os << "    cmp_mask "
-                   << dumpStrMask(ri->cmp_mask, sizeof(ri->cmp_mask))
-                   << endl;
-                os << "    neg_mask 0x" << std::hex << std::setw(8)
-                   << std::setfill('0') << ri->neg_mask << std::dec << endl;
-                os << "    offset " << ri->offset << endl;
-                os << "    fail_jump " << offset + ri->fail_jump << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_BYTE) {
-                os << "    and_mask 0x" << std::hex << std::setw(2)
-                   << std::setfill('0') << u32{ri->and_mask} << std::dec
-                   << endl;
-                os << "    cmp_mask 0x" << std::hex << std::setw(2)
-                   << std::setfill('0') << u32{ri->cmp_mask} << std::dec
-                   << endl;
-                os << "    negation " << u32{ri->negation} << endl;
-                os << "    offset " << ri->offset << endl;
-                os << "    fail_jump " << offset + ri->fail_jump << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_SHUFTI_16x8) {
-                os << "    nib_mask "
-                   << dumpStrMask(ri->nib_mask, sizeof(ri->nib_mask))
-                   << endl;
-                os << "    bucket_select_mask "
-                   << dumpStrMask(ri->bucket_select_mask,
-                                  sizeof(ri->bucket_select_mask))
-                   << endl;
-                os << "    offset " << ri->offset << endl;
-                os << "    fail_jump " << offset + ri->fail_jump << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_SHUFTI_32x8) {
-                os << "    hi_mask "
-                   << dumpStrMask(ri->hi_mask, sizeof(ri->hi_mask))
-                   << endl;
-                os << "    lo_mask "
-                   << dumpStrMask(ri->lo_mask, sizeof(ri->lo_mask))
-                   << endl;
-                os << "    bucket_select_mask "
-                   << dumpStrMask(ri->bucket_select_mask,
-                                  sizeof(ri->bucket_select_mask))
-                   << endl;
-                os << "    offset " << ri->offset << endl;
-                os << "    fail_jump " << offset + ri->fail_jump << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_SHUFTI_16x16) {
-                os << "    hi_mask "
-                   << dumpStrMask(ri->hi_mask, sizeof(ri->hi_mask))
-                   << endl;
-                os << "    lo_mask "
-                   << dumpStrMask(ri->lo_mask, sizeof(ri->lo_mask))
-                   << endl;
-                os << "    bucket_select_mask "
-                   << dumpStrMask(ri->bucket_select_mask,
-                                  sizeof(ri->bucket_select_mask))
-                   << endl;
-                os << "    offset " << ri->offset << endl;
-                os << "    fail_jump " << offset + ri->fail_jump << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_SHUFTI_32x16) {
-                os << "    hi_mask "
-                   << dumpStrMask(ri->hi_mask, sizeof(ri->hi_mask))
-                   << endl;
-                os << "    lo_mask "
-                   << dumpStrMask(ri->lo_mask, sizeof(ri->lo_mask))
-                   << endl;
-                os << "    bucket_select_mask_hi "
-                   << dumpStrMask(ri->bucket_select_mask_hi,
-                                  sizeof(ri->bucket_select_mask_hi))
-                   << endl;
-                os << "    bucket_select_mask_lo "
-                   << dumpStrMask(ri->bucket_select_mask_lo,
-                                  sizeof(ri->bucket_select_mask_lo))
-                   << endl;
-                os << "    offset " << ri->offset << endl;
-                os << "    fail_jump " << offset + ri->fail_jump << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_INFIX) {
-                os << "    queue " << ri->queue << endl;
-                os << "    lag " << ri->lag << endl;
-                os << "    report " << ri->report << endl;
-                os << "    fail_jump " << offset + ri->fail_jump << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_PREFIX) {
-                os << "    queue " << ri->queue << endl;
-                os << "    lag " << ri->lag << endl;
-                os << "    report " << ri->report << endl;
-                os << "    fail_jump " << offset + ri->fail_jump << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(PUSH_DELAYED) {
-                os << "    delay " << u32{ri->delay} << endl;
-                os << "    index " << ri->index << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(RECORD_ANCHORED) {
-                os << "    id " << ri->id << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CATCH_UP) {}
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CATCH_UP_MPV) {}
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(SOM_ADJUST) {
-                os << "    distance " << ri->distance << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(SOM_LEFTFIX) {
-                os << "    queue " << ri->queue << endl;
-                os << "    lag " << ri->lag << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(SOM_FROM_REPORT) {
-                dumpSomOperation(os, ri->som);
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(SOM_ZERO) {}
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(TRIGGER_INFIX) {
-                os << "    queue " << ri->queue << endl;
-                os << "    event " << ri->event << endl;
-                os << "    cancel " << u32{ri->cancel} << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(TRIGGER_SUFFIX) {
-                os << "    queue " << ri->queue << endl;
-                os << "    event " << ri->event << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(DEDUPE) {
-                os << "    quash_som " << u32{ri->quash_som} << endl;
-                os << "    dkey " << ri->dkey << endl;
-                os << "    offset_adjust " << ri->offset_adjust << endl;
-                os << "    fail_jump " << offset + ri->fail_jump << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(DEDUPE_SOM) {
-                os << "    quash_som " << u32{ri->quash_som} << endl;
-                os << "    dkey " << ri->dkey << endl;
-                os << "    offset_adjust " << ri->offset_adjust << endl;
-                os << "    fail_jump " << offset + ri->fail_jump << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(REPORT_CHAIN) {
-                os << "    event " << ri->event << endl;
-                os << "    top_squash_distance " << ri->top_squash_distance
-                   << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(REPORT_SOM_INT) {
-                dumpSomOperation(os, ri->som);
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(REPORT_SOM_AWARE) {
-                dumpSomOperation(os, ri->som);
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(REPORT) {
-                os << "    onmatch " << ri->onmatch << endl;
-                os << "    offset_adjust " << ri->offset_adjust << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(REPORT_EXHAUST) {
-                os << "    onmatch " << ri->onmatch << endl;
-                os << "    offset_adjust " << ri->offset_adjust << endl;
-                os << "    ekey " << ri->ekey << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(REPORT_SOM) {
-                os << "    onmatch " << ri->onmatch << endl;
-                os << "    offset_adjust " << ri->offset_adjust << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(REPORT_SOM_EXHAUST) {
-                os << "    onmatch " << ri->onmatch << endl;
-                os << "    offset_adjust " << ri->offset_adjust << endl;
-                os << "    ekey " << ri->ekey << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(DEDUPE_AND_REPORT) {
-                os << "    quash_som " << u32{ri->quash_som} << endl;
-                os << "    dkey " << ri->dkey << endl;
-                os << "    onmatch " << ri->onmatch << endl;
-                os << "    offset_adjust " << ri->offset_adjust << endl;
-                os << "    fail_jump " << offset + ri->fail_jump << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(FINAL_REPORT) {
-                os << "    onmatch " << ri->onmatch << endl;
-                os << "    offset_adjust " << ri->offset_adjust << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_EXHAUSTED) {
-                os << "    ekey " << ri->ekey << endl;
-                os << "    fail_jump " << offset + ri->fail_jump << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_MIN_LENGTH) {
-                os << "    end_adj " << ri->end_adj << endl;
-                os << "    min_length " << ri->min_length << endl;
-                os << "    fail_jump " << offset + ri->fail_jump << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(SET_STATE) {
-                os << "    index " << ri->index << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(SET_GROUPS) {
-                os << "    groups 0x" << std::hex << ri->groups << std::dec
-                   << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(SQUASH_GROUPS) {
-                os << "    groups 0x" << std::hex << ri->groups << std::dec
-                   << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_STATE) {
-                os << "    index " << ri->index << endl;
-                os << "    fail_jump " << offset + ri->fail_jump << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(SPARSE_ITER_BEGIN) {
-                os << "    iter_offset " << ri->iter_offset << endl;
-                os << "    jump_table " << ri->jump_table << endl;
-                dumpJumpTable(os, t, ri);
-                os << "    fail_jump " << offset + ri->fail_jump << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(SPARSE_ITER_NEXT) {
-                os << "    iter_offset " << ri->iter_offset << endl;
-                os << "    jump_table " << ri->jump_table << endl;
-                os << "    state " << ri->state << endl;
-                os << "    fail_jump " << offset + ri->fail_jump << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(SPARSE_ITER_ANY) {
-                os << "    iter_offset " << ri->iter_offset << endl;
-                os << "    fail_jump " << offset + ri->fail_jump << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(ENGINES_EOD) {
-                os << "    iter_offset " << ri->iter_offset << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(SUFFIXES_EOD) {}
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(MATCHER_EOD) {}
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_LONG_LIT) {
-                os << "    lit_offset " << ri->lit_offset << endl;
-                os << "    lit_length " << ri->lit_length << endl;
-                const char *lit = (const char *)t + ri->lit_offset;
-                os << "    literal: \""
-                   << escapeString(string(lit, ri->lit_length)) << "\"" << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-            PROGRAM_CASE(CHECK_LONG_LIT_NOCASE) {
-                os << "    lit_offset " << ri->lit_offset << endl;
-                os << "    lit_length " << ri->lit_length << endl;
-                const char *lit = (const char *)t + ri->lit_offset;
-                os << "    literal: \""
-                   << escapeString(string(lit, ri->lit_length)) << "\"" << endl;
-            }
-            PROGRAM_NEXT_INSTRUCTION
-
-        default:
-            os << "  UNKNOWN (code " << int{code} << ")" << endl;
-            os << "  <stopping>" << endl;
-            return;
-        }
-    }
-}
-
-#undef PROGRAM_CASE
-#undef PROGRAM_NEXT_INSTRUCTION
-
-static
-void dumpRoseLitPrograms(const RoseEngine *t, const string &filename) {
-    ofstream os(filename);
-
-    const u32 *litPrograms =
-        (const u32 *)loadFromByteCodeOffset(t, t->litProgramOffset);
-    const u32 *delayRebuildPrograms =
-        (const u32 *)loadFromByteCodeOffset(t, t->litDelayRebuildProgramOffset);
-
-    for (u32 i = 0; i < t->literalCount; i++) {
-        os << "Literal " << i << endl;
-        os << "---------------" << endl;
-
-        if (litPrograms[i]) {
-            os << "Program @ " << litPrograms[i] << ":" << endl;
-            const char *prog =
-                (const char *)loadFromByteCodeOffset(t, litPrograms[i]);
-            dumpProgram(os, t, prog);
-        } else {
-            os << "<No Program>" << endl;
-        }
-
-        if (delayRebuildPrograms[i]) {
-            os << "Delay Rebuild Program @ " << delayRebuildPrograms[i] << ":"
-               << endl;
-            const char *prog = (const char *)loadFromByteCodeOffset(
-                t, delayRebuildPrograms[i]);
-            dumpProgram(os, t, prog);
-        }
-
-        os << endl;
-    }
-
-    os.close();
-}
-
-static
-void dumpRoseEodPrograms(const RoseEngine *t, const string &filename) {
-    ofstream os(filename);
-    const char *base = (const char *)t;
-
-    if (t->eodProgramOffset) {
-        os << "EOD Program @ " << t->eodProgramOffset << ":" << endl;
-        dumpProgram(os, t, base + t->eodProgramOffset);
-        os << endl;
-    } else {
-        os << "<No EOD Program>" << endl;
-    }
-
-    os.close();
-}
-
-static
-void dumpRoseReportPrograms(const RoseEngine *t, const string &filename) {
-    ofstream os(filename);
-
-    const u32 *programs =
-        (const u32 *)loadFromByteCodeOffset(t, t->reportProgramOffset);
-
-    for (u32 i = 0; i < t->reportProgramCount; i++) {
-        os << "Report " << i << endl;
-        os << "---------------" << endl;
-
-        if (programs[i]) {
-            os << "Program @ " << programs[i] << ":" << endl;
-            const char *prog =
-                (const char *)loadFromByteCodeOffset(t, programs[i]);
-            dumpProgram(os, t, prog);
-        } else {
-            os << "<No Program>" << endl;
-        }
-    }
-
-    os.close();
-}
-
-static
-void dumpNfaNotes(ofstream &fout, const RoseEngine *t, const NFA *n) {
-    const u32 qindex = n->queueIndex;
-
-    if (qindex < t->outfixBeginQueue) {
-        fout << "chained";
-        return;
-    }
-
-    if (qindex < t->outfixEndQueue) {
-        fout << "outfix";
-        return;
-    }
-
-    const NfaInfo *nfa_info = getNfaInfoByQueue(t, qindex);
-    const NFA *nfa = getNfaByInfo(t, nfa_info);
-
-    if (nfa_info->eod) {
-        fout << "eod ";
-    }
-
-    if (qindex < t->leftfixBeginQueue) {
-        fout << "suffix";
-        return;
-    }
-
-    const LeftNfaInfo *left = getLeftInfoByQueue(t, qindex);
-    if (left->eager) {
-        fout << "eager ";
-    }
-    if (left->transient) {
-        fout << "transient " << (u32)left->transient << " ";
-    }
-    if (left->infix) {
-        fout << "infix";
-        u32 maxQueueLen = left->maxQueueLen;
-        if (maxQueueLen != (u32)(-1)) {
-            fout << " maxqlen=" << maxQueueLen;
-        }
-    } else {
-        fout << "prefix";
-    }
-    fout << " maxlag=" << left->maxLag;
-    if (left->stopTable) {
-        fout << " miracles";
-    }
-    if (left->countingMiracleOffset) {
-        const RoseCountingMiracle *cm
-            = (const RoseCountingMiracle *)((const char *)t
-                                            + left->countingMiracleOffset);
-        fout << " counting_miracle:" << (int)cm->count
-             << (cm->shufti ? "s" : "v");
-    }
-    if (nfaSupportsZombie(nfa)) {
-        fout << " zombie";
-    }
-    if (left->eod_check) {
-        fout << " eod";
-    }
-}
-
-static
-void dumpComponentInfo(const RoseEngine *t, const string &base) {
-    stringstream ss;
-    ss << base << "rose_components.txt";
-    ofstream fout(ss.str().c_str());
-
-    fout << "Index  Offset\tEngine               \tStates S.State Bytes   Notes\n";
-
-    for (u32 i = 0; i < t->queueCount; i++) {
-        const NfaInfo *nfa_info = getNfaInfoByQueue(t, i);
-        const NFA *n = getNfaByInfo(t, nfa_info);
-
-        fout << left << setw(6) << i << " ";
-
-        fout << left << ((const char *)n - (const char *)t) << "\t"; /* offset */
-
-        fout << left << setw(16) << describe(*n) << "\t";
-
-        fout << left << setw(6) << n->nPositions << " ";
-        fout << left << setw(7) << n->streamStateSize << " ";
-        fout << left << setw(7) << n->length << " ";
-
-        dumpNfaNotes(fout, t, n);
-
-        fout << endl;
-    }
-}
-
-
-static
-void dumpComponentInfoCsv(const RoseEngine *t, const string &base) {
-    FILE *f = fopen((base +"rose_components.csv").c_str(), "w");
-
-    fprintf(f, "Index, Offset,Engine Type,States,Stream State,Bytecode Size,"
-            "Kind,Notes\n");
-
-    for (u32 i = 0; i < t->queueCount; i++) {
-        const NfaInfo *nfa_info = getNfaInfoByQueue(t, i);
-        const NFA *n = getNfaByInfo(t, nfa_info);
-        nfa_kind kind;
-        stringstream notes;
-
-        if (i < t->outfixBeginQueue) {
-            notes << "chained;";
-        }
-
-        if (nfa_info->eod) {
-            notes << "eod;";
-        }
-
-        if (i < t->outfixEndQueue) {
-            kind = NFA_OUTFIX;
-        } else if (i < t->leftfixBeginQueue) {
-            kind = NFA_SUFFIX;
-        } else {
-            const LeftNfaInfo *left = getLeftInfoByQueue(t, i);
-            if (left->eager) {
-                notes << "eager;";
-            }
-            if (left->transient) {
-                notes << "transient " << (u32)left->transient << ";";
-            }
-            if (left->infix) {
-                kind = NFA_INFIX;
-                u32 maxQueueLen = left->maxQueueLen;
-                if (maxQueueLen != (u32)(-1)) {
-                    notes << "maxqlen=" << maxQueueLen << ";";
-                }
-            } else {
-                kind = NFA_PREFIX;
-            }
-            notes << "maxlag=" << left->maxLag << ";";
-            if (left->stopTable) {
-                notes << "miracles;";
-            }
-            if (left->countingMiracleOffset) {
-                auto cm = (const RoseCountingMiracle *)
-                    ((const char *)t + left->countingMiracleOffset);
-                notes << "counting_miracle:" << (int)cm->count
-                      << (cm->shufti ? "s" : "v") << ";";
-            }
-            if (nfaSupportsZombie(n)) {
-                notes << " zombie;";
-            }
-            if (left->eod_check) {
-            notes << "left_eod;";
-            }
-        }
-
-        fprintf(f, "%u,%zd,\"%s\",%u,%u,%u,%s,%s\n", i,
-                (const char *)n - (const char *)t, describe(*n).c_str(),
-                n->nPositions, n->streamStateSize, n->length,
-                to_string(kind).c_str(), notes.str().c_str());
-    }
-    fclose(f);
-}
-
-
-static
-void dumpExhaust(const RoseEngine *t, const string &base) {
-    stringstream sstxt;
-    sstxt << base << "rose_exhaust.txt";
-    FILE *f = fopen(sstxt.str().c_str(), "w");
-
-    const NfaInfo *infos
-        = (const NfaInfo *)((const char *)t + t->nfaInfoOffset);
-
-    u32 queue_count = t->activeArrayCount;
-
-    for (u32 i = 0; i < queue_count; ++i) {
-        u32 ekey_offset = infos[i].ekeyListOffset;
-
-        fprintf(f, "%u (%u):", i, ekey_offset);
-
-        if (ekey_offset) {
-            const u32 *ekeys = (const u32 *)((const char *)t + ekey_offset);
-            while (1) {
-                u32 e = *ekeys;
-                ++ekeys;
-                if (e == ~0U) {
-                    break;
-                }
-                fprintf(f, " %u", e);
-            }
-        }
-
-        fprintf(f, "\n");
-    }
-
-    fclose(f);
-}
-
-static
-void dumpNfas(const RoseEngine *t, bool dump_raw, const string &base) {
-    dumpExhaust(t, base);
-
-    for (u32 i = 0; i < t->queueCount; i++) {
-        const NfaInfo *nfa_info = getNfaInfoByQueue(t, i);
-        const NFA *n = getNfaByInfo(t, nfa_info);
-
-        stringstream ssbase;
-        ssbase << base << "rose_nfa_" << i;
-        nfaGenerateDumpFiles(n, ssbase.str());
-
-        if (dump_raw) {
-            stringstream ssraw;
-            ssraw << base << "rose_nfa_" << i << ".raw";
-            FILE *f = fopen(ssraw.str().c_str(), "w");
-            fwrite(n, 1, n->length, f);
-            fclose(f);
-        }
-    }
-}
-
-static
-void dumpRevComponentInfo(const RoseEngine *t, const string &base) {
-    stringstream ss;
-    ss << base << "som_rev_components.txt";
-    ofstream fout(ss.str().c_str());
-
-    fout << "Index  Offset\tEngine               \tStates S.State Bytes\n";
-
-    const char *tp = (const char *)t;
-    const u32 *rev_offsets = (const u32 *)(tp + t->somRevOffsetOffset);
-
-    for (u32 i = 0; i < t->somRevCount; i++) {
-        u32 offset = rev_offsets[i];
-        const NFA *n = (const NFA *)(tp + offset);
-
-        fout << left << setw(6) << i << " ";
-
-        fout << left << offset << "\t"; /* offset */
-
-        fout << left << setw(16) << describe(*n) << "\t";
-
-        fout << left << setw(6) << n->nPositions << " ";
-        fout << left << setw(7) << n->streamStateSize << " ";
-        fout << left << setw(7) << n->length;
-        fout << endl;
-    }
-}
-
-static
-void dumpRevNfas(const RoseEngine *t, bool dump_raw, const string &base) {
-    const char *tp = (const char *)t;
-    const u32 *rev_offsets = (const u32 *)(tp + t->somRevOffsetOffset);
-
-    for (u32 i = 0; i < t->somRevCount; i++) {
-        const NFA *n = (const NFA *)(tp + rev_offsets[i]);
-
-        stringstream ssbase;
-        ssbase << base << "som_rev_nfa_" << i;
-        nfaGenerateDumpFiles(n, ssbase.str());
-
-        if (dump_raw) {
-            stringstream ssraw;
-            ssraw << base << "som_rev_nfa_" << i << ".raw";
-            FILE *f = fopen(ssraw.str().c_str(), "w");
-            fwrite(n, 1, n->length, f);
-            fclose(f);
-        }
-    }
-}
-
-static
-void dumpAnchored(const RoseEngine *t, const string &base) {
-    u32 i = 0;
-    const anchored_matcher_info *curr
-        = (const anchored_matcher_info *)getALiteralMatcher(t);
-
-    while (curr) {
-        const NFA *n = (const NFA *)((const char *)curr + sizeof(*curr));
-
-        stringstream ssbase;
-        ssbase << base << "anchored_" << i;
-        nfaGenerateDumpFiles(n, ssbase.str());
-
-        curr = curr->next_offset ? (const anchored_matcher_info *)
-            ((const char *)curr + curr->next_offset) : nullptr;
-        i++;
-    };
-}
-
-static
-void dumpAnchoredStats(const void *atable, FILE *f) {
-    assert(atable);
-
-    u32 i = 0;
-    const anchored_matcher_info *curr = (const anchored_matcher_info *)atable;
-
-    while (curr) {
-        const NFA *n = (const NFA *)((const char *)curr + sizeof(*curr));
-
-        fprintf(f, "  NFA %u: %s, %u states (%u bytes)\n", i,
-                describe(*n).c_str(), n->nPositions, n->length);
-
-        curr = curr->next_offset ? (const anchored_matcher_info *)
-            ((const char *)curr + curr->next_offset) : nullptr;
-        i++;
-    };
-
-}
-
-static
-void dumpLongLiteralSubtable(const RoseLongLitTable *ll_table,
-                             const RoseLongLitSubtable *ll_sub, FILE *f) {
-    if (!ll_sub->hashBits) {
-        fprintf(f, "      <no table>\n");
-        return;
-    }
-
-    const char *base = (const char *)ll_table;
-
-    u32 nbits = ll_sub->hashBits;
-    u32 num_entries = 1U << nbits;
-    const auto *tab = (const RoseLongLitHashEntry *)(base + ll_sub->hashOffset);
-    u32 hash_occ =
-        count_if(tab, tab + num_entries, [](const RoseLongLitHashEntry &ent) {
-            return ent.str_offset != 0;
-        });
-    float hash_occ_percent = ((float)hash_occ / (float)num_entries) * 100;
-
-    fprintf(f, "      hash table   : %u bits, occupancy %u/%u (%0.1f%%)\n",
-            nbits, hash_occ, num_entries, hash_occ_percent);
-
-    u32 bloom_bits = ll_sub->bloomBits;
-    u32 bloom_size = 1U << bloom_bits;
-    const u8 *bloom = (const u8 *)base + ll_sub->bloomOffset;
-    u32 bloom_occ = accumulate(bloom, bloom + bloom_size / 8, 0,
-        [](const u32 &sum, const u8 &elem) { return sum + popcount32(elem); });
-    float bloom_occ_percent = ((float)bloom_occ / (float)(bloom_size)) * 100;
-
-    fprintf(f, "      bloom filter : %u bits, occupancy %u/%u (%0.1f%%)\n",
-            bloom_bits, bloom_occ, bloom_size, bloom_occ_percent);
-}
-
-static
-void dumpLongLiteralTable(const RoseEngine *t, FILE *f) {
-    if (!t->longLitTableOffset) {
-        return;
-    }
-
-    fprintf(f, "\n");
-    fprintf(f, "Long literal table (streaming):\n");
-
-    const auto *ll_table =
-        (const struct RoseLongLitTable *)loadFromByteCodeOffset(
-            t, t->longLitTableOffset);
-
-    fprintf(f, "    total size     : %u bytes\n", ll_table->size);
-    fprintf(f, "    longest len    : %u\n", ll_table->maxLen);
-    fprintf(f, "    stream state   : %u bytes\n", ll_table->streamStateBytes);
-
-    fprintf(f, "    caseful:\n");
-    dumpLongLiteralSubtable(ll_table, &ll_table->caseful, f);
-
-    fprintf(f, "    nocase:\n");
-    dumpLongLiteralSubtable(ll_table, &ll_table->nocase, f);
-}
-
-// Externally accessible functions
-
-void roseDumpText(const RoseEngine *t, FILE *f) {
-    if (!t) {
-        fprintf(f, "<< no rose >>\n");
-        return;
-    }
-
-    const void *atable = getAnchoredMatcher(t);
-    const HWLM *ftable = getFloatingMatcher(t);
-    const HWLM *etable = getEodMatcher(t);
-    const HWLM *sbtable = getSmallBlockMatcher(t);
-
-    fprintf(f, "Rose:\n\n");
-
-    fprintf(f, "mode:                : ");
-    switch(t->mode) {
-    case HS_MODE_BLOCK:
-        fprintf(f, "block");
-        break;
-    case HS_MODE_STREAM:
-        fprintf(f, "streaming");
-        break;
-    case HS_MODE_VECTORED:
-        fprintf(f, "vectored");
-        break;
-    }
-    fprintf(f, "\n");
-
-    fprintf(f, "properties           :");
-    if (t->canExhaust) {
-        fprintf(f, " canExhaust");
-    }
-    if (t->hasSom) {
-        fprintf(f, " hasSom");
-    }
-    fprintf(f, "\n");
-
-    fprintf(f, "dkey count           : %u\n", t->dkeyCount);
-    fprintf(f, "som slot count       : %u\n", t->somLocationCount);
-    fprintf(f, "som width            : %u bytes\n", t->somHorizon);
-    fprintf(f, "rose count           : %u\n", t->roseCount);
-    fprintf(f, "\n");
-
-    fprintf(f, "total engine size    : %u bytes\n", t->size);
-    fprintf(f, " - anchored matcher  : %u bytes over %u bytes\n", t->asize,
-            t->anchoredDistance);
-    fprintf(f, " - floating matcher  : %zu bytes%s",
-            ftable ? hwlmSize(ftable) : 0, t->noFloatingRoots ? " (cond)":"");
-    if (t->floatingMinDistance) {
-        fprintf(f, " from %s bytes\n",
-                rose_off(t->floatingMinDistance).str().c_str());
-    }
-    if (t->floatingDistance != ROSE_BOUND_INF && ftable) {
-        fprintf(f, " over %u bytes\n", t->floatingDistance);
-    } else {
-        fprintf(f, "\n");
-    }
-    fprintf(f, " - eod-anch matcher  : %zu bytes over last %u bytes\n",
-            etable ? hwlmSize(etable) : 0, t->ematcherRegionSize);
-    fprintf(f, " - small-blk matcher : %zu bytes over %u bytes\n",
-            sbtable ? hwlmSize(sbtable) : 0, t->smallBlockDistance);
-    fprintf(f, " - role state table  : %zu bytes\n",
-            t->rolesWithStateCount * sizeof(u32));
-    fprintf(f, " - nfa info table    : %zu bytes\n",
-            t->queueCount * sizeof(NfaInfo));
-    fprintf(f, " - lookaround table  : %u bytes\n",
-            t->nfaInfoOffset - t->lookaroundTableOffset);
-    fprintf(f, " - lookaround reach  : %u bytes\n",
-            t->lookaroundTableOffset - t->lookaroundReachOffset);
-
-    fprintf(f, "state space required : %u bytes\n", t->stateOffsets.end);
-    fprintf(f, " - history buffer    : %u bytes\n", t->historyRequired);
-    fprintf(f, " - exhaustion vector : %u bytes\n", (t->ekeyCount + 7) / 8);
-    fprintf(f, " - role state mmbit  : %u bytes\n", t->stateSize);
-    fprintf(f, " - long lit matcher  : %u bytes\n", t->longLitStreamState);
-    fprintf(f, " - active array      : %u bytes\n",
-            mmbit_size(t->activeArrayCount));
-    fprintf(f, " - active rose       : %u bytes\n",
-            mmbit_size(t->activeLeftCount));
-    fprintf(f, " - anchored state    : %u bytes\n", t->anchorStateSize);
-    fprintf(f, " - nfa state         : %u bytes\n", t->nfaStateSize);
-    fprintf(f, " - (trans. nfa state): %u bytes\n", t->tStateSize);
-    fprintf(f, " - one whole bytes   : %u bytes\n",
-            t->stateOffsets.anchorState - t->stateOffsets.leftfixLagTable);
-    fprintf(f, " - groups            : %u bytes\n",
-            t->stateOffsets.groups_size);
-    fprintf(f, "\n");
-
-    fprintf(f, "initial groups       : 0x%016llx\n", t->initialGroups);
-    fprintf(f, "floating groups      : 0x%016llx\n", t->floating_group_mask);
-    fprintf(f, "handled key count    : %u\n", t->handledKeyCount);
-    fprintf(f, "\n");
-
-    fprintf(f, "total literal count  : %u\n", t->totalNumLiterals);
-    fprintf(f, "  prog table size    : %u\n", t->literalCount);
-    fprintf(f, "  delayed literals   : %u\n", t->delay_count);
-
-    fprintf(f, "\n");
-    fprintf(f, "  minWidth                    : %u\n", t->minWidth);
-    fprintf(f, "  minWidthExcludingBoundaries : %u\n",
-            t->minWidthExcludingBoundaries);
-    fprintf(f, "  maxBiAnchoredWidth          : %s\n",
-            rose_off(t->maxBiAnchoredWidth).str().c_str());
-    fprintf(f, "  minFloatLitMatchOffset      : %s\n",
-            rose_off(t->floatingMinLiteralMatchOffset).str().c_str());
-    fprintf(f, "  delay_base_id               : %u\n", t->delay_base_id);
-    fprintf(f, "  maxFloatingDelayedMatch     : %s\n",
-            rose_off(t->maxFloatingDelayedMatch).str().c_str());
-
-    if (atable) {
-        fprintf(f, "\nAnchored literal matcher stats:\n\n");
-        dumpAnchoredStats(atable, f);
-    }
-
-    if (ftable) {
-        fprintf(f, "\nFloating literal matcher stats:\n\n");
-        hwlmPrintStats(ftable, f);
-    }
-
-    if (etable) {
-        fprintf(f, "\nEOD-anchored literal matcher stats:\n\n");
-        hwlmPrintStats(etable, f);
-    }
-
-    if (sbtable) {
-        fprintf(f, "\nSmall-block literal matcher stats:\n\n");
-        hwlmPrintStats(sbtable, f);
-    }
-
-    dumpLongLiteralTable(t, f);
-}
-
-#define DUMP_U8(o, member)                                              \
-    fprintf(f, "    %-32s: %hhu/%hhx\n", #member, o->member, o->member)
-#define DUMP_U32(o, member)                                             \
-    fprintf(f, "    %-32s: %u/%08x\n", #member, o->member, o->member)
-#define DUMP_U64(o, member)                                             \
-    fprintf(f, "    %-32s: %llu/%016llx\n", #member, o->member, o->member)
-
-void roseDumpStructRaw(const RoseEngine *t, FILE *f) {
-    fprintf(f, "struct RoseEngine {\n");
-    DUMP_U8(t, noFloatingRoots);
-    DUMP_U8(t, requiresEodCheck);
-    DUMP_U8(t, hasOutfixesInSmallBlock);
-    DUMP_U8(t, runtimeImpl);
-    DUMP_U8(t, mpvTriggeredByLeaf);
-    DUMP_U8(t, canExhaust);
-    DUMP_U8(t, hasSom);
-    DUMP_U8(t, somHorizon);
-    DUMP_U8(t, needsCatchup);
-    DUMP_U32(t, mode);
-    DUMP_U32(t, historyRequired);
-    DUMP_U32(t, ekeyCount);
-    DUMP_U32(t, dkeyCount);
-    DUMP_U32(t, dkeyLogSize);
-    DUMP_U32(t, invDkeyOffset);
-    DUMP_U32(t, somLocationCount);
-    DUMP_U32(t, somLocationFatbitSize);
-    DUMP_U32(t, rolesWithStateCount);
-    DUMP_U32(t, stateSize);
-    DUMP_U32(t, anchorStateSize);
-    DUMP_U32(t, nfaStateSize);
-    DUMP_U32(t, tStateSize);
-    DUMP_U32(t, smallWriteOffset);
-    DUMP_U32(t, amatcherOffset);
-    DUMP_U32(t, ematcherOffset);
-    DUMP_U32(t, fmatcherOffset);
-    DUMP_U32(t, sbmatcherOffset);
-    DUMP_U32(t, longLitTableOffset);
-    DUMP_U32(t, amatcherMinWidth);
-    DUMP_U32(t, fmatcherMinWidth);
-    DUMP_U32(t, eodmatcherMinWidth);
-    DUMP_U32(t, amatcherMaxBiAnchoredWidth);
-    DUMP_U32(t, fmatcherMaxBiAnchoredWidth);
-    DUMP_U32(t, litProgramOffset);
-    DUMP_U32(t, litDelayRebuildProgramOffset);
-    DUMP_U32(t, reportProgramOffset);
-    DUMP_U32(t, reportProgramCount);
-    DUMP_U32(t, literalCount);
-    DUMP_U32(t, activeArrayCount);
-    DUMP_U32(t, activeLeftCount);
-    DUMP_U32(t, queueCount);
-    DUMP_U32(t, activeQueueArraySize);
-    DUMP_U32(t, eagerIterOffset);
-    DUMP_U32(t, handledKeyCount);
-    DUMP_U32(t, handledKeyFatbitSize);
-    DUMP_U32(t, leftOffset);
-    DUMP_U32(t, roseCount);
-    DUMP_U32(t, lookaroundTableOffset);
-    DUMP_U32(t, lookaroundReachOffset);
-    DUMP_U32(t, eodProgramOffset);
-    DUMP_U32(t, lastByteHistoryIterOffset);
-    DUMP_U32(t, minWidth);
-    DUMP_U32(t, minWidthExcludingBoundaries);
-    DUMP_U32(t, maxBiAnchoredWidth);
-    DUMP_U32(t, anchoredDistance);
-    DUMP_U32(t, anchoredMinDistance);
-    DUMP_U32(t, floatingDistance);
-    DUMP_U32(t, floatingMinDistance);
-    DUMP_U32(t, smallBlockDistance);
-    DUMP_U32(t, floatingMinLiteralMatchOffset);
-    DUMP_U32(t, nfaInfoOffset);
-    DUMP_U64(t, initialGroups);
-    DUMP_U64(t, floating_group_mask);
-    DUMP_U32(t, size);
-    DUMP_U32(t, delay_count);
-    DUMP_U32(t, delay_fatbit_size);
-    DUMP_U32(t, delay_base_id);
-    DUMP_U32(t, anchored_count);
-    DUMP_U32(t, anchored_fatbit_size);
-    DUMP_U32(t, anchored_base_id);
-    DUMP_U32(t, maxFloatingDelayedMatch);
-    DUMP_U32(t, delayRebuildLength);
-    DUMP_U32(t, stateOffsets.history);
-    DUMP_U32(t, stateOffsets.exhausted);
-    DUMP_U32(t, stateOffsets.activeLeafArray);
-    DUMP_U32(t, stateOffsets.activeLeftArray);
-    DUMP_U32(t, stateOffsets.activeLeftArray_size);
-    DUMP_U32(t, stateOffsets.leftfixLagTable);
-    DUMP_U32(t, stateOffsets.anchorState);
-    DUMP_U32(t, stateOffsets.groups);
-    DUMP_U32(t, stateOffsets.groups_size);
-    DUMP_U32(t, stateOffsets.longLitState);
-    DUMP_U32(t, stateOffsets.somLocation);
-    DUMP_U32(t, stateOffsets.somValid);
-    DUMP_U32(t, stateOffsets.somWritable);
-    DUMP_U32(t, stateOffsets.end);
-    DUMP_U32(t, boundary.reportEodOffset);
-    DUMP_U32(t, boundary.reportZeroOffset);
-    DUMP_U32(t, boundary.reportZeroEodOffset);
-    DUMP_U32(t, totalNumLiterals);
-    DUMP_U32(t, asize);
-    DUMP_U32(t, outfixBeginQueue);
-    DUMP_U32(t, outfixEndQueue);
-    DUMP_U32(t, leftfixBeginQueue);
-    DUMP_U32(t, initMpvNfa);
-    DUMP_U32(t, rosePrefixCount);
-    DUMP_U32(t, activeLeftIterOffset);
-    DUMP_U32(t, ematcherRegionSize);
-    DUMP_U32(t, somRevCount);
-    DUMP_U32(t, somRevOffsetOffset);
-    DUMP_U32(t, longLitStreamState);
-    fprintf(f, "}\n");
-    fprintf(f, "sizeof(RoseEngine) = %zu\n", sizeof(RoseEngine));
-}
-
-void roseDumpComponents(const RoseEngine *t, bool dump_raw,
-                        const string &base) {
-    dumpComponentInfo(t, base);
-    dumpComponentInfoCsv(t, base);
-    dumpNfas(t, dump_raw, base);
-    dumpAnchored(t, base);
-    dumpRevComponentInfo(t, base);
-    dumpRevNfas(t, dump_raw, base);
-    dumpRoseLitPrograms(t, base + "/rose_lit_programs.txt");
-    dumpRoseEodPrograms(t, base + "/rose_eod_programs.txt");
-    dumpRoseReportPrograms(t, base + "/rose_report_programs.txt");
-}
-
-void roseDumpInternals(const RoseEngine *t, const string &base) {
-    if (!t) {
-        DEBUG_PRINTF("no rose\n");
-        return;
-    }
-
-    const void *atable = getAnchoredMatcher(t);
-    const HWLM *ftable = getFloatingMatcher(t);
-    const HWLM *etable = getEodMatcher(t);
-
-    if (atable) {
-        FILE *f = fopen((base + "/anchored.raw").c_str(), "w");
-        if (f) {
-            fwrite(atable, 1, t->asize, f);
-            fclose(f);
-        }
-    }
-
-    if (ftable) {
-        FILE *f = fopen((base + "/floating.raw").c_str(), "w");
-        if (f) {
-            fwrite(ftable, 1, hwlmSize(ftable), f);
-            fclose(f);
-        }
-    }
-
-    if (etable) {
-        FILE *f = fopen((base + "/eod.raw").c_str(), "w");
-        if (f) {
-            fwrite(etable, 1, hwlmSize(etable), f);
-            fclose(f);
-        }
-    }
-
-    FILE *f = fopen((base + "/rose.raw").c_str(), "w");
-    assert(f);
-    fwrite(t, 1, roseSize(t), f);
-    fclose(f);
-
-    f = fopen((base + "/rose_struct.txt").c_str(), "w");
-    roseDumpStructRaw(t, f);
-    fclose(f);
-
-    roseDumpComponents(t, true, base);
-}
-
-} // namespace ue2
diff --git a/src/rose/rose_graph.h b/src/rose/rose_graph.h
index c3af749fb..b7e092bbd 100644
--- a/src/rose/rose_graph.h
+++ b/src/rose/rose_graph.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -85,7 +85,7 @@ struct LeftEngInfo {
     std::shared_ptr<TamaProto> tamarama;
     u32 lag = 0U;
     ReportID leftfix_report = MO_INVALID_IDX;
-    depth dfa_min_width = 0;
+    depth dfa_min_width{0};
     depth dfa_max_width = depth::infinity();
 
     bool operator==(const LeftEngInfo &other) const {
@@ -125,7 +125,7 @@ struct RoseSuffixInfo {
     std::shared_ptr<raw_som_dfa> haig;
     std::shared_ptr<raw_dfa> rdfa;
     std::shared_ptr<TamaProto> tamarama;
-    depth dfa_min_width = 0;
+    depth dfa_min_width{0};
     depth dfa_max_width = depth::infinity();
 
     bool operator==(const RoseSuffixInfo &b) const;
diff --git a/src/rose/rose_in_graph.h b/src/rose/rose_in_graph.h
index 0e2185768..42c59932d 100644
--- a/src/rose/rose_in_graph.h
+++ b/src/rose/rose_in_graph.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -55,6 +55,7 @@ namespace ue2 {
 
 class NGHolder;
 struct raw_som_dfa;
+struct raw_dfa;
 
 enum RoseInVertexType {
     RIV_LITERAL,
@@ -166,9 +167,12 @@ struct RoseInEdgeProps {
     /** \brief Maximum bound on 'dot' repeat between literals. */
     u32 maxBound;
 
-    /** \brief Prefix graph. Graph is end to (end - lag). */
+    /** \brief Graph on edge. Graph is end to (end - lag). */
     std::shared_ptr<NGHolder> graph;
 
+    /** \brief DFA version of graph, if we have already determinised. */
+    std::shared_ptr<raw_dfa> dfa;
+
     /** \brief Haig version of graph, if required. */
     std::shared_ptr<raw_som_dfa> haig;
 
diff --git a/src/rose/rose_internal.h b/src/rose/rose_internal.h
index 411ce03f6..57395c9dc 100644
--- a/src/rose/rose_internal.h
+++ b/src/rose/rose_internal.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -304,7 +304,6 @@ struct RoseEngine {
     u8  hasSom; /**< has at least one pattern which tracks SOM. */
     u8  somHorizon; /**< width in bytes of SOM offset storage (governed by
                         SOM precision) */
-    u8 needsCatchup; /** catch up needs to be run on every report. */
     u32 mode; /**< scanning mode, one of HS_MODE_{BLOCK,STREAM,VECTORED} */
     u32 historyRequired; /**< max amount of history required for streaming */
     u32 ekeyCount; /**< number of exhaustion keys */
@@ -326,6 +325,7 @@ struct RoseEngine {
     u32 amatcherOffset; // offset of the anchored literal matcher (bytes)
     u32 ematcherOffset; // offset of the eod-anchored literal matcher (bytes)
     u32 fmatcherOffset; // offset of the floating literal matcher (bytes)
+    u32 drmatcherOffset; // offset of the delayed rebuild table (bytes)
     u32 sbmatcherOffset; // offset of the small-block literal matcher (bytes)
     u32 longLitTableOffset; // offset of the long literal table
     u32 amatcherMinWidth; /**< minimum number of bytes required for a pattern
@@ -343,12 +343,6 @@ struct RoseEngine {
     u32 fmatcherMaxBiAnchoredWidth; /**< maximum number of bytes that can still
                                      * produce a match for a pattern involved
                                      * with the anchored table. */
-    /** \brief Offset of u32 array of program offsets for literals. */
-    u32 litProgramOffset;
-
-    /** \brief Offset of u32 array of delay rebuild program offsets for
-     * literals. */
-    u32 litDelayRebuildProgramOffset;
 
     /**
      * \brief Offset of u32 array of program offsets for reports used by
@@ -362,12 +356,15 @@ struct RoseEngine {
     u32 reportProgramCount;
 
     /**
-     * \brief Number of entries in the arrays pointed to by litProgramOffset,
-     * litDelayRebuildProgramOffset.
-     *
-     * Note: NOT the total number of literals.
+     * \brief Offset of u32 array of program offsets for delayed replay of
+     * literals.
+     */
+    u32 delayProgramOffset;
+
+    /**
+     * \brief Offset of u32 array of program offsets for anchored literals.
      */
-    u32 literalCount;
+    u32 anchoredProgramOffset;
 
     u32 activeArrayCount; //number of nfas tracked in the active array
     u32 activeLeftCount; //number of nfas tracked in the active rose array
@@ -386,9 +383,6 @@ struct RoseEngine {
 
     u32 leftOffset;
     u32 roseCount;
-    u32 lookaroundTableOffset; //!< base of lookaround offset list (of s8 values)
-    u32 lookaroundReachOffset; /**< base of lookaround reach bitvectors (32
-                                * bytes each) */
 
     u32 eodProgramOffset; //!< EOD program, otherwise 0.
 
@@ -419,12 +413,8 @@ struct RoseEngine {
     u32 size; // (bytes)
     u32 delay_count; /* number of delayed literal ids. */
     u32 delay_fatbit_size; //!< size of each delay fatbit in scratch (bytes)
-    u32 delay_base_id; /* literal id of the first delayed literal.
-                        * delayed literal ids are contiguous */
     u32 anchored_count; /* number of anchored literal ids */
     u32 anchored_fatbit_size; //!< size of each anch fatbit in scratch (bytes)
-    u32 anchored_base_id; /* literal id of the first literal in the A table.
-                           * anchored literal ids are contiguous */
     u32 maxFloatingDelayedMatch; /* max offset that a delayed literal can
                                   * usefully be reported */
     u32 delayRebuildLength; /* length of the history region which needs to be
diff --git a/src/rose/rose_program.h b/src/rose/rose_program.h
index ed9133162..78b123d5c 100644
--- a/src/rose/rose_program.h
+++ b/src/rose/rose_program.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -36,6 +36,7 @@
 #include "som/som_operation.h"
 #include "rose_internal.h"
 #include "ue2common.h"
+#include "util/simd_types.h"
 
 /** \brief Minimum alignment for each instruction in memory. */
 #define ROSE_INSTR_MIN_ALIGN 8U
@@ -61,7 +62,7 @@ enum RoseInstructionCode {
     ROSE_INSTR_CHECK_INFIX,       //!< Infix engine must be in accept state.
     ROSE_INSTR_CHECK_PREFIX,      //!< Prefix engine must be in accept state.
     ROSE_INSTR_PUSH_DELAYED,      //!< Push delayed literal matches.
-    ROSE_INSTR_RECORD_ANCHORED,   //!< Record an anchored literal match.
+    ROSE_INSTR_DUMMY_NOP,         //!< NOP. Should not exist in build programs.
     ROSE_INSTR_CATCH_UP,          //!< Catch up engines, anchored matches.
     ROSE_INSTR_CATCH_UP_MPV,      //!< Catch up the MPV.
     ROSE_INSTR_SOM_ADJUST,        //!< Set SOM from a distance to EOM.
@@ -129,7 +130,55 @@ enum RoseInstructionCode {
      */
     ROSE_INSTR_CHECK_LONG_LIT_NOCASE,
 
-    LAST_ROSE_INSTRUCTION = ROSE_INSTR_CHECK_LONG_LIT_NOCASE //!< Sentinel.
+    /**
+     * \brief Confirm a case-sensitive "medium length" literal at the current
+     * offset. In streaming mode, this will check history if needed.
+     */
+    ROSE_INSTR_CHECK_MED_LIT,
+
+    /**
+     * \brief Confirm a case-insensitive "medium length" literal at the current
+     * offset. In streaming mode, this will check history if needed.
+     */
+    ROSE_INSTR_CHECK_MED_LIT_NOCASE,
+
+    /**
+     * \brief Clear the "work done" flag used by the SQUASH_GROUPS instruction.
+     */
+    ROSE_INSTR_CLEAR_WORK_DONE,
+
+    /** \brief Check lookaround if it has multiple paths. */
+    ROSE_INSTR_MULTIPATH_LOOKAROUND,
+
+    /**
+     * \brief Use shufti to check lookaround with multiple paths. The total
+     * length of the paths is 16 bytes at most and shufti has 8 buckets.
+     * All paths can be at most 16 bytes long.
+     */
+    ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_16x8,
+
+    /**
+     * \brief Use shufti to check lookaround with multiple paths. The total
+     * length of the paths is 32 bytes at most and shufti has 8 buckets.
+     * All paths can be at most 16 bytes long.
+     */
+    ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_32x8,
+
+    /**
+     * \brief Use shufti to check lookaround with multiple paths. The total
+     * length of the paths is 32 bytes at most and shufti has 16 buckets.
+     * All paths can be at most 16 bytes long.
+     */
+    ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_32x16,
+
+    /**
+     * \brief Use shufti to check multiple paths lookaround. The total
+     * length of the paths is 64 bytes at most and shufti has 8 buckets.
+     * All paths can be at most 16 bytes long.
+     */
+    ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_64,
+
+    LAST_ROSE_INSTRUCTION = ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_64 //!< Sentinel.
 };
 
 struct ROSE_STRUCT_END {
@@ -139,13 +188,14 @@ struct ROSE_STRUCT_END {
 struct ROSE_STRUCT_ANCHORED_DELAY {
     u8 code; //!< From enum RoseInstructionCode.
     rose_group groups; //!< Bitmask.
-    u32 done_jump; //!< Jump forward this many bytes if successful.
+    u32 anch_id; //!< Program to restart after the delay.
+    u32 done_jump; //!< Jump forward this many bytes if we have to delay.
 };
 
-/** Note: check failure will halt program. */
 struct ROSE_STRUCT_CHECK_LIT_EARLY {
     u8 code; //!< From enum RoseInstructionCode.
     u32 min_offset; //!< Minimum offset for this literal.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
 };
 
 /** Note: check failure will halt program. */
@@ -175,14 +225,15 @@ struct ROSE_STRUCT_CHECK_NOT_HANDLED {
 struct ROSE_STRUCT_CHECK_SINGLE_LOOKAROUND {
     u8 code; //!< From enum RoseInstructionCode.
     s8 offset; //!< The offset of the byte to examine.
-    u32 reach_index; //!< The index of the reach table entry to use.
+    u32 reach_index; //!< Index for lookaround reach bitvectors.
     u32 fail_jump; //!< Jump forward this many bytes on failure.
 };
 
 struct ROSE_STRUCT_CHECK_LOOKAROUND {
     u8 code; //!< From enum RoseInstructionCode.
-    u32 index;
-    u32 count;
+    u32 look_index; //!< Offset in bytecode of lookaround offset list.
+    u32 reach_index; //!< Offset in bytecode of lookaround reach bitvectors.
+    u32 count; //!< The count of lookaround entries in one instruction.
     u32 fail_jump; //!< Jump forward this many bytes on failure.
 };
 
@@ -277,9 +328,8 @@ struct ROSE_STRUCT_PUSH_DELAYED {
     u32 index; // Delay literal index (relative to first delay lit).
 };
 
-struct ROSE_STRUCT_RECORD_ANCHORED {
+struct ROSE_STRUCT_DUMMY_NOP {
     u8 code; //!< From enum RoseInstructionCode.
-    u32 id; //!< Literal ID.
 };
 
 struct ROSE_STRUCT_CATCH_UP {
@@ -477,18 +527,102 @@ struct ROSE_STRUCT_MATCHER_EOD {
     u8 code; //!< From enum RoseInstructionCode.
 };
 
-/** Note: check failure will halt program. */
 struct ROSE_STRUCT_CHECK_LONG_LIT {
     u8 code; //!< From enum RoseInstructionCode.
     u32 lit_offset; //!< Offset of literal string.
     u32 lit_length; //!< Length of literal string.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
 };
 
-/** Note: check failure will halt program. */
 struct ROSE_STRUCT_CHECK_LONG_LIT_NOCASE {
     u8 code; //!< From enum RoseInstructionCode.
     u32 lit_offset; //!< Offset of literal string.
     u32 lit_length; //!< Length of literal string.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_MED_LIT {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 lit_offset; //!< Offset of literal string.
+    u32 lit_length; //!< Length of literal string.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_MED_LIT_NOCASE {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 lit_offset; //!< Offset of literal string.
+    u32 lit_length; //!< Length of literal string.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CLEAR_WORK_DONE {
+    u8 code; //!< From enum RoseInstructionCode.
+};
+
+struct ROSE_STRUCT_MULTIPATH_LOOKAROUND {
+    u8 code; //!< From enum RoseInstructionCode.
+    u32 look_index; //!< Offset in bytecode of lookaround offset list.
+    u32 reach_index; //!< Offset in bytecode of lookaround reach bitvectors.
+    u32 count; //!< The lookaround byte numbers for each path.
+    s32 last_start; //!< The latest start offset among 8 paths.
+    u8 start_mask[MULTIPATH_MAX_LEN]; /*!< Used to initialize path if left-most
+                                       * data is missed. */
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
 };
 
+struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_16x8 {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 nib_mask[2 * sizeof(m128)]; //!< High and low nibble mask in shufti.
+    u8 bucket_select_mask[sizeof(m128)]; //!< Mask for bucket assigning.
+    u8 data_select_mask[sizeof(m128)]; //!< Shuffle mask for data ordering.
+    u32 hi_bits_mask; //!< High-bits used in multi-path validation.
+    u32 lo_bits_mask; //!< Low-bits used in multi-path validation.
+    u32 neg_mask; //!< 64 bits negation mask.
+    s32 base_offset; //!< Relative offset of the first byte.
+    s32 last_start; //!< The latest start offset among 8 paths.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_32x8 {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 hi_mask[sizeof(m128)]; //!< High nibble mask in shufti.
+    u8 lo_mask[sizeof(m128)]; //!< Low nibble mask in shufti.
+    u8 bucket_select_mask[sizeof(m256)]; //!< Mask for bucket assigning.
+    u8 data_select_mask[sizeof(m256)]; //!< Shuffle mask for data ordering.
+    u32 hi_bits_mask; //!< High-bits used in multi-path validation.
+    u32 lo_bits_mask; //!< Low-bits used in multi-path validation.
+    u32 neg_mask; //!< 64 bits negation mask.
+    s32 base_offset; //!< Relative offset of the first byte.
+    s32 last_start; //!< The latest start offset among 8 paths.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_32x16 {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 hi_mask[sizeof(m256)]; //!< High nibble mask in shufti.
+    u8 lo_mask[sizeof(m256)]; //!< Low nibble mask in shufti.
+    u8 bucket_select_mask_hi[sizeof(m256)]; //!< Mask for bucket assigning.
+    u8 bucket_select_mask_lo[sizeof(m256)]; //!< Mask for bucket assigning.
+    u8 data_select_mask[sizeof(m256)]; //!< Shuffle mask for data ordering.
+    u32 hi_bits_mask; //!< High-bits used in multi-path validation.
+    u32 lo_bits_mask; //!< Low-bits used in multi-path validation.
+    u32 neg_mask; //!< 64 bits negation mask.
+    s32 base_offset; //!< Relative offset of the first byte.
+    s32 last_start; //!< The latest start offset among 8 paths.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
+
+struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_64 {
+    u8 code; //!< From enum RoseInstructionCode.
+    u8 hi_mask[sizeof(m128)]; //!< High nibble mask in shufti.
+    u8 lo_mask[sizeof(m128)]; //!< Low nibble mask in shufti.
+    u8 bucket_select_mask[2 * sizeof(m256)]; //!< Mask for bucket assigning.
+    u8 data_select_mask[2 * sizeof(m256)]; //!< Shuffle mask for data ordering.
+    u64a hi_bits_mask; //!< High-bits used in multi-path validation.
+    u64a lo_bits_mask; //!< Low-bits used in multi-path validation.
+    u64a neg_mask; //!< 64 bits negation mask.
+    s32 base_offset; //!< Relative offset of the first byte.
+    s32 last_start; //!< The latest start offset among 8 paths.
+    u32 fail_jump; //!< Jump forward this many bytes on failure.
+};
 #endif // ROSE_ROSE_PROGRAM_H
diff --git a/src/rose/stream.c b/src/rose/stream.c
index 9599612f0..c68cd8ab9 100644
--- a/src/rose/stream.c
+++ b/src/rose/stream.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -412,16 +412,18 @@ void ensureStreamNeatAndTidy(const struct RoseEngine *t, char *state,
 }
 
 static really_inline
-void do_rebuild(const struct RoseEngine *t, const struct HWLM *ftable,
-                struct hs_scratch *scratch) {
+void do_rebuild(const struct RoseEngine *t, struct hs_scratch *scratch) {
+    assert(t->drmatcherOffset);
     assert(!can_stop_matching(scratch));
+
+    const struct HWLM *hwlm = getByOffset(t, t->drmatcherOffset);
     size_t len = MIN(scratch->core_info.hlen, t->delayRebuildLength);
     const u8 *buf = scratch->core_info.hbuf + scratch->core_info.hlen - len;
     DEBUG_PRINTF("BEGIN FLOATING REBUILD over %zu bytes\n", len);
 
     scratch->core_info.status &= ~STATUS_DELAY_DIRTY;
 
-    hwlmExec(ftable, buf, len, 0, roseDelayRebuildCallback, scratch,
+    hwlmExec(hwlm, buf, len, 0, roseDelayRebuildCallback, scratch,
              scratch->tctxt.groups);
     assert(!can_stop_matching(scratch));
 }
@@ -512,6 +514,34 @@ void runEagerPrefixesStream(const struct RoseEngine *t,
     }
 }
 
+static really_inline
+int can_never_match(const struct RoseEngine *t, char *state,
+                    struct hs_scratch *scratch, size_t length, u64a offset) {
+    struct RoseContext *tctxt = &scratch->tctxt;
+
+    if (tctxt->groups) {
+        DEBUG_PRINTF("still has active groups\n");
+        return 0;
+    }
+
+    if (offset + length <= t->anchoredDistance) { /* not < as may have eod */
+        DEBUG_PRINTF("still in anchored region\n");
+        return 0;
+    }
+
+    if (t->lastByteHistoryIterOffset) { /* last byte history is hard */
+        DEBUG_PRINTF("last byte history\n");
+        return 0;
+    }
+
+    if (mmbit_any(getActiveLeafArray(t, state), t->activeArrayCount)) {
+        DEBUG_PRINTF("active leaf\n");
+        return 0;
+    }
+
+    return 1;
+}
+
 void roseStreamExec(const struct RoseEngine *t, struct hs_scratch *scratch) {
     DEBUG_PRINTF("OH HAI [%llu, %llu)\n", scratch->core_info.buf_offset,
                  scratch->core_info.buf_offset + (u64a)scratch->core_info.len);
@@ -607,15 +637,12 @@ void roseStreamExec(const struct RoseEngine *t, struct hs_scratch *scratch) {
                      rebuild, scratch->core_info.status,
                      t->maxFloatingDelayedMatch, offset);
 
-        if (!flen) {
-            if (rebuild) { /* rebuild floating delayed match stuff */
-                do_rebuild(t, ftable, scratch);
-            }
-            goto flush_delay_and_exit;
+        if (rebuild) { /* rebuild floating delayed match stuff */
+            do_rebuild(t, scratch);
         }
 
-        if (rebuild) { /* rebuild floating delayed match stuff */
-            do_rebuild(t, ftable, scratch);
+        if (!flen) {
+            goto flush_delay_and_exit;
         }
 
         if (flen + offset <= t->floatingMinDistance) {
@@ -647,6 +674,14 @@ void roseStreamExec(const struct RoseEngine *t, struct hs_scratch *scratch) {
     if (!can_stop_matching(scratch)) {
         ensureStreamNeatAndTidy(t, state, scratch, length, offset);
     }
+
+    if (!told_to_stop_matching(scratch)
+        && can_never_match(t, state, scratch, length, offset)) {
+        DEBUG_PRINTF("PATTERN SET IS EXHAUSTED\n");
+        scratch->core_info.status = STATUS_EXHAUSTED;
+        return;
+    }
+
     DEBUG_PRINTF("DONE STREAMING SCAN, status = %u\n",
                  scratch->core_info.status);
     return;
diff --git a/src/rose/stream_long_lit.h b/src/rose/stream_long_lit.h
index d78e28635..0736ec88e 100644
--- a/src/rose/stream_long_lit.h
+++ b/src/rose/stream_long_lit.h
@@ -111,7 +111,7 @@ void loadLongLiteralState(const struct RoseEngine *t, char *state,
     }
 
     // If we don't have any long literals in play, these values must point to
-    // the real history buffer so that CHECK_LITERAL instructions examine the
+    // the real history buffer so that CHECK_LONG_LIT instructions examine the
     // history buffer.
     scratch->tctxt.ll_buf = scratch->core_info.hbuf;
     scratch->tctxt.ll_len = scratch->core_info.hlen;
diff --git a/src/rose/validate_shufti.h b/src/rose/validate_shufti.h
index 49d2c2fe6..1dc855d99 100644
--- a/src/rose/validate_shufti.h
+++ b/src/rose/validate_shufti.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -46,10 +46,11 @@ void dumpMask(const void *mask, int len) {
 static really_inline
 int validateShuftiMask16x16(const m256 data, const m256 hi_mask,
                             const m256 lo_mask, const m256 and_mask,
-                            const u32 neg_mask, const u16 valid_data_mask) {
+                            const u32 neg_mask, const u32 valid_data_mask) {
     m256 low4bits = set32x8(0xf);
-    m256 c_lo = vpshufb(lo_mask, and256(data, low4bits));
-    m256 c_hi = vpshufb(hi_mask, rshift64_m256(andnot256(low4bits, data), 4));
+    m256 c_lo = pshufb_m256(lo_mask, and256(data, low4bits));
+    m256 c_hi = pshufb_m256(hi_mask,
+                            rshift64_m256(andnot256(low4bits, data), 4));
     m256 t = and256(c_lo, c_hi);
     u32 nresult = movemask256(eq256(and256(t, and_mask), zeroes256()));
 #ifdef DEBUG
@@ -75,10 +76,10 @@ int validateShuftiMask16x16(const m256 data, const m256 hi_mask,
 static really_inline
 int validateShuftiMask16x8(const m128 data, const m256 nib_mask,
                            const m128 and_mask, const u32 neg_mask,
-                           const u16 valid_data_mask) {
+                           const u32 valid_data_mask) {
     m256 data_m256 = combine2x128(rshift64_m128(data, 4), data);
     m256 low4bits = set32x8(0xf);
-    m256 c_nib = vpshufb(nib_mask, and256(data_m256, low4bits));
+    m256 c_nib = pshufb_m256(nib_mask, and256(data_m256, low4bits));
     m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib));
     m128 nresult = eq128(and128(t, and_mask), zeroes128());
 #ifdef DEBUG
@@ -101,8 +102,9 @@ int validateShuftiMask32x8(const m256 data, const m256 hi_mask,
                            const m256 lo_mask, const m256 and_mask,
                            const u32 neg_mask, const u32 valid_data_mask) {
     m256 low4bits = set32x8(0xf);
-    m256 c_lo = vpshufb(lo_mask, and256(data, low4bits));
-    m256 c_hi = vpshufb(hi_mask, rshift64_m256(andnot256(low4bits, data), 4));
+    m256 c_lo = pshufb_m256(lo_mask, and256(data, low4bits));
+    m256 c_hi = pshufb_m256(hi_mask,
+                            rshift64_m256(andnot256(low4bits, data), 4));
     m256 t = and256(c_lo, c_hi);
     m256 nresult = eq256(and256(t, and_mask), zeroes256());
 #ifdef DEBUG
@@ -134,10 +136,10 @@ int validateShuftiMask32x16(const m256 data,
     m256 low4bits = set32x8(0xf);
     m256 data_lo = and256(data, low4bits);
     m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
-    m256 c_lo_1 = vpshufb(lo_mask_1, data_lo);
-    m256 c_lo_2 = vpshufb(lo_mask_2, data_lo);
-    m256 c_hi_1 = vpshufb(hi_mask_1, data_hi);
-    m256 c_hi_2 = vpshufb(hi_mask_2, data_hi);
+    m256 c_lo_1 = pshufb_m256(lo_mask_1, data_lo);
+    m256 c_lo_2 = pshufb_m256(lo_mask_2, data_lo);
+    m256 c_hi_1 = pshufb_m256(hi_mask_1, data_hi);
+    m256 c_hi_2 = pshufb_m256(hi_mask_2, data_hi);
     m256 t1 = and256(c_lo_1, c_hi_1);
     m256 t2 = and256(c_lo_2, c_hi_2);
     m256 result = or256(and256(t1, bucket_mask_lo), and256(t2, bucket_mask_hi));
@@ -172,4 +174,121 @@ int validateShuftiMask32x16(const m256 data,
     u32 cmp_result = (nresult ^ neg_mask) & valid_data_mask;
     return !cmp_result;
 }
+
+static really_inline
+int checkMultipath32(u32 data, u32 hi_bits, u32 lo_bits) {
+    u32 t = ~(data | hi_bits);
+    t += lo_bits;
+    t &= (~data) & hi_bits;
+    DEBUG_PRINTF("t %x\n", t);
+    return !!t;
+}
+
+static really_inline
+int checkMultipath64(u64a data, u64a hi_bits, u64a lo_bits) {
+    u64a t = ~(data | hi_bits);
+    t += lo_bits;
+    t &= (~data) & hi_bits;
+    DEBUG_PRINTF("t %llx\n", t);
+    return !!t;
+}
+
+static really_inline
+int validateMultipathShuftiMask16x8(const m128 data,
+                                    const m256 nib_mask,
+                                    const m128 bucket_select_mask,
+                                    const u32 hi_bits, const u32 lo_bits,
+                                    const u32 neg_mask,
+                                    const u32 valid_path_mask) {
+    m256 data_256 = combine2x128(rshift64_m128(data, 4), data);
+    m256 low4bits = set32x8(0xf);
+    m256 c_nib = pshufb_m256(nib_mask, and256(data_256, low4bits));
+    m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib));
+    m128 result = and128(t, bucket_select_mask);
+    u32 nresult = movemask128(eq128(result, zeroes128()));
+    u32 cmp_result = (nresult ^ neg_mask) | valid_path_mask;
+
+    DEBUG_PRINTF("cmp_result %x\n", cmp_result);
+
+    return checkMultipath32(cmp_result, hi_bits, lo_bits);
+}
+
+static really_inline
+int validateMultipathShuftiMask32x8(const m256 data,
+                                    const m256 hi_mask, const m256 lo_mask,
+                                    const m256 bucket_select_mask,
+                                    const u32 hi_bits, const u32 lo_bits,
+                                    const u32 neg_mask,
+                                    const u32 valid_path_mask) {
+    m256 low4bits = set32x8(0xf);
+    m256 data_lo = and256(data, low4bits);
+    m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
+    m256 c_lo = pshufb_m256(lo_mask, data_lo);
+    m256 c_hi = pshufb_m256(hi_mask, data_hi);
+    m256 c = and256(c_lo, c_hi);
+    m256 result = and256(c, bucket_select_mask);
+    u32 nresult = movemask256(eq256(result, zeroes256()));
+    u32 cmp_result = (nresult ^ neg_mask) | valid_path_mask;
+
+    DEBUG_PRINTF("cmp_result %x\n", cmp_result);
+
+    return checkMultipath32(cmp_result, hi_bits, lo_bits);
+}
+
+static really_inline
+int validateMultipathShuftiMask32x16(const m256 data,
+                                     const m256 hi_mask_1, const m256 hi_mask_2,
+                                     const m256 lo_mask_1, const m256 lo_mask_2,
+                                     const m256 bucket_select_mask_hi,
+                                     const m256 bucket_select_mask_lo,
+                                     const u32 hi_bits, const u32 lo_bits,
+                                     const u32 neg_mask,
+                                     const u32 valid_path_mask) {
+    m256 low4bits = set32x8(0xf);
+    m256 data_lo = and256(data, low4bits);
+    m256 data_hi = and256(rshift64_m256(data, 4), low4bits);
+    m256 c_lo_1 = pshufb_m256(lo_mask_1, data_lo);
+    m256 c_lo_2 = pshufb_m256(lo_mask_2, data_lo);
+    m256 c_hi_1 = pshufb_m256(hi_mask_1, data_hi);
+    m256 c_hi_2 = pshufb_m256(hi_mask_2, data_hi);
+    m256 t1 = and256(c_lo_1, c_hi_1);
+    m256 t2 = and256(c_lo_2, c_hi_2);
+    m256 result = or256(and256(t1, bucket_select_mask_lo),
+                        and256(t2, bucket_select_mask_hi));
+    u32 nresult = movemask256(eq256(result, zeroes256()));
+    u32 cmp_result = (nresult ^ neg_mask) | valid_path_mask;
+
+    DEBUG_PRINTF("cmp_result %x\n", cmp_result);
+
+    return checkMultipath32(cmp_result, hi_bits, lo_bits);
+}
+
+static really_inline
+int validateMultipathShuftiMask64(const m256 data_1, const m256 data_2,
+                                  const m256 hi_mask, const m256 lo_mask,
+                                  const m256 bucket_select_mask_1,
+                                  const m256 bucket_select_mask_2,
+                                  const u64a hi_bits, const u64a lo_bits,
+                                  const u64a neg_mask,
+                                  const u64a valid_path_mask) {
+    m256 low4bits = set32x8(0xf);
+    m256 c_lo_1 = pshufb_m256(lo_mask, and256(data_1, low4bits));
+    m256 c_lo_2 = pshufb_m256(lo_mask, and256(data_2, low4bits));
+    m256 c_hi_1 = pshufb_m256(hi_mask,
+                              rshift64_m256(andnot256(low4bits, data_1), 4));
+    m256 c_hi_2 = pshufb_m256(hi_mask,
+                              rshift64_m256(andnot256(low4bits, data_2), 4));
+    m256 t1 = and256(c_lo_1, c_hi_1);
+    m256 t2 = and256(c_lo_2, c_hi_2);
+    m256 nresult_1 = eq256(and256(t1, bucket_select_mask_1), zeroes256());
+    m256 nresult_2 = eq256(and256(t2, bucket_select_mask_2), zeroes256());
+    u64a nresult = (u64a)movemask256(nresult_1) |
+                   (u64a)movemask256(nresult_2) << 32;
+    u64a cmp_result = (nresult ^ neg_mask) | valid_path_mask;
+
+    DEBUG_PRINTF("cmp_result %llx\n", cmp_result);
+
+    return checkMultipath64(cmp_result, hi_bits, lo_bits);
+}
+
 #endif
diff --git a/src/runtime.c b/src/runtime.c
index a2ed10260..5725cf93a 100644
--- a/src/runtime.c
+++ b/src/runtime.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -214,7 +214,7 @@ void pureLiteralBlockExec(const struct RoseEngine *rose,
     scratch->tctxt.groups = rose->initialGroups;
 
     hwlmExec(ftable, buffer, length, 0, roseCallback, scratch,
-             rose->initialGroups);
+             rose->initialGroups & rose->floating_group_mask);
 }
 
 static really_inline
@@ -311,9 +311,10 @@ void runSmallWriteEngine(const struct SmallWriteEngine *smwr,
 }
 
 HS_PUBLIC_API
-hs_error_t hs_scan(const hs_database_t *db, const char *data, unsigned length,
-                   unsigned flags, hs_scratch_t *scratch,
-                   match_event_handler onEvent, void *userCtx) {
+hs_error_t HS_CDECL hs_scan(const hs_database_t *db, const char *data,
+                            unsigned length, unsigned flags,
+                            hs_scratch_t *scratch, match_event_handler onEvent,
+                            void *userCtx) {
     if (unlikely(!scratch || !data)) {
         return HS_INVALID;
     }
@@ -503,8 +504,9 @@ void init_stream(struct hs_stream *s, const struct RoseEngine *rose,
 }
 
 HS_PUBLIC_API
-hs_error_t hs_open_stream(const hs_database_t *db, UNUSED unsigned flags,
-                          hs_stream_t **stream) {
+hs_error_t HS_CDECL hs_open_stream(const hs_database_t *db,
+                                   UNUSED unsigned flags,
+                                   hs_stream_t **stream) {
     if (unlikely(!stream)) {
         return HS_INVALID;
     }
@@ -656,7 +658,8 @@ void report_eod_matches(hs_stream_t *id, hs_scratch_t *scratch,
 }
 
 HS_PUBLIC_API
-hs_error_t hs_copy_stream(hs_stream_t **to_id, const hs_stream_t *from_id) {
+hs_error_t HS_CDECL hs_copy_stream(hs_stream_t **to_id,
+                                   const hs_stream_t *from_id) {
     if (!to_id) {
         return HS_INVALID;
     }
@@ -683,11 +686,11 @@ hs_error_t hs_copy_stream(hs_stream_t **to_id, const hs_stream_t *from_id) {
 }
 
 HS_PUBLIC_API
-hs_error_t hs_reset_and_copy_stream(hs_stream_t *to_id,
-                                    const hs_stream_t *from_id,
-                                    hs_scratch_t *scratch,
-                                    match_event_handler onEvent,
-                                    void *context) {
+hs_error_t HS_CDECL hs_reset_and_copy_stream(hs_stream_t *to_id,
+                                             const hs_stream_t *from_id,
+                                             hs_scratch_t *scratch,
+                                             match_event_handler onEvent,
+                                             void *context) {
     if (!from_id || !from_id->rose) {
         return HS_INVALID;
     }
@@ -762,7 +765,7 @@ void pureLiteralStreamExec(struct hs_stream *stream_state,
     const size_t start = 0;
 
     hwlmExecStreaming(ftable, scratch, len2, start, roseCallback, scratch,
-                      rose->initialGroups);
+                      rose->initialGroups & rose->floating_group_mask);
 
     if (!told_to_stop_matching(scratch) &&
         isAllExhausted(rose, scratch->core_info.exhaustionVector)) {
@@ -906,9 +909,10 @@ hs_error_t hs_scan_stream_internal(hs_stream_t *id, const char *data,
 }
 
 HS_PUBLIC_API
-hs_error_t hs_scan_stream(hs_stream_t *id, const char *data, unsigned length,
-                          unsigned flags, hs_scratch_t *scratch,
-                          match_event_handler onEvent, void *context) {
+hs_error_t HS_CDECL hs_scan_stream(hs_stream_t *id, const char *data,
+                                   unsigned length, unsigned flags,
+                                   hs_scratch_t *scratch,
+                                   match_event_handler onEvent, void *context) {
     if (unlikely(!id || !scratch || !data ||
                  !validScratch(id->rose, scratch))) {
         return HS_INVALID;
@@ -924,8 +928,9 @@ hs_error_t hs_scan_stream(hs_stream_t *id, const char *data, unsigned length,
 }
 
 HS_PUBLIC_API
-hs_error_t hs_close_stream(hs_stream_t *id, hs_scratch_t *scratch,
-                           match_event_handler onEvent, void *context) {
+hs_error_t HS_CDECL hs_close_stream(hs_stream_t *id, hs_scratch_t *scratch,
+                                    match_event_handler onEvent,
+                                    void *context) {
     if (!id) {
         return HS_INVALID;
     }
@@ -947,9 +952,10 @@ hs_error_t hs_close_stream(hs_stream_t *id, hs_scratch_t *scratch,
 }
 
 HS_PUBLIC_API
-hs_error_t hs_reset_stream(hs_stream_t *id, UNUSED unsigned int flags,
-                           hs_scratch_t *scratch, match_event_handler onEvent,
-                           void *context) {
+hs_error_t HS_CDECL hs_reset_stream(hs_stream_t *id, UNUSED unsigned int flags,
+                                    hs_scratch_t *scratch,
+                                    match_event_handler onEvent,
+                                    void *context) {
     if (!id) {
         return HS_INVALID;
     }
@@ -972,7 +978,8 @@ hs_error_t hs_reset_stream(hs_stream_t *id, UNUSED unsigned int flags,
 }
 
 HS_PUBLIC_API
-hs_error_t hs_stream_size(const hs_database_t *db, size_t *stream_size) {
+hs_error_t HS_CDECL hs_stream_size(const hs_database_t *db,
+                                   size_t *stream_size) {
     if (!stream_size) {
         return HS_INVALID;
     }
@@ -1019,10 +1026,13 @@ void dumpData(const char *data, size_t len) {
 #endif
 
 HS_PUBLIC_API
-hs_error_t hs_scan_vector(const hs_database_t *db, const char * const * data,
-                          const unsigned int *length, unsigned int count,
-                          UNUSED unsigned int flags, hs_scratch_t *scratch,
-                          match_event_handler onEvent, void *context) {
+hs_error_t HS_CDECL hs_scan_vector(const hs_database_t *db,
+                                   const char * const * data,
+                                   const unsigned int *length,
+                                   unsigned int count,
+                                   UNUSED unsigned int flags,
+                                   hs_scratch_t *scratch,
+                                   match_event_handler onEvent, void *context) {
     if (unlikely(!scratch || !data || !length)) {
         return HS_INVALID;
     }
diff --git a/src/scratch.c b/src/scratch.c
index 8cbe97601..84d23cedd 100644
--- a/src/scratch.c
+++ b/src/scratch.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -240,7 +240,8 @@ hs_error_t alloc_scratch(const hs_scratch_t *proto, hs_scratch_t **scratch) {
 }
 
 HS_PUBLIC_API
-hs_error_t hs_alloc_scratch(const hs_database_t *db, hs_scratch_t **scratch) {
+hs_error_t HS_CDECL hs_alloc_scratch(const hs_database_t *db,
+                                     hs_scratch_t **scratch) {
     if (!db || !scratch) {
         return HS_INVALID;
     }
@@ -385,7 +386,8 @@ hs_error_t hs_alloc_scratch(const hs_database_t *db, hs_scratch_t **scratch) {
 }
 
 HS_PUBLIC_API
-hs_error_t hs_clone_scratch(const hs_scratch_t *src, hs_scratch_t **dest) {
+hs_error_t HS_CDECL hs_clone_scratch(const hs_scratch_t *src,
+                                     hs_scratch_t **dest) {
     if (!dest || !src || !ISALIGNED_CL(src) || src->magic != SCRATCH_MAGIC) {
         return HS_INVALID;
     }
@@ -402,7 +404,7 @@ hs_error_t hs_clone_scratch(const hs_scratch_t *src, hs_scratch_t **dest) {
 }
 
 HS_PUBLIC_API
-hs_error_t hs_free_scratch(hs_scratch_t *scratch) {
+hs_error_t HS_CDECL hs_free_scratch(hs_scratch_t *scratch) {
     if (scratch) {
         /* has to be aligned before we can do anything with it */
         if (!ISALIGNED_CL(scratch)) {
@@ -426,7 +428,7 @@ hs_error_t hs_free_scratch(hs_scratch_t *scratch) {
 }
 
 HS_PUBLIC_API
-hs_error_t hs_scratch_size(const hs_scratch_t *scratch, size_t *size) {
+hs_error_t HS_CDECL hs_scratch_size(const hs_scratch_t *scratch, size_t *size) {
     if (!size || !scratch || !ISALIGNED_CL(scratch) ||
         scratch->magic != SCRATCH_MAGIC) {
         return HS_INVALID;
diff --git a/src/scratch.h b/src/scratch.h
index b59dc8d4b..47f8afa87 100644
--- a/src/scratch.h
+++ b/src/scratch.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -73,8 +73,11 @@ struct catchup_pq {
 /** \brief Status flag: user requested termination. */
 #define STATUS_TERMINATED   (1U << 0)
 
-/** \brief Status flag: all possible matches on this stream have
- * been raised (i.e. all its exhaustion keys are on.) */
+/** \brief Status flag: it has been determined that it is not possible for this
+ * stream to raise any more matches.
+ *
+ * This may be because all its exhaustion keys are on or for other reasons
+ * (anchored sections not matching). */
 #define STATUS_EXHAUSTED    (1U << 1)
 
 /** \brief Status flag: Rose requires rebuild as delay literal matched in
diff --git a/src/smallwrite/smallwrite_build.cpp b/src/smallwrite/smallwrite_build.cpp
index 108bca8aa..bb933cbe3 100644
--- a/src/smallwrite/smallwrite_build.cpp
+++ b/src/smallwrite/smallwrite_build.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,10 +26,16 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+/**
+ * \file
+ * \brief Small-write engine build code.
+ */
+
 #include "smallwrite/smallwrite_build.h"
 
 #include "grey.h"
 #include "ue2common.h"
+#include "compiler/compiler.h"
 #include "nfa/dfa_min.h"
 #include "nfa/mcclellancompile.h"
 #include "nfa/mcclellancompile_util.h"
@@ -40,14 +46,18 @@
 #include "nfagraph/ng_depth.h"
 #include "nfagraph/ng_holder.h"
 #include "nfagraph/ng_mcclellan.h"
+#include "nfagraph/ng_reports.h"
 #include "nfagraph/ng_prune.h"
 #include "nfagraph/ng_util.h"
 #include "smallwrite/smallwrite_internal.h"
 #include "util/alloc.h"
+#include "util/bytecode_ptr.h"
 #include "util/charreach.h"
+#include "util/compare.h"
 #include "util/compile_context.h"
 #include "util/container.h"
 #include "util/make_unique.h"
+#include "util/ue2_graph.h"
 #include "util/ue2string.h"
 #include "util/verify_types.h"
 
@@ -56,12 +66,55 @@
 #include <vector>
 #include <utility>
 
+#include <boost/graph/breadth_first_search.hpp>
+
 using namespace std;
 
 namespace ue2 {
 
-#define LITERAL_MERGE_CHUNK_SIZE 25
 #define DFA_MERGE_MAX_STATES 8000
+#define MAX_TRIE_VERTICES 8000
+
+struct LitTrieVertexProps {
+    LitTrieVertexProps() = default;
+    explicit LitTrieVertexProps(u8 c_in) : c(c_in) {}
+    size_t index; // managed by ue2_graph
+    u8 c = 0; //!< character reached on this vertex
+    flat_set<ReportID> reports; //!< managed reports fired on this vertex
+};
+
+struct LitTrieEdgeProps {
+    size_t index; // managed by ue2_graph
+};
+
+/**
+ * \brief BGL graph used to store a trie of literals (for later AC construction
+ * into a DFA).
+ */
+struct LitTrie
+    : public ue2_graph<LitTrie, LitTrieVertexProps, LitTrieEdgeProps> {
+
+    LitTrie() : root(add_vertex(*this)) {}
+
+    const vertex_descriptor root; //!< Root vertex for the trie.
+};
+
+static
+bool is_empty(const LitTrie &trie) {
+    return num_vertices(trie) <= 1;
+}
+
+static
+std::set<ReportID> all_reports(const LitTrie &trie) {
+    std::set<ReportID> reports;
+    for (auto v : vertices_range(trie)) {
+        insert(&reports, trie[v].reports);
+    }
+    return reports;
+}
+
+using LitTrieVertex = LitTrie::vertex_descriptor;
+using LitTrieEdge = LitTrie::edge_descriptor;
 
 namespace { // unnamed
 
@@ -72,9 +125,9 @@ class SmallWriteBuildImpl : public SmallWriteBuild {
                         const CompileContext &cc);
 
     // Construct a runtime implementation.
-    aligned_unique_ptr<SmallWriteEngine> build(u32 roseQuality) override;
+    bytecode_ptr<SmallWriteEngine> build(u32 roseQuality) override;
 
-    void add(const NGWrapper &w) override;
+    void add(const NGHolder &g, const ExpressionInfo &expr) override;
     void add(const ue2_literal &literal, ReportID r) override;
 
     set<ReportID> all_reports() const override;
@@ -85,13 +138,15 @@ class SmallWriteBuildImpl : public SmallWriteBuild {
     const CompileContext &cc;
 
     unique_ptr<raw_dfa> rdfa;
-    vector<pair<ue2_literal, ReportID> > cand_literals;
+    LitTrie lit_trie;
+    LitTrie lit_trie_nocase;
+    size_t num_literals = 0;
     bool poisoned;
 };
 
 } // namespace
 
-SmallWriteBuild::~SmallWriteBuild() { }
+SmallWriteBuild::~SmallWriteBuild() = default;
 
 SmallWriteBuildImpl::SmallWriteBuildImpl(size_t num_patterns,
                                          const ReportManager &rm_in,
@@ -143,16 +198,16 @@ static
 bool pruneOverlong(NGHolder &g, const depth &max_depth,
                    const ReportManager &rm) {
     bool modified = false;
-    std::vector<NFAVertexDepth> depths;
-    calcDepths(g, depths);
+    auto depths = calcBidiDepths(g);
 
     for (auto v : vertices_range(g)) {
         if (is_special(v, g)) {
             continue;
         }
         const auto &d = depths.at(g[v].index);
-        depth min_depth = min(d.fromStart.min, d.fromStartDotStar.min);
-        if (min_depth > max_depth) {
+        depth min_match_offset = min(d.fromStart.min, d.fromStartDotStar.min)
+                               + min(d.toAccept.min, d.toAcceptEod.min);
+        if (min_match_offset > max_depth) {
             clear_vertex(v, g);
             modified = true;
             continue;
@@ -171,26 +226,41 @@ bool pruneOverlong(NGHolder &g, const depth &max_depth,
     return modified;
 }
 
-void SmallWriteBuildImpl::add(const NGWrapper &w) {
+void SmallWriteBuildImpl::add(const NGHolder &g, const ExpressionInfo &expr) {
     // If the graph is poisoned (i.e. we can't build a SmallWrite version),
     // we don't even try.
     if (poisoned) {
         return;
     }
 
-    if (w.som || w.min_length || isVacuous(w)) { /* cannot support in smwr */
+    if (expr.som) {
+        DEBUG_PRINTF("no SOM support in small-write engine\n");
         poisoned = true;
         return;
     }
 
-    DEBUG_PRINTF("w=%p\n", &w);
+    if (isVacuous(g)) {
+        DEBUG_PRINTF("no vacuous graph support in small-write engine\n");
+        poisoned = true;
+        return;
+    }
+
+    if (any_of_in(::ue2::all_reports(g), [&](ReportID id) {
+            return rm.getReport(id).minLength > 0;
+        })) {
+        DEBUG_PRINTF("no min_length extparam support in small-write engine\n");
+        poisoned = true;
+        return;
+    }
+
+    DEBUG_PRINTF("g=%p\n", &g);
 
     // make a copy of the graph so that we can modify it for our purposes
-    unique_ptr<NGHolder> h = cloneHolder(w);
+    unique_ptr<NGHolder> h = cloneHolder(g);
 
     pruneOverlong(*h, depth(cc.grey.smallWriteLargestBuffer), rm);
 
-    reduceGraph(*h, SOM_NONE, w.utf8, cc);
+    reduceGraph(*h, SOM_NONE, expr.utf8, cc);
 
     if (can_never_match(*h)) {
         DEBUG_PRINTF("graph can never match in small block\n");
@@ -209,7 +279,7 @@ void SmallWriteBuildImpl::add(const NGWrapper &w) {
         return;
     }
 
-    if (prune_overlong(*r, cc.grey.smallWriteLargestBuffer)) {
+    if (clear_deeper_reports(*r, cc.grey.smallWriteLargestBuffer)) {
         minimize_hopcroft(*r, cc.grey);
     }
 
@@ -229,101 +299,457 @@ void SmallWriteBuildImpl::add(const NGWrapper &w) {
     }
 }
 
+static
+bool add_to_trie(const ue2_literal &literal, ReportID report, LitTrie &trie) {
+    auto u = trie.root;
+    for (const auto &c : literal) {
+        auto next = LitTrie::null_vertex();
+        for (auto v : adjacent_vertices_range(u, trie)) {
+            if (trie[v].c == (u8)c.c) {
+                next = v;
+                break;
+            }
+        }
+        if (!next) {
+            next = add_vertex(LitTrieVertexProps((u8)c.c), trie);
+            add_edge(u, next, trie);
+        }
+        u = next;
+    }
+
+    trie[u].reports.insert(report);
+
+    DEBUG_PRINTF("added '%s' (report %u) to trie, now %zu vertices\n",
+                  escapeString(literal).c_str(), report, num_vertices(trie));
+    return num_vertices(trie) <= MAX_TRIE_VERTICES;
+}
+
 void SmallWriteBuildImpl::add(const ue2_literal &literal, ReportID r) {
     // If the graph is poisoned (i.e. we can't build a SmallWrite version),
     // we don't even try.
     if (poisoned) {
+        DEBUG_PRINTF("poisoned\n");
         return;
     }
 
     if (literal.length() > cc.grey.smallWriteLargestBuffer) {
+        DEBUG_PRINTF("exceeded length limit\n");
         return; /* too long */
     }
 
-    cand_literals.push_back(make_pair(literal, r));
+    if (++num_literals > cc.grey.smallWriteMaxLiterals) {
+        DEBUG_PRINTF("exceeded literal limit\n");
+        poisoned = true;
+        return;
+    }
 
-    if (cand_literals.size() > cc.grey.smallWriteMaxLiterals) {
+    auto &trie = literal.any_nocase() ? lit_trie_nocase : lit_trie;
+    if (!add_to_trie(literal, r, trie)) {
+        DEBUG_PRINTF("trie add failed\n");
         poisoned = true;
     }
 }
 
+namespace {
+
+/**
+ * \brief BFS visitor for Aho-Corasick automaton construction.
+ *
+ * This is doing two things:
+ *
+ *   - Computing the failure edges (also called fall or supply edges) for each
+ *     vertex, giving the longest suffix of the path to that point that is also
+ *     a prefix in the trie reached on the same character. The BFS traversal
+ *     makes it possible to build these from earlier failure paths.
+ *
+ *   - Computing the output function for each vertex, which is done by
+ *     propagating the reports from failure paths as well. This ensures that
+ *     substrings of the current path also report correctly.
+ */
+struct ACVisitor : public boost::default_bfs_visitor {
+    ACVisitor(LitTrie &trie_in,
+              map<LitTrieVertex, LitTrieVertex> &failure_map_in,
+              vector<LitTrieVertex> &ordering_in)
+        : mutable_trie(trie_in), failure_map(failure_map_in),
+          ordering(ordering_in) {}
+
+    LitTrieVertex find_failure_target(LitTrieVertex u, LitTrieVertex v,
+                                      const LitTrie &trie) {
+        assert(u == trie.root || contains(failure_map, u));
+        assert(!contains(failure_map, v));
+
+        const auto &c = trie[v].c;
+
+        while (u != trie.root) {
+            auto f = failure_map.at(u);
+            for (auto w : adjacent_vertices_range(f, trie)) {
+                if (trie[w].c == c) {
+                    return w;
+                }
+            }
+            u = f;
+        }
+
+        DEBUG_PRINTF("no failure edge\n");
+        return LitTrie::null_vertex();
+    }
+
+    void tree_edge(LitTrieEdge e, const LitTrie &trie) {
+        auto u = source(e, trie);
+        auto v = target(e, trie);
+        DEBUG_PRINTF("bfs (%zu, %zu) on '%c'\n", trie[u].index, trie[v].index,
+                     trie[v].c);
+        ordering.push_back(v);
+
+        auto f = find_failure_target(u, v, trie);
+
+        if (f) {
+            DEBUG_PRINTF("final failure vertex %zu\n", trie[f].index);
+            failure_map.emplace(v, f);
+
+            // Propagate reports from failure path to ensure we correctly
+            // report substrings.
+            insert(&mutable_trie[v].reports, mutable_trie[f].reports);
+        } else {
+            DEBUG_PRINTF("final failure vertex root\n");
+            failure_map.emplace(v, trie.root);
+        }
+    }
+
+private:
+    LitTrie &mutable_trie; //!< For setting reports property.
+    map<LitTrieVertex, LitTrieVertex> &failure_map;
+    vector<LitTrieVertex> &ordering; //!< BFS ordering for vertices.
+};
+}
+
+static UNUSED
+bool isSaneTrie(const LitTrie &trie) {
+    CharReach seen;
+    for (auto u : vertices_range(trie)) {
+        seen.clear();
+        for (auto v : adjacent_vertices_range(u, trie)) {
+            if (seen.test(trie[v].c)) {
+                return false;
+            }
+            seen.set(trie[v].c);
+        }
+    }
+    return true;
+}
+
+/**
+ * \brief Turn the given literal trie into an AC automaton by adding additional
+ * edges and reports.
+ */
 static
-void lit_to_graph(NGHolder *h, const ue2_literal &literal, ReportID r) {
-    NFAVertex u = h->startDs;
-    for (const auto &c : literal) {
-        NFAVertex v = add_vertex(*h);
-        add_edge(u, v, *h);
-        (*h)[v].char_reach = c;
-        u = v;
+void buildAutomaton(LitTrie &trie,
+                    map<LitTrieVertex, LitTrieVertex> &failure_map,
+                    vector<LitTrieVertex> &ordering) {
+    assert(isSaneTrie(trie));
+
+    // Find our failure transitions and reports.
+    ACVisitor ac_vis(trie, failure_map, ordering);
+    boost::breadth_first_search(trie, trie.root, visitor(ac_vis));
+
+    // Compute missing edges from failure map.
+    for (auto v : ordering) {
+        DEBUG_PRINTF("vertex %zu\n", trie[v].index);
+        CharReach seen;
+        for (auto w : adjacent_vertices_range(v, trie)) {
+            DEBUG_PRINTF("edge to %zu with reach 0x%02x\n", trie[w].index,
+                         trie[w].c);
+            assert(!seen.test(trie[w].c));
+            seen.set(trie[w].c);
+        }
+        auto parent = failure_map.at(v);
+        for (auto w : adjacent_vertices_range(parent, trie)) {
+            if (!seen.test(trie[w].c)) {
+                add_edge(v, w, trie);
+            }
+        }
     }
-    (*h)[u].reports.insert(r);
-    add_edge(u, h->accept, *h);
 }
 
-bool SmallWriteBuildImpl::determiniseLiterals() {
-    DEBUG_PRINTF("handling literals\n");
-    assert(!poisoned);
-    assert(cand_literals.size() <= cc.grey.smallWriteMaxLiterals);
+static
+vector<u32> findDistFromRoot(const LitTrie &trie) {
+    vector<u32> dist(num_vertices(trie), UINT32_MAX);
+    dist[trie[trie.root].index] = 0;
+
+    // BFS to find dist from root.
+    breadth_first_search(
+        trie, trie.root,
+        visitor(make_bfs_visitor(record_distances(
+            make_iterator_property_map(dist.begin(),
+                                       get(&LitTrieVertexProps::index, trie)),
+            boost::on_tree_edge()))));
+
+    return dist;
+}
 
-    if (cand_literals.empty()) {
-        return true; /* nothing to do */
+static
+vector<u32> findDistToAccept(const LitTrie &trie) {
+    vector<u32> dist(num_vertices(trie), UINT32_MAX);
+
+    // Start with all reporting vertices.
+    deque<LitTrieVertex> q;
+    for (auto v : vertices_range(trie)) {
+        if (!trie[v].reports.empty()) {
+            q.push_back(v);
+            dist[trie[v].index] = 0;
+        }
     }
 
-    vector<unique_ptr<raw_dfa> > temp_dfas;
+    // Custom BFS, since we have a pile of sources.
+    while (!q.empty()) {
+        auto v = q.front();
+        q.pop_front();
+        u32 d = dist[trie[v].index];
 
-    for (const auto &cand : cand_literals) {
-        NGHolder h;
-        DEBUG_PRINTF("determinising %s\n", dumpString(cand.first).c_str());
-        lit_to_graph(&h, cand.first, cand.second);
-        temp_dfas.push_back(buildMcClellan(h, &rm, cc.grey));
+        for (auto u : inv_adjacent_vertices_range(v, trie)) {
+            auto &u_dist = dist[trie[u].index];
+            if (u_dist == UINT32_MAX) {
+                q.push_back(u);
+                u_dist = d + 1;
+            }
+        }
+    }
 
-        // If we couldn't build a McClellan DFA for this portion, then we
-        // can't SmallWrite optimize the entire graph, so we can't
-        // optimize any of it
-        if (!temp_dfas.back()) {
-            DEBUG_PRINTF("failed to determinise\n");
-            poisoned = true;
-            return false;
+    return dist;
+}
+
+/**
+ * \brief Prune all vertices from the trie that do not lie on a path from root
+ * to accept of length <= max_depth.
+ */
+static
+void pruneTrie(LitTrie &trie, u32 max_depth) {
+    DEBUG_PRINTF("pruning trie to %u\n", max_depth);
+
+    auto dist_from_root = findDistFromRoot(trie);
+    auto dist_to_accept = findDistToAccept(trie);
+
+    vector<LitTrieVertex> dead;
+    for (auto v : vertices_range(trie)) {
+        if (v == trie.root) {
+            continue;
+        }
+        auto v_index = trie[v].index;
+        DEBUG_PRINTF("vertex %zu: from_start=%u, to_accept=%u\n", trie[v].index,
+                     dist_from_root[v_index], dist_to_accept[v_index]);
+        assert(dist_from_root[v_index] != UINT32_MAX);
+        assert(dist_to_accept[v_index] != UINT32_MAX);
+        u32 min_path_len = dist_from_root[v_index] + dist_to_accept[v_index];
+        if (min_path_len > max_depth) {
+            DEBUG_PRINTF("pruning vertex %zu (min path len %u)\n",
+                         trie[v].index, min_path_len);
+            clear_vertex(v, trie);
+            dead.push_back(v);
         }
     }
 
-    if (!rdfa && temp_dfas.size() == 1) {
-        /* no need to merge there is only one dfa */
-        rdfa = move(temp_dfas[0]);
-        return true;
+    if (dead.empty()) {
+        return;
+    }
+
+    for (auto v : dead) {
+        remove_vertex(v, trie);
     }
 
-    /* do a merge of the new dfas */
+    DEBUG_PRINTF("%zu vertices remain\n", num_vertices(trie));
 
-    vector<const raw_dfa *> to_merge;
+    renumber_edges(trie);
+    renumber_vertices(trie);
+}
 
-    if (rdfa) {/* also include the existing dfa */
-        to_merge.push_back(rdfa.get());
+static
+vector<CharReach> getAlphabet(const LitTrie &trie, bool nocase) {
+    vector<CharReach> esets = {CharReach::dot()};
+    for (auto v : vertices_range(trie)) {
+        if (v == trie.root) {
+            continue;
+        }
+
+        CharReach cr;
+        if (nocase) {
+            cr.set(mytoupper(trie[v].c));
+            cr.set(mytolower(trie[v].c));
+        } else {
+            cr.set(trie[v].c);
+        }
+
+        for (size_t i = 0; i < esets.size(); i++) {
+            if (esets[i].count() == 1) {
+                continue;
+            }
+
+            CharReach t = cr & esets[i];
+            if (t.any() && t != esets[i]) {
+                esets[i] &= ~t;
+                esets.push_back(t);
+            }
+        }
     }
 
-    for (const auto &d : temp_dfas) {
-        to_merge.push_back(d.get());
+    // For deterministic compiles.
+    sort(esets.begin(), esets.end());
+    return esets;
+}
+
+static
+u16 buildAlphabet(const LitTrie &trie, bool nocase,
+                  array<u16, ALPHABET_SIZE> &alpha,
+                  array<u16, ALPHABET_SIZE> &unalpha) {
+    const auto &esets = getAlphabet(trie, nocase);
+
+    u16 i = 0;
+    for (const auto &cr : esets) {
+        u16 leader = cr.find_first();
+        for (size_t s = cr.find_first(); s != cr.npos; s = cr.find_next(s)) {
+            alpha[s] = i;
+        }
+        unalpha[i] = leader;
+        i++;
     }
 
-    assert(to_merge.size() > 1);
+    for (u16 j = N_CHARS; j < ALPHABET_SIZE; j++, i++) {
+        alpha[j] = i;
+        unalpha[i] = j;
+    }
 
-    while (to_merge.size() > LITERAL_MERGE_CHUNK_SIZE) {
-        vector<const raw_dfa *> small_merge;
-        small_merge.insert(small_merge.end(), to_merge.begin(),
-                           to_merge.begin() + LITERAL_MERGE_CHUNK_SIZE);
+    DEBUG_PRINTF("alphabet size %u\n", i);
+    return i;
+}
 
-        temp_dfas.push_back(
-            mergeAllDfas(small_merge, DFA_MERGE_MAX_STATES, &rm, cc.grey));
+/**
+ * \brief Calculate state mapping, from vertex in trie to state index in BFS
+ * ordering.
+ */
+static
+unordered_map<LitTrieVertex, u32>
+makeStateMap(const LitTrie &trie, const vector<LitTrieVertex> &ordering) {
+    unordered_map<LitTrieVertex, u32> state_ids;
+    state_ids.reserve(num_vertices(trie));
+    u32 idx = DEAD_STATE + 1;
+    state_ids.emplace(trie.root, idx++);
+    for (auto v : ordering) {
+        state_ids.emplace(v, idx++);
+    }
+    assert(state_ids.size() == num_vertices(trie));
+    return state_ids;
+}
 
-        if (!temp_dfas.back()) {
-            DEBUG_PRINTF("merge failed\n");
-            poisoned = true;
-            return false;
+/** \brief Construct a raw_dfa from a literal trie. */
+static
+unique_ptr<raw_dfa> buildDfa(LitTrie &trie, bool nocase) {
+    DEBUG_PRINTF("trie has %zu states\n", num_vertices(trie));
+
+    vector<LitTrieVertex> ordering;
+    map<LitTrieVertex, LitTrieVertex> failure_map;
+    buildAutomaton(trie, failure_map, ordering);
+
+    // Construct DFA states in BFS order.
+    const auto state_ids = makeStateMap(trie, ordering);
+
+    auto rdfa = make_unique<raw_dfa>(NFA_OUTFIX);
+
+    // Calculate alphabet.
+    array<u16, ALPHABET_SIZE> unalpha;
+    auto &alpha = rdfa->alpha_remap;
+    rdfa->alpha_size = buildAlphabet(trie, nocase, alpha, unalpha);
+
+    // Construct states and transitions.
+    const u16 root_state = state_ids.at(trie.root);
+    assert(root_state == DEAD_STATE + 1);
+    rdfa->start_anchored = root_state;
+    rdfa->start_floating = root_state;
+    rdfa->states.resize(num_vertices(trie) + 1, dstate(rdfa->alpha_size));
+
+    // Dead state.
+    fill(rdfa->states[DEAD_STATE].next.begin(),
+         rdfa->states[DEAD_STATE].next.end(), DEAD_STATE);
+
+    for (auto u : vertices_range(trie)) {
+        auto u_state = state_ids.at(u);
+        DEBUG_PRINTF("state %u\n", u_state);
+        assert(u_state < rdfa->states.size());
+        auto &ds = rdfa->states[u_state];
+        ds.reports = trie[u].reports;
+        if (!ds.reports.empty()) {
+            DEBUG_PRINTF("reports: %s\n", as_string_list(ds.reports).c_str());
         }
 
-        to_merge.erase(to_merge.begin(),
-                       to_merge.begin() + LITERAL_MERGE_CHUNK_SIZE);
-        to_merge.push_back(temp_dfas.back().get());
+        // Set daddy state from failure map.
+        if (u == trie.root) {
+            ds.daddy = DEAD_STATE;
+        } else {
+            assert(contains(failure_map, u));
+            ds.daddy = state_ids.at(failure_map.at(u));
+        }
+
+        // By default, transition back to the root.
+        fill(ds.next.begin(), ds.next.end(), root_state);
+        // TOP should be a self-loop.
+        ds.next[alpha[TOP]] = u_state;
+
+        // Add in the real transitions.
+        for (auto v : adjacent_vertices_range(u, trie)) {
+            if (v == trie.root) {
+                continue;
+            }
+            auto v_state = state_ids.at(v);
+            u16 sym = alpha[trie[v].c];
+            DEBUG_PRINTF("edge to %u on 0x%02x (sym %u)\n", v_state,
+                         trie[v].c, sym);
+            assert(sym < ds.next.size());
+            assert(ds.next[sym] == root_state);
+            ds.next[sym] = v_state;
+        }
+    }
+
+    return rdfa;
+}
+
+bool SmallWriteBuildImpl::determiniseLiterals() {
+    DEBUG_PRINTF("handling literals\n");
+    assert(!poisoned);
+    assert(num_literals <= cc.grey.smallWriteMaxLiterals);
+
+    if (is_empty(lit_trie) && is_empty(lit_trie_nocase)) {
+        DEBUG_PRINTF("no literals\n");
+        return true; /* nothing to do */
+    }
+
+    vector<unique_ptr<raw_dfa>> dfas;
+
+    if (!is_empty(lit_trie)) {
+        dfas.push_back(buildDfa(lit_trie, false));
+        DEBUG_PRINTF("caseful literal dfa with %zu states\n",
+                     dfas.back()->states.size());
+    }
+    if (!is_empty(lit_trie_nocase)) {
+        dfas.push_back(buildDfa(lit_trie_nocase, true));
+        DEBUG_PRINTF("nocase literal dfa with %zu states\n",
+                     dfas.back()->states.size());
+    }
+
+    if (rdfa) {
+        dfas.push_back(move(rdfa));
+        DEBUG_PRINTF("general dfa with %zu states\n",
+                     dfas.back()->states.size());
+    }
+
+    // If we only have one DFA, no merging is necessary.
+    if (dfas.size() == 1) {
+        DEBUG_PRINTF("only one dfa\n");
+        rdfa = move(dfas.front());
+        return true;
+    }
+
+    // Merge all DFAs.
+    vector<const raw_dfa *> to_merge;
+    for (const auto &d : dfas) {
+        to_merge.push_back(d.get());
     }
 
     auto merged = mergeAllDfas(to_merge, DFA_MERGE_MAX_STATES, &rm, cc.grey);
@@ -334,11 +760,11 @@ bool SmallWriteBuildImpl::determiniseLiterals() {
         return false;
     }
 
-    DEBUG_PRINTF("merge succeeded, built %p\n", merged.get());
+    DEBUG_PRINTF("merge succeeded, built dfa with %zu states\n",
+                 merged->states.size());
 
-    // Replace our only DFA with the merged one
+    // Replace our only DFA with the merged one.
     rdfa = move(merged);
-
     return true;
 }
 
@@ -385,30 +811,36 @@ bool is_slow(const raw_dfa &rdfa, const set<dstate_id_t> &accel,
 }
 
 static
-aligned_unique_ptr<NFA> getDfa(raw_dfa &rdfa, const CompileContext &cc,
-                               const ReportManager &rm,
-                               set<dstate_id_t> &accel_states) {
-    aligned_unique_ptr<NFA> dfa = nullptr;
+bytecode_ptr<NFA> getDfa(raw_dfa &rdfa, const CompileContext &cc,
+                         const ReportManager &rm, bool has_non_literals,
+                         set<dstate_id_t> &accel_states) {
+    // If we determinised only literals, then we only need to consider the init
+    // states for acceleration.
+    bool only_accel_init = !has_non_literals;
+    bool trust_daddy_states = !has_non_literals;
+
+    bytecode_ptr<NFA> dfa = nullptr;
     if (cc.grey.allowSmallWriteSheng) {
-        dfa = shengCompile(rdfa, cc, rm, &accel_states);
+        dfa = shengCompile(rdfa, cc, rm, only_accel_init, &accel_states);
     }
     if (!dfa) {
-        dfa = mcclellanCompile(rdfa, cc, rm, &accel_states);
+        dfa = mcclellanCompile(rdfa, cc, rm, only_accel_init,
+                               trust_daddy_states, &accel_states);
     }
     return dfa;
 }
 
 static
-aligned_unique_ptr<NFA> prepEngine(raw_dfa &rdfa, u32 roseQuality,
-                                   const CompileContext &cc,
-                                   const ReportManager &rm, u32 *start_offset,
-                                   u32 *small_region) {
+bytecode_ptr<NFA> prepEngine(raw_dfa &rdfa, u32 roseQuality,
+                             const CompileContext &cc, const ReportManager &rm,
+                             bool has_non_literals, u32 *start_offset,
+                             u32 *small_region) {
     *start_offset = remove_leading_dots(rdfa);
 
     // Unleash the McClellan!
     set<dstate_id_t> accel_states;
 
-    auto nfa = getDfa(rdfa, cc, rm, accel_states);
+    auto nfa = getDfa(rdfa, cc, rm, has_non_literals, accel_states);
     if (!nfa) {
         DEBUG_PRINTF("DFA compile failed for smallwrite NFA\n");
         return nullptr;
@@ -420,14 +852,14 @@ aligned_unique_ptr<NFA> prepEngine(raw_dfa &rdfa, u32 roseQuality,
         if (*small_region <= *start_offset) {
             return nullptr;
         }
-        if (prune_overlong(rdfa, *small_region - *start_offset)) {
+        if (clear_deeper_reports(rdfa, *small_region - *start_offset)) {
             minimize_hopcroft(rdfa, cc.grey);
             if (rdfa.start_anchored == DEAD_STATE) {
                 DEBUG_PRINTF("all patterns pruned out\n");
                 return nullptr;
             }
 
-            nfa = getDfa(rdfa, cc, rm, accel_states);
+            nfa = getDfa(rdfa, cc, rm, has_non_literals, accel_states);
             if (!nfa) {
                 DEBUG_PRINTF("DFA compile failed for smallwrite NFA\n");
                 assert(0); /* able to build orig dfa but not the trimmed? */
@@ -456,9 +888,10 @@ unique_ptr<SmallWriteBuild> makeSmallWriteBuilder(size_t num_patterns,
     return ue2::make_unique<SmallWriteBuildImpl>(num_patterns, rm, cc);
 }
 
-aligned_unique_ptr<SmallWriteEngine>
-SmallWriteBuildImpl::build(u32 roseQuality) {
-    if (!rdfa && cand_literals.empty()) {
+bytecode_ptr<SmallWriteEngine> SmallWriteBuildImpl::build(u32 roseQuality) {
+    const bool has_literals = !is_empty(lit_trie) || !is_empty(lit_trie_nocase);
+    const bool has_non_literals = rdfa != nullptr;
+    if (!rdfa && !has_literals) {
         DEBUG_PRINTF("no smallwrite engine\n");
         poisoned = true;
         return nullptr;
@@ -469,17 +902,34 @@ SmallWriteBuildImpl::build(u32 roseQuality) {
         return nullptr;
     }
 
+    // We happen to know that if the rose is high quality, we're going to limit
+    // depth further.
+    if (roseQuality) {
+        u32 max_depth = cc.grey.smallWriteLargestBufferBad;
+        if (!is_empty(lit_trie)) {
+            pruneTrie(lit_trie, max_depth);
+        }
+        if (!is_empty(lit_trie_nocase)) {
+            pruneTrie(lit_trie_nocase, max_depth);
+        }
+    }
+
     if (!determiniseLiterals()) {
         DEBUG_PRINTF("some literal could not be made into a smallwrite dfa\n");
         return nullptr;
     }
 
+    if (!rdfa) {
+        DEBUG_PRINTF("no dfa, pruned everything away\n");
+        return nullptr;
+    }
+
     DEBUG_PRINTF("building rdfa %p\n", rdfa.get());
 
     u32 start_offset;
     u32 small_region;
-    auto nfa =
-        prepEngine(*rdfa, roseQuality, cc, rm, &start_offset, &small_region);
+    auto nfa = prepEngine(*rdfa, roseQuality, cc, rm, has_non_literals,
+                          &start_offset, &small_region);
     if (!nfa) {
         DEBUG_PRINTF("some smallwrite outfix could not be prepped\n");
         /* just skip the smallwrite optimization */
@@ -488,7 +938,7 @@ SmallWriteBuildImpl::build(u32 roseQuality) {
     }
 
     u32 size = sizeof(SmallWriteEngine) + nfa->length;
-    auto smwr = aligned_zmalloc_unique<SmallWriteEngine>(size);
+    auto smwr = make_zeroed_bytecode_ptr<SmallWriteEngine>(size);
 
     smwr->size = size;
     smwr->start_offset = start_offset;
@@ -510,15 +960,11 @@ set<ReportID> SmallWriteBuildImpl::all_reports() const {
     if (rdfa) {
         insert(&reports, ::ue2::all_reports(*rdfa));
     }
-    for (const auto &cand : cand_literals) {
-        reports.insert(cand.second);
-    }
-    return reports;
-}
 
-size_t smwrSize(const SmallWriteEngine *smwr) {
-    assert(smwr);
-    return smwr->size;
+    insert(&reports, ::ue2::all_reports(lit_trie));
+    insert(&reports, ::ue2::all_reports(lit_trie_nocase));
+
+    return reports;
 }
 
 } // namespace ue2
diff --git a/src/smallwrite/smallwrite_build.h b/src/smallwrite/smallwrite_build.h
index 84c6df3a2..648b13db7 100644
--- a/src/smallwrite/smallwrite_build.h
+++ b/src/smallwrite/smallwrite_build.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,49 +30,50 @@
 #define SMWR_BUILD_H
 
 /**
- * SmallWrite Build interface. Everything you ever needed to feed literals in
- * and get a SmallWriteEngine out. This header should be everything needed by
- * the rest of UE2.
+ * \file
+ * \brief Small-write engine build interface.
+ *
+ * Everything you ever needed to feed literals in and get a SmallWriteEngine
+ * out. This header should be everything needed by the rest of UE2.
  */
 
 #include "ue2common.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"
+#include "util/noncopyable.h"
 
+#include <memory>
 #include <set>
 
-#include <boost/core/noncopyable.hpp>
-
 struct SmallWriteEngine;
 
 namespace ue2 {
 
 struct CompileContext;
 struct ue2_literal;
-class  NGWrapper;
-class  ReportManager;
+class ExpressionInfo;
+class NGHolder;
+class ReportManager;
 
-// Abstract interface intended for callers from elsewhere in the tree, real
-// underlying implementation is SmallWriteBuildImpl in smwr_build_impl.h.
-class SmallWriteBuild : boost::noncopyable {
+/**
+ * Abstract interface intended for callers from elsewhere in the tree, real
+ * underlying implementation is SmallWriteBuildImpl in smwr_build_impl.h.
+ */
+class SmallWriteBuild : noncopyable {
 public:
-    // Destructor
     virtual ~SmallWriteBuild();
 
-    // Construct a runtime implementation.
-    virtual ue2::aligned_unique_ptr<SmallWriteEngine> build(u32 roseQuality) = 0;
+    virtual bytecode_ptr<SmallWriteEngine> build(u32 roseQuality) = 0;
 
-    virtual void add(const NGWrapper &w) = 0;
+    virtual void add(const NGHolder &g, const ExpressionInfo &expr) = 0;
     virtual void add(const ue2_literal &literal, ReportID r) = 0;
 
     virtual std::set<ReportID> all_reports() const = 0;
 };
 
-// Construct a usable SmallWrite builder.
-std::unique_ptr<SmallWriteBuild> makeSmallWriteBuilder(size_t num_patterns,
-                                                       const ReportManager &rm,
-                                                       const CompileContext &cc);
-
-size_t smwrSize(const SmallWriteEngine *t);
+/** \brief Construct a usable SmallWrite builder. */
+std::unique_ptr<SmallWriteBuild>
+makeSmallWriteBuilder(size_t num_patterns, const ReportManager &rm,
+                      const CompileContext &cc);
 
 } // namespace ue2
 
diff --git a/src/som/slot_manager.cpp b/src/som/slot_manager.cpp
index b1aa6bf78..3dc74d3da 100644
--- a/src/som/slot_manager.cpp
+++ b/src/som/slot_manager.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,9 +26,11 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-/** \file
+/**
+ * \file
  * \brief SOM Slot Manager.
  */
+
 #include "slot_manager.h"
 
 #include "slot_manager_internal.h"
@@ -245,7 +247,7 @@ u32 SomSlotManager::numSomSlots() const {
     return nextSomSlot;
 }
 
-u32 SomSlotManager::addRevNfa(aligned_unique_ptr<NFA> nfa, u32 maxWidth) {
+u32 SomSlotManager::addRevNfa(bytecode_ptr<NFA> nfa, u32 maxWidth) {
     u32 rv = verify_u32(rev_nfas.size());
     rev_nfas.push_back(move(nfa));
 
diff --git a/src/som/slot_manager.h b/src/som/slot_manager.h
index 971ea3623..ddb105f53 100644
--- a/src/som/slot_manager.h
+++ b/src/som/slot_manager.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,7 +26,8 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-/** \file
+/**
+ * \file
  * \brief SOM Slot Manager.
  */
 
@@ -35,12 +36,12 @@
 
 #include "ue2common.h"
 #include "nfagraph/ng_holder.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"
+#include "util/noncopyable.h"
 #include "util/ue2_containers.h"
 
 #include <deque>
 #include <memory>
-#include <boost/core/noncopyable.hpp>
 
 struct NFA;
 
@@ -54,7 +55,7 @@ struct SlotCache;
 /** \brief SOM slot manager. Used to hand out SOM slots and track their
  * relationships during SOM construction. Also stores reverse NFAs used for
  * SOM. */
-class SomSlotManager : boost::noncopyable {
+class SomSlotManager : noncopyable {
 public:
     explicit SomSlotManager(u8 precision);
     ~SomSlotManager();
@@ -78,11 +79,11 @@ class SomSlotManager : boost::noncopyable {
 
     u32 numSomSlots() const;
 
-    const std::deque<aligned_unique_ptr<NFA>> &getRevNfas() const {
+    const std::deque<bytecode_ptr<NFA>> &getRevNfas() const {
         return rev_nfas;
     }
 
-    u32 addRevNfa(aligned_unique_ptr<NFA> nfa, u32 maxWidth);
+    u32 addRevNfa(bytecode_ptr<NFA> nfa, u32 maxWidth);
 
     u32 somHistoryRequired() const { return historyRequired; }
 
@@ -97,7 +98,7 @@ class SomSlotManager : boost::noncopyable {
     std::unique_ptr<SlotCache> cache;
 
     /** \brief Reverse NFAs used for SOM support. */
-    std::deque<aligned_unique_ptr<NFA>> rev_nfas;
+    std::deque<bytecode_ptr<NFA>> rev_nfas;
 
     /** \brief In streaming mode, the amount of history we've committed to
      * using for SOM rev NFAs. */
diff --git a/src/som/som.h b/src/som/som.h
index 4a3809843..e759cf0a2 100644
--- a/src/som/som.h
+++ b/src/som/som.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,17 +26,22 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-/** \file
+/**
+ * \file
  * \brief Common SOM definitions.
  */
 
 #ifndef UE2_SOM_H
 #define UE2_SOM_H
 
+namespace ue2 {
+
 /** \brief Enumeration specifying a start of match behaviour. */
 enum som_type {
     SOM_NONE,       //!< No SOM required
     SOM_LEFT       //!< Exact leftmost SOM
 };
 
+} // namespace ue2
+
 #endif // UE2_SOM_H
diff --git a/src/ue2common.h b/src/ue2common.h
index e1f03f721..4bec83155 100644
--- a/src/ue2common.h
+++ b/src/ue2common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -189,8 +189,8 @@ typedef u32 ReportID;
   #define unlikely(x)   __builtin_expect(!!(x), 0)
 #endif
 #else
-#define likely(x)	(x)
-#define unlikely(x)	(x)
+#define likely(x)   (x)
+#define unlikely(x) (x)
 #endif
 
 #if !defined(RELEASE_BUILD) || defined(DEBUG)
diff --git a/src/util/alloc.h b/src/util/alloc.h
index 191bc387e..de20c8d02 100644
--- a/src/util/alloc.h
+++ b/src/util/alloc.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,7 +26,8 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-/** \file
+/**
+ * \file
  * \brief Aligned memory alloc/free.
  */
 
@@ -51,25 +52,6 @@ void *aligned_zmalloc(size_t size);
 /** \brief Free a pointer allocated with \ref aligned_zmalloc. */
 void aligned_free(void *ptr);
 
-template <typename T> struct AlignedDeleter {
-    void operator()(T *ptr) const { aligned_free(ptr); }
-};
-template <typename T>
-using aligned_unique_ptr = std::unique_ptr<T, AlignedDeleter<T>>;
-
-/** \brief 64-byte aligned, zeroed malloc that returns an appropriately-typed
- * aligned_unique_ptr.
- *
- * If the requested size cannot be allocated, throws std::bad_alloc.
- */
-template <typename T>
-inline
-aligned_unique_ptr<T> aligned_zmalloc_unique(size_t size) {
-    T* ptr = static_cast<T *>(aligned_zmalloc(size));
-    assert(ptr); // Guaranteed by aligned_zmalloc.
-    return aligned_unique_ptr<T>(ptr);
-}
-
 /** \brief Internal use only, used by AlignedAllocator. */
 void *aligned_malloc_internal(size_t size, size_t align);
 
diff --git a/src/nfa/multishufti.h b/src/util/arch.h
similarity index 54%
rename from src/nfa/multishufti.h
rename to src/util/arch.h
index af5784831..c78ee9ced 100644
--- a/src/nfa/multishufti.h
+++ b/src/util/arch.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -27,44 +27,60 @@
  */
 
 /** \file
- * \brief Multishufti: multibyte version of Shufti
- *
- * Utilises the SSSE3 pshufb shuffle instruction
+ * \brief Per-platform architecture definitions
  */
 
-#ifndef MULTISHUFTI_H
-#define MULTISHUFTI_H
+#ifndef UTIL_ARCH_H_
+#define UTIL_ARCH_H_
 
-#include "ue2common.h"
-#include "util/simd_types.h"
+#if defined(__SSE2__) || defined(_M_X64) || (_M_IX86_FP >= 2)
+#define HAVE_SSE2
+#endif
 
-#ifdef __cplusplus
-extern "C"
-{
+#if defined(__SSE4_1__) || (defined(_WIN32) && defined(__AVX__))
+#define HAVE_SSE41
 #endif
 
-const u8 *long_shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
-                          const u8 *buf_end, const u8 run_len);
+#if defined(__SSE4_2__) || (defined(_WIN32) && defined(__AVX__))
+#define HAVE_SSE42
+#endif
 
-const u8 *longgrab_shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
-                              const u8 *buf_end, const u8 run_len);
+#if defined(__AVX__)
+#define HAVE_AVX
+#endif
 
-const u8 *shift_shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
-                           const u8 *buf_end, const u8 run_len);
+#if defined(__AVX2__)
+#define HAVE_AVX2
+#endif
 
-const u8 *shiftgrab_shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
-                               const u8 *buf_end, const u8 run_len);
+#if defined(__AVX512BW__)
+#define HAVE_AVX512
+#endif
 
-const u8 *doubleshift_shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
-                                 const u8 *buf_end, const u8 run_len,
-                                 const u8 run2_len);
+/*
+ * ICC and MSVC don't break out POPCNT or BMI/2 as separate pre-def macros
+ */
+#if defined(__POPCNT__) ||                                                     \
+    (defined(__INTEL_COMPILER) && defined(__SSE4_2__)) ||                      \
+    (defined(_WIN32) && defined(__AVX__))
+#define HAVE_POPCOUNT_INSTR
+#endif
 
-const u8 *doubleshiftgrab_shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf,
-                                     const u8 *buf_end, const u8 run_len,
-                                     const u8 run2_len);
+#if defined(__BMI__) || (defined(_WIN32) && defined(__AVX2__)) ||              \
+    (defined(__INTEL_COMPILER) && defined(__AVX2__))
+#define HAVE_BMI
+#endif
 
-#ifdef __cplusplus
-}
+#if defined(__BMI2__) || (defined(_WIN32) && defined(__AVX2__)) ||             \
+    (defined(__INTEL_COMPILER) && defined(__AVX2__))
+#define HAVE_BMI2
 #endif
 
+/*
+ * MSVC uses a different form of inline asm
+ */
+#if defined(_WIN32) && defined(_MSC_VER)
+#define NO_ASM
 #endif
+
+#endif // UTIL_ARCH_H_
diff --git a/src/util/bitutils.h b/src/util/bitutils.h
index d144e8793..c545ee187 100644
--- a/src/util/bitutils.h
+++ b/src/util/bitutils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,37 +35,8 @@
 
 #include "ue2common.h"
 #include "popcount.h"
-
-#ifdef __cplusplus
-# if defined(HAVE_CXX_X86INTRIN_H)
-#  define USE_X86INTRIN_H
-# endif
-#else // C, baby
-# if defined(HAVE_C_X86INTRIN_H)
-#  define USE_X86INTRIN_H
-# endif
-#endif
-
-#ifdef __cplusplus
-# if defined(HAVE_CXX_INTRIN_H)
-#  define USE_INTRIN_H
-# endif
-#else // C, baby
-# if defined(HAVE_C_INTRIN_H)
-#  define USE_INTRIN_H
-# endif
-#endif
-
-#if defined(USE_X86INTRIN_H)
-#include <x86intrin.h>
-#elif defined(USE_INTRIN_H)
-#include <intrin.h>
-#endif
-
-// MSVC has a different form of inline asm
-#ifdef _WIN32
-#define NO_ASM
-#endif
+#include "util/arch.h"
+#include "util/intrinsics.h"
 
 #define CASE_BIT          0x20
 #define CASE_CLEAR        0xdf
@@ -269,7 +240,7 @@ u32 findAndClearMSB_64(u64a *v) {
 
 static really_inline
 u32 compress32(u32 x, u32 m) {
-#if defined(__BMI2__)
+#if defined(HAVE_BMI2)
     // BMI2 has a single instruction for this operation.
     return _pext_u32(x, m);
 #else
@@ -304,7 +275,7 @@ u32 compress32(u32 x, u32 m) {
 
 static really_inline
 u64a compress64(u64a x, u64a m) {
-#if defined(ARCH_X86_64) && defined(__BMI2__)
+#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
     // BMI2 has a single instruction for this operation.
     return _pext_u64(x, m);
 #else
@@ -340,7 +311,7 @@ u64a compress64(u64a x, u64a m) {
 
 static really_inline
 u32 expand32(u32 x, u32 m) {
-#if defined(__BMI2__)
+#if defined(HAVE_BMI2)
     // BMI2 has a single instruction for this operation.
     return _pdep_u32(x, m);
 #else
@@ -380,7 +351,7 @@ u32 expand32(u32 x, u32 m) {
 
 static really_inline
 u64a expand64(u64a x, u64a m) {
-#if defined(ARCH_X86_64) && defined(__BMI2__)
+#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
     // BMI2 has a single instruction for this operation.
     return _pdep_u64(x, m);
 #else
@@ -471,13 +442,9 @@ u32 rank_in_mask64(u64a mask, u32 bit) {
     return popcount64(mask);
 }
 
-#if defined(__BMI2__) || (defined(_WIN32) && defined(__AVX2__))
-#define HAVE_PEXT
-#endif
-
 static really_inline
 u32 pext32(u32 x, u32 mask) {
-#if defined(HAVE_PEXT)
+#if defined(HAVE_BMI2)
     // Intel BMI2 can do this operation in one instruction.
     return _pext_u32(x, mask);
 #else
@@ -497,7 +464,7 @@ u32 pext32(u32 x, u32 mask) {
 
 static really_inline
 u64a pext64(u64a x, u64a mask) {
-#if defined(HAVE_PEXT) && defined(ARCH_64_BIT)
+#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
     // Intel BMI2 can do this operation in one instruction.
     return _pext_u64(x, mask);
 #else
@@ -515,7 +482,7 @@ u64a pext64(u64a x, u64a mask) {
 #endif
 }
 
-#if defined(HAVE_PEXT) && defined(ARCH_64_BIT)
+#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
 static really_inline
 u64a pdep64(u64a x, u64a mask) {
     return _pdep_u64(x, mask);
diff --git a/src/util/boundary_reports.h b/src/util/boundary_reports.h
index 7ad93ba1e..b2bb1c9b0 100644
--- a/src/util/boundary_reports.h
+++ b/src/util/boundary_reports.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,13 +30,13 @@
 #define BOUNDARY_REPORTS_H
 
 #include "ue2common.h"
+#include "util/noncopyable.h"
 
 #include <set>
-#include <boost/core/noncopyable.hpp>
 
 namespace ue2 {
 
-struct BoundaryReports : boost::noncopyable {
+struct BoundaryReports : noncopyable {
     std::set<ReportID> report_at_0; /* set of internal reports to fire
                                      * unconditionally at offset 0 */
     std::set<ReportID> report_at_0_eod; /* set of internal reports to fire
diff --git a/src/util/bytecode_ptr.h b/src/util/bytecode_ptr.h
new file mode 100644
index 000000000..f1f2e5ef8
--- /dev/null
+++ b/src/util/bytecode_ptr.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file
+ * \brief bytecode_ptr: Smart pointer with unique ownership that knows its
+ * length and alignment.
+ */
+
+#ifndef UTIL_BYTECODE_PTR_H
+#define UTIL_BYTECODE_PTR_H
+
+#include "util/alloc.h"
+#include "util/operators.h"
+
+#include <algorithm> // std::max
+#include <cstring>
+#include <memory>
+#include <stdexcept> // std::logic_error
+
+namespace ue2 {
+
+/**
+ * \brief Smart pointer that knows its length and alignment and behaves like a
+ * std::unique_ptr -- i.e. it retains unique ownership of the memory region.
+ *
+ * This is intended to be used for flat aligned memory regions that will
+ * eventually end up copied into the Hyperscan bytecode.
+ */
+template<typename T>
+class bytecode_ptr : totally_ordered<bytecode_ptr<T>> {
+public:
+    bytecode_ptr() = default;
+    explicit bytecode_ptr(size_t bytes_in, size_t alignment_in = alignof(T))
+        : bytes(bytes_in), alignment(alignment_in) {
+        // posix_memalign doesn't like us asking for smaller alignment.
+        size_t mem_align = std::max(alignment, sizeof(void *));
+        ptr.reset(static_cast<T *>(aligned_malloc_internal(bytes, mem_align)));
+        if (!ptr) {
+            throw std::bad_alloc();
+        }
+    }
+
+    bytecode_ptr(std::nullptr_t) {}
+
+    T *get() const { return ptr.get(); }
+
+    T &operator*() { return *ptr; }
+    const T &operator*() const { return *ptr; }
+
+    T *operator->() { return ptr.get(); }
+    const T *operator->() const { return ptr.get(); }
+
+    explicit operator bool() const { return ptr != nullptr; }
+
+    /** \brief Move converter for shared_ptr. */
+    template <typename ST, class = typename std::enable_if<
+                               std::is_convertible<T *, ST *>::value>::type>
+    operator std::shared_ptr<ST>() && {
+        auto d = ptr.get_deleter();
+        return std::shared_ptr<ST>(ptr.release(), d);
+    }
+
+    void reset(T *p = nullptr) { ptr.reset(p); }
+
+    T *release() {
+        auto *p = ptr.release();
+        bytes = 0;
+        alignment = 0;
+        return p;
+    }
+
+    void swap(bytecode_ptr &other) {
+        using std::swap;
+        swap(ptr, other.ptr);
+        swap(bytes, other.bytes);
+        swap(alignment, other.alignment);
+    }
+
+    /**
+     * \brief Reduces the apparent size of the memory region. Note that this
+     * does not reallocate and copy, it just changes the value returned by
+     * size().
+     */
+    void shrink(size_t new_size) {
+        if (new_size > bytes) {
+            assert(0);
+            throw std::logic_error("Must shrink to a smaller value");
+        }
+        bytes = new_size;
+    }
+
+    /** \brief Returns size of the memory region in bytes. */
+    size_t size() const { return bytes; }
+
+    /** \brief Returns alignment of the memory region in bytes. */
+    size_t align() const { return alignment; }
+
+    bool operator==(const bytecode_ptr &a) const { return ptr == a.ptr; }
+    bool operator<(const bytecode_ptr &a) const { return ptr < a.ptr; }
+
+private:
+    /** \brief Deleter function for std::unique_ptr. */
+    template <typename DT> struct deleter {
+        void operator()(DT *p) const { aligned_free_internal(p); }
+    };
+
+    std::unique_ptr<T, deleter<T>> ptr; //!< Underlying pointer.
+    size_t bytes = 0; //!< Size of memory region in bytes.
+    size_t alignment = 0; //!< Alignment of memory region in bytes.
+};
+
+/**
+ * \brief Constructs a bytecode_ptr<T> with the given size and alignment.
+ */
+template<typename T>
+inline bytecode_ptr<T> make_bytecode_ptr(size_t size,
+                                         size_t align = alignof(T)) {
+    return bytecode_ptr<T>(size, align);
+}
+
+/**
+ * \brief Constructs a bytecode_ptr<T> with the given size and alignment and
+ * fills the memory region with zeroes.
+ */
+template<typename T>
+inline bytecode_ptr<T> make_zeroed_bytecode_ptr(size_t size,
+                                                size_t align = alignof(T)) {
+    auto ptr = make_bytecode_ptr<T>(size, align);
+    std::memset(ptr.get(), 0, size);
+    return ptr;
+}
+
+} // namespace ue2
+
+#endif // UTIL_BYTECODE_PTR_H
diff --git a/src/util/container.h b/src/util/container.h
index e2cfb485e..68f60e99e 100644
--- a/src/util/container.h
+++ b/src/util/container.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -89,6 +89,14 @@ auto make_vector_from(const std::pair<It, It> &range)
     return std::vector<T>(range.first, range.second);
 }
 
+/** \brief Sort a sequence container and remove duplicates. */
+template <typename C, typename Compare = std::less<typename C::value_type>>
+void sort_and_unique(C &container, Compare comp = Compare()) {
+    std::sort(std::begin(container), std::end(container), comp);
+    container.erase(std::unique(std::begin(container), std::end(container)),
+                    std::end(container));
+}
+
 /** \brief Returns a set containing the keys in the given associative
  * container. */
 template <typename C>
@@ -194,6 +202,17 @@ void erase_all(C *container, const D &donor) {
     }
 }
 
+
+template<typename C, typename Pred>
+bool any_of_in(const C &c, Pred p) {
+    return std::any_of(c.begin(), c.end(), std::move(p));
+}
+
+template<typename C, typename Pred>
+bool all_of_in(const C &c, Pred p) {
+    return std::all_of(c.begin(), c.end(), std::move(p));
+}
+
 } // namespace ue2
 
 #ifdef DUMP_SUPPORT
diff --git a/src/util/cpuid_flags.c b/src/util/cpuid_flags.c
index dba147ee1..c0ab09afb 100644
--- a/src/util/cpuid_flags.c
+++ b/src/util/cpuid_flags.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,6 +30,7 @@
 #include "ue2common.h"
 #include "hs_compile.h" // for HS_MODE_ flags
 #include "hs_internal.h"
+#include "util/arch.h"
 
 #ifndef _WIN32
 #include <cpuid.h>
@@ -55,9 +56,18 @@
 #define AVX2 (1 << 5)
 #define BMI2 (1 << 8)
 
+// Structured Extended Feature Flags Enumeration Leaf EBX values
+#define AVX512F (1 << 16)
+#define AVX512BW (1 << 30)
+
 // Extended Control Register 0 (XCR0) values
 #define XCR0_SSE (1 << 1)
 #define XCR0_AVX (1 << 2)
+#define XCR0_OPMASK (1 << 5) // k-regs
+#define XCR0_ZMM_Hi256 (1 << 6) // upper 256 bits of ZMM0-ZMM15
+#define XCR0_Hi16_ZMM (1 << 7) // ZMM16-ZMM31
+
+#define XCR0_AVX512 (XCR0_OPMASK | XCR0_ZMM_Hi256 | XCR0_Hi16_ZMM)
 
 static __inline
 void cpuid(unsigned int op, unsigned int leaf, unsigned int *eax,
@@ -123,6 +133,48 @@ int check_avx2(void) {
 #endif
 }
 
+int check_avx512(void) {
+    /*
+     * For our purposes, having avx512 really means "can we use AVX512BW?"
+     */
+#if defined(__INTEL_COMPILER)
+    return _may_i_use_cpu_feature(_FEATURE_AVX512BW | _FEATURE_AVX512VL);
+#else
+    unsigned int eax, ebx, ecx, edx;
+
+    cpuid(1, 0, &eax, &ebx, &ecx, &edx);
+
+    /* check XSAVE is enabled by OS */
+    if (!(ecx & XSAVE)) {
+        DEBUG_PRINTF("AVX and XSAVE not supported\n");
+        return 0;
+    }
+
+    /* check that AVX 512 registers are enabled by OS */
+    u64a xcr0 = xgetbv(0);
+    if ((xcr0 & XCR0_AVX512) != XCR0_AVX512) {
+        DEBUG_PRINTF("AVX512 registers not enabled\n");
+        return 0;
+    }
+
+    /* ECX and EDX contain capability flags */
+    ecx = 0;
+    cpuid(7, 0, &eax, &ebx, &ecx, &edx);
+
+    if (!(ebx & AVX512F)) {
+        DEBUG_PRINTF("AVX512F (AVX512 Foundation) instructions not enabled\n");
+        return 0;
+    }
+
+    if (ebx & AVX512BW) {
+        DEBUG_PRINTF("AVX512BW instructions enabled\n");
+        return 1;
+    }
+
+    return 0;
+#endif
+}
+
 u64a cpuid_flags(void) {
     u64a cap = 0;
 
@@ -131,10 +183,19 @@ u64a cpuid_flags(void) {
         cap |= HS_CPU_FEATURES_AVX2;
     }
 
-#if !defined(__AVX2__)
+    if (check_avx512()) {
+        DEBUG_PRINTF("AVX512 enabled\n");
+        cap |= HS_CPU_FEATURES_AVX512;
+    }
+
+#if !defined(FAT_RUNTIME) && !defined(HAVE_AVX2)
     cap &= ~HS_CPU_FEATURES_AVX2;
 #endif
 
+#if !defined(FAT_RUNTIME) && !defined(HAVE_AVX512)
+    cap &= ~HS_CPU_FEATURES_AVX512;
+#endif
+
     return cap;
 }
 
@@ -167,33 +228,37 @@ struct family_id {
  * Family Numbers" */
 static const struct family_id known_microarch[] = {
     { 0x6, 0x37, HS_TUNE_FAMILY_SLM }, /* baytrail */
+    { 0x6, 0x4A, HS_TUNE_FAMILY_SLM }, /* silvermont */
+    { 0x6, 0x4C, HS_TUNE_FAMILY_SLM }, /* silvermont */
     { 0x6, 0x4D, HS_TUNE_FAMILY_SLM }, /* avoton, rangley */
+    { 0x6, 0x5A, HS_TUNE_FAMILY_SLM }, /* silvermont */
+    { 0x6, 0x5D, HS_TUNE_FAMILY_SLM }, /* silvermont */
+
+    { 0x6, 0x5C, HS_TUNE_FAMILY_GLM }, /* goldmont */
+    { 0x6, 0x5F, HS_TUNE_FAMILY_GLM }, /* denverton */
 
     { 0x6, 0x3C, HS_TUNE_FAMILY_HSW }, /* haswell */
     { 0x6, 0x45, HS_TUNE_FAMILY_HSW }, /* haswell */
     { 0x6, 0x46, HS_TUNE_FAMILY_HSW }, /* haswell */
-    { 0x6, 0x3F, HS_TUNE_FAMILY_HSW }, /* haswell */
+    { 0x6, 0x3F, HS_TUNE_FAMILY_HSW }, /* haswell Xeon */
 
-    { 0x6, 0x3E, HS_TUNE_FAMILY_IVB }, /* ivybridge */
+    { 0x6, 0x3E, HS_TUNE_FAMILY_IVB }, /* ivybridge Xeon */
     { 0x6, 0x3A, HS_TUNE_FAMILY_IVB }, /* ivybridge */
 
     { 0x6, 0x2A, HS_TUNE_FAMILY_SNB }, /* sandybridge */
-    { 0x6, 0x2D, HS_TUNE_FAMILY_SNB }, /* sandybridge */
+    { 0x6, 0x2D, HS_TUNE_FAMILY_SNB }, /* sandybridge Xeon */
 
     { 0x6, 0x3D, HS_TUNE_FAMILY_BDW }, /* broadwell Core-M */
+    { 0x6, 0x47, HS_TUNE_FAMILY_BDW }, /* broadwell */
     { 0x6, 0x4F, HS_TUNE_FAMILY_BDW }, /* broadwell xeon */
     { 0x6, 0x56, HS_TUNE_FAMILY_BDW }, /* broadwell xeon-d */
 
-//    { 0x6, 0x25, HS_TUNE_FAMILY_GENERIC }, /* westmere */
-//    { 0x6, 0x2C, HS_TUNE_FAMILY_GENERIC }, /* westmere */
-//    { 0x6, 0x2F, HS_TUNE_FAMILY_GENERIC }, /* westmere */
-
-//    { 0x6, 0x1E, HS_TUNE_FAMILY_GENERIC }, /* nehalem */
-//    { 0x6, 0x1A, HS_TUNE_FAMILY_GENERIC }, /* nehalem */
-//    { 0x6, 0x2E, HS_TUNE_FAMILY_GENERIC }, /* nehalem */
+    { 0x6, 0x4E, HS_TUNE_FAMILY_SKL }, /* Skylake Mobile */
+    { 0x6, 0x5E, HS_TUNE_FAMILY_SKL }, /* Skylake Core/E3 Xeon */
+    { 0x6, 0x55, HS_TUNE_FAMILY_SKX }, /* Skylake Xeon */
 
-//    { 0x6, 0x17, HS_TUNE_FAMILY_GENERIC }, /* penryn */
-//    { 0x6, 0x1D, HS_TUNE_FAMILY_GENERIC }, /* penryn */
+    { 0x6, 0x8E, HS_TUNE_FAMILY_SKL }, /* Kabylake Mobile */
+    { 0x6, 0x9E, HS_TUNE_FAMILY_SKL }, /* Kabylake desktop */
 
 };
 
@@ -203,10 +268,13 @@ const char *dumpTune(u32 tune) {
 #define T_CASE(x) case x: return #x;
     switch (tune) {
         T_CASE(HS_TUNE_FAMILY_SLM);
+        T_CASE(HS_TUNE_FAMILY_GLM);
         T_CASE(HS_TUNE_FAMILY_HSW);
         T_CASE(HS_TUNE_FAMILY_SNB);
         T_CASE(HS_TUNE_FAMILY_IVB);
         T_CASE(HS_TUNE_FAMILY_BDW);
+        T_CASE(HS_TUNE_FAMILY_SKL);
+        T_CASE(HS_TUNE_FAMILY_SKX);
     }
 #undef T_CASE
     return "unknown";
diff --git a/src/util/cpuid_flags.h b/src/util/cpuid_flags.h
index 8b23d4958..d79c3832f 100644
--- a/src/util/cpuid_flags.h
+++ b/src/util/cpuid_flags.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,8 +26,8 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef CPUID_H_53FFCB14B257C2
-#define CPUID_H_53FFCB14B257C2
+#ifndef UTIL_CPUID_H_
+#define UTIL_CPUID_H_
 
 #include "ue2common.h"
 
@@ -41,6 +41,7 @@ u64a cpuid_flags(void);
 
 u32 cpuid_tune(void);
 
+int check_avx512(void);
 int check_avx2(void);
 int check_ssse3(void);
 int check_sse42(void);
@@ -50,5 +51,5 @@ int check_popcnt(void);
 } /* extern "C" */
 #endif
 
-#endif /* CPUID_H_53FFCB14B257C2 */
+#endif /* UTIL_CPUID_H_ */
 
diff --git a/src/util/depth.h b/src/util/depth.h
index 977fd0c30..9af1ded88 100644
--- a/src/util/depth.h
+++ b/src/util/depth.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,13 +34,13 @@
 #define DEPTH_H
 
 #include "ue2common.h"
+#include "util/hash.h"
+#include "util/operators.h"
 
 #ifdef DUMP_SUPPORT
 #include <string>
 #endif
 
-#include <boost/functional/hash/hash_fwd.hpp>
-
 namespace ue2 {
 
 /**
@@ -52,11 +52,12 @@ struct DepthOverflowError {};
  * \brief Type used to represent depth information; value is either a count,
  * or the special values "infinity" and "unreachable".
  */
-class depth {
+class depth : totally_ordered<depth> {
 public:
-    depth() : val(val_unreachable) {}
+    /** \brief The default depth is special value "unreachable". */
+    depth() = default;
 
-    depth(u32 v) : val(v) {
+    explicit depth(u32 v) : val(v) {
         if (v > max_value()) {
             DEBUG_PRINTF("depth %u too large to represent!\n", v);
             throw DepthOverflowError();
@@ -92,11 +93,7 @@ class depth {
     }
 
     bool operator<(const depth &d) const { return val < d.val; }
-    bool operator>(const depth &d) const { return val > d.val; }
-    bool operator<=(const depth &d) const { return val <= d.val; }
-    bool operator>=(const depth &d) const { return val >= d.val; }
     bool operator==(const depth &d) const { return val == d.val; }
-    bool operator!=(const depth &d) const { return val != d.val; }
 
     // The following comparison operators exist for use against integer types
     // that are bigger than what we can safely convert to depth (such as those
@@ -196,6 +193,29 @@ class depth {
         return *this;
     }
 
+    depth operator-(s32 d) const {
+        if (is_unreachable()) {
+            return unreachable();
+        }
+        if (is_infinite()) {
+            return infinity();
+        }
+
+        s64a rv = val - d;
+        if (rv < 0 || (u64a)rv >= val_infinity) {
+            DEBUG_PRINTF("depth %lld too large to represent!\n", rv);
+            throw DepthOverflowError();
+        }
+
+        return depth((u32)rv);
+    }
+
+    depth operator-=(s32 d) {
+        depth rv = *this - d;
+        *this = rv;
+        return *this;
+    }
+
 #ifdef DUMP_SUPPORT
     /** \brief Render as a string, useful for debugging. */
     std::string str() const;
@@ -209,17 +229,17 @@ class depth {
     static constexpr u32 val_infinity = (1u << 31) - 1;
     static constexpr u32 val_unreachable = 1u << 31;
 
-    u32 val;
+    u32 val = val_unreachable;
 };
 
 /**
  * \brief Encapsulates a min/max pair.
  */
-struct DepthMinMax {
-    depth min;
-    depth max;
+struct DepthMinMax : totally_ordered<DepthMinMax> {
+    depth min{depth::infinity()};
+    depth max{0};
 
-    DepthMinMax() : min(depth::infinity()), max(depth(0)) {}
+    DepthMinMax() = default;
     DepthMinMax(const depth &mn, const depth &mx) : min(mn), max(mx) {}
 
     bool operator<(const DepthMinMax &b) const {
@@ -233,21 +253,15 @@ struct DepthMinMax {
         return min == b.min && max == b.max;
     }
 
-    bool operator!=(const DepthMinMax &b) const {
-        return !(*this == b);
-    }
-
 #ifdef DUMP_SUPPORT
     /** \brief Render as a string, useful for debugging. */
     std::string str() const;
 #endif
+
 };
 
 inline size_t hash_value(const DepthMinMax &d) {
-    size_t val = 0;
-    boost::hash_combine(val, d.min);
-    boost::hash_combine(val, d.max);
-    return val;
+    return hash_all(d.min, d.max);
 }
 
 /**
diff --git a/src/util/dump_charclass.cpp b/src/util/dump_charclass.cpp
index 4c159ec24..4535777d1 100644
--- a/src/util/dump_charclass.cpp
+++ b/src/util/dump_charclass.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -249,15 +249,6 @@ string describeClass(const CharReach &cr, size_t maxLength,
     return oss.str();
 }
 
-string describeClasses(const std::vector<CharReach> &v, size_t maxClassLength,
-                       enum cc_output_t out_type) {
-    std::ostringstream oss;
-    for (const auto &cr : v) {
-        describeClass(oss, cr, maxClassLength, out_type);
-    }
-    return oss.str();
-}
-
 // C stdio wrapper
 void describeClass(FILE *f, const CharReach &cr, size_t maxLength,
                    enum cc_output_t out_type) {
diff --git a/src/util/dump_charclass.h b/src/util/dump_charclass.h
index 45b707f1e..999641340 100644
--- a/src/util/dump_charclass.h
+++ b/src/util/dump_charclass.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -37,6 +37,7 @@
 
 #include <cstdio>
 #include <ostream>
+#include <sstream>
 #include <string>
 #include <vector>
 
@@ -55,9 +56,16 @@ void describeClass(std::ostream &os, const CharReach &cr, size_t maxLength = 16,
 std::string describeClass(const CharReach &cr, size_t maxLength = 16,
                           enum cc_output_t out_type = CC_OUT_TEXT);
 
-std::string describeClasses(const std::vector<CharReach> &v,
+template<typename Container>
+std::string describeClasses(const Container &container,
                             size_t maxClassLength = 16,
-                            enum cc_output_t out_type = CC_OUT_TEXT);
+                            enum cc_output_t out_type = CC_OUT_TEXT) {
+    std::ostringstream oss;
+    for (const CharReach &cr : container) {
+        describeClass(oss, cr, maxClassLength, out_type);
+    }
+    return oss.str();
+}
 
 void describeClass(FILE *f, const CharReach &cr, size_t maxLength,
                    enum cc_output_t out_type);
diff --git a/src/util/dump_util.cpp b/src/util/dump_util.cpp
index 5b961367c..782cba7a3 100644
--- a/src/util/dump_util.cpp
+++ b/src/util/dump_util.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,6 +33,8 @@
 
 using namespace std;
 
+namespace ue2 {
+
 FILE *fopen_or_throw(const char *path, const char *mode) {
     FILE *f = fopen(path, mode);
     if (!f) {
@@ -40,3 +42,5 @@ FILE *fopen_or_throw(const char *path, const char *mode) {
     }
     return f;
 }
+
+} // namespace ue2
diff --git a/src/util/dump_util.h b/src/util/dump_util.h
index 487d2e7c3..f5ebe94a5 100644
--- a/src/util/dump_util.h
+++ b/src/util/dump_util.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,9 +31,13 @@
 
 #include <cstdio>
 
+namespace ue2 {
+
 /**
  * Same as fopen(), but on error throws an exception rather than returning NULL.
  */
 FILE *fopen_or_throw(const char *path, const char *mode);
 
+} // namespace ue2
+
 #endif
diff --git a/src/util/hash.h b/src/util/hash.h
index 0b5717729..6f76e43de 100644
--- a/src/util/hash.h
+++ b/src/util/hash.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,6 +34,7 @@
 #ifndef UTIL_HASH_H
 #define UTIL_HASH_H
 
+#include <iterator>
 #include <boost/functional/hash/hash_fwd.hpp>
 
 namespace ue2 {
@@ -69,6 +70,15 @@ size_t hash_all(Args&&... args) {
     return v;
 }
 
+/**
+ * \brief Compute the hash of all the elements of any range on which we can
+ * call std::begin() and std::end().
+ */
+template<typename Range>
+size_t hash_range(const Range &r) {
+    return boost::hash_range(std::begin(r), std::end(r));
+}
+
 } // namespace ue2
 
 #endif // UTIL_HASH_H
diff --git a/src/util/hash_dynamic_bitset.h b/src/util/hash_dynamic_bitset.h
new file mode 100644
index 000000000..315aed34f
--- /dev/null
+++ b/src/util/hash_dynamic_bitset.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file
+ * \brief Hashing utility functions.
+ */
+
+#ifndef UTIL_HASH_DYNAMIC_BITSET_H
+#define UTIL_HASH_DYNAMIC_BITSET_H
+
+#include <boost/dynamic_bitset.hpp>
+#include <boost/functional/hash/hash.hpp>
+
+#include <iterator>
+
+namespace ue2 {
+
+/**
+ * \brief An output iterator which calculates the combined hash of all elements
+ * written to it.
+ *
+ * The location to output the hash is provided to the constructor and should
+ * already be zero initialised.
+ */
+struct hash_output_it {
+    using value_type = void;
+    using difference_type = ptrdiff_t;
+    using pointer = void *;
+    using reference = void;
+    using iterator_category = std::output_iterator_tag;
+
+    hash_output_it(size_t *hash_out = nullptr) : out(hash_out) {}
+    hash_output_it &operator++() {
+        return *this;
+    }
+    hash_output_it &operator++(int) {
+        return *this;
+    }
+
+    struct deref_proxy {
+        deref_proxy(size_t *hash_out) : out(hash_out) {}
+
+        template<typename T>
+        void operator=(const T &val) const {
+            boost::hash_combine(*out, val);
+        }
+
+    private:
+        size_t *out; /* output location of the owning iterator */
+    };
+
+    deref_proxy operator*() { return {out}; }
+
+private:
+    size_t *out; /* location to output the hashes to */
+};
+
+/* Function object for hashing a dynamic bitset */
+struct hash_dynamic_bitset {
+    size_t operator()(const boost::dynamic_bitset<> &bs) const {
+        size_t rv = 0;
+        to_block_range(bs, hash_output_it(&rv));
+        return rv;
+    }
+};
+
+} // namespace ue2
+
+#endif
diff --git a/unit/internal/multiaccel_shift.cpp b/src/util/intrinsics.h
similarity index 59%
rename from unit/internal/multiaccel_shift.cpp
rename to src/util/intrinsics.h
index d6019870d..edc4f6efb 100644
--- a/unit/internal/multiaccel_shift.cpp
+++ b/src/util/intrinsics.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,56 +26,41 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+/** \file
+ * \brief Wrapper around the compiler supplied intrinsic header
+ */
+
+#ifndef INTRINSICS_H
+#define INTRINSICS_H
+
 #include "config.h"
-#include "src/ue2common.h"
 
-#include "gtest/gtest.h"
-#include "nfa/multiaccel_common.h"
+#ifdef __cplusplus
+# if defined(HAVE_CXX_X86INTRIN_H)
+#  define USE_X86INTRIN_H
+# endif
+#else // C
+# if defined(HAVE_C_X86INTRIN_H)
+#  define USE_X86INTRIN_H
+# endif
+#endif
 
-/*
- * Unit tests for the shifters.
- *
- * This is a bit messy, as shifters are macros, so we're using macros to test
- * other macros.
- */
+#ifdef __cplusplus
+# if defined(HAVE_CXX_INTRIN_H)
+#  define USE_INTRIN_H
+# endif
+#else // C
+# if defined(HAVE_C_INTRIN_H)
+#  define USE_INTRIN_H
+# endif
+#endif
 
-#define TEST_SHIFT(n) \
-    do { \
-        u64a val = ((u64a) 1 << n) - 1; \
-        JOIN(SHIFT, n)(val); \
-        ASSERT_EQ(val, 1); \
-    } while (0)
+#if defined(USE_X86INTRIN_H)
+#include <x86intrin.h>
+#elif defined(USE_INTRIN_H)
+#include <intrin.h>
+#else
+#error no intrinsics file
+#endif
 
-TEST(MultiaccelShift, StaticShift) {
-    TEST_SHIFT(1);
-    TEST_SHIFT(2);
-    TEST_SHIFT(3);
-    TEST_SHIFT(4);
-    TEST_SHIFT(5);
-    TEST_SHIFT(6);
-    TEST_SHIFT(7);
-    TEST_SHIFT(8);
-    TEST_SHIFT(10);
-    TEST_SHIFT(11);
-    TEST_SHIFT(12);
-    TEST_SHIFT(13);
-    TEST_SHIFT(14);
-    TEST_SHIFT(15);
-    TEST_SHIFT(16);
-    TEST_SHIFT(17);
-    TEST_SHIFT(18);
-    TEST_SHIFT(19);
-    TEST_SHIFT(20);
-    TEST_SHIFT(21);
-    TEST_SHIFT(22);
-    TEST_SHIFT(23);
-    TEST_SHIFT(24);
-    TEST_SHIFT(25);
-    TEST_SHIFT(26);
-    TEST_SHIFT(27);
-    TEST_SHIFT(28);
-    TEST_SHIFT(29);
-    TEST_SHIFT(30);
-    TEST_SHIFT(31);
-    TEST_SHIFT(32);
-}
+#endif // INTRINSICS_H
diff --git a/src/util/make_unique.h b/src/util/make_unique.h
index 12148af1b..651e8c5cf 100644
--- a/src/util/make_unique.h
+++ b/src/util/make_unique.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -39,9 +39,9 @@
 
 namespace ue2 {
 #if defined(USE_STD)
-	using std::make_unique;
+using std::make_unique;
 #else
-	using boost::make_unique;
+using boost::make_unique;
 #endif
 }
 
diff --git a/src/util/masked_move.c b/src/util/masked_move.c
index ec788db7c..001cd49f2 100644
--- a/src/util/masked_move.c
+++ b/src/util/masked_move.c
@@ -29,8 +29,9 @@
 
 #include "ue2common.h"
 #include "masked_move.h"
+#include "util/arch.h"
 
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
 /* masks for masked moves */
 
 /* magic mask for maskload (vmmaskmovq) - described in UE-2424 */
diff --git a/src/util/masked_move.h b/src/util/masked_move.h
index 09276e802..4c877ca9e 100644
--- a/src/util/masked_move.h
+++ b/src/util/masked_move.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -29,7 +29,9 @@
 #ifndef MASKED_MOVE_H
 #define MASKED_MOVE_H
 
-#if defined(__AVX2__)
+#include "arch.h"
+
+#if defined(HAVE_AVX2)
 
 #include "unaligned.h"
 #include "simd_utils.h"
@@ -68,7 +70,8 @@ masked_move256_len(const u8 *buf, const u32 len) {
     u32 end = unaligned_load_u32(buf + len - 4);
     m256 preshufend = _mm256_broadcastq_epi64(_mm_cvtsi32_si128(end));
     m256 v = _mm256_maskload_epi32((const int *)buf, lmask);
-    m256 shufend = vpshufb(preshufend, loadu256(&mm_shuffle_end[len - 4]));
+    m256 shufend = pshufb_m256(preshufend,
+                               loadu256(&mm_shuffle_end[len - 4]));
     m256 target = or256(v, shufend);
 
     return target;
diff --git a/src/util/math.h b/src/util/math.h
new file mode 100644
index 000000000..e18c50277
--- /dev/null
+++ b/src/util/math.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef UTIL_MATH_H_
+#define UTIL_MATH_H_
+
+#include "arch.h"
+#include "intrinsics.h"
+
+#include <math.h>
+
+static really_inline
+double our_pow(double x, double y) {
+#if defined(HAVE_AVX)
+    /*
+     * Clear the upper half of AVX registers before calling into the math lib.
+     * On some versions of glibc this can save thousands of AVX-to-SSE
+     * transitions.
+     */
+    _mm256_zeroupper();
+#endif
+    return pow(x, y);
+}
+
+#endif // UTIL_MATH_H_
diff --git a/src/util/multibit_build.cpp b/src/util/multibit_build.cpp
index 5fe2d6172..ad6a0d6a6 100644
--- a/src/util/multibit_build.cpp
+++ b/src/util/multibit_build.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -155,9 +155,9 @@ void bfs(vector<mmbit_sparse_iter> &out, const TreeNode &tree) {
 
 /** \brief Construct a sparse iterator over the values in \a bits for a
  * multibit of size \a total_bits. */
-void mmbBuildSparseIterator(vector<mmbit_sparse_iter> &out,
-                            const vector<u32> &bits, u32 total_bits) {
-    assert(out.empty());
+vector<mmbit_sparse_iter> mmbBuildSparseIterator(const vector<u32> &bits,
+                                                 u32 total_bits) {
+    vector<mmbit_sparse_iter> out;
     assert(!bits.empty());
     assert(total_bits > 0);
     assert(total_bits <= MMB_MAX_BITS);
@@ -186,6 +186,7 @@ void mmbBuildSparseIterator(vector<mmbit_sparse_iter> &out,
 #endif
 
     DEBUG_PRINTF("iter has %zu records\n", out.size());
+    return out;
 }
 
 template<typename T>
@@ -272,7 +273,7 @@ void mmbBuildInitRangePlan(u32 total_bits, u32 begin, u32 end,
         }
 
         // Partial block to deal with beginning.
-        block_offset += k1 / MMB_KEY_BITS;
+        block_offset += (k1 / MMB_KEY_BITS) * sizeof(MMB_TYPE);
         if (k1 % MMB_KEY_BITS) {
             u32 idx = k1 / MMB_KEY_BITS;
             u32 block_end = (idx + 1) * MMB_KEY_BITS;
diff --git a/src/util/multibit_build.h b/src/util/multibit_build.h
index 951f1fb46..2d7b5fc26 100644
--- a/src/util/multibit_build.h
+++ b/src/util/multibit_build.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -61,8 +61,8 @@ u32 mmbit_size(u32 total_bits);
 
 /** \brief Construct a sparse iterator over the values in \a bits for a
  * multibit of size \a total_bits. */
-void mmbBuildSparseIterator(std::vector<mmbit_sparse_iter> &out,
-                            const std::vector<u32> &bits, u32 total_bits);
+std::vector<mmbit_sparse_iter>
+mmbBuildSparseIterator(const std::vector<u32> &bits, u32 total_bits);
 
 struct scatter_plan_raw;
 
diff --git a/src/util/noncopyable.h b/src/util/noncopyable.h
new file mode 100644
index 000000000..cd4f2e026
--- /dev/null
+++ b/src/util/noncopyable.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \file
+ * \brief Class that makes derived classes non-copyable.
+ */
+
+#ifndef UTIL_NONCOPYABLE_H
+#define UTIL_NONCOPYABLE_H
+
+namespace ue2 {
+
+/** \brief Class that makes derived classes non-copyable.  */
+struct noncopyable {
+    noncopyable() = default;
+    noncopyable(const noncopyable &) = delete;
+    noncopyable(noncopyable &&) = default;
+    noncopyable &operator=(const noncopyable &) = delete;
+    noncopyable &operator=(noncopyable &&) = default;
+};
+
+} // namespace ue2
+
+#endif // UTIL_NONCOPYABLE_H
diff --git a/src/util/operators.h b/src/util/operators.h
new file mode 100644
index 000000000..b0a1c1cca
--- /dev/null
+++ b/src/util/operators.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of Intel Corporation nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * \brief Ordered operators: provides all the other compare operators for types
+ * that provide equal and less-than.
+ *
+ * This is similar to Boost's totally_ordered class, but much simpler and
+ * without injecting the boost namespace into ADL lookup.
+ */
+
+#ifndef UTIL_OPERATORS_H
+#define UTIL_OPERATORS_H
+
+namespace ue2 {
+
+/**
+ * \brief Ordered operators: provides all the other compare operators for types
+ * that provide equal and less-than.
+ *
+ * Simply inherit from this class with your class name as its template
+ * parameter.
+ */
+template<typename T>
+class totally_ordered {
+public:
+    friend bool operator!=(const T &a, const T &b) { return !(a == b); }
+    friend bool operator<=(const T &a, const T &b) { return !(b < a); }
+    friend bool operator>(const T &a, const T &b) { return b < a; }
+    friend bool operator>=(const T &a, const T &b) { return !(a < b); }
+};
+
+} // namespace
+
+#endif // UTIL_OPERATORS_H
diff --git a/src/util/partitioned_set.h b/src/util/partitioned_set.h
index 8f92a8b75..a9e4644d1 100644
--- a/src/util/partitioned_set.h
+++ b/src/util/partitioned_set.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,13 +30,13 @@
 #define PARTITIONED_SET_H
 
 #include "container.h"
+#include "noncopyable.h"
 #include "ue2_containers.h"
 #include "ue2common.h"
 
 #include <algorithm>
 #include <vector>
 
-#include <boost/core/noncopyable.hpp>
 #include <boost/dynamic_bitset.hpp>
 
 namespace ue2 {
@@ -53,7 +53,7 @@ static constexpr size_t INVALID_SUBSET = ~(size_t)0;
  */
 
 template<typename T>
-class partitioned_set : boost::noncopyable {
+class partitioned_set : noncopyable {
 public:
     class subset {
     public:
diff --git a/src/util/popcount.h b/src/util/popcount.h
index d882a6720..eb08f6b1b 100644
--- a/src/util/popcount.h
+++ b/src/util/popcount.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,17 +30,11 @@
  * \brief Platform specific popcount functions
  */
 
-#ifndef POPCOUNT_H_075D843B4545B6
-#define POPCOUNT_H_075D843B4545B6
+#ifndef UTIL_POPCOUNT_H_
+#define UTIL_POPCOUNT_H_
 
 #include "ue2common.h"
-
-// We have a native popcount where the compiler has defined __POPCNT__.
-#if defined(__POPCNT__)
-#define HAVE_POPCOUNT_INSTR
-#elif defined(_WIN32) && defined(__AVX__) // TODO: fix win preproc
-#define HAVE_POPCOUNT_INSTR
-#endif
+#include "util/arch.h"
 
 static really_inline
 u32 popcount32(u32 x) {
@@ -76,5 +70,5 @@ u32 popcount64(u64a x) {
 #endif
 }
 
-#endif /* POPCOUNT_H_075D843B4545B6 */
+#endif /* UTIL_POPCOUNT_H_ */
 
diff --git a/src/util/queue_index_factory.h b/src/util/queue_index_factory.h
index 1360beef5..e8f7028ec 100644
--- a/src/util/queue_index_factory.h
+++ b/src/util/queue_index_factory.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,12 +33,11 @@
 #define UTIL_QUEUE_INDEX_FACTORY_H
 
 #include "ue2common.h"
-
-#include <boost/core/noncopyable.hpp>
+#include "util/noncopyable.h"
 
 namespace ue2 {
 
-class QueueIndexFactory : boost::noncopyable {
+class QueueIndexFactory : noncopyable {
 public:
     QueueIndexFactory() : val(0) {}
     u32 get_queue() { return val++; }
diff --git a/src/util/report.h b/src/util/report.h
index 24ecca9d4..a8e233ffd 100644
--- a/src/util/report.h
+++ b/src/util/report.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,9 +34,10 @@
 #ifndef UTIL_REPORT_H
 #define UTIL_REPORT_H
 
-#include "util/exhaust.h" // for INVALID_EKEY
-#include "order_check.h"
 #include "ue2common.h"
+#include "util/exhaust.h" // for INVALID_EKEY
+#include "util/hash.h"
+#include "util/order_check.h"
 
 #include <cassert>
 
@@ -195,6 +196,23 @@ bool operator<(const Report &a, const Report &b) {
     return false;
 }
 
+inline
+bool operator==(const Report &a, const Report &b) {
+    return a.type == b.type && a.quashSom == b.quashSom &&
+           a.minOffset == b.minOffset && a.maxOffset == b.maxOffset &&
+           a.minLength == b.minLength && a.ekey == b.ekey &&
+           a.offsetAdjust == b.offsetAdjust && a.onmatch == b.onmatch &&
+           a.revNfaIndex == b.revNfaIndex && a.somDistance == b.somDistance &&
+           a.topSquashDistance == b.topSquashDistance;
+}
+
+inline
+size_t hash_value(const Report &r) {
+    return hash_all(r.type, r.quashSom, r.minOffset, r.maxOffset, r.minLength,
+                    r.ekey, r.offsetAdjust, r.onmatch, r.revNfaIndex,
+                    r.somDistance, r.topSquashDistance);
+}
+
 static inline
 Report makeECallback(u32 report, s32 offsetAdjust, u32 ekey) {
     Report ir(EXTERNAL_CALLBACK, report);
diff --git a/src/util/report_manager.cpp b/src/util/report_manager.cpp
index 8377ea036..a846eb25e 100644
--- a/src/util/report_manager.cpp
+++ b/src/util/report_manager.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -29,9 +29,12 @@
 /** \file
  * \brief ReportManager: tracks Report structures, exhaustion and dedupe keys.
  */
-#include "grey.h"
+
 #include "report_manager.h"
+
+#include "grey.h"
 #include "ue2common.h"
+#include "compiler/compiler.h"
 #include "nfagraph/ng.h"
 #include "rose/rose_build.h"
 #include "util/compile_error.h"
@@ -64,7 +67,7 @@ u32 ReportManager::getInternalId(const Report &ir) {
 
     u32 size = reportIds.size();
     reportIds.push_back(ir);
-    reportIdToInternalMap[ir] = size;
+    reportIdToInternalMap.emplace(ir, size);
     DEBUG_PRINTF("new report %u\n", size);
     return size;
 }
@@ -171,8 +174,9 @@ u32 ReportManager::getDkey(const Report &r) const {
 
 void ReportManager::registerExtReport(ReportID id,
                                       const external_report_info &ext) {
-    if (contains(externalIdMap, id)) {
-        const external_report_info &eri = externalIdMap.at(id);
+    auto it = externalIdMap.find(id);
+    if (it != externalIdMap.end()) {
+        const external_report_info &eri = it->second;
         if (eri.highlander != ext.highlander) {
             /* we have a problem */
             ostringstream out;
@@ -201,20 +205,21 @@ void ReportManager::registerExtReport(ReportID id,
     }
 }
 
-Report ReportManager::getBasicInternalReport(const NGWrapper &g, s32 adj) {
+Report ReportManager::getBasicInternalReport(const ExpressionInfo &expr,
+                                             s32 adj) {
     /* validate that we are not violating highlander constraints, this will
      * throw a CompileError if so. */
-    registerExtReport(g.reportId,
-                      external_report_info(g.highlander, g.expressionIndex));
+    registerExtReport(expr.report,
+                      external_report_info(expr.highlander, expr.index));
 
     /* create the internal report */
     u32 ekey = INVALID_EKEY;
-    if (g.highlander) {
+    if (expr.highlander) {
         /* all patterns with the same report id share an ekey */
-        ekey = getExhaustibleKey(g.reportId);
+        ekey = getExhaustibleKey(expr.report);
     }
 
-    return makeECallback(g.reportId, adj, ekey);
+    return makeECallback(expr.report, adj, ekey);
 }
 
 void ReportManager::setProgramOffset(ReportID id, u32 programOffset) {
diff --git a/src/util/report_manager.h b/src/util/report_manager.h
index 0eed2711b..95e14a2c3 100644
--- a/src/util/report_manager.h
+++ b/src/util/report_manager.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -36,18 +36,19 @@
 
 #include "ue2common.h"
 #include "util/compile_error.h"
+#include "util/noncopyable.h"
 #include "util/report.h"
+#include "util/ue2_containers.h"
 
 #include <map>
 #include <set>
 #include <vector>
-#include <boost/core/noncopyable.hpp>
 
 namespace ue2 {
 
 struct Grey;
 class RoseBuild;
-class NGWrapper;
+class ExpressionInfo;
 
 struct external_report_info {
     external_report_info(bool h, u32 fpi)
@@ -57,7 +58,7 @@ struct external_report_info {
 };
 
 /** \brief Tracks Report structures, exhaustion and dedupe keys. */
-class ReportManager : boost::noncopyable {
+class ReportManager : noncopyable {
 public:
     explicit ReportManager(const Grey &g);
 
@@ -92,13 +93,13 @@ class ReportManager : boost::noncopyable {
     const std::vector<Report> &reports() const { return reportIds; }
 
     /**
-     * Get a simple internal report corresponding to the wrapper. An ekey will
-     * be setup as required.
+     * Get a simple internal report corresponding to the expression. An ekey
+     * will be setup if required.
      *
      * Note: this function may throw a CompileError if constraints on external
      * match id are violated (mixed highlander status for example).
      */
-    Report getBasicInternalReport(const NGWrapper &g, s32 adj = 0);
+    Report getBasicInternalReport(const ExpressionInfo &expr, s32 adj = 0);
 
     /** \brief Register an external report and validate that we are not
      * violating highlander constraints (which will cause an exception to be
@@ -129,18 +130,18 @@ class ReportManager : boost::noncopyable {
     std::vector<Report> reportIds;
 
     /** \brief Mapping from Report to ID (inverse of \ref reportIds
-     * vector).  */
-    std::map<Report, size_t> reportIdToInternalMap;
+     * vector). */
+    unordered_map<Report, size_t> reportIdToInternalMap;
 
     /** \brief Mapping from ReportID to dedupe key. */
-    std::map<ReportID, u32> reportIdToDedupeKey;
+    unordered_map<ReportID, u32> reportIdToDedupeKey;
 
     /** \brief Mapping from ReportID to Rose program offset in bytecode. */
-    std::map<ReportID, u32> reportIdToProgramOffset;
+    unordered_map<ReportID, u32> reportIdToProgramOffset;
 
     /** \brief Mapping from external match ids to information about that
      * id. */
-    std::map<ReportID, external_report_info> externalIdMap;
+    unordered_map<ReportID, external_report_info> externalIdMap;
 
     /** \brief Mapping from expression index to exhaustion key. */
     std::map<s64a, u32> toExhaustibleKeyMap;
diff --git a/src/util/simd_types.h b/src/util/simd_types.h
index d6e5d6a3e..962cad6c9 100644
--- a/src/util/simd_types.h
+++ b/src/util/simd_types.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,52 +30,28 @@
 #define SIMD_TYPES_H
 
 #include "config.h"
+#include "util/arch.h"
+#include "util/intrinsics.h"
 #include "ue2common.h"
 
-// more recent headers are bestest, but only if we can use them
-#ifdef __cplusplus
-# if defined(HAVE_CXX_X86INTRIN_H)
-#  define USE_X86INTRIN_H
-# endif
-#else // C
-# if defined(HAVE_C_X86INTRIN_H)
-#  define USE_X86INTRIN_H
-# endif
-#endif
-
-#ifdef __cplusplus
-# if defined(HAVE_CXX_INTRIN_H)
-#  define USE_INTRIN_H
-# endif
-#else // C
-# if defined(HAVE_C_INTRIN_H)
-#  define USE_INTRIN_H
-# endif
-#endif
-
-#if defined(USE_X86INTRIN_H)
-#include <x86intrin.h>
-#elif defined(USE_INTRIN_H)
-#include <intrin.h>
-#else
-#error no intrinsics!
-#endif
-
-#if defined(__SSE2__) || defined(_M_X64) || (_M_IX86_FP >= 2)
+#if defined(HAVE_SSE2)
 typedef __m128i m128;
 #else
 typedef struct ALIGN_DIRECTIVE {u64a hi; u64a lo;} m128;
 #endif
 
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
 typedef __m256i m256;
 #else
-typedef ALIGN_AVX_DIRECTIVE struct {m128 lo; m128 hi;} m256;
+typedef struct ALIGN_AVX_DIRECTIVE {m128 lo; m128 hi;} m256;
 #endif
 
-// these should align to 16 and 32 respectively
 typedef struct {m128 lo; m128 mid; m128 hi;} m384;
-typedef struct {m256 lo; m256 hi;} m512;
+#if defined(HAVE_AVX512)
+typedef __m512i m512;
+#else
+typedef struct ALIGN_ATTR(64) {m256 lo; m256 hi;} m512;
+#endif
 
 #endif /* SIMD_TYPES_H */
 
diff --git a/src/util/simd_utils.c b/src/util/simd_utils.c
index 54b5b4baa..25a81412e 100644
--- a/src/util/simd_utils.c
+++ b/src/util/simd_utils.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -49,6 +49,7 @@ ALIGN_CL_DIRECTIVE const char vbs_mask_data[] = {
 
 /** \brief LUT for the mask1bit functions. */
 ALIGN_CL_DIRECTIVE const u8 simd_onebit_masks[] = {
+    ZEROES_32, ZEROES_32,
     ZEROES_31, 0x01, ZEROES_32,
     ZEROES_31, 0x02, ZEROES_32,
     ZEROES_31, 0x04, ZEROES_32,
@@ -57,4 +58,5 @@ ALIGN_CL_DIRECTIVE const u8 simd_onebit_masks[] = {
     ZEROES_31, 0x20, ZEROES_32,
     ZEROES_31, 0x40, ZEROES_32,
     ZEROES_31, 0x80, ZEROES_32,
+    ZEROES_32, ZEROES_32,
 };
diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h
index e8676249a..047cdbab1 100644
--- a/src/util/simd_utils.h
+++ b/src/util/simd_utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -38,40 +38,13 @@
 #endif
 
 #include "config.h"
-#include <string.h> // for memcpy
-
-// more recent headers are bestest, but only if we can use them
-#ifdef __cplusplus
-# if defined(HAVE_CXX_X86INTRIN_H)
-#  define USE_X86INTRIN_H
-# endif
-#else // C
-# if defined(HAVE_C_X86INTRIN_H)
-#  define USE_X86INTRIN_H
-# endif
-#endif
-
-#ifdef __cplusplus
-# if defined(HAVE_CXX_INTRIN_H)
-#  define USE_INTRIN_H
-# endif
-#else // C
-# if defined(HAVE_C_INTRIN_H)
-#  define USE_INTRIN_H
-# endif
-#endif
-
-#if defined(USE_X86INTRIN_H)
-#include <x86intrin.h>
-#elif defined(USE_INTRIN_H)
-#include <intrin.h>
-#else
-#error no intrins!
-#endif
-
 #include "ue2common.h"
 #include "simd_types.h"
 #include "unaligned.h"
+#include "util/arch.h"
+#include "util/intrinsics.h"
+
+#include <string.h> // for memcpy
 
 // Define a common assume_aligned using an appropriate compiler built-in, if
 // it's available. Note that we need to handle C or C++ compilation.
@@ -141,7 +114,7 @@ static really_inline u32 diffrich128(m128 a, m128 b) {
  * returns a 4-bit mask indicating which 64-bit words contain differences.
  */
 static really_inline u32 diffrich64_128(m128 a, m128 b) {
-#if defined(__SSE_41__)
+#if defined(HAVE_SSE41)
     a = _mm_cmpeq_epi64(a, b);
     return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5;
 #else
@@ -150,7 +123,17 @@ static really_inline u32 diffrich64_128(m128 a, m128 b) {
 #endif
 }
 
-#define lshift64_m128(a, b) _mm_slli_epi64((a), (b))
+static really_really_inline
+m128 lshift64_m128(m128 a, unsigned b) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(b)) {
+        return _mm_slli_epi64(a, b);
+    }
+#endif
+    m128 x = _mm_cvtsi32_si128(b);
+    return _mm_sll_epi64(a, x);
+}
+
 #define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
 #define eq128(a, b)      _mm_cmpeq_epi8((a), (b))
 #define movemask128(a)  ((u32)_mm_movemask_epi8((a)))
@@ -180,25 +163,17 @@ static really_inline u64a movq(const m128 in) {
 /* another form of movq */
 static really_inline
 m128 load_m128_from_u64a(const u64a *p) {
-#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
-    /* unfortunately _mm_loadl_epi64() is best avoided as it seems to cause
-     * trouble on some older compilers, possibly because it is misdefined to
-     * take an m128 as its parameter */
-    return _mm_set_epi64((__m64)0ULL, (__m64)*p);
-#else
-    /* ICC doesn't like casting to __m64 */
-    return _mm_loadl_epi64((const m128 *)p);
-#endif
+    return _mm_set_epi64x(0LL, *p);
 }
 
 #define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed)
 #define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed)
 
-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)
 // TODO: this entire file needs restructuring - this carveout is awful
 #define extractlow64from256(a) movq(a.lo)
 #define extractlow32from256(a) movd(a.lo)
-#if defined(__SSE4_1__)
+#if defined(HAVE_SSE41)
 #define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4)
 #define extract64from256(a, imm) _mm_extract_epi64((imm >> 2) ? a.hi : a.lo, imm % 2)
 #else
@@ -275,7 +250,7 @@ extern const u8 simd_onebit_masks[];
 static really_inline
 m128 mask1bit128(unsigned int n) {
     assert(n < sizeof(m128) * 8);
-    u32 mask_idx = ((n % 8) * 64) + 31;
+    u32 mask_idx = ((n % 8) * 64) + 95;
     mask_idx -= n / 8;
     return loadu128(&simd_onebit_masks[mask_idx]);
 }
@@ -296,7 +271,7 @@ void clearbit128(m128 *ptr, unsigned int n) {
 static really_inline
 char testbit128(m128 val, unsigned int n) {
     const m128 mask = mask1bit128(n);
-#if defined(__SSE4_1__)
+#if defined(HAVE_SSE41)
     return !_mm_testz_si128(mask, val);
 #else
     return isnonzero128(and128(mask, val));
@@ -307,29 +282,41 @@ char testbit128(m128 val, unsigned int n) {
 #define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset)
 
 static really_inline
-m128 pshufb(m128 a, m128 b) {
+m128 pshufb_m128(m128 a, m128 b) {
     m128 result;
     result = _mm_shuffle_epi8(a, b);
     return result;
 }
 
 static really_inline
-m256 vpshufb(m256 a, m256 b) {
-#if defined(__AVX2__)
+m256 pshufb_m256(m256 a, m256 b) {
+#if defined(HAVE_AVX2)
     return _mm256_shuffle_epi8(a, b);
 #else
     m256 rv;
-    rv.lo = pshufb(a.lo, b.lo);
-    rv.hi = pshufb(a.hi, b.hi);
+    rv.lo = pshufb_m128(a.lo, b.lo);
+    rv.hi = pshufb_m128(a.hi, b.hi);
     return rv;
 #endif
 }
 
+#if defined(HAVE_AVX512)
+static really_inline
+m512 pshufb_m512(m512 a, m512 b) {
+    return _mm512_shuffle_epi8(a, b);
+}
+
+static really_inline
+m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) {
+    return _mm512_maskz_shuffle_epi8(k, a, b);
+}
+#endif
+
 static really_inline
 m128 variable_byte_shift_m128(m128 in, s32 amount) {
     assert(amount >= -16 && amount <= 16);
     m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
-    return pshufb(in, shift_mask);
+    return pshufb_m128(in, shift_mask);
 }
 
 static really_inline
@@ -352,12 +339,28 @@ m128 sub_u8_m128(m128 a, m128 b) {
     return _mm_sub_epi8(a, b);
 }
 
+static really_inline
+m128 set64x2(u64a hi, u64a lo) {
+    return _mm_set_epi64x(hi, lo);
+}
+
 /****
  **** 256-bit Primitives
  ****/
 
-#if defined(__AVX2__)
-#define lshift64_m256(a, b) _mm256_slli_epi64((a), (b))
+#if defined(HAVE_AVX2)
+
+static really_really_inline
+m256 lshift64_m256(m256 a, unsigned b) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(b)) {
+        return _mm256_slli_epi64(a, b);
+    }
+#endif
+    m128 x = _mm_cvtsi32_si128(b);
+    return _mm256_sll_epi64(a, x);
+}
+
 #define rshift64_m256(a, b) _mm256_srli_epi64((a), (b))
 
 static really_inline
@@ -375,7 +378,7 @@ m256 set2x128(m128 a) {
 
 #else
 
-static really_inline
+static really_really_inline
 m256 lshift64_m256(m256 a, int b) {
     m256 rv = a;
     rv.lo = lshift64_m128(rv.lo, b);
@@ -421,7 +424,7 @@ m256 set2x128(m128 a) {
 #endif
 
 static really_inline m256 zeroes256(void) {
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
     return _mm256_setzero_si256();
 #else
     m256 rv = {zeroes128(), zeroes128()};
@@ -430,7 +433,7 @@ static really_inline m256 zeroes256(void) {
 }
 
 static really_inline m256 ones256(void) {
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
     m256 rv = _mm256_set1_epi8(0xFF);
 #else
     m256 rv = {ones128(), ones128()};
@@ -438,7 +441,7 @@ static really_inline m256 ones256(void) {
     return rv;
 }
 
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
 static really_inline m256 and256(m256 a, m256 b) {
     return _mm256_and_si256(a, b);
 }
@@ -451,7 +454,7 @@ static really_inline m256 and256(m256 a, m256 b) {
 }
 #endif
 
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
 static really_inline m256 or256(m256 a, m256 b) {
     return _mm256_or_si256(a, b);
 }
@@ -464,7 +467,7 @@ static really_inline m256 or256(m256 a, m256 b) {
 }
 #endif
 
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
 static really_inline m256 xor256(m256 a, m256 b) {
     return _mm256_xor_si256(a, b);
 }
@@ -477,7 +480,7 @@ static really_inline m256 xor256(m256 a, m256 b) {
 }
 #endif
 
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
 static really_inline m256 not256(m256 a) {
     return _mm256_xor_si256(a, ones256());
 }
@@ -490,7 +493,7 @@ static really_inline m256 not256(m256 a) {
 }
 #endif
 
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
 static really_inline m256 andnot256(m256 a, m256 b) {
     return _mm256_andnot_si256(a, b);
 }
@@ -504,7 +507,7 @@ static really_inline m256 andnot256(m256 a, m256 b) {
 #endif
 
 static really_inline int diff256(m256 a, m256 b) {
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
     return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1);
 #else
     return diff128(a.lo, b.lo) || diff128(a.hi, b.hi);
@@ -512,7 +515,7 @@ static really_inline int diff256(m256 a, m256 b) {
 }
 
 static really_inline int isnonzero256(m256 a) {
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
     return !!diff256(a, zeroes256());
 #else
     return isnonzero128(or128(a.lo, a.hi));
@@ -524,7 +527,7 @@ static really_inline int isnonzero256(m256 a) {
  * mask indicating which 32-bit words contain differences.
  */
 static really_inline u32 diffrich256(m256 a, m256 b) {
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
     a = _mm256_cmpeq_epi32(a, b);
     return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF;
 #else
@@ -548,7 +551,7 @@ static really_inline u32 diffrich64_256(m256 a, m256 b) {
 // aligned load
 static really_inline m256 load256(const void *ptr) {
     assert(ISALIGNED_N(ptr, alignof(m256)));
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
     return _mm256_load_si256((const m256 *)ptr);
 #else
     m256 rv = { load128(ptr), load128((const char *)ptr + 16) };
@@ -558,7 +561,7 @@ static really_inline m256 load256(const void *ptr) {
 
 // aligned load  of 128-bit value to low and high part of 256-bit value
 static really_inline m256 load2x128(const void *ptr) {
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
     return set2x128(load128(ptr));
 #else
     assert(ISALIGNED_N(ptr, alignof(m128)));
@@ -575,7 +578,7 @@ static really_inline m256 loadu2x128(const void *ptr) {
 // aligned store
 static really_inline void store256(void *ptr, m256 a) {
     assert(ISALIGNED_N(ptr, alignof(m256)));
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
     _mm256_store_si256((m256 *)ptr, a);
 #else
     ptr = assume_aligned(ptr, 16);
@@ -585,7 +588,7 @@ static really_inline void store256(void *ptr, m256 a) {
 
 // unaligned load
 static really_inline m256 loadu256(const void *ptr) {
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
     return _mm256_loadu_si256((const m256 *)ptr);
 #else
     m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) };
@@ -595,7 +598,7 @@ static really_inline m256 loadu256(const void *ptr) {
 
 // unaligned store
 static really_inline void storeu256(void *ptr, m256 a) {
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
     _mm256_storeu_si256((m256 *)ptr, a);
 #else
     storeu128(ptr, a.lo);
@@ -622,12 +625,24 @@ m256 loadbytes256(const void *ptr, unsigned int n) {
 static really_inline
 m256 mask1bit256(unsigned int n) {
     assert(n < sizeof(m256) * 8);
-    u32 mask_idx = ((n % 8) * 64) + 31;
+    u32 mask_idx = ((n % 8) * 64) + 95;
     mask_idx -= n / 8;
     return loadu256(&simd_onebit_masks[mask_idx]);
 }
 
-#if !defined(__AVX2__)
+static really_inline
+m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) {
+#if defined(HAVE_AVX2)
+    return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0);
+#else
+    m256 rv;
+    rv.hi = set64x2(hi_1, hi_0);
+    rv.lo = set64x2(lo_1, lo_0);
+    return rv;
+#endif
+}
+
+#if !defined(HAVE_AVX2)
 // switches on bit N in the given vector.
 static really_inline
 void setbit256(m256 *ptr, unsigned int n) {
@@ -782,7 +797,6 @@ static really_inline m384 andnot384(m384 a, m384 b) {
     return rv;
 }
 
-// The shift amount is an immediate
 static really_really_inline
 m384 lshift64_m384(m384 a, unsigned b) {
     m384 rv;
@@ -920,42 +934,119 @@ char testbit384(m384 val, unsigned int n) {
  **** 512-bit Primitives
  ****/
 
-static really_inline m512 and512(m512 a, m512 b) {
+#define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b))
+#define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b))
+
+static really_inline
+m512 zeroes512(void) {
+#if defined(HAVE_AVX512)
+    return _mm512_setzero_si512();
+#else
+    m512 rv = {zeroes256(), zeroes256()};
+    return rv;
+#endif
+}
+
+static really_inline
+m512 ones512(void) {
+#if defined(HAVE_AVX512)
+    return _mm512_set1_epi8(0xFF);
+    //return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512());
+#else
+    m512 rv = {ones256(), ones256()};
+    return rv;
+#endif
+}
+
+#if defined(HAVE_AVX512)
+static really_inline
+m512 set64x8(u8 a) {
+    return _mm512_set1_epi8(a);
+}
+
+static really_inline
+m512 set8x64(u64a a) {
+    return _mm512_set1_epi64(a);
+}
+
+static really_inline
+m512 set4x128(m128 a) {
+    return _mm512_broadcast_i32x4(a);
+}
+#endif
+
+static really_inline
+m512 and512(m512 a, m512 b) {
+#if defined(HAVE_AVX512)
+    return _mm512_and_si512(a, b);
+#else
     m512 rv;
     rv.lo = and256(a.lo, b.lo);
     rv.hi = and256(a.hi, b.hi);
     return rv;
+#endif
 }
 
-static really_inline m512 or512(m512 a, m512 b) {
+static really_inline
+m512 or512(m512 a, m512 b) {
+#if defined(HAVE_AVX512)
+    return _mm512_or_si512(a, b);
+#else
     m512 rv;
     rv.lo = or256(a.lo, b.lo);
     rv.hi = or256(a.hi, b.hi);
     return rv;
+#endif
 }
 
-static really_inline m512 xor512(m512 a, m512 b) {
+static really_inline
+m512 xor512(m512 a, m512 b) {
+#if defined(HAVE_AVX512)
+    return _mm512_xor_si512(a, b);
+#else
     m512 rv;
     rv.lo = xor256(a.lo, b.lo);
     rv.hi = xor256(a.hi, b.hi);
     return rv;
+#endif
 }
 
-static really_inline m512 not512(m512 a) {
+static really_inline
+m512 not512(m512 a) {
+#if defined(HAVE_AVX512)
+    return _mm512_xor_si512(a, ones512());
+#else
     m512 rv;
     rv.lo = not256(a.lo);
     rv.hi = not256(a.hi);
     return rv;
+#endif
 }
 
-static really_inline m512 andnot512(m512 a, m512 b) {
+static really_inline
+m512 andnot512(m512 a, m512 b) {
+#if defined(HAVE_AVX512)
+    return _mm512_andnot_si512(a, b);
+#else
     m512 rv;
     rv.lo = andnot256(a.lo, b.lo);
     rv.hi = andnot256(a.hi, b.hi);
     return rv;
+#endif
 }
 
-// The shift amount is an immediate
+#if defined(HAVE_AVX512)
+static really_really_inline
+m512 lshift64_m512(m512 a, unsigned b) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(b)) {
+        return _mm512_slli_epi64(a, b);
+    }
+#endif
+    m128 x = _mm_cvtsi32_si128(b);
+    return _mm512_sll_epi64(a, x);
+}
+#else
 static really_really_inline
 m512 lshift64_m512(m512 a, unsigned b) {
     m512 rv;
@@ -963,29 +1054,37 @@ m512 lshift64_m512(m512 a, unsigned b) {
     rv.hi = lshift64_m256(a.hi, b);
     return rv;
 }
+#endif
 
-static really_inline m512 zeroes512(void) {
-    m512 rv = {zeroes256(), zeroes256()};
-    return rv;
-}
+#if defined(HAVE_AVX512)
+#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b))
+#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed)
+#endif
 
-static really_inline m512 ones512(void) {
-    m512 rv = {ones256(), ones256()};
-    return rv;
-}
+#if !defined(_MM_CMPINT_NE)
+#define _MM_CMPINT_NE 0x4
+#endif
 
-static really_inline int diff512(m512 a, m512 b) {
+static really_inline
+int diff512(m512 a, m512 b) {
+#if defined(HAVE_AVX512)
+    return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE);
+#else
     return diff256(a.lo, b.lo) || diff256(a.hi, b.hi);
+#endif
 }
 
-static really_inline int isnonzero512(m512 a) {
-#if !defined(__AVX2__)
+static really_inline
+int isnonzero512(m512 a) {
+#if defined(HAVE_AVX512)
+    return diff512(a, zeroes512());
+#elif defined(HAVE_AVX2)
+    m256 x = or256(a.lo, a.hi);
+    return !!diff256(x, zeroes256());
+#else
     m128 x = or128(a.lo.lo, a.lo.hi);
     m128 y = or128(a.hi.lo, a.hi.hi);
     return isnonzero128(or128(x, y));
-#else
-    m256 x = or256(a.lo, a.hi);
-    return !!diff256(x, zeroes256());
 #endif
 }
 
@@ -993,8 +1092,11 @@ static really_inline int isnonzero512(m512 a) {
  * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit
  * mask indicating which 32-bit words contain differences.
  */
-static really_inline u32 diffrich512(m512 a, m512 b) {
-#if defined(__AVX2__)
+static really_inline
+u32 diffrich512(m512 a, m512 b) {
+#if defined(HAVE_AVX512)
+    return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE);
+#elif defined(HAVE_AVX2)
     return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8);
 #else
     a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo);
@@ -1011,22 +1113,32 @@ static really_inline u32 diffrich512(m512 a, m512 b) {
  * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and
  * returns a 16-bit mask indicating which 64-bit words contain differences.
  */
-static really_inline u32 diffrich64_512(m512 a, m512 b) {
+static really_inline
+u32 diffrich64_512(m512 a, m512 b) {
+    //TODO: cmp_epi64?
     u32 d = diffrich512(a, b);
     return (d | (d >> 1)) & 0x55555555;
 }
 
 // aligned load
-static really_inline m512 load512(const void *ptr) {
+static really_inline
+m512 load512(const void *ptr) {
+#if defined(HAVE_AVX512)
+    return _mm512_load_si512(ptr);
+#else
     assert(ISALIGNED_N(ptr, alignof(m256)));
     m512 rv = { load256(ptr), load256((const char *)ptr + 32) };
     return rv;
+#endif
 }
 
 // aligned store
-static really_inline void store512(void *ptr, m512 a) {
-    assert(ISALIGNED_N(ptr, alignof(m256)));
-#if defined(__AVX2__)
+static really_inline
+void store512(void *ptr, m512 a) {
+    assert(ISALIGNED_N(ptr, alignof(m512)));
+#if defined(HAVE_AVX512)
+    return _mm512_store_si512(ptr, a);
+#elif defined(HAVE_AVX2)
     m512 *x = (m512 *)ptr;
     store256(&x->lo, a.lo);
     store256(&x->hi, a.hi);
@@ -1037,11 +1149,28 @@ static really_inline void store512(void *ptr, m512 a) {
 }
 
 // unaligned load
-static really_inline m512 loadu512(const void *ptr) {
+static really_inline
+m512 loadu512(const void *ptr) {
+#if defined(HAVE_AVX512)
+    return _mm512_loadu_si512(ptr);
+#else
     m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) };
     return rv;
+#endif
+}
+
+#if defined(HAVE_AVX512)
+static really_inline
+m512 loadu_maskz_m512(__mmask64 k, const void *ptr) {
+    return _mm512_maskz_loadu_epi8(k, ptr);
 }
 
+static really_inline
+m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) {
+    return _mm512_mask_loadu_epi8(src, k, ptr);
+}
+#endif
+
 // packed unaligned store of first N bytes
 static really_inline
 void storebytes512(void *ptr, m512 a, unsigned int n) {
@@ -1058,11 +1187,19 @@ m512 loadbytes512(const void *ptr, unsigned int n) {
     return a;
 }
 
+static really_inline
+m512 mask1bit512(unsigned int n) {
+    assert(n < sizeof(m512) * 8);
+    u32 mask_idx = ((n % 8) * 64) + 95;
+    mask_idx -= n / 8;
+    return loadu512(&simd_onebit_masks[mask_idx]);
+}
+
 // switches on bit N in the given vector.
 static really_inline
 void setbit512(m512 *ptr, unsigned int n) {
     assert(n < sizeof(*ptr) * 8);
-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)
     m128 *sub;
     if (n < 128) {
         sub = &ptr->lo.lo;
@@ -1074,6 +1211,8 @@ void setbit512(m512 *ptr, unsigned int n) {
         sub = &ptr->hi.hi;
     }
     setbit128(sub, n % 128);
+#elif defined(HAVE_AVX512)
+    *ptr = or512(mask1bit512(n), *ptr);
 #else
     m256 *sub;
     if (n < 256) {
@@ -1090,7 +1229,7 @@ void setbit512(m512 *ptr, unsigned int n) {
 static really_inline
 void clearbit512(m512 *ptr, unsigned int n) {
     assert(n < sizeof(*ptr) * 8);
-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)
     m128 *sub;
     if (n < 128) {
         sub = &ptr->lo.lo;
@@ -1102,6 +1241,8 @@ void clearbit512(m512 *ptr, unsigned int n) {
         sub = &ptr->hi.hi;
     }
     clearbit128(sub, n % 128);
+#elif defined(HAVE_AVX512)
+    *ptr = andnot512(mask1bit512(n), *ptr);
 #else
     m256 *sub;
     if (n < 256) {
@@ -1118,7 +1259,7 @@ void clearbit512(m512 *ptr, unsigned int n) {
 static really_inline
 char testbit512(m512 val, unsigned int n) {
     assert(n < sizeof(val) * 8);
-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)
     m128 sub;
     if (n < 128) {
         sub = val.lo.lo;
@@ -1130,6 +1271,9 @@ char testbit512(m512 val, unsigned int n) {
         sub = val.hi.hi;
     }
     return testbit128(sub, n % 128);
+#elif defined(HAVE_AVX512)
+    const m512 mask = mask1bit512(n);
+    return !!_mm512_test_epi8_mask(mask, val);
 #else
     m256 sub;
     if (n < 256) {
diff --git a/src/nfa/multivermicelli.h b/src/util/small_vector.h
similarity index 57%
rename from src/nfa/multivermicelli.h
rename to src/util/small_vector.h
index 55f9b1f28..0b60d8c0f 100644
--- a/src/nfa/multivermicelli.h
+++ b/src/util/small_vector.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,37 +26,37 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef MULTIVERMICELLI_H_
-#define MULTIVERMICELLI_H_
+#ifndef UTIL_SMALL_VECTOR_H
+#define UTIL_SMALL_VECTOR_H
 
-#ifdef __cplusplus
-extern "C"
-{
+#include <vector>
+
+#include <boost/version.hpp>
+
+#if BOOST_VERSION >= 105800
+#  define HAVE_BOOST_CONTAINER_SMALL_VECTOR
 #endif
 
-const u8 *long_vermicelliExec(char c, char nocase, const u8 *buf,
-                              const u8 *buf_end, const u8 run_len);
+#if defined(HAVE_BOOST_CONTAINER_SMALL_VECTOR)
+#  include <boost/container/small_vector.hpp>
+#endif
 
-const u8 *longgrab_vermicelliExec(char c, char nocase, const u8 *buf,
-                                  const u8 *buf_end, const u8 run_len);
+namespace ue2 {
 
-const u8 *shift_vermicelliExec(char c, char nocase, const u8 *buf,
-                               const u8 *buf_end, const u8 run_len);
+#if defined(HAVE_BOOST_CONTAINER_SMALL_VECTOR)
 
-const u8 *shiftgrab_vermicelliExec(char c, char nocase, const u8 *buf,
-                                   const u8 *buf_end, const u8 run_len);
+template <class T, std::size_t N,
+          typename Allocator = boost::container::new_allocator<T>>
+using small_vector = boost::container::small_vector<T, N, Allocator>;
 
-const u8 *doubleshift_vermicelliExec(char c, char nocase, const u8 *buf,
-                                     const u8 *buf_end, const u8 run_len,
-                                     const u8 run2_len);
+#else
 
-const u8 *doubleshiftgrab_vermicelliExec(char c, char nocase, const u8 *buf,
-                                         const u8 *buf_end, const u8 run_len,
-                                         const u8 run2_len);
+// Boost version isn't new enough, fall back to just using std::vector.
+template <class T, std::size_t N, typename Allocator = std::allocator<T>>
+using small_vector = std::vector<T, Allocator>;
 
-#ifdef __cplusplus
-}
-#endif
+#endif // HAVE_BOOST_CONTAINER_SMALL_VECTOR
 
+} // namespace ue2
 
-#endif /* MULTIVERMICELLI_H_ */
+#endif // UTIL_SMALL_VECTOR_H
diff --git a/src/util/state_compress.c b/src/util/state_compress.c
index 2a821dad6..7238849e7 100644
--- a/src/util/state_compress.c
+++ b/src/util/state_compress.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,6 +31,7 @@
  */
 #include "config.h"
 #include "ue2common.h"
+#include "arch.h"
 #include "bitutils.h"
 #include "unaligned.h"
 #include "pack_bits.h"
@@ -262,7 +263,7 @@ m256 loadcompressed256_32bit(const void *ptr, m256 mvec) {
                  expand32(v[4], m[4]), expand32(v[5], m[5]),
                  expand32(v[6], m[6]), expand32(v[7], m[7]) };
 
-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)
     m256 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]),
                   .hi = _mm_set_epi32(x[7], x[6], x[5], x[4]) };
 #else
@@ -289,7 +290,7 @@ m256 loadcompressed256_64bit(const void *ptr, m256 mvec) {
     u64a x[4] = { expand64(v[0], m[0]), expand64(v[1], m[1]),
                   expand64(v[2], m[2]), expand64(v[3], m[3]) };
 
-#if !defined(__AVX2__)
+#if !defined(HAVE_AVX2)
     m256 xvec = { .lo = _mm_set_epi64x(x[1], x[0]),
                   .hi = _mm_set_epi64x(x[3], x[2]) };
 #else
@@ -546,16 +547,21 @@ m512 loadcompressed512_32bit(const void *ptr, m512 mvec) {
                   expand32(v[14], m[14]), expand32(v[15], m[15]) };
 
     m512 xvec;
-#if !defined(__AVX2__)
-    xvec.lo.lo = _mm_set_epi32(x[3], x[2], x[1], x[0]);
-    xvec.lo.hi = _mm_set_epi32(x[7], x[6], x[5], x[4]);
-    xvec.hi.lo = _mm_set_epi32(x[11], x[10], x[9], x[8]);
-    xvec.hi.hi = _mm_set_epi32(x[15], x[14], x[13], x[12]);
-#else
+#if defined(HAVE_AVX512)
+    xvec = _mm512_set_epi32(x[15], x[14], x[13], x[12],
+                            x[11], x[10], x[9], x[8],
+                            x[7], x[6], x[5], x[4],
+                            x[3], x[2], x[1], x[0]);
+#elif defined(HAVE_AVX2)
     xvec.lo = _mm256_set_epi32(x[7], x[6], x[5], x[4],
                                x[3], x[2], x[1], x[0]);
     xvec.hi = _mm256_set_epi32(x[15], x[14], x[13], x[12],
                                x[11], x[10], x[9], x[8]);
+#else
+    xvec.lo.lo = _mm_set_epi32(x[3], x[2], x[1], x[0]);
+    xvec.lo.hi = _mm_set_epi32(x[7], x[6], x[5], x[4]);
+    xvec.hi.lo = _mm_set_epi32(x[11], x[10], x[9], x[8]);
+    xvec.hi.hi = _mm_set_epi32(x[15], x[14], x[13], x[12]);
 #endif
     return xvec;
 }
@@ -581,14 +587,17 @@ m512 loadcompressed512_64bit(const void *ptr, m512 mvec) {
                   expand64(v[4], m[4]), expand64(v[5], m[5]),
                   expand64(v[6], m[6]), expand64(v[7], m[7]) };
 
-#if !defined(__AVX2__)
+#if defined(HAVE_AVX512)
+    m512 xvec = _mm512_set_epi64(x[7], x[6], x[5], x[4],
+                                 x[3], x[2], x[1], x[0]);
+#elif defined(HAVE_AVX2)
+    m512 xvec = { .lo = _mm256_set_epi64x(x[3], x[2], x[1], x[0]),
+                  .hi = _mm256_set_epi64x(x[7], x[6], x[5], x[4])};
+#else
     m512 xvec = { .lo = { _mm_set_epi64x(x[1], x[0]),
                           _mm_set_epi64x(x[3], x[2]) },
                   .hi = { _mm_set_epi64x(x[5], x[4]),
                           _mm_set_epi64x(x[7], x[6]) } };
-#else
-    m512 xvec = { .lo = _mm256_set_epi64x(x[3], x[2], x[1], x[0]),
-                  .hi = _mm256_set_epi64x(x[7], x[6], x[5], x[4])};
 #endif
     return xvec;
 }
diff --git a/src/util/target_info.cpp b/src/util/target_info.cpp
index 4eadec2d2..3a41e0207 100644
--- a/src/util/target_info.cpp
+++ b/src/util/target_info.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -46,6 +46,10 @@ bool target_t::can_run_on_code_built_for(const target_t &code_target) const {
         return false;
     }
 
+    if (!has_avx512() && code_target.has_avx512()) {
+        return false;
+    }
+
     return true;
 }
 
@@ -53,11 +57,15 @@ target_t::target_t(const hs_platform_info &p)
     : tune(p.tune), cpu_features(p.cpu_features) {}
 
 bool target_t::has_avx2(void) const {
-    return (cpu_features & HS_CPU_FEATURES_AVX2);
+    return cpu_features & HS_CPU_FEATURES_AVX2;
+}
+
+bool target_t::has_avx512(void) const {
+    return cpu_features & HS_CPU_FEATURES_AVX512;
 }
 
 bool target_t::is_atom_class(void) const {
-    return tune == HS_TUNE_FAMILY_SLM;
+    return tune == HS_TUNE_FAMILY_SLM || tune == HS_TUNE_FAMILY_GLM;
 }
 
 } // namespace ue2
diff --git a/src/util/target_info.h b/src/util/target_info.h
index 67b5b7d9e..794b29855 100644
--- a/src/util/target_info.h
+++ b/src/util/target_info.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2016, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -40,6 +40,8 @@ struct target_t {
 
     bool has_avx2(void) const;
 
+    bool has_avx512(void) const;
+
     bool is_atom_class(void) const;
 
     // This asks: can this target (the object) run on code that was built for
diff --git a/src/util/ue2_containers.h b/src/util/ue2_containers.h
index 5bbf4cfe9..29919c7e1 100644
--- a/src/util/ue2_containers.h
+++ b/src/util/ue2_containers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,12 +30,15 @@
 #define UTIL_UE2_CONTAINERS_H_
 
 #include "ue2common.h"
+#include "util/operators.h"
+#include "util/small_vector.h"
 
 #include <algorithm>
 #include <iterator>
 #include <type_traits>
 #include <utility>
 
+#include <boost/functional/hash/hash_fwd.hpp>
 #include <boost/iterator/iterator_facade.hpp>
 #include <boost/unordered/unordered_map.hpp>
 #include <boost/unordered/unordered_set.hpp>
@@ -57,8 +60,8 @@ class iter_wrapper
     : public boost::iterator_facade<iter_wrapper<WrappedIter, Value>, Value,
                                     boost::random_access_traversal_tag> {
 public:
-    iter_wrapper() {}
-    explicit iter_wrapper(const WrappedIter &it_in) : it(it_in) {}
+    iter_wrapper() = default;
+    explicit iter_wrapper(WrappedIter it_in) : it(std::move(it_in)) {}
 
     // Templated copy-constructor to allow for interoperable iterator and
     // const_iterator.
@@ -67,10 +70,10 @@ class iter_wrapper
 
 public:
     template <class OtherIter, class OtherValue>
-    iter_wrapper(const iter_wrapper<OtherIter, OtherValue> &other,
+    iter_wrapper(iter_wrapper<OtherIter, OtherValue> other,
                  typename std::enable_if<std::is_convertible<
                      OtherIter, WrappedIter>::value>::type * = nullptr)
-        : it(other.it) {}
+        : it(std::move(other.it)) {}
 
     WrappedIter get() const { return it; }
 
@@ -90,6 +93,58 @@ class iter_wrapper
     Value &dereference() const { return *it; }
 };
 
+template <class T, class Compare, class Allocator>
+class flat_base {
+protected:
+    // Underlying storage is a small vector with local space for one element.
+    using storage_type = small_vector<T, 1, Allocator>;
+    using storage_alloc_type = typename storage_type::allocator_type;
+
+    // Putting our storage and comparator in a tuple allows us to make use of
+    // the empty base class optimization (if this STL implements it for
+    // std::tuple).
+    std::tuple<storage_type, Compare> storage;
+
+    flat_base(const Compare &compare, const Allocator &alloc)
+        : storage(storage_type(storage_alloc_type(alloc)), compare) {}
+
+    storage_type &data() { return std::get<0>(this->storage); }
+    const storage_type &data() const { return std::get<0>(this->storage); }
+
+    Compare &comp() { return std::get<1>(this->storage); }
+    const Compare &comp() const { return std::get<1>(this->storage); }
+
+public:
+    // Common member types.
+    using key_compare = Compare;
+
+    Allocator get_allocator() const {
+        return data().get_allocator();
+    }
+
+    key_compare key_comp() const {
+        return comp();
+    }
+
+    // Capacity.
+
+    bool empty() const { return data().empty(); }
+    size_t size() const { return data().size(); }
+    size_t max_size() const { return data().max_size(); }
+
+    // Modifiers.
+
+    void clear() {
+        data().clear();
+    }
+
+    void swap(flat_base &a) {
+        using std::swap;
+        swap(comp(), a.comp());
+        swap(data(), a.data());
+    }
+};
+
 } // namespace flat_detail
 
 /**
@@ -102,33 +157,35 @@ class iter_wrapper
  */
 template <class T, class Compare = std::less<T>,
           class Allocator = std::allocator<T>>
-class flat_set {
-    // Underlying storage is a sorted std::vector.
-    using StorageT = std::vector<T, Allocator>;
-
-    Compare comp;
-    StorageT data;
+class flat_set
+    : public flat_detail::flat_base<T, Compare, Allocator>,
+      public totally_ordered<flat_set<T, Compare, Allocator>> {
+    using base_type = flat_detail::flat_base<T, Compare, Allocator>;
+    using storage_type = typename base_type::storage_type;
+    using base_type::data;
+    using base_type::comp;
 
 public:
     // Member types.
     using key_type = T;
     using value_type = T;
-    using size_type = typename StorageT::size_type;
-    using difference_type = typename StorageT::difference_type;
-    using key_compare = Compare;
+    using size_type = typename storage_type::size_type;
+    using difference_type = typename storage_type::difference_type;
+    using key_compare = typename base_type::key_compare;
     using value_compare = Compare;
     using allocator_type = Allocator;
     using reference = value_type &;
     using const_reference = const value_type &;
-    using pointer = typename std::allocator_traits<Allocator>::pointer;
-    using const_pointer = typename std::allocator_traits<Allocator>::const_pointer;
+    using allocator_traits_type = typename std::allocator_traits<Allocator>;
+    using pointer = typename allocator_traits_type::pointer;
+    using const_pointer = typename allocator_traits_type::const_pointer;
 
     // Iterator types.
 
-    using iterator = flat_detail::iter_wrapper<typename StorageT::iterator,
+    using iterator = flat_detail::iter_wrapper<typename storage_type::iterator,
                                                const value_type>;
     using const_iterator =
-        flat_detail::iter_wrapper<typename StorageT::const_iterator,
+        flat_detail::iter_wrapper<typename storage_type::const_iterator,
                                   const value_type>;
 
     using reverse_iterator = std::reverse_iterator<iterator>;
@@ -138,19 +195,19 @@ class flat_set {
 
     flat_set(const Compare &compare = Compare(),
              const Allocator &alloc = Allocator())
-        : comp(compare), data(alloc) {}
+        : base_type(compare, alloc) {}
 
     template <class InputIt>
     flat_set(InputIt first, InputIt last, const Compare &compare = Compare(),
              const Allocator &alloc = Allocator())
-        : comp(compare), data(alloc) {
+        : flat_set(compare, alloc) {
         insert(first, last);
     }
 
     flat_set(std::initializer_list<value_type> init,
              const Compare &compare = Compare(),
              const Allocator &alloc = Allocator())
-        : comp(compare), data(alloc) {
+        : flat_set(compare, alloc) {
         insert(init.begin(), init.end());
     }
 
@@ -159,20 +216,14 @@ class flat_set {
     flat_set &operator=(const flat_set &) = default;
     flat_set &operator=(flat_set &&) = default;
 
-    // Other members.
-
-    allocator_type get_allocator() const {
-        return data.get_allocator();
-    }
-
     // Iterators.
 
-    iterator begin() { return iterator(data.begin()); }
-    const_iterator cbegin() const { return const_iterator(data.cbegin()); }
+    iterator begin() { return iterator(data().begin()); }
+    const_iterator cbegin() const { return const_iterator(data().cbegin()); }
     const_iterator begin() const { return cbegin(); }
 
-    iterator end() { return iterator(data.end()); }
-    const_iterator cend() const { return const_iterator(data.cend()); }
+    iterator end() { return iterator(data().end()); }
+    const_iterator cend() const { return const_iterator(data().cend()); }
     const_iterator end() const { return cend(); }
 
     reverse_iterator rbegin() { return reverse_iterator(end()); }
@@ -187,22 +238,12 @@ class flat_set {
     }
     const_reverse_iterator rend() const { return crend(); }
 
-    // Capacity.
-
-    bool empty() const { return data.empty(); }
-    size_t size() const { return data.size(); }
-    size_t max_size() const { return data.max_size(); }
-
     // Modifiers.
 
-    void clear() {
-        data.clear();
-    }
-
     std::pair<iterator, bool> insert(const value_type &value) {
-        auto it = std::lower_bound(data.begin(), data.end(), value, comp);
-        if (it == data.end() || comp(value, *it)) {
-            return std::make_pair(iterator(data.insert(it, value)), true);
+        auto it = std::lower_bound(data().begin(), data().end(), value, comp());
+        if (it == data().end() || comp()(value, *it)) {
+            return std::make_pair(iterator(data().insert(it, value)), true);
         }
         return std::make_pair(iterator(it), false);
     }
@@ -212,9 +253,9 @@ class flat_set {
     }
 
     std::pair<iterator, bool> insert(value_type &&value) {
-        auto it = std::lower_bound(data.begin(), data.end(), value, comp);
-        if (it == data.end() || comp(value, *it)) {
-            return std::make_pair(iterator(data.insert(it, std::move(value))),
+        auto it = std::lower_bound(data().begin(), data().end(), value, comp());
+        if (it == data().end() || comp()(value, *it)) {
+            return std::make_pair(iterator(data().insert(it, std::move(value))),
                                   true);
         }
         return std::make_pair(iterator(it), false);
@@ -240,12 +281,12 @@ class flat_set {
         return insert(value_type(std::forward<Args>(args)...));
     }
 
-    void erase(iterator pos) {
-        data.erase(pos.get());
+    void erase(const_iterator pos) {
+        data().erase(pos.get());
     }
 
-    void erase(iterator first, iterator last) {
-        data.erase(first.get(), last.get());
+    void erase(const_iterator first, const_iterator last) {
+        data().erase(first.get(), last.get());
     }
 
     void erase(const key_type &key) {
@@ -255,12 +296,6 @@ class flat_set {
         }
     }
 
-    void swap(flat_set &a) {
-        using std::swap;
-        swap(comp, a.comp);
-        swap(data, a.data);
-    }
-
     // Lookup.
 
     size_type count(const value_type &value) const {
@@ -268,61 +303,50 @@ class flat_set {
     }
 
     iterator find(const value_type &value) {
-        auto it = std::lower_bound(data.begin(), data.end(), value, comp);
-        if (it != data.end() && comp(value, *it)) {
-            it = data.end();
+        auto it = std::lower_bound(data().begin(), data().end(), value, comp());
+        if (it != data().end() && comp()(value, *it)) {
+            it = data().end();
         }
         return iterator(it);
     }
 
     const_iterator find(const value_type &value) const {
-        auto it = std::lower_bound(data.begin(), data.end(), value, comp);
-        if (it != data.end() && comp(value, *it)) {
-            it = data.end();
+        auto it = std::lower_bound(data().begin(), data().end(), value, comp());
+        if (it != data().end() && comp()(value, *it)) {
+            it = data().end();
         }
         return const_iterator(it);
     }
 
     // Observers.
 
-    key_compare key_comp() const {
-        return comp;
-    }
-
     value_compare value_comp() const {
-        return comp;
+        return comp();
     }
 
-    // Operators.
+    // Operators. All others provided by ue2::totally_ordered.
 
     bool operator==(const flat_set &a) const {
-        return data == a.data;
-    }
-    bool operator!=(const flat_set &a) const {
-        return data != a.data;
+        return data() == a.data();
     }
     bool operator<(const flat_set &a) const {
-        return data < a.data;
-    }
-    bool operator<=(const flat_set &a) const {
-        return data <= a.data;
-    }
-    bool operator>(const flat_set &a) const {
-        return data > a.data;
-    }
-    bool operator>=(const flat_set &a) const {
-        return data >= a.data;
+        return data() < a.data();
     }
 
     // Free swap function for ADL.
     friend void swap(flat_set &a, flat_set &b) {
         a.swap(b);
     }
+
+    // Free hash function.
+    friend size_t hash_value(const flat_set &a) {
+        return boost::hash_range(a.begin(), a.end());
+    }
 };
 
 /**
  * \brief Map container implemented internally as a sorted vector. Use this
- * rather than std::map for small sets as it's faster, uses less memory and
+ * rather than std::map for small maps as it's faster, uses less memory and
  * incurs less malloc time.
  *
  * Note: we used to use boost::flat_map, but have run into problems with all
@@ -336,7 +360,9 @@ class flat_set {
  */
 template <class Key, class T, class Compare = std::less<Key>,
           class Allocator = std::allocator<std::pair<Key, T>>>
-class flat_map {
+class flat_map
+    : public flat_detail::flat_base<std::pair<Key, T>, Compare, Allocator>,
+      public totally_ordered<flat_map<Key, T, Compare, Allocator>> {
 public:
     // Member types.
     using key_type = Key;
@@ -344,28 +370,29 @@ class flat_map {
     using value_type = std::pair<const Key, T>;
 
 private:
-    // Underlying storage is a sorted std::vector.
-    using storage_type = std::pair<key_type, mapped_type>;
-    using StorageT = std::vector<storage_type, Allocator>;
-
-    Compare comp;
-    StorageT data;
+    using base_type =
+        flat_detail::flat_base<std::pair<Key, T>, Compare, Allocator>;
+    using keyval_storage_type = std::pair<key_type, mapped_type>;
+    using storage_type = typename base_type::storage_type;
+    using base_type::data;
+    using base_type::comp;
 
 public:
     // More Member types.
-    using size_type = typename StorageT::size_type;
-    using difference_type = typename StorageT::difference_type;
-    using key_compare = Compare;
+    using size_type = typename storage_type::size_type;
+    using difference_type = typename storage_type::difference_type;
+    using key_compare = typename base_type::key_compare;
     using allocator_type = Allocator;
     using reference = value_type &;
     using const_reference = const value_type &;
-    using pointer = typename std::allocator_traits<Allocator>::pointer;
-    using const_pointer = typename std::allocator_traits<Allocator>::const_pointer;
+    using allocator_traits_type = typename std::allocator_traits<Allocator>;
+    using pointer = typename allocator_traits_type::pointer;
+    using const_pointer = typename allocator_traits_type::const_pointer;
 
 public:
     using const_iterator =
-        flat_detail::iter_wrapper<typename StorageT::const_iterator,
-                                  const storage_type>;
+        flat_detail::iter_wrapper<typename storage_type::const_iterator,
+                                  const keyval_storage_type>;
 
     using const_reverse_iterator = std::reverse_iterator<const_iterator>;
 
@@ -377,19 +404,19 @@ class flat_map {
 
     flat_map(const Compare &compare = Compare(),
              const Allocator &alloc = Allocator())
-        : comp(compare), data(alloc) {}
+        : base_type(compare, alloc) {}
 
     template <class InputIt>
     flat_map(InputIt first, InputIt last, const Compare &compare = Compare(),
              const Allocator &alloc = Allocator())
-        : comp(compare), data(alloc) {
+        : flat_map(compare, alloc) {
         insert(first, last);
     }
 
     flat_map(std::initializer_list<value_type> init,
              const Compare &compare = Compare(),
              const Allocator &alloc = Allocator())
-        : comp(compare), data(alloc) {
+        : flat_map(compare, alloc) {
         insert(init.begin(), init.end());
     }
 
@@ -398,18 +425,12 @@ class flat_map {
     flat_map &operator=(const flat_map &) = default;
     flat_map &operator=(flat_map &&) = default;
 
-    // Other members.
-
-    allocator_type get_allocator() const {
-        return data.get_allocator();
-    }
-
     // Iterators.
 
-    const_iterator cbegin() const { return const_iterator(data.cbegin()); }
+    const_iterator cbegin() const { return const_iterator(data().cbegin()); }
     const_iterator begin() const { return cbegin(); }
 
-    const_iterator cend() const { return const_iterator(data.cend()); }
+    const_iterator cend() const { return const_iterator(data().cend()); }
     const_iterator end() const { return cend(); }
 
     const_reverse_iterator crbegin() const {
@@ -422,61 +443,55 @@ class flat_map {
     }
     const_reverse_iterator rend() const { return crend(); }
 
-    // Capacity.
-
-    bool empty() const { return data.empty(); }
-    size_t size() const { return data.size(); }
-    size_t max_size() const { return data.max_size(); }
-
 private:
-    using storage_iterator = typename StorageT::iterator;
-    using storage_const_iterator = typename StorageT::const_iterator;
+    using storage_iterator = typename storage_type::iterator;
+    using storage_const_iterator = typename storage_type::const_iterator;
 
     storage_iterator data_lower_bound(const key_type &key) {
         return std::lower_bound(
-            data.begin(), data.end(), key,
-            [&](const storage_type &elem, const key_type &k) {
-                return comp(elem.first, k);
+            data().begin(), data().end(), key,
+            [&](const keyval_storage_type &elem, const key_type &k) {
+                return comp()(elem.first, k);
             });
     }
 
     storage_const_iterator
     data_lower_bound(const key_type &key) const {
         return std::lower_bound(
-            data.begin(), data.end(), key,
-            [&](const storage_type &elem, const key_type &k) {
-                return comp(elem.first, k);
+            data().begin(), data().end(), key,
+            [&](const keyval_storage_type &elem, const key_type &k) {
+                return comp()(elem.first, k);
             });
     }
 
     std::pair<storage_iterator, bool> data_insert(const value_type &value) {
         auto it = data_lower_bound(value.first);
-        if (it == data.end() || comp(value.first, it->first)) {
-            return std::make_pair(data.insert(it, value), true);
+        if (it == data().end() || comp()(value.first, it->first)) {
+            return std::make_pair(data().insert(it, value), true);
         }
         return std::make_pair(it, false);
     }
 
     std::pair<storage_iterator, bool> data_insert(value_type &&value) {
         auto it = data_lower_bound(value.first);
-        if (it == data.end() || comp(value.first, it->first)) {
-            return std::make_pair(data.insert(it, std::move(value)), true);
+        if (it == data().end() || comp()(value.first, it->first)) {
+            return std::make_pair(data().insert(it, std::move(value)), true);
         }
         return std::make_pair(it, false);
     }
 
     storage_iterator data_find(const key_type &key) {
         auto it = data_lower_bound(key);
-        if (it != data.end() && comp(key, it->first)) {
-            it = data.end();
+        if (it != data().end() && comp()(key, it->first)) {
+            it = data().end();
         }
         return it;
     }
 
     storage_const_iterator data_find(const key_type &key) const {
         auto it = data_lower_bound(key);
-        if (it != data.end() && comp(key, it->first)) {
-            it = data.end();
+        if (it != data().end() && comp()(key, it->first)) {
+            it = data().end();
         }
         return it;
     }
@@ -484,10 +499,6 @@ class flat_map {
 public:
     // Modifiers.
 
-    void clear() {
-        data.clear();
-    }
-
     std::pair<iterator, bool> insert(const value_type &value) {
         auto rv = data_insert(value);
         return std::make_pair(iterator(rv.first), rv.second);
@@ -514,17 +525,12 @@ class flat_map {
         return insert(value_type(std::forward<Args>(args)...));
     }
 
-    void erase(iterator pos) {
-        // Convert to a non-const storage iterator via pointer arithmetic.
-        storage_iterator it = data.begin() + distance(begin(), pos);
-        data.erase(it);
+    void erase(const_iterator pos) {
+        data().erase(pos.get());
     }
 
-    void erase(iterator first, iterator last) {
-        // Convert to a non-const storage iterator via pointer arithmetic.
-        storage_iterator data_first = data.begin() + distance(begin(), first);
-        storage_iterator data_last = data.begin() + distance(begin(), last);
-        data.erase(data_first, data_last);
+    void erase(const_iterator first, const_iterator last) {
+        data().erase(first.get(), last.get());
     }
 
     void erase(const key_type &key) {
@@ -534,12 +540,6 @@ class flat_map {
         }
     }
 
-    void swap(flat_map &a) {
-        using std::swap;
-        swap(comp, a.comp);
-        swap(data, a.data);
-    }
-
     // Lookup.
 
     size_type count(const key_type &key) const {
@@ -554,7 +554,7 @@ class flat_map {
 
     mapped_type &at(const key_type &key) {
         auto it = data_find(key);
-        if (it == data.end()) {
+        if (it == data().end()) {
             throw std::out_of_range("element not found");
         }
         return it->second;
@@ -562,7 +562,7 @@ class flat_map {
 
     const mapped_type &at(const key_type &key) const {
         auto it = data_find(key);
-        if (it == data.end()) {
+        if (it == data().end()) {
             throw std::out_of_range("element not found");
         }
         return it->second;
@@ -575,35 +575,39 @@ class flat_map {
 
     // Observers.
 
-    key_compare key_comp() const {
-        return comp;
+    class value_compare {
+        friend class flat_map;
+    protected:
+        Compare c;
+        value_compare(Compare c_in) : c(c_in) {}
+    public:
+        bool operator()(const value_type &lhs, const value_type &rhs) {
+            return c(lhs.first, rhs.first);
+        }
+    };
+
+    value_compare value_comp() const {
+        return value_compare(comp());
     }
 
-    // Operators.
+    // Operators. All others provided by ue2::totally_ordered.
 
     bool operator==(const flat_map &a) const {
-        return data == a.data;
-    }
-    bool operator!=(const flat_map &a) const {
-        return data != a.data;
+        return data() == a.data();
     }
     bool operator<(const flat_map &a) const {
-        return data < a.data;
-    }
-    bool operator<=(const flat_map &a) const {
-        return data <= a.data;
-    }
-    bool operator>(const flat_map &a) const {
-        return data > a.data;
-    }
-    bool operator>=(const flat_map &a) const {
-        return data >= a.data;
+        return data() < a.data();
     }
 
     // Free swap function for ADL.
     friend void swap(flat_map &a, flat_map &b) {
         a.swap(b);
     }
+
+    // Free hash function.
+    friend size_t hash_value(const flat_map &a) {
+        return boost::hash_range(a.begin(), a.end());
+    }
 };
 
 } // namespace
diff --git a/src/util/ue2_graph.h b/src/util/ue2_graph.h
index 9634b0322..138d7467d 100644
--- a/src/util/ue2_graph.h
+++ b/src/util/ue2_graph.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,8 +31,9 @@
 
 #include "ue2common.h"
 #include "util/graph_range.h"
+#include "util/noncopyable.h"
+#include "util/operators.h"
 
-#include <boost/operators.hpp>
 #include <boost/functional/hash.hpp>
 #include <boost/graph/properties.hpp> /* vertex_index_t, ... */
 #include <boost/pending/property.hpp> /* no_property */
@@ -156,7 +157,7 @@ namespace ue2 {
 
 namespace graph_detail {
 
-class graph_base : boost::noncopyable {
+class graph_base : noncopyable {
 };
 
 struct default_edge_property {
@@ -292,7 +293,7 @@ class ue2_graph : graph_detail::graph_base {
     using vertex_bundled = VertexPropertyType;
     using edge_bundled = EdgePropertyType;
 
-    class vertex_descriptor : boost::totally_ordered<vertex_descriptor> {
+    class vertex_descriptor : totally_ordered<vertex_descriptor> {
     public:
         vertex_descriptor() : p(nullptr), serial(0) { }
         explicit vertex_descriptor(vertex_node *pp)
@@ -324,7 +325,7 @@ class ue2_graph : graph_detail::graph_base {
         friend ue2_graph;
     };
 
-    class edge_descriptor : boost::totally_ordered<edge_descriptor> {
+    class edge_descriptor : totally_ordered<edge_descriptor> {
     public:
         edge_descriptor() : p(nullptr), serial(0) { }
         explicit edge_descriptor(edge_node *pp) : p(pp), serial(pp->serial) { }
diff --git a/src/util/ue2string.h b/src/util/ue2string.h
index 08b6a5442..a90d47a35 100644
--- a/src/util/ue2string.h
+++ b/src/util/ue2string.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,6 +35,7 @@
 
 #include "ue2common.h"
 #include "util/charreach.h"
+#include "util/hash.h"
 
 #include <iterator>
 #include <string>
@@ -206,6 +207,13 @@ struct ue2_literal {
     std::vector<bool> nocase; /* for trolling value */
 };
 
+inline
+size_t hash_value(const ue2_literal::elem &elem) {
+    return hash_all(elem.c, elem.nocase);
+}
+
+inline
+size_t hash_value(const ue2_literal &lit) { return hash_range(lit); }
 
 /// Return a reversed copy of this literal.
 ue2_literal reverse_literal(const ue2_literal &in);
diff --git a/src/util/verify_types.h b/src/util/verify_types.h
index 98c24c997..5833d5ec6 100644
--- a/src/util/verify_types.h
+++ b/src/util/verify_types.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,45 +30,59 @@
 #define UTIL_VERIFY_TYPES
 
 #include "ue2common.h"
+#include "util/compile_error.h"
 
 #include <cassert>
+#include <type_traits>
 
 namespace ue2 {
 
-template<typename Int_T>
-static UNUSED u8 verify_u8(Int_T val) {
-    assert(val == (Int_T)((u8)val)); // there and back again
-    return (u8)(val);
+template<typename To_T, typename From_T>
+To_T verify_cast(From_T val) {
+    static_assert(std::is_integral<To_T>::value,
+                  "Output type must be integral.");
+    static_assert(std::is_integral<From_T>::value ||
+                      std::is_enum<From_T>::value ||
+                      std::is_convertible<From_T, To_T>::value,
+                  "Must be integral or enum type, or convertible to output.");
+
+    To_T conv_val = static_cast<To_T>(val);
+    if (static_cast<From_T>(conv_val) != val) {
+        assert(0);
+        throw ResourceLimitError();
+    }
+
+    return conv_val;
+}
+
+template<typename T>
+s8 verify_s8(T val) {
+    return verify_cast<s8>(val);
 }
 
-template<typename Int_T>
-static UNUSED s8 verify_s8(Int_T val) {
-    assert(val == (Int_T)((s8)val)); // there and back again
-    return (s8)(val);
+template<typename T>
+u8 verify_u8(T val) {
+    return verify_cast<u8>(val);
 }
 
-template<typename Int_T>
-static UNUSED s16 verify_s16(Int_T val) {
-    assert(val == (Int_T)((s16)val)); // there and back again
-    return (s16)(val);
+template<typename T>
+s16 verify_s16(T val) {
+    return verify_cast<s16>(val);
 }
 
-template<typename Int_T>
-static UNUSED u16 verify_u16(Int_T val) {
-    assert(val == (Int_T)((u16)val)); // there and back again
-    return (u16)(val);
+template<typename T>
+u16 verify_u16(T val) {
+    return verify_cast<u16>(val);
 }
 
-template<typename Int_T>
-static UNUSED s32 verify_s32(Int_T val) {
-    assert(val == (Int_T)((s32)val)); // there and back again
-    return (s32)(val);
+template<typename T>
+s32 verify_s32(T val) {
+    return verify_cast<s32>(val);
 }
 
-template<typename Int_T>
-static UNUSED u32 verify_u32(Int_T val) {
-    assert(val == (Int_T)((u32)val)); // there and back again
-    return (u32)(val);
+template<typename T>
+u32 verify_u32(T val) {
+    return verify_cast<u32>(val);
 }
 
 } // namespace ue2
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 049fd3681..61bb00f20 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -1,3 +1,6 @@
+if (WIN32)
+    return()
+endif()
 find_package(Threads)
 
 # remove some warnings
diff --git a/tools/hsbench/CMakeLists.txt b/tools/hsbench/CMakeLists.txt
index 25a833d08..9b2cde4db 100644
--- a/tools/hsbench/CMakeLists.txt
+++ b/tools/hsbench/CMakeLists.txt
@@ -1,4 +1,8 @@
 include (${CMAKE_MODULE_PATH}/sqlite3.cmake)
+if (NOT SQLITE3_FOUND)
+    message(STATUS "sqlite3 not found, not building hsbench")
+    return()
+endif()
 
 if (NOT XCODE)
     include_directories(SYSTEM ${SQLITE3_INCLUDE_DIRS})
@@ -7,6 +11,18 @@ else()
     set(EXTRA_CXX_FLAGS "${EXTRA_CXX_FLAGS} -isystem ${SQLITE3_INCLUDE_DIRS}")
 endif()
 
+# BSD has the _np funcs in a _np header
+CHECK_INCLUDE_FILE_CXX(pthread_np.h HAVE_PTHREAD_NP_H)
+if (HAVE_PTHREAD_NP_H)
+    set (PTHREAD_NP_INC pthread_np.h)
+else ()
+    set (PTHREAD_NP_INC pthread.h)
+endif ()
+
+set (CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -D_GNU_SOURCE")
+set (CMAKE_REQUIRED_LIBRARIES pthread)
+CHECK_CXX_SYMBOL_EXISTS(pthread_setaffinity_np ${PTHREAD_NP_INC} HAVE_DECL_PTHREAD_SETAFFINITY_NP)
+
 CHECK_FUNCTION_EXISTS(malloc_info HAVE_MALLOC_INFO)
 CHECK_FUNCTION_EXISTS(shmget HAVE_SHMGET)
 set(HAVE_SHMGET ${HAVE_SHMGET} CACHE BOOL "shmget()")
diff --git a/tools/hsbench/common.h b/tools/hsbench/common.h
index a4d60021a..efff3f99d 100644
--- a/tools/hsbench/common.h
+++ b/tools/hsbench/common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -38,5 +38,7 @@ extern bool saveDatabases;
 extern bool loadDatabases;
 extern std::string serializePath;
 extern unsigned int somPrecisionMode;
+extern bool forceEditDistance;
+extern unsigned editDistance;
 
 #endif // COMMON_H
diff --git a/tools/hsbench/data_corpus.cpp b/tools/hsbench/data_corpus.cpp
index 55bfe93af..8e761ec34 100644
--- a/tools/hsbench/data_corpus.cpp
+++ b/tools/hsbench/data_corpus.cpp
@@ -110,7 +110,7 @@ vector<DataBlock> readCorpus(const string &filename) {
     if (status != SQLITE_DONE) {
         ostringstream oss;
         oss << "Error retrieving blocks from corpus: "
-            << sqlite3_errstr(status);
+            << sqlite3_errmsg(db);
 
         status = sqlite3_finalize(statement);
         assert(status == SQLITE_OK);
diff --git a/tools/hsbench/engine_hyperscan.cpp b/tools/hsbench/engine_hyperscan.cpp
index f5abb9faf..9674e5c84 100644
--- a/tools/hsbench/engine_hyperscan.cpp
+++ b/tools/hsbench/engine_hyperscan.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -36,7 +36,6 @@
 #include "huge.h"
 #include "timer.h"
 
-#include "crc32.h"
 #include "database.h"
 #include "hs_compile.h"
 #include "hs_internal.h"
@@ -46,12 +45,15 @@
 
 #include <cassert>
 #include <cstring>
+#include <functional>
 #include <iomanip>
 #include <sstream>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include <boost/crc.hpp>
+
 using namespace std;
 
 EngineContext::EngineContext(const hs_database_t *db) {
@@ -230,11 +232,13 @@ string dbSettingsHash(const string &filename, u32 mode) {
 
     string info = info_oss.str();
 
-    u32 crc = Crc32c_ComputeBuf(0, info.data(), info.size());
+    boost::crc_32_type crc;
+
+    crc.process_bytes(info.data(), info.length());
 
     // return STL string with printable version of digest
     ostringstream oss;
-    oss << hex << setw(8) << setfill('0') << crc << dec;
+    oss << hex << setw(8) << setfill('0') << crc.checksum() << dec;
 
     return oss.str();
 }
@@ -289,6 +293,10 @@ buildEngineHyperscan(const ExpressionMap &expressions, ScanMode scan_mode,
                        m.first);
                 return nullptr;
             }
+            if (forceEditDistance) {
+                extparam.flags |= HS_EXT_FLAG_EDIT_DISTANCE;
+                extparam.edit_distance = editDistance;
+            }
 
             exprs.push_back(expr);
             ids.push_back(m.first);
diff --git a/tools/hsbench/main.cpp b/tools/hsbench/main.cpp
index 4298963b9..3153737ee 100644
--- a/tools/hsbench/main.cpp
+++ b/tools/hsbench/main.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, Intel Corporation
+ * Copyright (c) 2016-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -56,6 +56,9 @@
 #include <getopt.h>
 #ifndef _WIN32
 #include <pthread.h>
+#if defined(HAVE_PTHREAD_NP_H)
+#include <pthread_np.h>
+#endif
 #include <unistd.h>
 #endif
 
@@ -72,6 +75,8 @@ bool saveDatabases = false;
 bool loadDatabases = false;
 string serializePath("");
 unsigned int somPrecisionMode = HS_MODE_SOM_HORIZON_LARGE;
+bool forceEditDistance = false;
+unsigned editDistance = 0;
 
 namespace /* anonymous */ {
 
@@ -120,7 +125,11 @@ class ThreadContext : boost::noncopyable {
     // Apply processor affinity (if available) to this thread.
     bool affine(UNUSED int cpu) {
 #ifdef HAVE_DECL_PTHREAD_SETAFFINITY_NP
+#if defined(__linux__)
         cpu_set_t cpuset;
+#else // BSD
+        cpuset_t cpuset;
+#endif
         CPU_ZERO(&cpuset);
         assert(cpu >= 0 && cpu < CPU_SETSIZE);
 
@@ -164,11 +173,15 @@ void usage(const char *error) {
            " (default: streaming).\n");
     printf("  -V              Benchmark in vectored mode"
            " (default: streaming).\n");
+#ifdef HAVE_DECL_PTHREAD_SETAFFINITY_NP
     printf("  -T CPU,CPU,...  Benchmark with threads on these CPUs.\n");
+#endif
     printf("  -i DIR          Don't compile, load from files in DIR"
            " instead.\n");
     printf("  -w DIR          After compiling, save to files in DIR.\n");
     printf("  -d NUMBER       Set SOM precision mode (default: 8 (large)).\n");
+    printf("  -E DISTANCE     Match all patterns within edit distance"
+           " DISTANCE.\n");
     printf("\n");
     printf("  --per-scan      Display per-scan Mbit/sec results.\n");
     printf("  --echo-matches  Display all matches that occur during scan.\n");
@@ -190,8 +203,12 @@ struct BenchmarkSigs {
 /** Process command-line arguments. Prints usage and exits on error. */
 static
 void processArgs(int argc, char *argv[], vector<BenchmarkSigs> &sigSets,
-                 UNUSED Grey &grey) {
-    const char options[] = "-b:c:Cd:e:G:hi:n:No:p:sT:Vw:z:";
+                 UNUSED unique_ptr<Grey> &grey) {
+    const char options[] = "-b:c:Cd:e:E:G:hi:n:No:p:sVw:z:"
+#if HAVE_DECL_PTHREAD_SETAFFINITY_N
+        "T:" // add the thread flag
+#endif
+        ;
     int in_sigfile = 0;
     int do_per_scan = 0;
     int do_echo_matches = 0;
@@ -237,9 +254,17 @@ void processArgs(int argc, char *argv[], vector<BenchmarkSigs> &sigSets,
         case 'e':
             exprPath.assign(optarg);
             break;
+        case 'E':
+            if (!fromString(optarg, editDistance)) {
+                usage("Couldn't parse argument to -E flag, should be"
+                      " a non-negative integer.");
+                exit(1);
+            }
+            forceEditDistance = true;
+            break;
 #ifndef RELEASE_BUILD
         case 'G':
-            applyGreyOverrides(&grey, string(optarg));
+            applyGreyOverrides(grey.get(), string(optarg));
             break;
 #endif
         case 'h':
@@ -585,6 +610,17 @@ void displayPerScanResults(const vector<unique_ptr<ThreadContext>> &threads,
     printf("\n");
 }
 
+static
+double fastestResult(const vector<unique_ptr<ThreadContext>> &threads) {
+    double best = threads[0]->results[0].seconds;
+    for (const auto &t : threads) {
+        for (const auto &r : t->results) {
+            best = min(best, r.seconds);
+        }
+    }
+    return best;
+}
+
 static
 u64a byte_size(const vector<DataBlock> &corpus_blocks) {
     u64a total = 0;
@@ -638,8 +674,12 @@ void displayResults(const vector<unique_ptr<ThreadContext>> &threads,
 
     double blockRate = (double)totalBlocks / (double)totalSecs;
     printf("Overall block rate:      %'0.2f blocks/sec\n", blockRate);
-    printf("Overall throughput:      %'0.2Lf Mbit/sec\n",
+    printf("Mean throughput:         %'0.2Lf Mbit/sec\n",
            calc_mbps(totalSecs, totalBytes));
+
+    double lowestScanTime = fastestResult(threads);
+    printf("Maximum throughput:      %'0.2Lf Mbit/sec\n",
+           calc_mbps(lowestScanTime, bytesPerRun));
     printf("\n");
 
     if (display_per_scan) {
@@ -723,8 +763,10 @@ void runBenchmark(const EngineHyperscan &db,
 
 /** Main driver. */
 int main(int argc, char *argv[]) {
-    Grey grey;
-
+    unique_ptr<Grey> grey;
+#if !defined(RELEASE_BUILD)
+    grey = make_unique<Grey>();
+#endif
     setlocale(LC_ALL, ""); // use the user's locale
 
 #ifndef NDEBUG
@@ -742,6 +784,7 @@ int main(int argc, char *argv[]) {
     // known expressions together.
     if (sigSets.empty()) {
         SignatureSet sigs;
+        sigs.reserve(exprMapTemplate.size());
         for (auto i : exprMapTemplate | map_keys) {
             sigs.push_back(i);
         }
@@ -758,14 +801,12 @@ int main(int argc, char *argv[]) {
     }
 
     for (const auto &s : sigSets) {
-        ExpressionMap exprMap = exprMapTemplate; // copy
-
-        limitBySignature(exprMap, s.sigs);
+        auto exprMap = limitToSignatures(exprMapTemplate, s.sigs);
         if (exprMap.empty()) {
             continue;
         }
 
-        auto engine = buildEngineHyperscan(exprMap, scan_mode, s.name, grey);
+        auto engine = buildEngineHyperscan(exprMap, scan_mode, s.name, *grey);
         if (!engine) {
             printf("Error: expressions failed to compile.\n");
             exit(1);
diff --git a/tools/hsbench/scripts/CorpusBuilder.py b/tools/hsbench/scripts/CorpusBuilder.py
index 5baed2bd5..da2d593f5 100755
--- a/tools/hsbench/scripts/CorpusBuilder.py
+++ b/tools/hsbench/scripts/CorpusBuilder.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 
 '''
 A module to construct corpora databases for the Hyperscan benchmarker
diff --git a/tools/hsbench/scripts/gutenbergCorpus.py b/tools/hsbench/scripts/gutenbergCorpus.py
index fa1b1570d..62752a4d2 100755
--- a/tools/hsbench/scripts/gutenbergCorpus.py
+++ b/tools/hsbench/scripts/gutenbergCorpus.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 
 '''
 This script creates a Hyperscan benchmarking corpus database from a supplied
diff --git a/tools/hsbench/scripts/linebasedCorpus.py b/tools/hsbench/scripts/linebasedCorpus.py
index bde20e398..b27f8674f 100755
--- a/tools/hsbench/scripts/linebasedCorpus.py
+++ b/tools/hsbench/scripts/linebasedCorpus.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 
 '''
 Simple script to take a file full of lines of text and push them into a
diff --git a/unit/CMakeLists.txt b/unit/CMakeLists.txt
index 8b4944447..a7658b26a 100644
--- a/unit/CMakeLists.txt
+++ b/unit/CMakeLists.txt
@@ -30,12 +30,41 @@ if(CMAKE_COMPILER_IS_GNUCC)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-array-bounds")
 endif()
 
-add_library(gtest STATIC ${gtest_SOURCES})
-
 add_definitions(-DGTEST_HAS_PTHREAD=0 -DSRCDIR=${PROJECT_SOURCE_DIR})
 
+set(unit_hyperscan_SOURCES
+    ${gtest_SOURCES}
+    hyperscan/allocators.cpp
+    hyperscan/arg_checks.cpp
+    hyperscan/bad_patterns.cpp
+    hyperscan/bad_patterns.txt
+    hyperscan/behaviour.cpp
+    hyperscan/expr_info.cpp
+    hyperscan/extparam.cpp
+    hyperscan/identical.cpp
+    hyperscan/main.cpp
+    hyperscan/multi.cpp
+    hyperscan/order.cpp
+    hyperscan/scratch_op.cpp
+    hyperscan/scratch_in_use.cpp
+    hyperscan/serialize.cpp
+    hyperscan/single.cpp
+    hyperscan/som.cpp
+    hyperscan/stream_op.cpp
+    hyperscan/test_util.cpp
+    hyperscan/test_util.h
+    )
+add_executable(unit-hyperscan ${unit_hyperscan_SOURCES})
+if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
+target_link_libraries(unit-hyperscan hs_shared expressionutil)
+else()
+target_link_libraries(unit-hyperscan hs expressionutil)
+endif()
+
+
 if (NOT (RELEASE_BUILD OR FAT_RUNTIME))
 set(unit_internal_SOURCES
+    ${gtest_SOURCES}
     internal/bitfield.cpp
     internal/bitutils.cpp
     internal/charreach.cpp
@@ -52,8 +81,6 @@ set(unit_internal_SOURCES
     internal/limex_nfa.cpp
     internal/masked_move.cpp
     internal/multi_bit.cpp
-    internal/multiaccel_matcher.cpp
-    internal/multiaccel_shift.cpp
     internal/nfagraph_common.h
     internal/nfagraph_comp.cpp
     internal/nfagraph_equivalence.cpp
@@ -85,40 +112,13 @@ set(unit_internal_SOURCES
     internal/util_string.cpp
     internal/vermicelli.cpp
     internal/main.cpp
-)
+    )
 
 add_executable(unit-internal ${unit_internal_SOURCES})
-target_link_libraries(unit-internal hs gtest corpusomatic)
+set_target_properties(unit-internal PROPERTIES COMPILE_FLAGS "${HS_CXX_FLAGS}")
+target_link_libraries(unit-internal hs corpusomatic)
 endif(NOT (RELEASE_BUILD OR FAT_RUNTIME))
 
-set(unit_hyperscan_SOURCES
-    hyperscan/allocators.cpp
-    hyperscan/arg_checks.cpp
-    hyperscan/bad_patterns.cpp
-    hyperscan/bad_patterns.txt
-    hyperscan/behaviour.cpp
-    hyperscan/expr_info.cpp
-    hyperscan/extparam.cpp
-    hyperscan/identical.cpp
-    hyperscan/main.cpp
-    hyperscan/multi.cpp
-    hyperscan/order.cpp
-    hyperscan/scratch_op.cpp
-    hyperscan/scratch_in_use.cpp
-    hyperscan/serialize.cpp
-    hyperscan/single.cpp
-    hyperscan/som.cpp
-    hyperscan/stream_op.cpp
-    hyperscan/test_util.cpp
-    hyperscan/test_util.h
-    )
-add_executable(unit-hyperscan ${unit_hyperscan_SOURCES})
-if (BUILD_STATIC_AND_SHARED OR BUILD_SHARED_LIBS)
-target_link_libraries(unit-hyperscan hs_shared gtest expressionutil)
-else()
-target_link_libraries(unit-hyperscan hs gtest expressionutil)
-endif()
-
 #
 # build target to run unit tests
 #
diff --git a/unit/hyperscan/bad_patterns.txt b/unit/hyperscan/bad_patterns.txt
index 1a33210d1..3d6d9db90 100644
--- a/unit/hyperscan/bad_patterns.txt
+++ b/unit/hyperscan/bad_patterns.txt
@@ -70,8 +70,8 @@
 70:/foo[^\x00-\xff]/ #Pattern can never match.
 71:/foo[^\x00-\xff]$/ #Pattern can never match.
 72:/\Bd\B/i{min_length=2,min_offset=4,max_offset=54} #Expression has min_length=2 but can only produce matches of length 1 bytes at most.
-73:/(((.|aaa)aaaaaa.aaa){14,19}a((a|a{5,6}|aa){3,11}|aa.|a){2}){4}\Z/sm #Pattern is too large.
-74:/(((.|aaa)aaaaaa.aaa){14,19}a((a|a{5,6}|aa){3,11}|aa.|a){2}){4}\Z/smL #Pattern is too large.
+73:/(((.|aaa)aaaaaa.aaa){14,19}a((a|a{5,6}|aa){3,11}|aa.|a){2}){40}\Z/sm #Pattern is too large.
+74:/(((.|aaa)aaaaaa.aaa){14,19}a((a|a{5,6}|aa){3,11}|aa.|a){2}){40}\Z/smL #Pattern is too large.
 75:/\B/s8{min_length=1} #Expression has min_length=1 but can only produce matches of length 0 bytes at most.
 76:/(f|d|(\b)|i|a\Z)/mHV8{min_length=2,min_offset=9,max_offset=14} #Expression has min_length=2 but can only produce matches of length 1 bytes at most.
 77:/(f|e|d{19,}|h\Z|^j|\Aa)/smi{min_length=7,min_offset=8,max_offset=18} #Extended parameter constraints can not be satisfied for any match from this expression.
@@ -90,9 +90,9 @@
 91:/a\owibble/ #Value in \o{...} sequence is non-octal or missing braces at index 1.
 92:/a\o{wibble/ #Value in \o{...} sequence is non-octal or missing braces at index 1.
 93:/a\o{777}/ #Value in \o{...} sequence is too large at index 1.
-94:/(*UTF16)foo/ #(*UTF16) not supported at index 2.
-95:/(*BSR_UNICODE)abc/ #Unknown control verb at index 2.
-96:/a+(*SKIP)b/ #Unknown control verb at index 4.
+94:/(*UTF16)foo/ #Unsupported control verb (*UTF16) at index 0.
+95:/(*BSR_UNICODE)abc/ #Unsupported control verb (*BSR_UNICODE) at index 0.
+96:/a+(*SKIP)b/ #Unknown control verb (*SKIP) at index 2.
 97:/foo(*/ #Invalid repeat at index 4.
 98:/[:\]:]/ #POSIX named classes are only supported inside a class at index 0.
 99:/[[:[:]/ #Invalid POSIX named class at index 1.
@@ -130,3 +130,15 @@
 133:/[a[.\].]]/ #Unsupported POSIX collating element at index 2.
 134:/[a[=\]=]]/ #Unsupported POSIX collating element at index 2.
 135:/[^\D\d]/8W #Pattern can never match.
+136:/(*LIMIT_MATCH=1000)foobar/ #Unsupported control verb (*LIMIT_MATCH=1000) at index 0.
+137:/(*UTF32)foobar/ #Unsupported control verb (*UTF32) at index 0.
+138:/(*UNKNOWNVERB)foobar/ #Unknown control verb (*UNKNOWNVERB) at index 0.
+139:/foo(*UTF8)bar/ #(*UTF8) must be at start of expression, encountered at index 5.
+140:/(?i)(*UTF8)foobar/ #(*UTF8) must be at start of expression, encountered at index 6.
+141:/(*@&/ #Unknown control verb at index 2.
+142:/abcd/si{edit_distance=4} #Approximate matching patterns that reduce to vacuous patterns are disallowed.
+143:/foobar|hatstand/sL{edit_distance=6} #Approximate matching patterns that reduce to vacuous patterns are disallowed.
+144:/abc\b/{edit_distance=1} #Zero-width assertions are disallowed for approximate matching.
+145:/abc/8{edit_distance=1} #UTF-8 is disallowed for approximate matching.
+146:/(*UTF8)abc/{edit_distance=1} #UTF-8 is disallowed for approximate matching.
+147:/\b\BMYBt/s{edit_distance=1} #Pattern can never match.
diff --git a/unit/hyperscan/expr_info.cpp b/unit/hyperscan/expr_info.cpp
index 984104c55..7cc6abd7f 100644
--- a/unit/hyperscan/expr_info.cpp
+++ b/unit/hyperscan/expr_info.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -51,6 +51,53 @@ struct expected_info {
     char matches_only_at_eod;
 };
 
+ostream& operator<<(ostream &os, const hs_expr_ext &ext) {
+    if (!ext.flags) {
+        return os;
+    }
+    bool first = true;
+    if (ext.flags & HS_EXT_FLAG_MIN_OFFSET) {
+        if (!first) {
+            os << ", ";
+        }
+        os << "min_offset=" << ext.min_offset;
+        first = false;
+    }
+    if (ext.flags & HS_EXT_FLAG_MAX_OFFSET) {
+        if (!first) {
+            os << ", ";
+        }
+        os << "max_offset=" << ext.max_offset;
+        first = false;
+    }
+    if (ext.flags & HS_EXT_FLAG_MIN_LENGTH) {
+        if (!first) {
+            os << ", ";
+        }
+        os << "min_length=" << ext.min_length;
+        first = false;
+    }
+    if (ext.flags & HS_EXT_FLAG_EDIT_DISTANCE) {
+        if (!first) {
+            os << ", ";
+        }
+        os << "edit_distance=" << ext.edit_distance;
+        first = false;
+    }
+    return os;
+}
+
+// For Google Test.
+void PrintTo(const expected_info &ei, ostream *os) {
+    *os << "expected_info: "
+        << "pattern=\"" << ei.pattern << "\""
+        << ", ext={" << ei.ext << "}"
+        << ", min=" << ei.min << ", max=" << ei.max
+        << ", unordered_matches=" << (ei.unordered_matches ? 1 : 0)
+        << ", matches_at_eod=" << (ei.matches_at_eod ? 1 : 0)
+        << ", matches_only_at_eod=" << (ei.matches_only_at_eod ? 1 : 0);
+}
+
 class ExprInfop : public TestWithParam<expected_info> {
 };
 
@@ -124,7 +171,7 @@ TEST_P(ExprInfop, check_ext_null) {
     free(info);
 }
 
-static const hs_expr_ext NO_EXT_PARAM = { 0, 0, 0, 0 };
+static const hs_expr_ext NO_EXT_PARAM = { 0, 0, 0, 0, 0 };
 
 static const expected_info ei_test[] = {
     {"abc", NO_EXT_PARAM, 3, 3, 0, 0, 0},
@@ -167,10 +214,38 @@ static const expected_info ei_test[] = {
     {"(foo|bar)\\z", NO_EXT_PARAM, 3, 3, 0, 1, 1},
 
     // Some cases with extended parameters.
-    {"^abc.*def", {HS_EXT_FLAG_MAX_OFFSET, 0, 10, 0}, 6, 10, 0, 0, 0},
-    {"abc.*def", {HS_EXT_FLAG_MAX_OFFSET, 0, 10, 0}, 6, 10, 0, 0, 0},
-    {"abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 100}, 100, UINT_MAX, 0, 0, 0},
-    {"abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 5}, 6, UINT_MAX, 0, 0, 0},
+    {"^abc.*def", {HS_EXT_FLAG_MAX_OFFSET, 0, 10, 0, 0}, 6, 10, 0, 0, 0},
+    {"^abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 100, 0}, 100, UINT_MAX, 0, 0, 0},
+    {"abc.*def", {HS_EXT_FLAG_MAX_OFFSET, 0, 10, 0, 0}, 6, 10, 0, 0, 0},
+    {"abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 100, 0}, 100, UINT_MAX, 0, 0, 0},
+    {"abc.*def", {HS_EXT_FLAG_MIN_LENGTH, 0, 0, 5, 0}, 6, UINT_MAX, 0, 0, 0},
+
+    {"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1}, 5, UINT_MAX, 0, 0, 0},
+    {"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2}, 4, UINT_MAX, 0, 0, 0},
+    {"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 10, 2},
+                10, UINT_MAX, 0, 0, 0},
+    {"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2},
+                4, UINT_MAX, 0, 0, 0},
+    {"abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2},
+                4, 6, 0, 0, 0},
+
+    {"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1}, 5, UINT_MAX, 0, 0, 0},
+    {"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2}, 4, UINT_MAX, 0, 0, 0},
+    {"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 10, 2},
+                10, UINT_MAX, 0, 0, 0},
+    {"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2},
+                4, UINT_MAX, 0, 0, 0},
+    {"^abc.*def", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2},
+                4, 6, 0, 0, 0},
+
+    {"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 1}, 5, 7, 0, 0, 0},
+    {"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE, 0, 0, 0, 2}, 4, 8, 0, 0, 0},
+    {"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_LENGTH, 0, 0, 8, 2},
+                8, 8, 0, 0, 0},
+    {"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MIN_OFFSET, 6, 0, 0, 2},
+                4, 8, 0, 0, 0},
+    {"^abcdef", {HS_EXT_FLAG_EDIT_DISTANCE | HS_EXT_FLAG_MAX_OFFSET, 0, 6, 0, 2},
+                4, 6, 0, 0, 0},
 };
 
 INSTANTIATE_TEST_CASE_P(ExprInfo, ExprInfop, ValuesIn(ei_test));
diff --git a/unit/hyperscan/serialize.cpp b/unit/hyperscan/serialize.cpp
index 7e0fcb7ce..3b34abacd 100644
--- a/unit/hyperscan/serialize.cpp
+++ b/unit/hyperscan/serialize.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,55 +31,78 @@
  */
 #include "config.h"
 
-#include <cstring>
-#include <string>
-#include <vector>
-
 #include "gtest/gtest.h"
 #include "hs.h"
 #include "hs_internal.h"
 #include "test_util.h"
 
+#include <cstring>
+#include <string>
+#include <vector>
+
 namespace {
 
 using namespace std;
 using namespace testing;
 
 static const unsigned validModes[] = {
-    HS_MODE_STREAM,
-    HS_MODE_NOSTREAM
+    HS_MODE_NOSTREAM,
+    HS_MODE_STREAM | HS_MODE_SOM_HORIZON_LARGE,
+    HS_MODE_VECTORED
 };
 
-class Serializep : public TestWithParam<unsigned> {
+static const pattern testPatterns[] = {
+    pattern("hatstand.*teakettle.*badgerbrush", HS_FLAG_CASELESS, 1000),
+    pattern("hatstand.*teakettle.*badgerbrush", HS_FLAG_DOTALL, 1001),
+    pattern("hatstand|teakettle|badgerbrush", 0, 1002),
+    pattern("^hatstand|teakettle|badgerbrush$", 0, 1003),
+    pattern("foobar.{10,1000}xyzzy", HS_FLAG_DOTALL, 1004),
+    pattern("foobar.{2,501}roobar", 0, 1005),
+    pattern("abc.*def.*ghi", HS_FLAG_SOM_LEFTMOST, 1006),
+    pattern("(\\p{L}){4}", HS_FLAG_UTF8|HS_FLAG_UCP, 1007),
+    pattern("\\.(exe|pdf|gif|jpg|png|wav|riff|mp4)\\z", 0, 1008)
 };
 
+class SerializeP : public TestWithParam<tuple<unsigned, pattern>> {};
+
+static
+const char *getModeString(unsigned mode) {
+    if (mode & HS_MODE_STREAM) {
+        return "STREAM";
+    }
+    if (mode & HS_MODE_BLOCK) {
+        return "BLOCK";
+    }
+    if (mode & HS_MODE_VECTORED) {
+        return "VECTORED";
+    }
+    return "UNKNOWN";
+}
+
 // Check that we can deserialize from a char array at any alignment and the info
 // is consistent
-TEST_P(Serializep, DeserializeFromAnyAlignment) {
-    const unsigned mode = GetParam();
+TEST_P(SerializeP, DeserializeFromAnyAlignment) {
+    const unsigned mode = get<0>(GetParam());
+    const pattern &pat = get<1>(GetParam());
     SCOPED_TRACE(mode);
+    SCOPED_TRACE(pat);
 
     hs_error_t err;
-    hs_database_t *db = buildDB("hatstand.*teakettle.*badgerbrush",
-                                HS_FLAG_CASELESS, 1000, mode);
+    hs_database_t *db = buildDB(pat, mode);
     ASSERT_TRUE(db != nullptr) << "database build failed.";
 
     char *original_info = nullptr;
     err = hs_database_info(db, &original_info);
     ASSERT_EQ(HS_SUCCESS, err);
 
-    const char *mode_string = nullptr;
-    switch (mode) {
-    case HS_MODE_STREAM:
-        mode_string = "STREAM";
-        break;
-    case HS_MODE_NOSTREAM:
-        mode_string = "BLOCK";
-    }
+    const char *mode_string = getModeString(mode);
 
-    ASSERT_NE(nullptr, original_info) << "hs_serialized_database_info returned null.";
+    ASSERT_NE(nullptr, original_info)
+        << "hs_serialized_database_info returned null.";
     ASSERT_STREQ("Version:", string(original_info).substr(0, 8).c_str());
-    ASSERT_TRUE(strstr(original_info, mode_string) != nullptr);
+    ASSERT_TRUE(strstr(original_info, mode_string) != nullptr)
+        << "Original info \"" << original_info
+        << "\" does not contain " << mode_string;
 
     char *bytes = nullptr;
     size_t length = 0;
@@ -133,31 +156,28 @@ TEST_P(Serializep, DeserializeFromAnyAlignment) {
 
 // Check that we can deserialize_at from a char array at any alignment and the
 // info is consistent
-TEST_P(Serializep, DeserializeAtFromAnyAlignment) {
-    const unsigned mode = GetParam();
+TEST_P(SerializeP, DeserializeAtFromAnyAlignment) {
+    const unsigned mode = get<0>(GetParam());
+    const pattern &pat = get<1>(GetParam());
     SCOPED_TRACE(mode);
+    SCOPED_TRACE(pat);
 
     hs_error_t err;
-    hs_database_t *db = buildDB("hatstand.*teakettle.*badgerbrush",
-                                HS_FLAG_CASELESS, 1000, mode);
+    hs_database_t *db = buildDB(pat, mode);
     ASSERT_TRUE(db != nullptr) << "database build failed.";
 
     char *original_info;
     err = hs_database_info(db, &original_info);
     ASSERT_EQ(HS_SUCCESS, err);
 
-    const char *mode_string = nullptr;
-    switch (mode) {
-    case HS_MODE_STREAM:
-        mode_string = "STREAM";
-        break;
-    case HS_MODE_NOSTREAM:
-        mode_string = "BLOCK";
-    }
+    const char *mode_string = getModeString(mode);
 
-    ASSERT_NE(nullptr, original_info) << "hs_serialized_database_info returned null.";
+    ASSERT_NE(nullptr, original_info)
+        << "hs_serialized_database_info returned null.";
     ASSERT_STREQ("Version:", string(original_info).substr(0, 8).c_str());
-    ASSERT_TRUE(strstr(original_info, mode_string) != nullptr);
+    ASSERT_TRUE(strstr(original_info, mode_string) != nullptr)
+        << "Original info \"" << original_info
+        << "\" does not contain " << mode_string;
 
     char *bytes = nullptr;
     size_t length = 0;
@@ -217,8 +237,8 @@ TEST_P(Serializep, DeserializeAtFromAnyAlignment) {
     delete[] mem;
 }
 
-INSTANTIATE_TEST_CASE_P(Serialize, Serializep,
-                        ValuesIn(validModes));
+INSTANTIATE_TEST_CASE_P(Serialize, SerializeP,
+                        Combine(ValuesIn(validModes), ValuesIn(testPatterns)));
 
 // Attempt to reproduce the scenario in UE-1946.
 TEST(Serialize, CrossCompileSom) {
@@ -226,11 +246,10 @@ TEST(Serialize, CrossCompileSom) {
     plat.cpu_features = 0;
     plat.tune = HS_TUNE_FAMILY_GENERIC;
 
-    static const char *pattern = "hatstand.*(badgerbrush|teakettle)";
+    static const char *pat = "hatstand.*(badgerbrush|teakettle)";
     const unsigned mode = HS_MODE_STREAM
                           | HS_MODE_SOM_HORIZON_LARGE;
-    hs_database_t *db = buildDB(pattern, HS_FLAG_SOM_LEFTMOST, 1000, mode,
-                                &plat);
+    hs_database_t *db = buildDB(pat, HS_FLAG_SOM_LEFTMOST, 1000, mode, &plat);
     ASSERT_TRUE(db != nullptr) << "database build failed.";
 
     size_t db_len;
@@ -275,15 +294,16 @@ static void misaligned_free(void *p) {
     free(c - 1);
 }
 
-// make sure that serializing/deserializing to null or an unaligned address fails
+// make sure that serializing/deserializing to null or an unaligned address
+// fails
 TEST(Serialize, CompileNullMalloc) {
     hs_database_t *db;
     hs_compile_error_t *c_err;
-    static const char *pattern = "hatstand.*(badgerbrush|teakettle)";
+    static const char *pat = "hatstand.*(badgerbrush|teakettle)";
 
     // mallocing null should fail compile
     hs_set_allocator(null_malloc, nullptr);
-    hs_error_t err = hs_compile(pattern, 0, HS_MODE_BLOCK, nullptr, &db, &c_err);
+    hs_error_t err = hs_compile(pat, 0, HS_MODE_BLOCK, nullptr, &db, &c_err);
     ASSERT_NE(HS_SUCCESS, err);
     ASSERT_TRUE(db == nullptr);
     ASSERT_TRUE(c_err != nullptr);
@@ -294,14 +314,14 @@ TEST(Serialize, CompileNullMalloc) {
 TEST(Serialize, CompileErrorAllocator) {
     hs_database_t *db;
     hs_compile_error_t *c_err;
-    static const char *pattern = "hatsta^nd.*(badgerbrush|teakettle)";
+    static const char *pat = "hatsta^nd.*(badgerbrush|teakettle)";
 
     // failing to compile should use the misc allocator
     allocated_count = 0;
     allocated_count_b = 0;
     hs_set_allocator(count_malloc_b, count_free_b);
     hs_set_misc_allocator(count_malloc, count_free);
-    hs_error_t err = hs_compile(pattern, 0, HS_MODE_BLOCK, nullptr, &db, &c_err);
+    hs_error_t err = hs_compile(pat, 0, HS_MODE_BLOCK, nullptr, &db, &c_err);
     ASSERT_NE(HS_SUCCESS, err);
     ASSERT_TRUE(db == nullptr);
     ASSERT_TRUE(c_err != nullptr);
@@ -315,13 +335,13 @@ TEST(Serialize, CompileErrorAllocator) {
 TEST(Serialize, AllocatorsUsed) {
     hs_database_t *db;
     hs_compile_error_t *c_err;
-    static const char *pattern = "hatstand.*(badgerbrush|teakettle)";
+    static const char *pat = "hatstand.*(badgerbrush|teakettle)";
 
     allocated_count = 0;
     allocated_count_b = 0;
     hs_set_allocator(count_malloc_b, count_free_b);
     hs_set_database_allocator(count_malloc, count_free);
-    hs_error_t err = hs_compile(pattern, 0, HS_MODE_BLOCK, nullptr, &db, &c_err);
+    hs_error_t err = hs_compile(pat, 0, HS_MODE_BLOCK, nullptr, &db, &c_err);
     ASSERT_EQ(HS_SUCCESS, err);
     ASSERT_TRUE(db != nullptr);
     ASSERT_TRUE(c_err == nullptr);
@@ -344,15 +364,14 @@ TEST(Serialize, AllocatorsUsed) {
     ASSERT_EQ(0, allocated_count_b);
 }
 
-
 TEST(Serialize, CompileUnalignedMalloc) {
     hs_database_t *db;
     hs_compile_error_t *c_err;
-    static const char *pattern = "hatstand.*(badgerbrush|teakettle)";
+    static const char *pat = "hatstand.*(badgerbrush|teakettle)";
 
     // unaligned malloc should fail compile
     hs_set_allocator(misaligned_malloc, misaligned_free);
-    hs_error_t err = hs_compile(pattern, 0, HS_MODE_BLOCK, nullptr, &db, &c_err);
+    hs_error_t err = hs_compile(pat, 0, HS_MODE_BLOCK, nullptr, &db, &c_err);
     ASSERT_NE(HS_SUCCESS, err);
     ASSERT_TRUE(db == nullptr);
     ASSERT_TRUE(c_err != nullptr);
@@ -363,8 +382,8 @@ TEST(Serialize, CompileUnalignedMalloc) {
 TEST(Serialize, SerializeNullMalloc) {
     hs_database_t *db;
     hs_compile_error_t *c_err;
-    static const char *pattern = "hatstand.*(badgerbrush|teakettle)";
-    hs_error_t err = hs_compile(pattern, 0, HS_MODE_BLOCK, nullptr, &db, &c_err);
+    static const char *pat = "hatstand.*(badgerbrush|teakettle)";
+    hs_error_t err = hs_compile(pat, 0, HS_MODE_BLOCK, nullptr, &db, &c_err);
     ASSERT_EQ(HS_SUCCESS, err);
     ASSERT_TRUE(db != nullptr);
 
@@ -384,13 +403,14 @@ TEST(Serialize, SerializeNullMalloc) {
     hs_free_database(db);
 }
 
-// make sure that serializing/deserializing to null or an unaligned address fails
+// make sure that serializing/deserializing to null or an unaligned address
+// fails
 TEST(Serialize, SerializeUnalignedMalloc) {
     hs_database_t *db;
     hs_compile_error_t *c_err;
-    static const char *pattern = "hatstand.*(badgerbrush|teakettle)";
+    static const char *pat= "hatstand.*(badgerbrush|teakettle)";
 
-    hs_error_t err = hs_compile(pattern, 0, HS_MODE_BLOCK, nullptr, &db, &c_err);
+    hs_error_t err = hs_compile(pat, 0, HS_MODE_BLOCK, nullptr, &db, &c_err);
     ASSERT_EQ(HS_SUCCESS, err);
     ASSERT_TRUE(db != nullptr);
 
@@ -414,9 +434,9 @@ TEST(Serialize, SerializeUnalignedMalloc) {
 TEST(Serialize, DeserializeNullMalloc) {
     hs_database_t *db;
     hs_compile_error_t *c_err;
-    static const char *pattern = "hatstand.*(badgerbrush|teakettle)";
+    static const char *pat = "hatstand.*(badgerbrush|teakettle)";
 
-    hs_error_t err = hs_compile(pattern, 0, HS_MODE_BLOCK, nullptr, &db, &c_err);
+    hs_error_t err = hs_compile(pat, 0, HS_MODE_BLOCK, nullptr, &db, &c_err);
     ASSERT_EQ(HS_SUCCESS, err);
     ASSERT_TRUE(db != nullptr);
 
@@ -447,9 +467,9 @@ TEST(Serialize, DeserializeNullMalloc) {
 TEST(Serialize, DeserializeUnalignedMalloc) {
     hs_database_t *db;
     hs_compile_error_t *c_err;
-    static const char *pattern = "hatstand.*(badgerbrush|teakettle)";
+    static const char *pat = "hatstand.*(badgerbrush|teakettle)";
 
-    hs_error_t err = hs_compile(pattern, 0, HS_MODE_BLOCK, nullptr, &db, &c_err);
+    hs_error_t err = hs_compile(pat, 0, HS_MODE_BLOCK, nullptr, &db, &c_err);
     ASSERT_EQ(HS_SUCCESS, err);
     ASSERT_TRUE(db != nullptr);
 
@@ -486,9 +506,9 @@ TEST(Serialize, DeserializeUnalignedMalloc) {
 TEST(Serialize, DeserializeGarbage) {
     hs_database_t *db;
     hs_compile_error_t *c_err;
-    static const char *pattern = "hatstand.*(badgerbrush|teakettle)";
+    static const char *pat = "hatstand.*(badgerbrush|teakettle)";
 
-    hs_error_t err = hs_compile(pattern, 0, HS_MODE_BLOCK, nullptr, &db, &c_err);
+    hs_error_t err = hs_compile(pat, 0, HS_MODE_BLOCK, nullptr, &db, &c_err);
     ASSERT_EQ(HS_SUCCESS, err);
     ASSERT_TRUE(db != nullptr);
 
diff --git a/unit/hyperscan/single.cpp b/unit/hyperscan/single.cpp
index 029d223ae..01fbfeab5 100644
--- a/unit/hyperscan/single.cpp
+++ b/unit/hyperscan/single.cpp
@@ -363,7 +363,8 @@ static const unsigned validModes[] = {
 // Mode bits for switching off various architecture features
 static const unsigned long long featureMask[] = {
     ~0ULL, /* native */
-    ~HS_CPU_FEATURES_AVX2, /* no avx2 */
+    ~(HS_CPU_FEATURES_AVX2 | HS_CPU_FEATURES_AVX512), /* no avx2 */
+    ~HS_CPU_FEATURES_AVX512, /* no avx512 */
 };
 
 INSTANTIATE_TEST_CASE_P(Single,
diff --git a/unit/hyperscan/test_util.cpp b/unit/hyperscan/test_util.cpp
index 345b05d00..f6c20a74e 100644
--- a/unit/hyperscan/test_util.cpp
+++ b/unit/hyperscan/test_util.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -26,32 +26,37 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include <cstring>
-#include <iostream>
-#include <string>
-#include <vector>
-
 #include "hs.h"
 #include "test_util.h"
 #include "gtest/gtest.h"
 #include "util/expressions.h"
 #include "util/ExpressionParser.h"
 
+#include <cstring>
+#include <iostream>
+#include <string>
+#include <vector>
+
 using namespace std;
 
 int record_cb(unsigned id, unsigned long long, unsigned long long to,
               unsigned, void *ctxt) {
     CallBackContext *c = (CallBackContext *)ctxt;
 
-    c->matches.push_back(MatchRecord(to, id));
+    c->matches.emplace_back(to, id);
 
     return (int)c->halt;
 }
 
-std::ostream &operator<< (std::ostream &o, const MatchRecord &m) {
+std::ostream &operator<<(std::ostream &o, const MatchRecord &m) {
     return o << "[" << m.to << ", " << m.id << "]";
 }
 
+std::ostream &operator<<(std::ostream &o, const pattern &p) {
+    return o << "[" << "expr=\"" << p.expression << "\", flags=" << p.flags
+             << ", id=" << p.id << "]";
+}
+
 hs_database_t *buildDB(const vector<pattern> &patterns, unsigned int mode,
                        hs_platform_info *plat) {
     vector<const char *> expressions;
@@ -59,20 +64,20 @@ hs_database_t *buildDB(const vector<pattern> &patterns, unsigned int mode,
     vector<unsigned int> ids;
     vector<const hs_expr_ext *> ext;
 
-    for (vector<pattern>::const_iterator it = patterns.begin();
-         it != patterns.end(); ++it) {
-        expressions.push_back(it->expression.c_str());
-        flags.push_back(it->flags);
-        ids.push_back(it->id);
-        ext.push_back(&it->ext);
+    for (const auto &pat : patterns) {
+        expressions.push_back(pat.expression.c_str());
+        flags.push_back(pat.flags);
+        ids.push_back(pat.id);
+        ext.push_back(&pat.ext);
     }
 
     hs_database_t *db = nullptr;
     hs_compile_error_t *compile_err = nullptr;
     hs_error_t err;
 
-    err = hs_compile_ext_multi(&expressions[0], &flags[0], &ids[0], &ext[0],
-                               patterns.size(), mode, plat, &db, &compile_err);
+    err = hs_compile_ext_multi(expressions.data(), flags.data(), ids.data(),
+                               ext.data(), patterns.size(), mode, plat, &db,
+                               &compile_err);
 
     if (err != HS_SUCCESS) {
         return nullptr;
@@ -82,15 +87,13 @@ hs_database_t *buildDB(const vector<pattern> &patterns, unsigned int mode,
 }
 
 hs_database_t *buildDB(const pattern &expr, unsigned int mode) {
-    return buildDB(vector<pattern>(1, expr), mode);
+    return buildDB(vector<pattern>({expr}), mode);
 }
 
 hs_database_t *buildDB(const char *expression, unsigned int flags,
                        unsigned int id, unsigned int mode,
                        hs_platform_info_t *plat) {
-    vector<pattern> patterns;
-    patterns.push_back(pattern(expression, flags, id));
-    return buildDB(patterns, mode, plat);
+    return buildDB({pattern(expression, flags, id)}, mode, plat);
 }
 
 hs_database_t *buildDB(const char *filename, unsigned int mode,
@@ -99,16 +102,14 @@ hs_database_t *buildDB(const char *filename, unsigned int mode,
     ExpressionMap expressions;
     loadExpressionsFromFile(filename, expressions);
 
-    for (ExpressionMap::iterator it = expressions.begin();
-         it != expressions.end(); ++it) {
+    for (const auto &expr : expressions) {
         unsigned int flags = 0;
         string regex;
         hs_expr_ext ext;
-        if (!readExpression(it->second, regex, &flags, &ext)) {
+        if (!readExpression(expr.second, regex, &flags, &ext)) {
             return nullptr;
         }
-        patterns.push_back(pattern(regex, flags | extra_flags, it->first,
-                                   ext));
+        patterns.emplace_back(regex, flags | extra_flags, expr.first, ext);
     }
     return buildDB(patterns, mode);
 }
@@ -145,13 +146,13 @@ hs_database_t *buildDB(const char *filename, unsigned int mode,
     ExpressionMap expressions;
     loadExpressionsFromFile(filename, expressions);
 
-    for (ExpressionMap::iterator it = expressions.begin();
-         it != expressions.end(); ++it) {
+    for (const auto &expr : expressions) {
         unsigned int flags = 0;
         string regex;
         hs_expr_ext ext;
         bool must_be_ordered;
-        if (!readExpression(it->second, regex, &flags, &ext, &must_be_ordered)) {
+        if (!readExpression(expr.second, regex, &flags, &ext,
+                            &must_be_ordered)) {
             return nullptr;
         }
 
@@ -159,7 +160,7 @@ hs_database_t *buildDB(const char *filename, unsigned int mode,
             return nullptr;
         }
 
-        patterns.emplace_back(regex, flags, it->first, ext);
+        patterns.emplace_back(regex, flags, expr.first, ext);
     }
     return buildDB(patterns, mode);
 }
diff --git a/unit/hyperscan/test_util.h b/unit/hyperscan/test_util.h
index fad6137c1..efa0570c3 100644
--- a/unit/hyperscan/test_util.h
+++ b/unit/hyperscan/test_util.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -29,13 +29,13 @@
 #ifndef TEST_UTIL_H
 #define TEST_UTIL_H
 
+#include "hs.h"
+
 #include <cstring>
 #include <iosfwd>
 #include <string>
 #include <vector>
 
-#include "hs.h"
-
 #ifndef UNUSED
 #if defined(_WIN32) || defined(_WIN64)
 #define UNUSED
@@ -53,11 +53,10 @@ struct MatchRecord {
     int id;
 };
 
-std::ostream &operator<< (std::ostream &o, const MatchRecord &m);
+std::ostream &operator<<(std::ostream &o, const MatchRecord &m);
 
 struct CallBackContext {
-    CallBackContext() : halt(false) {}
-    bool halt;
+    bool halt = false;
     std::vector<MatchRecord> matches;
 
     void clear() {
@@ -79,22 +78,29 @@ int dummy_cb(unsigned, unsigned long long, unsigned long long, unsigned,
 
 struct pattern {
     std::string expression;
-    unsigned int flags;
-    unsigned int id;
+    unsigned int flags = 0;
+    unsigned int id = 0;
     hs_expr_ext ext;
 
-    pattern(const std::string &expression_in, unsigned int flags_in = 0,
-            unsigned int id_in = 0) : expression(expression_in),
-                                      flags(flags_in), id(id_in) {
+    // We need a default constructor for combining in parameterised tests.
+    pattern() {
         memset(&ext, 0, sizeof(ext));
     }
 
-    pattern(const std::string &expression_in, unsigned int flags_in,
-            unsigned int id_in, const hs_expr_ext &ext_in) :
-                expression(expression_in), flags(flags_in), id(id_in),
-                ext(ext_in) { }
+    explicit pattern(std::string expression_in,
+                     unsigned int flags_in = 0, unsigned int id_in = 0)
+        : expression(std::move(expression_in)), flags(flags_in), id(id_in) {
+        memset(&ext, 0, sizeof(ext));
+    }
+
+    pattern(std::string expression_in, unsigned int flags_in,
+            unsigned int id_in, hs_expr_ext ext_in)
+        : expression(std::move(expression_in)), flags(flags_in), id(id_in),
+          ext(std::move(ext_in)) {}
 };
 
+std::ostream &operator<<(std::ostream &o, const pattern &p);
+
 hs_database_t *buildDB(const std::vector<pattern> &patterns, unsigned int mode,
                        hs_platform_info *plat = nullptr);
 hs_database_t *buildDB(const pattern &pat, unsigned int mode);
diff --git a/unit/internal/bitutils.cpp b/unit/internal/bitutils.cpp
index 31aaf17fc..3f7885449 100644
--- a/unit/internal/bitutils.cpp
+++ b/unit/internal/bitutils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -29,32 +29,33 @@
 #include "config.h"
 
 #include "gtest/gtest.h"
+#include "util/arch.h"
 #include "util/bitutils.h"
 #include "util/popcount.h"
 
 // open coded implementations to test against
 static
 u32 our_clz(u32 x) {
-	u32 n;
-
-	if (x == 0) return(32);
-	n = 0;
-	if (x <= 0x0000FFFF) { n = n + 16; x = x << 16; }
-	if (x <= 0x00FFFFFF) { n = n + 8; x = x << 8; }
-	if (x <= 0x0FFFFFFF) { n = n + 4; x = x << 4; }
-	if (x <= 0x3FFFFFFF) { n = n + 2; x = x << 2; }
-	if (x <= 0x7FFFFFFF) { n = n + 1; }
-	return n;
+    u32 n;
+
+    if (x == 0) return(32);
+    n = 0;
+    if (x <= 0x0000FFFF) { n = n + 16; x = x << 16; }
+    if (x <= 0x00FFFFFF) { n = n + 8; x = x << 8; }
+    if (x <= 0x0FFFFFFF) { n = n + 4; x = x << 4; }
+    if (x <= 0x3FFFFFFF) { n = n + 2; x = x << 2; }
+    if (x <= 0x7FFFFFFF) { n = n + 1; }
+    return n;
 }
 
 static
 u32 our_clzll(u64a x) {
-	// Synthesise from 32-bit variant.
-	u32 high = x >> 32;
-	if (high) {
-		return our_clz(high);
-	}
-	return 32 + our_clz(x);
+    // Synthesise from 32-bit variant.
+    u32 high = x >> 32;
+    if (high) {
+        return our_clz(high);
+    }
+    return 32 + our_clz(x);
 }
 
 
@@ -437,7 +438,7 @@ TEST(BitUtils, rank_in_mask64) {
     ASSERT_EQ(31, rank_in_mask64(0xf0f0f0f0f0f0f0f0ULL, 63));
 }
 
-#if defined(HAVE_PEXT) && defined(ARCH_64_BIT)
+#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
 TEST(BitUtils, pdep64) {
     u64a data = 0xF123456789ABCDEF;
     ASSERT_EQ(0xfULL, pdep64(data, 0xf));
diff --git a/unit/internal/database.cpp b/unit/internal/database.cpp
index cb3e76b50..8f0c1a695 100644
--- a/unit/internal/database.cpp
+++ b/unit/internal/database.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,6 +33,7 @@
 #include "crc32.h"
 #include "database.h"
 #include "ue2common.h"
+#include "util/arch.h"
 #include "util/target_info.h"
 
 #include "gtest/gtest.h"
@@ -47,10 +48,14 @@ TEST(DB, flagsToPlatform) {
 
     p.cpu_features = 0;
 
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
     p.cpu_features |= HS_CPU_FEATURES_AVX2;
 #endif
 
+#if defined(HAVE_AVX512)
+    p.cpu_features |= HS_CPU_FEATURES_AVX512;
+#endif
+
     platform_t pp = target_to_platform(target_t(p));
     ASSERT_EQ(pp, hs_current_platform);
 }
diff --git a/unit/internal/depth.cpp b/unit/internal/depth.cpp
index a004643b5..ad9ffe388 100644
--- a/unit/internal/depth.cpp
+++ b/unit/internal/depth.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -112,9 +112,10 @@ TEST(depth, add_finite) {
     ASSERT_EQ(depth(900), depth(1000) + s32{-100});
 
     // overflow must throw
+    depth max_depth(depth::max_value());
     depth d;
-    ASSERT_THROW(d = depth::max_value() + depth(1), DepthOverflowError);
-    ASSERT_THROW(d = depth::max_value() + 1, DepthOverflowError);
+    ASSERT_THROW(d = max_depth + depth(1), DepthOverflowError);
+    ASSERT_THROW(d = max_depth + 1, DepthOverflowError);
 
     // underflow must throw
     ASSERT_THROW(d = depth(0) + s32{-1}, DepthOverflowError);
@@ -267,11 +268,11 @@ TEST(depth, unordered_set) {
     ue2::unordered_set<depth> depths;
 
     for (const auto &val : finite_values) {
-        depths.insert(val);
+        depths.emplace(val);
     }
 
     for (const auto &val : finite_values) {
-        ASSERT_TRUE(depths.find(val) != depths.end());
+        ASSERT_TRUE(depths.find(depth(val)) != depths.end());
     }
 
     ASSERT_TRUE(depths.find(depth::infinity()) == depths.end());
diff --git a/unit/internal/fdr.cpp b/unit/internal/fdr.cpp
index 6116bfdb6..bd0bb4c0c 100644
--- a/unit/internal/fdr.cpp
+++ b/unit/internal/fdr.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -96,15 +96,6 @@ struct match {
 };
 
 extern "C" {
-static
-hwlmcb_rv_t countCallback(UNUSED size_t start, UNUSED size_t end, u32,
-                          void *ctxt) {
-    if (ctxt) {
-        ++*(u32 *)ctxt;
-    }
-
-    return HWLM_CONTINUE_MATCHING;
-}
 
 static
 hwlmcb_rv_t decentCallback(size_t start, size_t end, u32 id, void *ctxt) {
@@ -231,42 +222,6 @@ TEST_P(FDRp, MultiLocation) {
     }
 }
 
-TEST_P(FDRp, Flood) {
-    const u32 hint = GetParam();
-    SCOPED_TRACE(hint);
-
-    vector<hwlmLiteral> lits;
-    lits.push_back(hwlmLiteral("aaaa", 0, 1));
-    lits.push_back(hwlmLiteral("aaaaaaaa", 0, 2));
-    lits.push_back(hwlmLiteral("baaaaaaaa", 0, 3));
-    lits.push_back(hwlmLiteral("aaaaaaaab", 0, 4));
-
-    auto fdr = fdrBuildTableHinted(lits, false, hint, get_current_target(), Grey());
-    CHECK_WITH_TEDDY_OK_TO_FAIL(fdr, hint);
-
-    const u32 testSize = 1024;
-    vector<u8> data(testSize, 'a');
-
-    vector<match> matches;
-    fdrExec(fdr.get(), data.data(), testSize, 0, decentCallback, &matches,
-            HWLM_ALL_GROUPS);
-    ASSERT_EQ(testSize - 3 + testSize - 7, matches.size());
-    EXPECT_EQ(match(0, 3, 1), matches[0]);
-    EXPECT_EQ(match(1, 4, 1), matches[1]);
-    EXPECT_EQ(match(2, 5, 1), matches[2]);
-    EXPECT_EQ(match(3, 6, 1), matches[3]);
-
-    u32 currentMatch = 4;
-    for (u32 i = 7; i < testSize; i++, currentMatch += 2) {
-        EXPECT_TRUE(
-          (match(i - 3, i, 1) == matches[currentMatch] &&
-           match(i - 7, i, 2) == matches[currentMatch+1]) ||
-          (match(i - 7, i, 2) == matches[currentMatch+1] &&
-           match(i - 3, i, 1) == matches[currentMatch])
-        );
-    }
-}
-
 TEST_P(FDRp, NoRepeat1) {
     const u32 hint = GetParam();
     SCOPED_TRACE(hint);
@@ -414,36 +369,6 @@ TEST_P(FDRp, SmallStreaming2) {
     ASSERT_EQ(expected.size(), matches.size());
 }
 
-TEST_P(FDRp, LongLiteral) {
-    const u32 hint = GetParam();
-    SCOPED_TRACE(hint);
-    size_t sz;
-    const u8 *data;
-    vector<hwlmLiteral> lits;
-
-    string alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
-    string alpha4 = alpha+alpha+alpha+alpha;
-    lits.push_back(hwlmLiteral(alpha4.c_str(), 0,10));
-
-    auto fdr = fdrBuildTableHinted(lits, false, hint, get_current_target(), Grey());
-    CHECK_WITH_TEDDY_OK_TO_FAIL(fdr, hint);
-
-    u32 count = 0;
-
-    data = (const u8 *)alpha4.c_str();
-    sz = alpha4.size();
-
-    fdrExec(fdr.get(), data, sz, 0, countCallback, &count, HWLM_ALL_GROUPS);
-    EXPECT_EQ(1U, count);
-    count = 0;
-    fdrExec(fdr.get(), data, sz - 1, 0, countCallback, &count, HWLM_ALL_GROUPS);
-    EXPECT_EQ(0U, count);
-    count = 0;
-    fdrExec(fdr.get(), data + 1, sz - 1, 0, countCallback, &count,
-            HWLM_ALL_GROUPS);
-    EXPECT_EQ(0U, count);
-}
-
 TEST_P(FDRp, moveByteStream) {
     const u32 hint = GetParam();
     SCOPED_TRACE(hint);
@@ -458,7 +383,7 @@ TEST_P(FDRp, moveByteStream) {
 
     size_t size = fdrSize(fdrTable0.get());
 
-    auto fdrTable = aligned_zmalloc_unique<FDR>(size);
+    auto fdrTable = make_bytecode_ptr<FDR>(size, 64);
     EXPECT_NE(nullptr, fdrTable);
 
     memcpy(fdrTable.get(), fdrTable0.get(), size);
@@ -491,7 +416,7 @@ TEST_P(FDRp, Stream1) {
 
     vector<hwlmLiteral> lits;
     lits.push_back(hwlmLiteral("f", 0, 0));
-    lits.push_back(hwlmLiteral("longsigislong", 0, 1));
+    lits.push_back(hwlmLiteral("literal", 0, 1));
 
     auto fdr = fdrBuildTableHinted(lits, false, hint, get_current_target(), Grey());
     CHECK_WITH_TEDDY_OK_TO_FAIL(fdr, hint);
@@ -514,7 +439,7 @@ INSTANTIATE_TEST_CASE_P(FDR, FDRp, ValuesIn(getValidFdrEngines()));
 
 typedef struct {
     string pattern;
-    unsigned char alien;
+    unsigned char alien; // character not present in pattern
 } pattern_alien_t;
 
 // gtest helper
@@ -529,7 +454,6 @@ class FDRpp : public TestWithParam<tuple<u32, pattern_alien_t>> {};
 // not happen if literal is partially (from 1 character up to full literal
 // length) is out of searched buffer - "too early" and "too late" conditions
 TEST_P(FDRpp, AlignAndTooEarly) {
-
     const size_t buf_alignment = 32;
     // Buffer should be big enough to hold two instances of matching literals
     // (up to 64 bytes each) and room for offset (up to 32 bytes)
@@ -538,7 +462,7 @@ TEST_P(FDRpp, AlignAndTooEarly) {
     const u32 hint = get<0>(GetParam());
     SCOPED_TRACE(hint);
 
-    // pattern which is used to generate literals of variable size - from 1 to 64
+    // pattern which is used to generate literals of variable size - from 1 to 8
     const string &pattern = get<1>(GetParam()).pattern;
     const size_t patLen = pattern.size();
     const unsigned char alien = get<1>(GetParam()).alien;
@@ -551,7 +475,7 @@ TEST_P(FDRpp, AlignAndTooEarly) {
     vector<hwlmLiteral> lits;
     for (size_t litLen = 1; litLen <= patLen; litLen++) {
 
-        // building literal from pattern substring of variable length 1-64
+        // building literal from pattern substring of variable length 1-patLen
         lits.push_back(hwlmLiteral(string(pattern, 0, litLen), 0, 0));
         auto fdr = fdrBuildTableHinted(lits, false, hint, get_current_target(),
                                        Grey());
@@ -596,9 +520,9 @@ TEST_P(FDRpp, AlignAndTooEarly) {
 }
 
 static const pattern_alien_t test_pattern[] = {
-        {"abaabaaabaaabbaaaaabaaaaabbaaaaaaabaabbaaaabaaaaaaaabbbbaaaaaaab", 'x'},
-        {"zzzyyzyzyyyyzyyyyyzzzzyyyyyyyyzyyyyyyyzzzzzyzzzzzzzzzyzzyzzzzzzz", (unsigned char)'\x99'},
-        {"abcdef lafjk askldfjklf alfqwei9rui 'gldgkjnooiuswfs138746453583", '\0'}
+        {"abaabaaa", 'x'},
+        {"zzzyyzyz", (unsigned char)'\x99'},
+        {"abcdef l", '\0'}
 };
 
 INSTANTIATE_TEST_CASE_P(FDR, FDRpp, Combine(ValuesIn(getValidFdrEngines()),
diff --git a/unit/internal/fdr_flood.cpp b/unit/internal/fdr_flood.cpp
index 7b00ac4c8..952fffc19 100644
--- a/unit/internal/fdr_flood.cpp
+++ b/unit/internal/fdr_flood.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -161,8 +161,8 @@ TEST_P(FDRFloodp, NoMask) {
         vector<hwlmLiteral> lits;
 
         // build literals of type "aaaa", "aaab", "baaa"
-        // of lengths 1, 2, 4, 8, 16, 32, both case-less and case-sensitive
-        for (int i = 0; i < 6 ; i++) {
+        // of lengths 1, 2, 4, 8, both case-less and case-sensitive
+        for (int i = 0; i < 4; i++) {
             string s(1 << i, c);
             lits.push_back(hwlmLiteral(s, false, i * 8 + 0));
             s[0] = cAlt;
@@ -183,13 +183,13 @@ TEST_P(FDRFloodp, NoMask) {
                                        Grey());
         CHECK_WITH_TEDDY_OK_TO_FAIL(fdr, hint);
 
-        map <u32, int> matchesCounts;
+        map<u32, int> matchesCounts;
 
         hwlm_error_t fdrStatus = fdrExec(fdr.get(), &data[0], dataSize,
                     0, countCallback, (void *)&matchesCounts, HWLM_ALL_GROUPS);
         ASSERT_EQ(0, fdrStatus);
 
-        for (u8 i = 0; i < 6 ; i++) {
+        for (u8 i = 0; i < 4; i++) {
             u32 cnt = dataSize - (1 << i) + 1;
             ASSERT_EQ(cnt, matchesCounts[i * 8 + 0]);
             ASSERT_EQ(0, matchesCounts[i * 8 + 1]);
@@ -214,7 +214,7 @@ TEST_P(FDRFloodp, NoMask) {
                     0, countCallback, (void *)&matchesCounts, HWLM_ALL_GROUPS);
         ASSERT_EQ(0, fdrStatus);
 
-        for (u8 i = 0; i < 6 ; i++) {
+        for (u8 i = 0; i < 4; i++) {
             u32 cnt = dataSize - (1 << i) + 1;
             ASSERT_EQ(0, matchesCounts[i * 8 + 0]);
             ASSERT_EQ(i == 0 ? cnt : 0, matchesCounts[i * 8 + 1]);
diff --git a/unit/internal/fdr_loadval.cpp b/unit/internal/fdr_loadval.cpp
index 22fee7704..bb5efb5f0 100644
--- a/unit/internal/fdr_loadval.cpp
+++ b/unit/internal/fdr_loadval.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,7 +30,7 @@
 #include "gtest/gtest.h"
 
 #include "fdr/fdr_loadval.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"
 
 using namespace std;
 using namespace testing;
@@ -71,7 +71,7 @@ static void fillWithBytes(u8 *ptr, size_t len) {
 TYPED_TEST(FDR_Loadval, Normal) {
     // We should be able to do a normal load at any alignment.
     const size_t len = sizeof(TypeParam);
-    aligned_unique_ptr<u8> mem_p = aligned_zmalloc_unique<u8>(len + 15);
+    auto mem_p = make_bytecode_ptr<u8>(len + 15, 16);
     u8 * mem = mem_p.get();
     ASSERT_TRUE(ISALIGNED_16(mem));
     fillWithBytes(mem, len + 15);
@@ -90,7 +90,7 @@ TYPED_TEST(FDR_Loadval, CautiousEverywhere) {
     // the 'lo' ptr or after the 'hi' ptr.
     const size_t len = sizeof(TypeParam);
 
-    aligned_unique_ptr<u8> mem_p = aligned_zmalloc_unique<u8>(len + 1);
+    auto mem_p = make_bytecode_ptr<u8>(len + 1, 16);
     u8 *mem = mem_p.get() + 1; // force unaligned
     fillWithBytes(mem, len);
 
diff --git a/unit/internal/flat_map.cpp b/unit/internal/flat_map.cpp
index 54372dece..6a81bbfed 100644
--- a/unit/internal/flat_map.cpp
+++ b/unit/internal/flat_map.cpp
@@ -211,6 +211,7 @@ TEST(flat_map, custom_compare) {
     ASSERT_EQ(10, f.rbegin()->second);
 
     ASSERT_TRUE(flat_map_is_sorted(f));
+    ASSERT_TRUE(std::is_sorted(f.begin(), f.end(), f.value_comp()));
     ASSERT_TRUE(flat_map_is_sorted_cmp(f, std::greater<u32>()));
 }
 
@@ -401,3 +402,41 @@ TEST(flat_map, max_size) {
     flat_map<string, string> f;
     ASSERT_LE(1ULL << 24, f.max_size());
 }
+
+TEST(flat_map, hash_value) {
+    const vector<pair<u32, u32>> input = {
+        {0, 0}, {3, 1}, {76, 2}, {132, 3}, {77, 4}, {99999, 5}, {100, 6}};
+    for (size_t len = 0; len < input.size(); len++) {
+        flat_map<u32, u32> f1(input.begin(), input.begin() + len);
+        flat_map<u32, u32> f2(input.rbegin() + input.size() - len,
+                              input.rend());
+        EXPECT_EQ(hash_value(f1), hash_value(f2));
+
+        // Try removing an element.
+        auto f3 = f1;
+        EXPECT_EQ(hash_value(f1), hash_value(f3));
+        EXPECT_EQ(hash_value(f2), hash_value(f3));
+        if (!f3.empty()) {
+            f3.erase(f3.begin());
+            EXPECT_NE(hash_value(f1), hash_value(f3));
+            EXPECT_NE(hash_value(f2), hash_value(f3));
+        }
+
+        // Try adding an element.
+        f3 = f1;
+        EXPECT_EQ(hash_value(f1), hash_value(f3));
+        EXPECT_EQ(hash_value(f2), hash_value(f3));
+        f3.emplace(32767, 7);
+        EXPECT_NE(hash_value(f1), hash_value(f3));
+        EXPECT_NE(hash_value(f2), hash_value(f3));
+
+        // Change a value, but not a key.
+        f3 = f1;
+        EXPECT_EQ(hash_value(f1), hash_value(f3));
+        EXPECT_EQ(hash_value(f2), hash_value(f3));
+        f3.erase(77);
+        f3.emplace(77, 10);
+        EXPECT_NE(hash_value(f1), hash_value(f3));
+        EXPECT_NE(hash_value(f2), hash_value(f3));
+    }
+}
diff --git a/unit/internal/flat_set.cpp b/unit/internal/flat_set.cpp
index 7d45cbb20..3bee0edbe 100644
--- a/unit/internal/flat_set.cpp
+++ b/unit/internal/flat_set.cpp
@@ -392,3 +392,31 @@ TEST(flat_set, max_size) {
     flat_set<string> f;
     ASSERT_LE(1ULL << 24, f.max_size());
 }
+
+TEST(flat_set, hash_value) {
+    const vector<u32> input = {0,        15, 3,   1,   20,  32768,
+                               24000000, 17, 100, 101, 104, 99999};
+    for (size_t len = 0; len < input.size(); len++) {
+        flat_set<u32> f1(input.begin(), input.begin() + len);
+        flat_set<u32> f2(input.rbegin() + input.size() - len, input.rend());
+        EXPECT_EQ(hash_value(f1), hash_value(f2));
+
+        // Try removing an element.
+        auto f3 = f1;
+        EXPECT_EQ(hash_value(f1), hash_value(f3));
+        EXPECT_EQ(hash_value(f2), hash_value(f3));
+        if (!f3.empty()) {
+            f3.erase(f3.begin());
+            EXPECT_NE(hash_value(f1), hash_value(f3));
+            EXPECT_NE(hash_value(f2), hash_value(f3));
+        }
+
+        // Try adding an element.
+        f3 = f1;
+        EXPECT_EQ(hash_value(f1), hash_value(f3));
+        EXPECT_EQ(hash_value(f2), hash_value(f3));
+        f3.insert(32767);
+        EXPECT_NE(hash_value(f1), hash_value(f3));
+        EXPECT_NE(hash_value(f2), hash_value(f3));
+    }
+}
diff --git a/unit/internal/lbr.cpp b/unit/internal/lbr.cpp
index e40bda02d..d32f7e8fa 100644
--- a/unit/internal/lbr.cpp
+++ b/unit/internal/lbr.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -29,20 +29,20 @@
 #include "config.h"
 #include "gtest/gtest.h"
 
-#include "util/target_info.h"
-#include "util/charreach.h"
+#include "grey.h"
+#include "hs_compile.h" /* for controlling ssse3 usage */
+#include "compiler/compiler.h"
 #include "nfa/lbr.h"
 #include "nfa/nfa_api.h"
-#include "nfa/nfa_internal.h"
 #include "nfa/nfa_api_util.h"
+#include "nfa/nfa_internal.h"
+#include "nfagraph/ng.h"
 #include "nfagraph/ng_lbr.h"
 #include "nfagraph/ng_util.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"
+#include "util/charreach.h"
 #include "util/compile_context.h"
-#include "grey.h"
-#include "nfagraph/ng.h"
-#include "compiler/compiler.h"
-#include "hs_compile.h" /* for controlling ssse3 usage */
+#include "util/target_info.h"
 
 #include <ostream>
 
@@ -96,7 +96,8 @@ class LbrTest : public TestWithParam<LbrTestParams> {
         const CompileContext cc(true, false, target, grey);
         ReportManager rm(cc.grey);
         ParsedExpression parsed(0, pattern.c_str(), flags, 0);
-        unique_ptr<NGWrapper> g = buildWrapper(rm, cc, parsed);
+        auto built_expr = buildGraph(rm, cc, parsed);
+        const auto &g = built_expr.g;
         ASSERT_TRUE(g != nullptr);
         clearReports(*g);
 
@@ -109,8 +110,8 @@ class LbrTest : public TestWithParam<LbrTestParams> {
         nfa = constructLBR(*g, triggers, cc, rm);
         ASSERT_TRUE(nfa != nullptr);
 
-        full_state = aligned_zmalloc_unique<char>(nfa->scratchStateSize);
-        stream_state = aligned_zmalloc_unique<char>(nfa->streamStateSize);
+        full_state = make_bytecode_ptr<char>(nfa->scratchStateSize, 64);
+        stream_state = make_bytecode_ptr<char>(nfa->streamStateSize);
     }
 
     virtual void initQueue() {
@@ -151,13 +152,13 @@ class LbrTest : public TestWithParam<LbrTestParams> {
     unsigned matches;
 
     // Compiled NFA structure.
-    aligned_unique_ptr<NFA> nfa;
+    bytecode_ptr<NFA> nfa;
 
-    // Space for full state.
-    aligned_unique_ptr<char> full_state;
+    // Aligned space for full state.
+    bytecode_ptr<char> full_state;
 
     // Space for stream state.
-    aligned_unique_ptr<char> stream_state;
+    bytecode_ptr<char> stream_state;
 
     // Queue structure.
     struct mq q;
diff --git a/unit/internal/limex_nfa.cpp b/unit/internal/limex_nfa.cpp
index 804fcb1f2..c70ceeae1 100644
--- a/unit/internal/limex_nfa.cpp
+++ b/unit/internal/limex_nfa.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -38,7 +38,7 @@
 #include "nfagraph/ng.h"
 #include "nfagraph/ng_limex.h"
 #include "nfagraph/ng_util.h"
-#include "util/alloc.h"
+#include "util/bytecode_ptr.h"
 #include "util/target_info.h"
 
 using namespace std;
@@ -73,7 +73,8 @@ class LimExModelTest : public TestWithParam<int> {
         CompileContext cc(false, false, target, Grey());
         ReportManager rm(cc.grey);
         ParsedExpression parsed(0, expr.c_str(), flags, 0);
-        unique_ptr<NGWrapper> g = buildWrapper(rm, cc, parsed);
+        auto built_expr = buildGraph(rm, cc, parsed);
+        const auto &g = built_expr.g;
         ASSERT_TRUE(g != nullptr);
         clearReports(*g);
 
@@ -87,8 +88,8 @@ class LimExModelTest : public TestWithParam<int> {
                            type, cc);
         ASSERT_TRUE(nfa != nullptr);
 
-        full_state = aligned_zmalloc_unique<char>(nfa->scratchStateSize);
-        stream_state = aligned_zmalloc_unique<char>(nfa->streamStateSize);
+        full_state = make_bytecode_ptr<char>(nfa->scratchStateSize, 64);
+        stream_state = make_bytecode_ptr<char>(nfa->streamStateSize);
     }
 
     virtual void initQueue() {
@@ -115,13 +116,13 @@ class LimExModelTest : public TestWithParam<int> {
     unsigned matches;
 
     // Compiled NFA structure.
-    aligned_unique_ptr<NFA> nfa;
+    bytecode_ptr<NFA> nfa;
 
     // Space for full state.
-    aligned_unique_ptr<char> full_state;
+    bytecode_ptr<char> full_state;
 
     // Space for stream state.
-    aligned_unique_ptr<char> stream_state;
+    bytecode_ptr<char> stream_state;
 
     // Queue structure.
     struct mq q;
@@ -186,8 +187,7 @@ TEST_P(LimExModelTest, CompressExpand) {
 
     // Expand state into a new copy and check that it matches the original
     // uncompressed state.
-    aligned_unique_ptr<char> state_copy =
-        aligned_zmalloc_unique<char>(nfa->scratchStateSize);
+    auto state_copy = make_bytecode_ptr<char>(nfa->scratchStateSize, 64);
     char *dest = state_copy.get();
     memset(dest, 0xff, nfa->scratchStateSize);
     nfaExpandState(nfa.get(), dest, q.streamState, q.offset,
@@ -306,7 +306,8 @@ class LimExReverseTest : public TestWithParam<int> {
         CompileContext cc(false, false, get_current_target(), Grey());
         ReportManager rm(cc.grey);
         ParsedExpression parsed(0, expr.c_str(), flags, 0);
-        unique_ptr<NGWrapper> g = buildWrapper(rm, cc, parsed);
+        auto built_expr = buildGraph(rm, cc, parsed);
+        const auto &g = built_expr.g;
         ASSERT_TRUE(g != nullptr);
         clearReports(*g);
 
@@ -329,7 +330,7 @@ class LimExReverseTest : public TestWithParam<int> {
     unsigned matches;
 
     // Compiled NFA structure.
-    aligned_unique_ptr<NFA> nfa;
+    bytecode_ptr<NFA> nfa;
 };
 
 INSTANTIATE_TEST_CASE_P(LimExReverse, LimExReverseTest,
@@ -365,7 +366,8 @@ class LimExZombieTest : public TestWithParam<int> {
         CompileContext cc(true, false, get_current_target(), Grey());
         ParsedExpression parsed(0, expr.c_str(), flags, 0);
         ReportManager rm(cc.grey);
-        unique_ptr<NGWrapper> g = buildWrapper(rm, cc, parsed);
+        auto built_expr = buildGraph(rm, cc, parsed);
+        const auto &g = built_expr.g;
         ASSERT_TRUE(g != nullptr);
         clearReports(*g);
 
@@ -379,8 +381,8 @@ class LimExZombieTest : public TestWithParam<int> {
                            type, cc);
         ASSERT_TRUE(nfa != nullptr);
 
-        full_state = aligned_zmalloc_unique<char>(nfa->scratchStateSize);
-        stream_state = aligned_zmalloc_unique<char>(nfa->streamStateSize);
+        full_state = make_bytecode_ptr<char>(nfa->scratchStateSize, 64);
+        stream_state = make_bytecode_ptr<char>(nfa->streamStateSize);
     }
 
     virtual void initQueue() {
@@ -407,13 +409,13 @@ class LimExZombieTest : public TestWithParam<int> {
     unsigned matches;
 
     // Compiled NFA structure.
-    aligned_unique_ptr<NFA> nfa;
+    bytecode_ptr<NFA> nfa;
 
     // Space for full state.
-    aligned_unique_ptr<char> full_state;
+    bytecode_ptr<char> full_state;
 
     // Space for stream state.
-    aligned_unique_ptr<char> stream_state;
+    bytecode_ptr<char> stream_state;
 
     // Queue structure.
     struct mq q;
diff --git a/unit/internal/main.cpp b/unit/internal/main.cpp
index 566ae1a54..15e41d0bf 100644
--- a/unit/internal/main.cpp
+++ b/unit/internal/main.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -27,9 +27,10 @@
  */
 
 #include "gtest/gtest.h"
+#include "hs_common.h"
 
 // Driver: run all the tests (defined in other source files in this directory)
-int main(int argc, char **argv) {
+int HS_CDECL main(int argc, char **argv) {
     testing::InitGoogleTest(&argc, argv);
 
     return RUN_ALL_TESTS();
diff --git a/unit/internal/masked_move.cpp b/unit/internal/masked_move.cpp
index 6a2d742db..7bd78c504 100644
--- a/unit/internal/masked_move.cpp
+++ b/unit/internal/masked_move.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,11 +31,12 @@
 #include <cstring>
 
 #include "gtest/gtest.h"
+#include "util/arch.h"
 #include "util/masked_move.h"
 
 namespace {
 
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
 
 bool try_mask_len(const u8 *buf, u8 *target, size_t len) {
     memset(target, 0, 32);
diff --git a/unit/internal/multi_bit.cpp b/unit/internal/multi_bit.cpp
index 38da1d8ac..2b0c7c797 100644
--- a/unit/internal/multi_bit.cpp
+++ b/unit/internal/multi_bit.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,6 +30,7 @@
 
 #include "gtest/gtest.h"
 #include "ue2common.h"
+#include "rose/rose_build_scatter.h"
 #include "util/compile_error.h"
 #include "util/make_unique.h"
 #include "util/multibit.h"
@@ -698,7 +699,9 @@ TEST_P(MultiBitTest, InitRangeChunked) {
 
     for (u32 n = 2; n <= 10; n++) {
         u32 chunk_size = test_size / n;
-        if (chunk_size == 0) break;
+        if (chunk_size == 0) {
+            break;
+        }
 
         for (u32 k = 0; k < n; k++) {
             u32 chunk_begin = k * chunk_size;
@@ -723,9 +726,62 @@ TEST_P(MultiBitTest, InitRangeChunked) {
     }
 }
 
+static
+void apply(const scatter_plan_raw &sp, u8 *out) {
+    for (const auto &e : sp.p_u64a) {
+        memcpy(out + e.offset, &e.val, sizeof(e.val));
+    }
+    for (const auto &e : sp.p_u32) {
+        memcpy(out + e.offset, &e.val, sizeof(e.val));
+    }
+    for (const auto &e : sp.p_u16) {
+        memcpy(out + e.offset, &e.val, sizeof(e.val));
+    }
+    for (const auto &e : sp.p_u8) {
+        memcpy(out + e.offset, &e.val, sizeof(e.val));
+    }
+}
+
+TEST_P(MultiBitTest, InitRangePlanChunked) {
+    SCOPED_TRACE(test_size);
+    ASSERT_TRUE(ba != nullptr);
+
+    // Init ranges chunk by chunk.
+
+    for (u32 n = 2; n <= 10; n++) {
+        u32 chunk_size = test_size / n;
+        if (chunk_size == 0) {
+            break;
+        }
+
+        for (u32 k = 0; k < n; k++) {
+            u32 chunk_begin = k * chunk_size;
+            u32 chunk_end = min(test_size, (k + 1) * chunk_size);
+
+            scatter_plan_raw sp;
+            mmbBuildInitRangePlan(test_size, chunk_begin, chunk_end, &sp);
+            memset(ba, 0xaa, mmbit_size(test_size));
+            apply(sp, ba);
+
+            // First bit set should be chunk_begin.
+            ASSERT_EQ(chunk_begin, mmbit_iterate(ba, test_size, MMB_INVALID));
+
+            // All bits in the chunk should be on.
+            for (u64a i = chunk_begin; i < chunk_end; i += stride) {
+                SCOPED_TRACE(i);
+                ASSERT_TRUE(mmbit_isset(ba, test_size, i));
+            }
+
+            // Last bit on is chunk_end - 1.
+            if (chunk_end) {
+                ASSERT_EQ(MMB_INVALID, mmbit_iterate(ba, test_size, chunk_end - 1));
+            }
+        }
+    }
+}
+
 TEST(MultiBit, SparseIteratorBegin1) {
     const u32 test_size = 100;
-    vector<mmbit_sparse_iter> it;
     vector<u32> bits;
 
     bits.push_back(1);
@@ -734,7 +790,7 @@ TEST(MultiBit, SparseIteratorBegin1) {
     bits.push_back(35);
     bits.push_back(68);
 
-    mmbBuildSparseIterator(it, bits, test_size);
+    auto it = mmbBuildSparseIterator(bits, test_size);
     //ASSERT_EQ(4U, it.size());
 
     // Trivial initial test: all bits in 'bits' are on, all others are off
@@ -763,7 +819,6 @@ TEST(MultiBit, SparseIteratorBegin1) {
 
 TEST(MultiBit, SparseIteratorBegin2) {
     const u32 test_size = 40000;
-    vector<mmbit_sparse_iter> it;
     vector<u32> bits;
 
     bits.push_back(1);
@@ -773,7 +828,7 @@ TEST(MultiBit, SparseIteratorBegin2) {
     bits.push_back(8920);
     bits.push_back(37000);
 
-    mmbBuildSparseIterator(it, bits, test_size);
+    auto it = mmbBuildSparseIterator(bits, test_size);
     //ASSERT_EQ(12U, it.size());
 
     // Trivial initial test: all bits in 'bits' are on, all others are off
@@ -802,7 +857,6 @@ TEST(MultiBit, SparseIteratorBegin2) {
 
 TEST(MultiBit, SparseIteratorNext1) {
     const u32 test_size = 100;
-    vector<mmbit_sparse_iter> it;
     vector<u32> bits;
 
     bits.push_back(1);
@@ -811,7 +865,7 @@ TEST(MultiBit, SparseIteratorNext1) {
     bits.push_back(35);
     bits.push_back(68);
 
-    mmbBuildSparseIterator(it, bits, test_size);
+    auto it = mmbBuildSparseIterator(bits, test_size);
 
     // Trivial initial test: all bits in 'bits' are on, all others are off
     mmbit_holder ba(test_size);
@@ -867,7 +921,6 @@ TEST(MultiBit, SparseIteratorNext1) {
 
 TEST(MultiBit, SparseIteratorNext2) {
     const u32 test_size = 40000;
-    vector<mmbit_sparse_iter> it;
     vector<u32> bits;
 
     bits.push_back(1);
@@ -882,7 +935,7 @@ TEST(MultiBit, SparseIteratorNext2) {
     bits.push_back(37000);
     bits.push_back(39999);
 
-    mmbBuildSparseIterator(it, bits, test_size);
+    auto it = mmbBuildSparseIterator(bits, test_size);
 
     // Trivial initial test: all bits in 'bits' are on, all others are off
     mmbit_holder ba(test_size);
@@ -938,7 +991,6 @@ TEST(MultiBit, SparseIteratorNext2) {
 
 TEST(MultiBit, SparseIteratorNextSmall) {
     const u32 test_size = 15;
-    vector<mmbit_sparse_iter> it;
     vector<u32> bits;
 
     bits.push_back(1);
@@ -948,7 +1000,7 @@ TEST(MultiBit, SparseIteratorNextSmall) {
     bits.push_back(12);
     bits.push_back(14);
 
-    mmbBuildSparseIterator(it, bits, test_size);
+    auto it = mmbBuildSparseIterator(bits, test_size);
 
     // Trivial initial test: all bits in 'bits' are on, all others are off
     mmbit_holder ba(test_size);
@@ -1007,13 +1059,12 @@ TEST_P(MultiBitTest, SparseIteratorBeginAll) {
     ASSERT_TRUE(ba != nullptr);
 
     // Put all our bits into the sparse iterator.
-    vector<mmbit_sparse_iter> it;
     vector<u32> bits;
     bits.reserve(test_size / stride);
     for (u64a i = 0; i < test_size; i += stride) {
         bits.push_back(i);
     }
-    mmbBuildSparseIterator(it, bits, test_size);
+    auto it = mmbBuildSparseIterator(bits, test_size);
 
     // Switch all bits on in state.
     mmbit_clear(ba, test_size);
@@ -1047,12 +1098,11 @@ TEST_P(MultiBitTest, SparseIteratorBeginThirds) {
     }
 
     // Put all our bits into the sparse iterator
-    vector<mmbit_sparse_iter> it;
     vector<u32> bits(test_size);
     for (u32 i = 0; i != test_size; i++) {
         bits[i] = i;
     }
-    mmbBuildSparseIterator(it, bits, test_size);
+    auto it = mmbBuildSparseIterator(bits, test_size);
 
     // Switch every third bits on in state
     mmbit_clear(ba, test_size);
@@ -1082,13 +1132,12 @@ TEST_P(MultiBitTest, SparseIteratorNextAll) {
     ASSERT_TRUE(ba != nullptr);
 
     // Put all our bits into the sparse iterator.
-    vector<mmbit_sparse_iter> it;
     vector<u32> bits;
     bits.reserve(test_size / stride);
     for (u64a i = 0; i < test_size; i += stride) {
         bits.push_back(i);
     }
-    mmbBuildSparseIterator(it, bits, test_size);
+    auto it = mmbBuildSparseIterator(bits, test_size);
 
     // Switch all bits on in state
     mmbit_clear(ba, test_size);
@@ -1125,14 +1174,13 @@ TEST_P(MultiBitTest, SparseIteratorNextExactStrided) {
     // Put all our bits into the sparse iterator and switch them on in the
     // state.
     mmbit_clear(ba, test_size);
-    vector<mmbit_sparse_iter> it;
     vector<u32> bits;
     bits.reserve(test_size / stride);
     for (u64a i = 0; i < test_size; i += stride) {
         bits.push_back(i);
         mmbit_set(ba, test_size, i);
     }
-    mmbBuildSparseIterator(it, bits, test_size);
+    auto it = mmbBuildSparseIterator(bits, test_size);
 
     // Iterate over all bits.
     vector<mmbit_sparse_state> state(mmbit_sparse_iter_state_size(test_size));
@@ -1157,13 +1205,12 @@ TEST_P(MultiBitTest, SparseIteratorNextNone) {
     ASSERT_TRUE(ba != nullptr);
 
     // Put all our bits into the sparse iterator.
-    vector<mmbit_sparse_iter> it;
     vector<u32> bits;
     bits.reserve(test_size / stride);
     for (u64a i = 0; i < test_size; i += stride) {
         bits.push_back(i);
     }
-    mmbBuildSparseIterator(it, bits, test_size);
+    auto it = mmbBuildSparseIterator(bits, test_size);
 
     // Switch only the first bit on
     mmbit_clear(ba, test_size);
@@ -1186,13 +1233,12 @@ TEST_P(MultiBitTest, SparseIteratorUnsetAll) {
     ASSERT_TRUE(ba != nullptr);
 
     // Put all our bits into the sparse iterator
-    vector<mmbit_sparse_iter> it;
     vector<u32> bits;
     bits.reserve(test_size / stride);
     for (u64a i = 0; i < test_size; i += stride) {
         bits.push_back(i);
     }
-    mmbBuildSparseIterator(it, bits, test_size);
+    auto it = mmbBuildSparseIterator(bits, test_size);
 
     // Switch all bits on
     mmbit_clear(ba, test_size);
@@ -1226,9 +1272,8 @@ TEST_P(MultiBitTest, SparseIteratorUnsetHalves) {
         odd.push_back(i);
     }
 
-    vector<mmbit_sparse_iter> it_even, it_odd;
-    mmbBuildSparseIterator(it_even, even, test_size);
-    mmbBuildSparseIterator(it_odd, odd, test_size);
+    auto it_even = mmbBuildSparseIterator(even, test_size);
+    auto it_odd = mmbBuildSparseIterator(odd, test_size);
 
     // Switch all bits on
     mmbit_clear(ba, test_size);
diff --git a/unit/internal/multiaccel_matcher.cpp b/unit/internal/multiaccel_matcher.cpp
deleted file mode 100644
index bdf56ff91..000000000
--- a/unit/internal/multiaccel_matcher.cpp
+++ /dev/null
@@ -1,301 +0,0 @@
-/*
- * Copyright (c) 2015-2016, Intel Corporation
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- *  * Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of Intel Corporation nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-
-extern "C" {
-#include "nfa/accel.h" // wrapping in extern C to make sure run_accel works
-}
-
-#include "config.h"
-#include "src/ue2common.h"
-
-#include "gtest/gtest.h"
-#include "nfagraph/ng_limex_accel.h"
-#include "nfa/accelcompile.h"
-#include "nfa/multivermicelli.h"
-#include "nfa/multishufti.h"
-#include "nfa/multitruffle.h"
-#include "util/alloc.h"
-#include "util/charreach.h"
-
-#include <algorithm>
-#include <iostream>
-#include <random>
-#include <string>
-#include <vector>
-
-using namespace ue2;
-using namespace std;
-using namespace testing;
-
-// test parameters structure
-struct MultiaccelTestParam {
-    string match_pattern;
-    u32 match_pattern_start_idx;
-    u32 match_idx;
-    bool test_all_offsets;
-    u8 match_len1;
-    u8 match_len2;
-    MultibyteAccelInfo::multiaccel_type type;
-};
-
-// buffer size is constant
-static const u32 BUF_SIZE = 200;
-
-// strings, out of which CharReach will be generated
-static const string VERM_CR = "a";
-static const string V_NC_CR = "aA";
-static const string SHUF_CR = "abcdefghijklmnopqrstuvwxyz";
-static const string TRUF_CR = "\x11\x22\x33\x44\x55\x66\x77\x88\x99";
-
-// Parameterized test case for multiaccel patterns.
-class MultiaccelTest : public TestWithParam<MultiaccelTestParam> {
-protected:
-    virtual void SetUp() {
-        // set up is deferred until the actual test, since we can't compile
-        // any accel schemes unless we know CharReach
-        const MultiaccelTestParam &p = GetParam();
-
-        // reserve space in our buffer
-        buffer = (u8 *)aligned_zmalloc(BUF_SIZE);
-
-        // store the index where we expect to see the match. note that it may
-        // be different from where the match pattern has started since we may
-        // have a flooded match (i.e. a match preceded by almost-match) or a
-        // no-match (in which case "match" index is at the end of the buffer).
-        match_idx = p.match_idx;
-
-        // make note if we need to test all offsets - sometimes we don't, for
-        // example when testing partial or no-match.
-        test_all_offsets = p.test_all_offsets;
-    }
-
-    char getChar(const CharReach &cr) {
-        assert(cr.count() > 0);
-        auto dist = uniform_int_distribution<size_t>(0, cr.count() - 1);
-        size_t result = cr.find_nth(dist(prng));
-        assert(result != CharReach::npos);
-        return (char)result;
-    }
-
-    // char generator
-    char getChar(const CharReach &cr, bool match) {
-        return getChar(match ? cr : ~cr);
-    }
-
-    // appends a string with matches/unmatches according to input match pattern
-    void getMatch(u8 *result, u32 start, const string &pattern,
-                  const CharReach &cr) {
-        for (const auto &c : pattern) {
-            result[start++] = getChar(cr, c == '1');
-        }
-    }
-
-    // appends non-matching noise of certain lengths
-    void getNoise(u8 *result, u32 start, u32 len, const CharReach &cr) {
-        for (unsigned i = 0; i < len; i++) {
-            result[start + i] = getChar(cr, false);
-        }
-    }
-
-    // deferred buffer generation, as we don't know CharReach before we run the test
-    void GenerateBuffer(const CharReach &cr) {
-        const MultiaccelTestParam &p = GetParam();
-
-        // step 1: fill prefix with non-matching noise
-        u32 start = 0;
-        getNoise(buffer, start, p.match_pattern_start_idx, cr);
-
-        // step 2: add a match
-        start += p.match_pattern_start_idx;
-        getMatch(buffer, start, p.match_pattern, cr);
-
-        // step 3: fill in the rest of the buffer with non-matching noise
-        start += p.match_pattern.size();
-        getNoise(buffer, start, BUF_SIZE - p.match_pattern.size() -
-                 p.match_pattern_start_idx, cr);
-    }
-
-    // deferred accel scheme generation, as we don't know CharReach before we run the test
-    void CompileAccelScheme(const CharReach &cr, AccelAux *aux) {
-        const MultiaccelTestParam &p = GetParam();
-
-        AccelInfo ai;
-        ai.single_stops = cr; // dummy CharReach to prevent red tape accel
-        ai.ma_len1 = p.match_len1;
-        ai.ma_len2 = p.match_len2;
-        ai.multiaccel_stops = cr;
-        ai.ma_type = p.type;
-
-        buildAccelAux(ai, aux);
-
-        // now, verify we've successfully built our accel scheme, *and* that it's
-        // a multibyte scheme
-        ASSERT_TRUE(aux->accel_type >= ACCEL_MLVERM &&
-                    aux->accel_type <= ACCEL_MDSGTRUFFLE);
-    }
-
-    virtual void TearDown() {
-        aligned_free(buffer);
-    }
-
-    // We want our tests to be deterministic, so we use a PRNG in the test
-    // fixture.
-    mt19937 prng;
-
-    u32 match_idx;
-    u8 *buffer;
-    bool test_all_offsets;
-};
-
-static
-void runTest(const u8 *buffer, AccelAux *aux, unsigned match_idx,
-             bool test_all_offsets) {
-    const u8 *start = buffer;
-    const u8 *end = start + BUF_SIZE;
-    const u8 *match = start + match_idx;
-
-    // comparing indexes into the buffer is easier to understand than pointers
-    if (test_all_offsets) {
-        // run_accel can only scan >15 byte buffers
-        u32 end_offset = min(match_idx, BUF_SIZE - 15);
-
-        for (unsigned offset = 0; offset < end_offset; offset++) {
-            const u8 *ptr = run_accel(aux, (start + offset), end);
-            unsigned idx = ptr - start;
-            ASSERT_EQ(match_idx, idx);
-        }
-    } else {
-        const u8 *ptr = run_accel(aux, start, end);
-        unsigned idx = ptr - start;
-        ASSERT_EQ(match_idx, idx);
-    }
-}
-
-TEST_P(MultiaccelTest, TestVermicelli) {
-    AccelAux aux = {0};
-    CharReach cr(VERM_CR);
-
-    GenerateBuffer(cr);
-
-    CompileAccelScheme(cr, &aux);
-
-    runTest(buffer, &aux, match_idx, test_all_offsets);
-}
-
-TEST_P(MultiaccelTest, TestVermicelliNocase) {
-    AccelAux aux = {0};
-    CharReach cr(V_NC_CR);
-
-    GenerateBuffer(cr);
-
-    CompileAccelScheme(cr, &aux);
-
-    runTest(buffer, &aux, match_idx, test_all_offsets);
-}
-
-TEST_P(MultiaccelTest, TestShufti) {
-    AccelAux aux = {0};
-    CharReach cr(SHUF_CR);
-
-    GenerateBuffer(cr);
-
-    CompileAccelScheme(cr, &aux);
-
-    runTest(buffer, &aux, match_idx, test_all_offsets);
-}
-
-TEST_P(MultiaccelTest, TestTruffle) {
-    AccelAux aux = {0};
-    CharReach cr(TRUF_CR);
-
-    GenerateBuffer(cr);
-
-    CompileAccelScheme(cr, &aux);
-
-    runTest(buffer, &aux, match_idx, test_all_offsets);
-}
-
-static const MultiaccelTestParam multiaccelTests[] = {
-    // long matcher
-
-    // full, partial, flooded, nomatch
-    {"11111", 180, 180, true, 5, 0, MultibyteAccelInfo::MAT_LONG},
-    {"111", 197, 197, true, 5, 0, MultibyteAccelInfo::MAT_LONG},
-    {"1111011111", 177, 182, false, 5, 0, MultibyteAccelInfo::MAT_LONG},
-    {"1111011110", 177, 200, false, 5, 0, MultibyteAccelInfo::MAT_LONG},
-
-    // long-grab matcher
-
-    // full, partial, flooded, nomatch
-    {"111110", 180, 180, true, 5, 0, MultibyteAccelInfo::MAT_LONGGRAB},
-    {"111", 197, 197, true, 5, 0, MultibyteAccelInfo::MAT_LONGGRAB},
-    {"11111111110", 177, 182, false, 5, 0, MultibyteAccelInfo::MAT_LONGGRAB},
-    {"11110111101", 177, 200, false, 5, 0, MultibyteAccelInfo::MAT_LONGGRAB},
-
-    // shift matcher
-
-    // full, partial, flooded, nomatch
-    {"11001", 180, 180, true, 4, 0, MultibyteAccelInfo::MAT_SHIFT},
-    {"110", 197, 197, true, 4, 0, MultibyteAccelInfo::MAT_SHIFT},
-    {"1001011001", 177, 182, false, 4, 0, MultibyteAccelInfo::MAT_SHIFT},
-    {"1101001011", 177, 200, false, 4, 0, MultibyteAccelInfo::MAT_SHIFT},
-
-    // shift-grab matcher
-
-    // full, partial, flooded, nomatch
-    {"10111", 180, 180, true, 4, 0, MultibyteAccelInfo::MAT_SHIFTGRAB},
-    {"101", 197, 197, true, 4, 0, MultibyteAccelInfo::MAT_SHIFTGRAB},
-    {"1110010111", 177, 182, false, 4, 0, MultibyteAccelInfo::MAT_SHIFTGRAB},
-    {"1100101100", 177, 200, false, 4, 0, MultibyteAccelInfo::MAT_SHIFTGRAB},
-
-    // doubleshift matcher
-
-    // full, partial (one and two shifts), flooded, nomatch
-    {"110111", 180, 180, true, 3, 2, MultibyteAccelInfo::MAT_DSHIFT},
-    {"110", 197, 197, true, 3, 2, MultibyteAccelInfo::MAT_DSHIFT},
-    {"1101", 196, 196, true, 3, 2, MultibyteAccelInfo::MAT_DSHIFT},
-    {"1100100101", 178, 182, false, 3, 2, MultibyteAccelInfo::MAT_DSHIFT},
-    {"1101001101", 177, 200, false, 3, 2, MultibyteAccelInfo::MAT_DSHIFT},
-
-    // doubleshift-grab matcher
-
-    // full, partial (one and two shifts), flooded, nomatch
-    {"100101", 180, 180, true, 3, 2, MultibyteAccelInfo::MAT_DSHIFTGRAB},
-    {"100", 197, 197, true, 3, 2, MultibyteAccelInfo::MAT_DSHIFTGRAB},
-    {"1011", 196, 196, true, 3, 2, MultibyteAccelInfo::MAT_DSHIFTGRAB},
-    {"11111101101", 177, 182, false, 3, 2, MultibyteAccelInfo::MAT_DSHIFTGRAB},
-    {"1111110111", 177, 200, false, 3, 2, MultibyteAccelInfo::MAT_DSHIFTGRAB},
-};
-
-INSTANTIATE_TEST_CASE_P(Multiaccel, MultiaccelTest, ValuesIn(multiaccelTests));
-
-// boring stuff for google test
-void PrintTo(const MultiaccelTestParam &p, ::std::ostream *os) {
-    *os << "MultiaccelTestParam: " << p.match_pattern;
-}
diff --git a/unit/internal/nfagraph_common.h b/unit/internal/nfagraph_common.h
index d3aafc99f..ca5554c44 100644
--- a/unit/internal/nfagraph_common.h
+++ b/unit/internal/nfagraph_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -40,18 +40,19 @@ namespace ue2 {
 
 // Helper function: construct a graph from an expression, flags and context.
 inline
-std::unique_ptr<NGWrapper> constructGraphWithCC(const std::string &expr,
-                                                CompileContext &cc,
-                                                unsigned flags) {
+std::unique_ptr<NGHolder> constructGraphWithCC(const std::string &expr,
+                                               CompileContext &cc,
+                                               unsigned flags) {
     ReportManager rm(cc.grey);
     ParsedExpression parsed(0, expr.c_str(), flags, 0);
-    return buildWrapper(rm, cc, parsed);
+    auto built_expr = buildGraph(rm, cc, parsed);
+    return std::move(built_expr.g);
 }
 
 // Helper function: construct a graph from an expression and its flags.
 inline
-std::unique_ptr<NGWrapper> constructGraph(const std::string &expr,
-                                          unsigned flags) {
+std::unique_ptr<NGHolder> constructGraph(const std::string &expr,
+                                         unsigned flags) {
     CompileContext cc(false, false, get_current_target(), Grey());
     return constructGraphWithCC(expr, cc, flags);
 }
diff --git a/unit/internal/nfagraph_comp.cpp b/unit/internal/nfagraph_comp.cpp
index 41af3f0ca..61b05a465 100644
--- a/unit/internal/nfagraph_comp.cpp
+++ b/unit/internal/nfagraph_comp.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,13 +33,8 @@
 #include "config.h"
 #include "gtest/gtest.h"
 #include "nfagraph_common.h"
-#include "grey.h"
-#include "hs.h"
-#include "compiler/compiler.h"
 #include "nfagraph/ng.h"
-#include "nfagraph/ng_builder.h"
 #include "nfagraph/ng_calc_components.h"
-#include "util/target_info.h"
 
 using namespace std;
 using namespace ue2;
@@ -48,7 +43,9 @@ TEST(NFAGraph, CalcComp1) {
     auto graph = constructGraph("abc|def|ghi", 0);
     ASSERT_TRUE(graph != nullptr);
 
-    deque<unique_ptr<NGHolder>> comps = calcComponents(*graph);
+    Grey grey;
+    grey.calcComponents = true;
+    auto comps = calcComponents(std::move(graph), grey);
     ASSERT_EQ(3, comps.size());
 }
 
@@ -56,7 +53,9 @@ TEST(NFAGraph, CalcComp2) {
     auto graph = constructGraph("a|b|c|d|e|f|g|h|i", 0);
     ASSERT_TRUE(graph != nullptr);
 
-    deque<unique_ptr<NGHolder>> comps = calcComponents(*graph);
+    Grey grey;
+    grey.calcComponents = true;
+    auto comps = calcComponents(std::move(graph), grey);
 
     // We should be identifying this as a trivial case and not splitting it.
     ASSERT_EQ(1, comps.size());
@@ -67,7 +66,9 @@ TEST(NFAGraph, RecalcComp1) {
     comps.push_back(constructGraph("abc|def|ghi", 0));
     ASSERT_TRUE(comps.back() != nullptr);
 
-    recalcComponents(comps);
+    Grey grey;
+    grey.calcComponents = true;
+    recalcComponents(comps, grey);
 
     ASSERT_EQ(3, comps.size());
 }
diff --git a/unit/internal/nfagraph_equivalence.cpp b/unit/internal/nfagraph_equivalence.cpp
index 8fda92231..73aec1d7e 100644
--- a/unit/internal/nfagraph_equivalence.cpp
+++ b/unit/internal/nfagraph_equivalence.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -54,7 +54,7 @@ TEST(NFAGraph, RemoveEquivalence1) {
     // The graph should be merged into: a(b|c)
     CompileContext cc(false, false, get_current_target(), Grey());
 
-    unique_ptr<NGWrapper> graph(constructGraphWithCC("(ab|ac)", cc, 0));
+    auto graph(constructGraphWithCC("(ab|ac)", cc, 0));
     ASSERT_TRUE(graph != nullptr);
     NGHolder &g = *graph;
     g.kind = NFA_SUFFIX;
@@ -115,7 +115,7 @@ TEST(NFAGraph, RemoveEquivalence2) {
     // The graph should be merged into: (b|c)a
     CompileContext cc(false, false, get_current_target(), Grey());
 
-    unique_ptr<NGWrapper> graph(constructGraphWithCC("(ba|ca)", cc, 0));
+    auto graph(constructGraphWithCC("(ba|ca)", cc, 0));
     ASSERT_TRUE(graph != nullptr);
     NGHolder &g = *graph;
     g.kind = NFA_SUFFIX;
@@ -176,8 +176,7 @@ TEST(NFAGraph, RemoveEquivalence3) {
     // The graph should be merged into: a(..)+(X|Y)
     CompileContext cc(false, false, get_current_target(), Grey());
 
-    unique_ptr<NGWrapper> graph(constructGraphWithCC("a(..)+X|a(..)+Y", cc,
-            HS_FLAG_DOTALL));
+    auto graph(constructGraphWithCC("a(..)+X|a(..)+Y", cc, HS_FLAG_DOTALL));
     ASSERT_TRUE(graph != nullptr);
     NGHolder &g = *graph;
     g.kind = NFA_SUFFIX;
@@ -266,8 +265,7 @@ TEST(NFAGraph, RemoveEquivalence4) {
     // The graph should be merged into: (X|Y)(..)+a
     CompileContext cc(false, false, get_current_target(), Grey());
 
-    unique_ptr<NGWrapper> graph(constructGraphWithCC("X(..)+a|Y(..)+a", cc,
-            HS_FLAG_DOTALL));
+    auto graph(constructGraphWithCC("X(..)+a|Y(..)+a", cc, HS_FLAG_DOTALL));
     ASSERT_TRUE(graph != nullptr);
     NGHolder &g = *graph;
     g.kind = NFA_SUFFIX;
@@ -363,8 +361,7 @@ TEST(NFAGraph, RemoveEquivalence5) {
     // The graph should be merged into: [^\x00]*[\x00]
     CompileContext cc(false, false, get_current_target(), Grey());
 
-    unique_ptr<NGWrapper> graph(constructGraphWithCC("[^\\x00][^\\x00]*[\\x00]",
-            cc, 0));
+    auto graph(constructGraphWithCC("[^\\x00][^\\x00]*[\\x00]", cc, 0));
     ASSERT_TRUE(graph != nullptr);
     NGHolder &g = *graph;
     g.kind = NFA_PREFIX;
@@ -420,7 +417,7 @@ TEST(NFAGraph, RemoveEquivalence5) {
 TEST(NFAGraph, RemoveEquivalence6) {
     // Build a small graph with two redundant vertices: ^(.*|.*)a
     // The graph should be merged into: a
-    unique_ptr<NGWrapper> graph(constructGraph("^(.*|.*)a", HS_FLAG_DOTALL));
+    auto graph(constructGraph("^(.*|.*)a", HS_FLAG_DOTALL));
     ASSERT_TRUE(graph != nullptr);
     NGHolder &g = *graph;
 
@@ -458,7 +455,7 @@ TEST(NFAGraph, RemoveEquivalence6) {
 TEST(NFAGraph, RemoveEquivalence7) {
     // Build a small graph with no redundant vertices: ^.+a
     // Make sure we don't merge anything
-    unique_ptr<NGWrapper> graph(constructGraph("^.+a", HS_FLAG_DOTALL));
+    auto graph(constructGraph("^.+a", HS_FLAG_DOTALL));
     ASSERT_TRUE(graph != nullptr);
     NGHolder &g = *graph;
 
diff --git a/unit/internal/nfagraph_find_matches.cpp b/unit/internal/nfagraph_find_matches.cpp
index 553d6dc54..cd0cd796e 100644
--- a/unit/internal/nfagraph_find_matches.cpp
+++ b/unit/internal/nfagraph_find_matches.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -76,7 +76,7 @@ class MatchesTest: public TestWithParam<MatchesTestParams> {
 static const MatchesTestParams matchesTests[] = {
     // EOD and anchored patterns
 
-	// these should produce no matches
+    // these should produce no matches
     { "^foobar", "foolish", {}, 0, false, true},
     { "^foobar$", "ze foobar", {}, 0, false, true},
     { "^foobar$", "foobar ", {}, 0, false, true},
@@ -208,14 +208,25 @@ TEST_P(MatchesTest, Check) {
     CompileContext cc(false, false, get_current_target(), Grey());
     ReportManager rm(cc.grey);
     ParsedExpression parsed(0, t.pattern.c_str(), t.flags, 0);
-    auto g = buildWrapper(rm, cc, parsed);
+    auto built_expr = buildGraph(rm, cc, parsed);
+    const auto &g = built_expr.g;
     bool utf8 = (t.flags & HS_FLAG_UTF8) > 0;
 
     set<pair<size_t, size_t>> matches;
-    findMatches(*g, rm, t.input, matches, t.notEod, t.som, utf8);
+    bool success = findMatches(*g, rm, t.input, matches, 0, t.notEod, utf8);
+    ASSERT_TRUE(success);
 
     set<pair<size_t, size_t>> expected(begin(t.matches), end(t.matches));
 
+    // findMatches returns matches with SOM, so zero them out if not SOM
+    if (!t.som) {
+        set<pair<size_t, size_t>> new_matches;
+        for (auto &m : matches) {
+            new_matches.emplace(0, m.second);
+        }
+        matches.swap(new_matches);
+    }
+
     ASSERT_EQ(expected, matches) << "Pattern '" << t.pattern
                                  << "' against input '" << t.input << "'";
 }
diff --git a/unit/internal/nfagraph_redundancy.cpp b/unit/internal/nfagraph_redundancy.cpp
index be9527fd3..c77045e02 100644
--- a/unit/internal/nfagraph_redundancy.cpp
+++ b/unit/internal/nfagraph_redundancy.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -53,7 +53,7 @@ TEST(NFAGraph, RemoveRedundancy1) {
     // The character reachability should be merged into: [ab]c
     CompileContext cc(false, false, get_current_target(), Grey());
 
-    unique_ptr<NGWrapper> graph(constructGraphWithCC("(a|b)c", cc, 0));
+    auto graph(constructGraphWithCC("(a|b)c", cc, 0));
     ASSERT_TRUE(graph.get() != nullptr);
     NGHolder &g = *graph;
 
@@ -95,8 +95,7 @@ TEST(NFAGraph, RemoveRedundancy2) {
     // Build a small graph with a redundant vertex: a.*b?c
     // The dot-star should swallow the 'b?', leaving a.*c
     CompileContext cc(false, false, get_current_target(), Grey());
-    unique_ptr<NGWrapper> graph(constructGraphWithCC("a.*b?c", cc,
-                                                     HS_FLAG_DOTALL));
+    auto graph(constructGraphWithCC("a.*b?c", cc, HS_FLAG_DOTALL));
     ASSERT_TRUE(graph.get() != nullptr);
     NGHolder &g = *graph;
 
@@ -152,8 +151,7 @@ TEST(NFAGraph, RemoveRedundancy2) {
 
 TEST(NFAGraph, RemoveRedundancy3) {
     CompileContext cc(false, false, get_current_target(), Grey());
-    unique_ptr<NGWrapper> graph(constructGraphWithCC("foobar.*(a|b)?teakettle",
-                                                     cc, 0));
+    auto graph(constructGraphWithCC("foobar.*(a|b)?teakettle", cc, 0));
     ASSERT_TRUE(graph.get() != nullptr);
 
     unsigned countBefore = num_vertices(*graph);
@@ -166,7 +164,7 @@ TEST(NFAGraph, RemoveRedundancy3) {
 
 TEST(NFAGraph, RemoveRedundancy4) {
     CompileContext cc(false, false, get_current_target(), Grey());
-    unique_ptr<NGWrapper> graph(constructGraphWithCC("foo([A-Z]|a|b|q)", cc, 0));
+    auto graph(constructGraphWithCC("foo([A-Z]|a|b|q)", cc, 0));
     ASSERT_TRUE(graph.get() != nullptr);
 
     unsigned countBefore = num_vertices(*graph);
@@ -178,8 +176,7 @@ TEST(NFAGraph, RemoveRedundancy4) {
 
 TEST(NFAGraph, RemoveRedundancy5) {
     CompileContext cc(false, false, get_current_target(), Grey());
-    unique_ptr<NGWrapper> graph(constructGraphWithCC("[0-9]?badgerbrush",
-            cc, 0));
+    auto graph(constructGraphWithCC("[0-9]?badgerbrush", cc, 0));
     ASSERT_TRUE(graph.get() != nullptr);
 
     unsigned countBefore = num_vertices(*graph);
diff --git a/unit/internal/nfagraph_repeat.cpp b/unit/internal/nfagraph_repeat.cpp
index b34d12717..941873ece 100644
--- a/unit/internal/nfagraph_repeat.cpp
+++ b/unit/internal/nfagraph_repeat.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -66,28 +66,28 @@ struct PureRepeatTest {
 class NFAPureRepeatTest : public TestWithParam<PureRepeatTest> { };
 
 static const PureRepeatTest pureRepeatTests[] = {
-    { "^.*", 0, depth::infinity() },
-    { "^.+", 1, depth::infinity() },
-    { "^.", 1, 1 },
-    { "^..", 2, 2 },
-    { "^.?.", 1, 2 },
-    { "^.{1,2}", 1, 2 },
-    { "^.{1,3}", 1, 3 },
-    { "^.{1,10}", 1, 10 },
-    { "^.{1,200}", 1, 200 },
-    { "^.{200}", 200, 200 },
-    { "^.{0,}", 0, depth::infinity() },
-    { "^.{1,}", 1, depth::infinity() },
-    { "^.{2,}", 2, depth::infinity() },
-    { "^.{10,}", 10, depth::infinity() },
-    { "^.{200,}", 200, depth::infinity() },
-    { "^.{5000,}", 5000, depth::infinity() },
-    { "^.{0,1}", 0, 1 },
-    { "^.{0,2}", 0, 2 },
-    { "^.{0,100}", 0, 100 },
-    { "^.{0,5000}", 0, 5000 },
-    { "^x{10}x{20,30}", 30, 40 },
-    { "^..?..?..?..?..?", 5, 10 }
+    { "^.*", depth(0), depth::infinity() },
+    { "^.+", depth(1), depth::infinity() },
+    { "^.", depth(1), depth(1) },
+    { "^..", depth(2), depth(2) },
+    { "^.?.", depth(1), depth(2) },
+    { "^.{1,2}", depth(1), depth(2) },
+    { "^.{1,3}", depth(1), depth(3) },
+    { "^.{1,10}", depth(1), depth(10) },
+    { "^.{1,200}", depth(1), depth(200) },
+    { "^.{200}", depth(200), depth(200) },
+    { "^.{0,}", depth(0), depth::infinity() },
+    { "^.{1,}", depth(1), depth::infinity() },
+    { "^.{2,}", depth(2), depth::infinity() },
+    { "^.{10,}", depth(10), depth::infinity() },
+    { "^.{200,}", depth(200), depth::infinity() },
+    { "^.{5000,}", depth(5000), depth::infinity() },
+    { "^.{0,1}", depth(0), depth(1) },
+    { "^.{0,2}", depth(0), depth(2) },
+    { "^.{0,100}", depth(0), depth(100) },
+    { "^.{0,5000}", depth(0), depth(5000) },
+    { "^x{10}x{20,30}", depth(30), depth(40) },
+    { "^..?..?..?..?..?", depth(5), depth(10) }
 };
 
 INSTANTIATE_TEST_CASE_P(PureRepeat, NFAPureRepeatTest,
diff --git a/unit/internal/nfagraph_width.cpp b/unit/internal/nfagraph_width.cpp
index 03508ea84..7ccdca37f 100644
--- a/unit/internal/nfagraph_width.cpp
+++ b/unit/internal/nfagraph_width.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -52,26 +52,26 @@ struct WidthTest {
 class NFAWidthTest : public TestWithParam<WidthTest> { };
 
 static const WidthTest widthTests[] = {
-    { "()", 0, 0 },
-    { "a", 1, 1 },
-    { "a?b", 1, 2 },
-    { "foobar", 6, 6 },
-    { "foo(bar)?", 3, 6 },
-    { "(a|ab|abc|abcd)", 1, 4 },
-    { "foo.*bar", 6, depth::infinity() },
-    { "foo(bar)*", 3, depth::infinity() },
-    { "foo(bar)+", 6, depth::infinity() },
-    { "foo(bar){1,3}", 6, 12 },
-    { "(abcd)+", 4, depth::infinity() },
-    { "foo\\z", 3, 3 },
-    { "^foo", 3, 3 },
-    { "^foo|bar.*baz", 3, depth::infinity() },
-    { "^foobar.*|baz", 3, depth::infinity() },
-    { "foo(\\z|bar)", 3, 6 },
-    { "foo(|bar\\z)", 3, 6 },
-    { "foo.{0,15}bar", 6, 21 },
-    { "foo.{0,15}.*bar", 6, depth::infinity() },
-    { "(?smi)^(aa[^a]aa$|a|a+\\Z|a)", 1, depth::infinity() }
+    { "()", depth(0), depth(0) },
+    { "a", depth(1), depth(1) },
+    { "a?b", depth(1), depth(2) },
+    { "foobar", depth(6), depth(6) },
+    { "foo(bar)?", depth(3), depth(6) },
+    { "(a|ab|abc|abcd)", depth(1), depth(4) },
+    { "foo.*bar", depth(6), depth::infinity() },
+    { "foo(bar)*", depth(3), depth::infinity() },
+    { "foo(bar)+", depth(6), depth::infinity() },
+    { "foo(bar){1,3}", depth(6), depth(12) },
+    { "(abcd)+", depth(4), depth::infinity() },
+    { "foo\\z", depth(3), depth(3) },
+    { "^foo", depth(3), depth(3) },
+    { "^foo|bar.*baz", depth(3), depth::infinity() },
+    { "^foobar.*|baz", depth(3), depth::infinity() },
+    { "foo(\\z|bar)", depth(3), depth(6) },
+    { "foo(|bar\\z)", depth(3), depth(6) },
+    { "foo.{0,15}bar", depth(6), depth(21) },
+    { "foo.{0,15}.*bar", depth(6), depth::infinity() },
+    { "(?smi)^(aa[^a]aa$|a|a+\\Z|a)", depth(1), depth::infinity() }
 };
 
 INSTANTIATE_TEST_CASE_P(NFAWidth, NFAWidthTest, ValuesIn(widthTests));
@@ -79,10 +79,10 @@ INSTANTIATE_TEST_CASE_P(NFAWidth, NFAWidthTest, ValuesIn(widthTests));
 TEST_P(NFAWidthTest, Check) {
     const WidthTest &t = GetParam();
     SCOPED_TRACE(testing::Message() << "Pattern: " << t.pattern);
-    unique_ptr<NGWrapper> w(constructGraph(t.pattern, 0));
+    auto g = constructGraph(t.pattern, 0);
 
-    ASSERT_EQ(t.minWidth, findMinWidth(*w));
-    ASSERT_EQ(t.maxWidth, findMaxWidth(*w));
+    ASSERT_EQ(t.minWidth, findMinWidth(*g));
+    ASSERT_EQ(t.maxWidth, findMaxWidth(*g));
 }
 
 // for google test
diff --git a/unit/internal/repeat.cpp b/unit/internal/repeat.cpp
index 7f245e62f..546d7d4f8 100644
--- a/unit/internal/repeat.cpp
+++ b/unit/internal/repeat.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -106,96 +106,96 @@ class RepeatTest : public TestWithParam<RepeatTestInfo> {
 
 static const RepeatTestInfo repeatTests[] = {
     // Fixed repeats -- ring model
-    { REPEAT_RING, 2, 2 },
-    { REPEAT_RING, 4, 4 },
-    { REPEAT_RING, 10, 10 },
-    { REPEAT_RING, 16, 16 },
-    { REPEAT_RING, 20, 20 },
-    { REPEAT_RING, 30, 30 },
-    { REPEAT_RING, 50, 50 },
-    { REPEAT_RING, 64, 64 },
-    { REPEAT_RING, 65, 65 },
-    { REPEAT_RING, 100, 100 },
-    { REPEAT_RING, 200, 200 },
-    { REPEAT_RING, 1000, 1000 },
-    { REPEAT_RING, 4100, 4100 },
-    { REPEAT_RING, 16000, 16000 },
+    { REPEAT_RING, depth(2), depth(2) },
+    { REPEAT_RING, depth(4), depth(4) },
+    { REPEAT_RING, depth(10), depth(10) },
+    { REPEAT_RING, depth(16), depth(16) },
+    { REPEAT_RING, depth(20), depth(20) },
+    { REPEAT_RING, depth(30), depth(30) },
+    { REPEAT_RING, depth(50), depth(50) },
+    { REPEAT_RING, depth(64), depth(64) },
+    { REPEAT_RING, depth(65), depth(65) },
+    { REPEAT_RING, depth(100), depth(100) },
+    { REPEAT_RING, depth(200), depth(200) },
+    { REPEAT_RING, depth(1000), depth(1000) },
+    { REPEAT_RING, depth(4100), depth(4100) },
+    { REPEAT_RING, depth(16000), depth(16000) },
     // {0, N} repeats -- last model
-    { REPEAT_LAST, 0, 4 },
-    { REPEAT_LAST, 0, 10 },
-    { REPEAT_LAST, 0, 20 },
-    { REPEAT_LAST, 0, 30 },
-    { REPEAT_LAST, 0, 50 },
-    { REPEAT_LAST, 0, 100 },
-    { REPEAT_LAST, 0, 200 },
-    { REPEAT_LAST, 0, 1000 },
-    { REPEAT_LAST, 0, 16000 },
+    { REPEAT_LAST, depth(0), depth(4) },
+    { REPEAT_LAST, depth(0), depth(10) },
+    { REPEAT_LAST, depth(0), depth(20) },
+    { REPEAT_LAST, depth(0), depth(30) },
+    { REPEAT_LAST, depth(0), depth(50) },
+    { REPEAT_LAST, depth(0), depth(100) },
+    { REPEAT_LAST, depth(0), depth(200) },
+    { REPEAT_LAST, depth(0), depth(1000) },
+    { REPEAT_LAST, depth(0), depth(16000) },
     // {0, N} repeats -- ring model (though we use 'last' model in practice)
-    { REPEAT_RING, 0, 2 },
-    { REPEAT_RING, 0, 4 },
-    { REPEAT_RING, 0, 10 },
-    { REPEAT_RING, 0, 20 },
-    { REPEAT_RING, 0, 30 },
-    { REPEAT_RING, 0, 50 },
-    { REPEAT_RING, 0, 64 },
-    { REPEAT_RING, 0, 65 },
-    { REPEAT_RING, 0, 100 },
-    { REPEAT_RING, 0, 200 },
-    { REPEAT_RING, 0, 1000 },
-    { REPEAT_RING, 0, 16000 },
+    { REPEAT_RING, depth(0), depth(2) },
+    { REPEAT_RING, depth(0), depth(4) },
+    { REPEAT_RING, depth(0), depth(10) },
+    { REPEAT_RING, depth(0), depth(20) },
+    { REPEAT_RING, depth(0), depth(30) },
+    { REPEAT_RING, depth(0), depth(50) },
+    { REPEAT_RING, depth(0), depth(64) },
+    { REPEAT_RING, depth(0), depth(65) },
+    { REPEAT_RING, depth(0), depth(100) },
+    { REPEAT_RING, depth(0), depth(200) },
+    { REPEAT_RING, depth(0), depth(1000) },
+    { REPEAT_RING, depth(0), depth(16000) },
     // {N, M} repeats -- ring model
-    { REPEAT_RING, 2, 3 },
-    { REPEAT_RING, 1, 4 },
-    { REPEAT_RING, 5, 10 },
-    { REPEAT_RING, 10, 20 },
-    { REPEAT_RING, 10, 50 },
-    { REPEAT_RING, 50, 60 },
-    { REPEAT_RING, 100, 200 },
-    { REPEAT_RING, 1, 200 },
-    { REPEAT_RING, 10, 16000 },
-    { REPEAT_RING, 10000, 16000 },
+    { REPEAT_RING, depth(2), depth(3) },
+    { REPEAT_RING, depth(1), depth(4) },
+    { REPEAT_RING, depth(5), depth(10) },
+    { REPEAT_RING, depth(10), depth(20) },
+    { REPEAT_RING, depth(10), depth(50) },
+    { REPEAT_RING, depth(50), depth(60) },
+    { REPEAT_RING, depth(100), depth(200) },
+    { REPEAT_RING, depth(1), depth(200) },
+    { REPEAT_RING, depth(10), depth(16000) },
+    { REPEAT_RING, depth(10000), depth(16000) },
     // {N, M} repeats -- range model
-    { REPEAT_RANGE, 1, 4 },
-    { REPEAT_RANGE, 5, 10 },
-    { REPEAT_RANGE, 10, 20 },
-    { REPEAT_RANGE, 10, 50 },
-    { REPEAT_RANGE, 50, 60 },
-    { REPEAT_RANGE, 100, 200 },
-    { REPEAT_RANGE, 1, 200 },
-    { REPEAT_RANGE, 10, 16000 },
-    { REPEAT_RANGE, 10000, 16000 },
+    { REPEAT_RANGE, depth(1), depth(4) },
+    { REPEAT_RANGE, depth(5), depth(10) },
+    { REPEAT_RANGE, depth(10), depth(20) },
+    { REPEAT_RANGE, depth(10), depth(50) },
+    { REPEAT_RANGE, depth(50), depth(60) },
+    { REPEAT_RANGE, depth(100), depth(200) },
+    { REPEAT_RANGE, depth(1), depth(200) },
+    { REPEAT_RANGE, depth(10), depth(16000) },
+    { REPEAT_RANGE, depth(10000), depth(16000) },
     // {N,M} repeats -- small bitmap model
-    { REPEAT_BITMAP, 1, 2 },
-    { REPEAT_BITMAP, 5, 10 },
-    { REPEAT_BITMAP, 10, 20 },
-    { REPEAT_BITMAP, 20, 40 },
-    { REPEAT_BITMAP, 1, 63 },
-    { REPEAT_BITMAP, 50, 63 },
+    { REPEAT_BITMAP, depth(1), depth(2) },
+    { REPEAT_BITMAP, depth(5), depth(10) },
+    { REPEAT_BITMAP, depth(10), depth(20) },
+    { REPEAT_BITMAP, depth(20), depth(40) },
+    { REPEAT_BITMAP, depth(1), depth(63) },
+    { REPEAT_BITMAP, depth(50), depth(63) },
     // {N,M} repeats -- trailer model
-    { REPEAT_TRAILER, 1, 2 },
-    { REPEAT_TRAILER, 8, 8 },
-    { REPEAT_TRAILER, 0, 8 },
-    { REPEAT_TRAILER, 10, 20 },
-    { REPEAT_TRAILER, 1, 32 },
-    { REPEAT_TRAILER, 64, 64 },
-    { REPEAT_TRAILER, 1, 64 },
-    { REPEAT_TRAILER, 1, 100 },
-    { REPEAT_TRAILER, 1, 2000 },
-    { REPEAT_TRAILER, 50, 200 },
-    { REPEAT_TRAILER, 50, 1000 },
-    { REPEAT_TRAILER, 64, 1024 },
+    { REPEAT_TRAILER, depth(1), depth(2) },
+    { REPEAT_TRAILER, depth(8), depth(8) },
+    { REPEAT_TRAILER, depth(0), depth(8) },
+    { REPEAT_TRAILER, depth(10), depth(20) },
+    { REPEAT_TRAILER, depth(1), depth(32) },
+    { REPEAT_TRAILER, depth(64), depth(64) },
+    { REPEAT_TRAILER, depth(1), depth(64) },
+    { REPEAT_TRAILER, depth(1), depth(100) },
+    { REPEAT_TRAILER, depth(1), depth(2000) },
+    { REPEAT_TRAILER, depth(50), depth(200) },
+    { REPEAT_TRAILER, depth(50), depth(1000) },
+    { REPEAT_TRAILER, depth(64), depth(1024) },
     // {N,} repeats -- first model
-    { REPEAT_FIRST, 0, depth::infinity() },
-    { REPEAT_FIRST, 1, depth::infinity() },
-    { REPEAT_FIRST, 4, depth::infinity() },
-    { REPEAT_FIRST, 10, depth::infinity() },
-    { REPEAT_FIRST, 50, depth::infinity() },
-    { REPEAT_FIRST, 100, depth::infinity() },
-    { REPEAT_FIRST, 1000, depth::infinity() },
-    { REPEAT_FIRST, 3000, depth::infinity() },
-    { REPEAT_FIRST, 10000, depth::infinity() },
+    { REPEAT_FIRST, depth(0), depth::infinity() },
+    { REPEAT_FIRST, depth(1), depth::infinity() },
+    { REPEAT_FIRST, depth(4), depth::infinity() },
+    { REPEAT_FIRST, depth(10), depth::infinity() },
+    { REPEAT_FIRST, depth(50), depth::infinity() },
+    { REPEAT_FIRST, depth(100), depth::infinity() },
+    { REPEAT_FIRST, depth(1000), depth::infinity() },
+    { REPEAT_FIRST, depth(3000), depth::infinity() },
+    { REPEAT_FIRST, depth(10000), depth::infinity() },
     // {,} repeats -- always
-    { REPEAT_ALWAYS, 0, depth::infinity() },
+    { REPEAT_ALWAYS, depth(0), depth::infinity() },
 };
 
 INSTANTIATE_TEST_CASE_P(Repeat, RepeatTest, ValuesIn(repeatTests));
@@ -508,55 +508,55 @@ const u32 sparsePeriods[] = {
 static
 const RepeatTestInfo sparseRepeats[] = {
     // Fixed repeats
-    { REPEAT_SPARSE_OPTIMAL_P, 10, 10 },
-    { REPEAT_SPARSE_OPTIMAL_P, 20, 20 },
-    { REPEAT_SPARSE_OPTIMAL_P, 40, 40 },
-    { REPEAT_SPARSE_OPTIMAL_P, 80, 80 },
-    { REPEAT_SPARSE_OPTIMAL_P, 100, 100 },
-    { REPEAT_SPARSE_OPTIMAL_P, 150, 150 },
-    { REPEAT_SPARSE_OPTIMAL_P, 200, 200 },
-    { REPEAT_SPARSE_OPTIMAL_P, 250, 250 },
-    { REPEAT_SPARSE_OPTIMAL_P, 300, 300 },
-    { REPEAT_SPARSE_OPTIMAL_P, 350, 350 },
-    { REPEAT_SPARSE_OPTIMAL_P, 400, 400 },
-    { REPEAT_SPARSE_OPTIMAL_P, 500, 500 },
-    { REPEAT_SPARSE_OPTIMAL_P, 600, 600 },
-    { REPEAT_SPARSE_OPTIMAL_P, 800, 800 },
-    { REPEAT_SPARSE_OPTIMAL_P, 1000, 1000 },
-    { REPEAT_SPARSE_OPTIMAL_P, 1500, 1500 },
-    { REPEAT_SPARSE_OPTIMAL_P, 2000, 2000 },
-    { REPEAT_SPARSE_OPTIMAL_P, 2500, 2500 },
-    { REPEAT_SPARSE_OPTIMAL_P, 3000, 3000 },
-    { REPEAT_SPARSE_OPTIMAL_P, 3500, 3500 },
-    { REPEAT_SPARSE_OPTIMAL_P, 4000, 4000 },
-    { REPEAT_SPARSE_OPTIMAL_P, 4500, 4500 },
-    { REPEAT_SPARSE_OPTIMAL_P, 5000, 5000 },
-    { REPEAT_SPARSE_OPTIMAL_P, 65534, 65534 },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(10), depth(10) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(20), depth(20) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(40), depth(40) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(80), depth(80) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(100), depth(100) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(150), depth(150) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(200), depth(200) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(250), depth(250) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(300), depth(300) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(350), depth(350) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(400), depth(400) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(500), depth(500) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(600), depth(600) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(800), depth(800) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(1000), depth(1000) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(1500), depth(1500) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(2000), depth(2000) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(2500), depth(2500) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(3000), depth(3000) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(3500), depth(3500) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(4000), depth(4000) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(4500), depth(4500) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(5000), depth(5000) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(65534), depth(65534) },
     // {N, M} repeats
-    { REPEAT_SPARSE_OPTIMAL_P, 10, 20 },
-    { REPEAT_SPARSE_OPTIMAL_P, 20, 40 },
-    { REPEAT_SPARSE_OPTIMAL_P, 40, 80 },
-    { REPEAT_SPARSE_OPTIMAL_P, 80, 100 },
-    { REPEAT_SPARSE_OPTIMAL_P, 100, 120 },
-    { REPEAT_SPARSE_OPTIMAL_P, 150, 180 },
-    { REPEAT_SPARSE_OPTIMAL_P, 200, 400 },
-    { REPEAT_SPARSE_OPTIMAL_P, 250, 500 },
-    { REPEAT_SPARSE_OPTIMAL_P, 300, 400 },
-    { REPEAT_SPARSE_OPTIMAL_P, 350, 500 },
-    { REPEAT_SPARSE_OPTIMAL_P, 400, 500 },
-    { REPEAT_SPARSE_OPTIMAL_P, 500, 600 },
-    { REPEAT_SPARSE_OPTIMAL_P, 600, 700 },
-    { REPEAT_SPARSE_OPTIMAL_P, 800, 1000 },
-    { REPEAT_SPARSE_OPTIMAL_P, 1000, 1200 },
-    { REPEAT_SPARSE_OPTIMAL_P, 1500, 1800 },
-    { REPEAT_SPARSE_OPTIMAL_P, 2000, 4000 },
-    { REPEAT_SPARSE_OPTIMAL_P, 2500, 3000 },
-    { REPEAT_SPARSE_OPTIMAL_P, 3000, 3500 },
-    { REPEAT_SPARSE_OPTIMAL_P, 3500, 4000 },
-    { REPEAT_SPARSE_OPTIMAL_P, 4000, 8000 },
-    { REPEAT_SPARSE_OPTIMAL_P, 4500, 8000 },
-    { REPEAT_SPARSE_OPTIMAL_P, 5000, 5001 },
-    { REPEAT_SPARSE_OPTIMAL_P, 60000, 65534 }
+    { REPEAT_SPARSE_OPTIMAL_P, depth(10), depth(20) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(20), depth(40) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(40), depth(80) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(80), depth(100) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(100), depth(120) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(150), depth(180) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(200), depth(400) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(250), depth(500) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(300), depth(400) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(350), depth(500) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(400), depth(500) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(500), depth(600) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(600), depth(700) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(800), depth(1000) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(1000), depth(1200) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(1500), depth(1800) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(2000), depth(4000) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(2500), depth(3000) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(3000), depth(3500) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(3500), depth(4000) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(4000), depth(8000) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(4500), depth(8000) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(5000), depth(5001) },
+    { REPEAT_SPARSE_OPTIMAL_P, depth(60000), depth(65534) }
 };
 
 static
diff --git a/unit/internal/shuffle.cpp b/unit/internal/shuffle.cpp
index a4632c36c..b2316babd 100644
--- a/unit/internal/shuffle.cpp
+++ b/unit/internal/shuffle.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -30,6 +30,7 @@
 
 #include "gtest/gtest.h"
 
+#include "util/arch.h"
 #include "util/simd_utils.h"
 #include "nfa/limex_shuffle.h"
 
@@ -164,14 +165,15 @@ TEST(Shuffle, PackedExtract64_3) {
 template<typename T>
 static
 void build_pshufb_masks_onebit(unsigned int bit, T *permute, T *compare) {
-    static_assert(sizeof(T) == sizeof(m128) || sizeof(T) == sizeof(m256),
+    static_assert(sizeof(T) == sizeof(m128) || sizeof(T) == sizeof(m256) ||
+                      sizeof(T) == sizeof(m512),
                   "should be valid type");
     // permute mask has 0x80 in all bytes except the one we care about
     memset(permute, 0x80, sizeof(*permute));
     memset(compare, 0, sizeof(*compare));
     char *pmsk = (char *)permute;
     char *cmsk = (char *)compare;
-    u8 off = (bit >= 128) ? 0x10 : 0;
+    u8 off = (bit >= 128) ? (bit >= 256) ? (bit >= 384) ? 0x30 : 0x20 : 0x10 : 0;
     pmsk[off] = bit/8;
     cmsk[off] = ~(1 << (bit % 8));
 }
@@ -194,7 +196,7 @@ TEST(Shuffle, PackedExtract128_1) {
     }
 }
 
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
 TEST(Shuffle, PackedExtract256_1) {
     // Try all possible one-bit masks
     for (unsigned int i = 0; i < 256; i++) {
@@ -213,4 +215,24 @@ TEST(Shuffle, PackedExtract256_1) {
     }
 }
 #endif
+
+#if defined(HAVE_AVX512)
+TEST(Shuffle, PackedExtract512_1) {
+    // Try all possible one-bit masks
+    for (unsigned int i = 0; i < 512; i++) {
+        // shuffle a single 1 bit to the front
+        m512 permute, compare;
+        build_pshufb_masks_onebit(i, &permute, &compare);
+        EXPECT_EQ(1U, packedExtract512(setbit<m512>(i), permute, compare));
+        EXPECT_EQ(1U, packedExtract512(ones512(), permute, compare));
+        // we should get zero out of these cases
+        EXPECT_EQ(0U, packedExtract512(zeroes512(), permute, compare));
+        EXPECT_EQ(0U, packedExtract512(not512(setbit<m512>(i)), permute, compare));
+        // we should get zero out of all the other bit positions
+        for (unsigned int j = 0; (j != i && j < 512); j++) {
+            EXPECT_EQ(0U, packedExtract512(setbit<m512>(j), permute, compare));
+        }
+    }
+}
+#endif
 } // namespace
diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp
index 7b34d92e2..623c2c998 100644
--- a/unit/internal/simd_utils.cpp
+++ b/unit/internal/simd_utils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -29,7 +29,8 @@
 #include "config.h"
 
 #include "gtest/gtest.h"
-#include "util/alloc.h"
+#include "util/arch.h"
+#include "util/bytecode_ptr.h"
 #include "util/make_unique.h"
 #include "util/simd_utils.h"
 
@@ -142,6 +143,10 @@ void simd_loadbytes(m128 *a, const void *ptr, unsigned i) { *a = loadbytes128(pt
 void simd_loadbytes(m256 *a, const void *ptr, unsigned i) { *a = loadbytes256(ptr, i); }
 void simd_loadbytes(m384 *a, const void *ptr, unsigned i) { *a = loadbytes384(ptr, i); }
 void simd_loadbytes(m512 *a, const void *ptr, unsigned i) { *a = loadbytes512(ptr, i); }
+m128 simd_lshift64(const m128 &a, unsigned i) { return lshift64_m128(a, i); }
+m256 simd_lshift64(const m256 &a, unsigned i) { return lshift64_m256(a, i); }
+m384 simd_lshift64(const m384 &a, unsigned i) { return lshift64_m384(a, i); }
+m512 simd_lshift64(const m512 &a, unsigned i) { return lshift64_m512(a, i); }
 
 template<typename T>
 class SimdUtilsTest : public testing::Test {
@@ -539,8 +544,9 @@ TYPED_TEST(SimdUtilsTest, load_store) {
         a.bytes[i] = (char)(i % 256);
     }
 
-    aligned_unique_ptr<char> mem_ptr = aligned_zmalloc_unique<char>(sizeof(a));
+    auto mem_ptr = make_bytecode_ptr<char>(sizeof(a), alignof(TypeParam));
     char *mem = mem_ptr.get();
+
     ASSERT_EQ(0, (size_t)mem % 16U);
 
     memset(mem, 0, sizeof(a));
@@ -584,6 +590,65 @@ TYPED_TEST(SimdUtilsTest, loadbytes_storebytes) {
     }
 }
 
+TYPED_TEST(SimdUtilsTest, lshift64) {
+    TypeParam a;
+    memset(&a, 0x5a, sizeof(a));
+
+    static constexpr u64a exp_val = 0x5a5a5a5a5a5a5a5aULL;
+
+    union {
+        TypeParam simd;
+        u64a qword[sizeof(TypeParam) / 8];
+    } c;
+
+    for (unsigned s = 0; s < 64; s++) {
+        c.simd = simd_lshift64(a, s);
+
+        const u64a expected = exp_val << s;
+        for (size_t i = 0; i < sizeof(c) / 8; i++) {
+            EXPECT_EQ(expected, c.qword[i]);
+        }
+    }
+
+    /* Clang 3.4 on FreeBSD 10 crashes on the following - disable for now */
+#if !(defined(__FreeBSD__) && defined(__clang__) && __clang_major__ == 3)
+
+    // test immediates
+    u64a expected;
+
+    c.simd = simd_lshift64(a, 1);
+    expected = exp_val << 1;
+    for (size_t i = 0; i < sizeof(c) / 8; i++) {
+        EXPECT_EQ(expected, c.qword[i]);
+    }
+
+    c.simd = simd_lshift64(a, 2);
+    expected = exp_val << 2;
+    for (size_t i = 0; i < sizeof(c) / 8; i++) {
+        EXPECT_EQ(expected, c.qword[i]);
+    }
+
+    c.simd = simd_lshift64(a, 7);
+    expected = exp_val << 7;
+    for (size_t i = 0; i < sizeof(c) / 8; i++) {
+        EXPECT_EQ(expected, c.qword[i]);
+    }
+
+    c.simd = simd_lshift64(a, 31);
+    expected = exp_val << 31;
+    for (size_t i = 0; i < sizeof(c) / 8; i++) {
+        EXPECT_EQ(expected, c.qword[i]);
+    }
+#endif
+}
+
+TEST(SimdUtilsTest, alignment) {
+    ASSERT_EQ(16, alignof(m128));
+    ASSERT_EQ(32, alignof(m256));
+    ASSERT_EQ(16, alignof(m384));
+    ASSERT_EQ(64, alignof(m512));
+}
+
 TEST(SimdUtilsTest, movq) {
     m128 simd;
 
@@ -620,7 +685,7 @@ TEST(SimdUtilsTest, set4x32) {
     ASSERT_EQ(0, memcmp(cmp, &simd, sizeof(simd)));
 }
 
-#if defined(__AVX2__)
+#if defined(HAVE_AVX2)
 TEST(SimdUtilsTest, set32x8) {
     char cmp[sizeof(m256)];
 
diff --git a/unit/internal/utf8_validate.cpp b/unit/internal/utf8_validate.cpp
index 6649e6fe9..f570e6b02 100644
--- a/unit/internal/utf8_validate.cpp
+++ b/unit/internal/utf8_validate.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -117,6 +117,6 @@ INSTANTIATE_TEST_CASE_P(ValidUtf8, ValidUtf8Test, ValuesIn(valid_utf8_tests));
 
 TEST_P(ValidUtf8Test, check) {
     const auto &info = GetParam();
-    ASSERT_EQ(info.is_valid, isValidUtf8(info.str.c_str()))
-        << "String is: " << printable(info.str) << std::endl;
+    SCOPED_TRACE(testing::Message() << "String is: " << printable(info.str));
+    ASSERT_EQ(info.is_valid, isValidUtf8(info.str.c_str()));
 }
diff --git a/util/CMakeLists.txt b/util/CMakeLists.txt
index c0a6bc212..ea942ef1a 100644
--- a/util/CMakeLists.txt
+++ b/util/CMakeLists.txt
@@ -2,7 +2,7 @@
 
 CHECK_FUNCTION_EXISTS(mmap HAVE_MMAP)
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CXX_FLAGS} ${HS_CXX_FLAGS}")
 include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}
     ${PROJECT_SOURCE_DIR})
 
diff --git a/util/ExpressionParser.h b/util/ExpressionParser.h
index 992304484..c97c114e7 100644
--- a/util/ExpressionParser.h
+++ b/util/ExpressionParser.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -29,12 +29,14 @@
 #ifndef EXPRESSIONPARSER_H
 #define EXPRESSIONPARSER_H
 
+#include "hs_common.h"
+
 #include <string>
 
 struct hs_expr_ext;
 
-bool readExpression(const std::string &line, std::string &expr,
-                    unsigned int *flags, hs_expr_ext *ext,
-                    bool *must_be_ordered = nullptr);
+bool HS_CDECL readExpression(const std::string &line, std::string &expr,
+                             unsigned int *flags, hs_expr_ext *ext,
+                             bool *must_be_ordered = nullptr);
 
 #endif
diff --git a/util/ExpressionParser.rl b/util/ExpressionParser.rl
index 98ed8daa9..233b70c18 100644
--- a/util/ExpressionParser.rl
+++ b/util/ExpressionParser.rl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -48,7 +48,8 @@ enum ParamKey {
     PARAM_NONE,
     PARAM_MIN_OFFSET,
     PARAM_MAX_OFFSET,
-    PARAM_MIN_LENGTH
+    PARAM_MIN_LENGTH,
+    PARAM_EDIT_DISTANCE
 };
 
 %%{
@@ -92,6 +93,10 @@ enum ParamKey {
                 ext->flags |= HS_EXT_FLAG_MIN_LENGTH;
                 ext->min_length = num;
                 break;
+            case PARAM_EDIT_DISTANCE:
+                ext->flags |= HS_EXT_FLAG_EDIT_DISTANCE;
+                ext->edit_distance = num;
+                break;
             case PARAM_NONE:
             default:
                 // No key specified, syntax invalid.
@@ -110,9 +115,9 @@ void initExt(hs_expr_ext *ext) {
     ext->max_offset = MAX_OFFSET;
 }
 
-bool readExpression(const std::string &input, std::string &expr,
-                    unsigned int *flags, hs_expr_ext *ext,
-                    bool *must_be_ordered) {
+bool HS_CDECL readExpression(const std::string &input, std::string &expr,
+                             unsigned int *flags, hs_expr_ext *ext,
+                             bool *must_be_ordered) {
     assert(flags);
     assert(ext);
 
@@ -151,8 +156,9 @@ bool readExpression(const std::string &input, std::string &expr,
     %%{
         single_flag = [ismW8HPLVO];
         param = ('min_offset' @{ key = PARAM_MIN_OFFSET; } |
-                 'max_offset' @{ key = PARAM_MAX_OFFSET; } | 
-                 'min_length' @{ key = PARAM_MIN_LENGTH; } );
+                 'max_offset' @{ key = PARAM_MAX_OFFSET; } |
+                 'min_length' @{ key = PARAM_MIN_LENGTH; } |
+                 'edit_distance' @{ key = PARAM_EDIT_DISTANCE; });
 
         value = (digit @accumulateNum)+ >{num = 0;};
         param_spec = (' '* param '=' value ' '*) >{ key = PARAM_NONE; }
diff --git a/util/expressions.cpp b/util/expressions.cpp
index 944c74772..a81e0cd58 100644
--- a/util/expressions.cpp
+++ b/util/expressions.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -27,6 +27,10 @@
  */
 
 #include "config.h"
+#include "expressions.h"
+
+#include "hs.h"
+#include "string_util.h"
 
 #include <algorithm>
 #include <fstream>
@@ -34,7 +38,6 @@
 #include <stdexcept>
 #include <string>
 
-#include <boost/algorithm/string/trim.hpp>
 #include <sys/types.h>
 #include <sys/stat.h>
 #if !defined(_WIN32)
@@ -45,9 +48,7 @@
 #include <windows.h>
 #endif
 
-#include "expressions.h"
-#include "hs.h"
-#include "string_util.h"
+#include <boost/algorithm/string/trim.hpp>
 
 using namespace std;
 
@@ -90,7 +91,7 @@ void processLine(string &line, unsigned lineNum,
 
     //cout << "Inserting expr: id=" << id << ", pcre=" << pcre_str << endl;
 
-    bool ins = exprMap.insert(ExpressionMap::value_type(id, pcre_str)).second;
+    bool ins = exprMap.emplace(id, pcre_str).second;
     if (!ins) {
         failLine(lineNum, file, line, "Duplicate ID found.");
     }
@@ -101,7 +102,7 @@ void processLine(string &line, unsigned lineNum,
 #define S_ISDIR(st_m) (_S_IFDIR & (st_m))
 #define S_ISREG(st_m) (_S_IFREG & (st_m))
 #endif
-void loadExpressionsFromFile(const string &fname, ExpressionMap &exprMap) {
+void HS_CDECL loadExpressionsFromFile(const string &fname, ExpressionMap &exprMap) {
     struct stat st;
     if (stat(fname.c_str(), &st) != 0) {
         return;
@@ -194,7 +195,7 @@ void loadExpressions(const string &inPath, ExpressionMap &exprMap) {
     }
 }
 #else // windows TODO: improve
-void loadExpressions(const string &inPath, ExpressionMap &exprMap) {
+void HS_CDECL loadExpressions(const string &inPath, ExpressionMap &exprMap) {
     // Is our input path a file or a directory?
     struct stat st;
     if (stat(inPath.c_str(), &st) != 0) {
@@ -250,8 +251,8 @@ void loadExpressions(const string &inPath, ExpressionMap &exprMap) {
 }
 #endif
 
-void loadSignatureList(const string &inFile,
-                       SignatureSet &signatures) {
+void HS_CDECL loadSignatureList(const string &inFile,
+                                SignatureSet &signatures) {
     ifstream f(inFile.c_str());
     if (!f.good()) {
         cerr << "Can't open file: '" << inFile << "'" << endl;
@@ -278,20 +279,19 @@ void loadSignatureList(const string &inFile,
     }
 }
 
-void limitBySignature(ExpressionMap &exprMap,
-                      const SignatureSet &signatures) {
+ExpressionMap limitToSignatures(const ExpressionMap &exprMap,
+                                const SignatureSet &signatures) {
     ExpressionMap keepers;
 
-    SignatureSet::const_iterator it, ite;
-    for (it = signatures.begin(), ite = signatures.end(); it != ite; ++it) {
-        ExpressionMap::const_iterator match = exprMap.find(*it);
+    for (auto id : signatures) {
+        auto match = exprMap.find(id);
         if (match == exprMap.end()) {
-            cerr << "Unable to find signature " << *it
+            cerr << "Unable to find signature " << id
                     << " in expression set!" << endl;
             exit(1);
         }
         keepers.insert(*match);
     }
 
-    exprMap.swap(keepers);
+    return keepers;
 }
diff --git a/util/expressions.h b/util/expressions.h
index 949c9201f..078b99722 100644
--- a/util/expressions.h
+++ b/util/expressions.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -29,27 +29,26 @@
 #ifndef EXPRESSIONS_H
 #define EXPRESSIONS_H
 
+#include "hs_common.h"
+
 #include <map>
 #include <string>
-#include <list>
+#include <vector>
 
-typedef std::map<unsigned, std::string> ExpressionMap;
-typedef std::list<unsigned> SignatureSet;
+using ExpressionMap = std::map<unsigned, std::string>;
+using SignatureSet = std::vector<unsigned>;
 
 // load all of the expressions from the given directory into the given
 // expression map. Exits on failure.
-void loadExpressions(const std::string &inDir, ExpressionMap &exprMap);
+void HS_CDECL loadExpressions(const std::string &inDir, ExpressionMap &exprMap);
 
-void loadExpressionsFromFile(const std::string &fname, ExpressionMap &exprMap);
+void HS_CDECL loadExpressionsFromFile(const std::string &fname, ExpressionMap &exprMap);
 
 // load a list of signature IDs
-void loadSignatureList(const std::string &inFile, SignatureSet &signatures);
+void HS_CDECL loadSignatureList(const std::string &inFile, SignatureSet &signatures);
 
-// produce a new expression map only containing those signatures in the
-// expression list
-void generateExprMap(const SignatureSet &signatures,
-    const ExpressionMap &allExprs, ExpressionMap &out);
+// trim expression map to only the given signatures, returning result
+ExpressionMap limitToSignatures(const ExpressionMap &exprMap,
+                                const SignatureSet &signatures);
 
-// trim expression map to only the given signatures (in-place)
-void limitBySignature(ExpressionMap &exprMap, const SignatureSet &signatures);
 #endif
diff --git a/util/ng_corpus_generator.cpp b/util/ng_corpus_generator.cpp
index ca7c413ab..19ab7edf2 100644
--- a/util/ng_corpus_generator.cpp
+++ b/util/ng_corpus_generator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,6 +35,7 @@
 #include "ng_corpus_generator.h"
 
 #include "ng_corpus_editor.h"
+#include "compiler/compiler.h"
 #include "nfagraph/ng.h"
 #include "nfagraph/ng_util.h"
 #include "ue2common.h"
@@ -48,15 +49,15 @@
 
 #include <algorithm>
 #include <deque>
+#include <memory>
 #include <set>
 #include <sstream>
 #include <vector>
 
-#include <boost/ptr_container/ptr_vector.hpp>
+#include <boost/utility.hpp>
 
 using namespace std;
 using namespace ue2;
-using boost::ptr_vector;
 
 typedef vector<NFAVertex> VertexPath;
 
@@ -139,8 +140,8 @@ void findPaths(const NGHolder &g, CorpusProperties &cProps,
     // limit will evict a random existing one.
     const size_t MAX_OPEN = min((size_t)1000, corpusLimit * 10);
 
-    ptr_vector<VertexPath> open;
-    open.push_back(new VertexPath(1, g.start));
+    vector<unique_ptr<VertexPath>> open;
+    open.push_back(ue2::make_unique<VertexPath>(1, g.start));
 
     ue2::unordered_set<NFAVertex> one_way_in;
     for (const auto &v : vertices_range(g)) {
@@ -152,7 +153,8 @@ void findPaths(const NGHolder &g, CorpusProperties &cProps,
     while (!open.empty()) {
         u32 slot = cProps.rand(0, open.size() - 1);
         swap(open.at(slot), open.back());
-        ptr_vector<VertexPath>::auto_type p = open.pop_back();
+        auto p = std::move(open.back());
+        open.pop_back();
         NFAVertex u = p->back();
 
         DEBUG_PRINTF("dequeuing path %s, back %zu\n",
@@ -194,19 +196,19 @@ void findPaths(const NGHolder &g, CorpusProperties &cProps,
 
             // If we've got no further adjacent vertices, re-use p rather than
             // copying it for the next path.
-            VertexPath *new_path;
+            unique_ptr<VertexPath> new_path;
             if (boost::next(ai) == ae) {
-                new_path = p.release();
+                new_path = std::move(p);
             } else {
-                new_path = new VertexPath(*p);
+                new_path = make_unique<VertexPath>(*p);
             }
 
             new_path->push_back(v);
             if (open.size() < MAX_OPEN) {
-                open.push_back(new_path);
+                open.push_back(std::move(new_path));
             } else {
                 u32 victim = cProps.rand(0, open.size() - 1);
-                open.replace(victim, new_path);
+                open[victim] = std::move(new_path);
             }
         }
     }
@@ -218,8 +220,9 @@ namespace {
 /** \brief Concrete implementation */
 class CorpusGeneratorImpl : public CorpusGenerator {
 public:
-    CorpusGeneratorImpl(const NGHolder &graph_in, CorpusProperties &props);
-    ~CorpusGeneratorImpl() {}
+    CorpusGeneratorImpl(const NGHolder &graph_in, const ExpressionInfo &expr_in,
+                        CorpusProperties &props);
+    ~CorpusGeneratorImpl() = default;
 
     void generateCorpus(vector<string> &data);
 
@@ -236,6 +239,9 @@ class CorpusGeneratorImpl : public CorpusGenerator {
      * bytes in length. */
     void addRandom(const min_max &mm, string *out);
 
+    /** \brief Info about this expression. */
+    const ExpressionInfo &expr;
+
     /** \brief The NFA graph we operate over. */
     const NGHolder &graph;
 
@@ -245,9 +251,13 @@ class CorpusGeneratorImpl : public CorpusGenerator {
 };
 
 CorpusGeneratorImpl::CorpusGeneratorImpl(const NGHolder &graph_in,
+                                         const ExpressionInfo &expr_in,
                                          CorpusProperties &props)
-    : graph(graph_in), cProps(props) {
-    // empty
+    : expr(expr_in), graph(graph_in), cProps(props) {
+    // if this pattern is to be matched approximately
+    if (expr.edit_distance && !props.editDistance) {
+        props.editDistance = props.rand(0, expr.edit_distance + 1);
+    }
 }
 
 void CorpusGeneratorImpl::generateCorpus(vector<string> &data) {
@@ -388,8 +398,9 @@ void CorpusGeneratorImpl::newGenerator(vector<string> &outdata) {
 /** \brief Concrete implementation for UTF-8 */
 class CorpusGeneratorUtf8 : public CorpusGenerator {
 public:
-    CorpusGeneratorUtf8(const NGHolder &graph_in, CorpusProperties &props);
-    ~CorpusGeneratorUtf8() {}
+    CorpusGeneratorUtf8(const NGHolder &graph_in, const ExpressionInfo &expr_in,
+                        CorpusProperties &props);
+    ~CorpusGeneratorUtf8() = default;
 
     void generateCorpus(vector<string> &data);
 
@@ -406,6 +417,9 @@ class CorpusGeneratorUtf8 : public CorpusGenerator {
      * length. */
     void addRandom(const min_max &mm, vector<unichar> *out);
 
+    /** \brief Info about this expression. */
+    const ExpressionInfo &expr;
+
     /** \brief The NFA graph we operate over. */
     const NGHolder &graph;
 
@@ -415,9 +429,14 @@ class CorpusGeneratorUtf8 : public CorpusGenerator {
 };
 
 CorpusGeneratorUtf8::CorpusGeneratorUtf8(const NGHolder &graph_in,
+                                         const ExpressionInfo &expr_in,
                                          CorpusProperties &props)
-    : graph(graph_in), cProps(props) {
-    // empty
+    : expr(expr_in), graph(graph_in), cProps(props) {
+    // we do not support Utf8 for approximate matching
+    if (expr.edit_distance) {
+        throw CorpusGenerationFailure("UTF-8 for edited patterns is not "
+                                      "supported.");
+    }
 }
 
 void CorpusGeneratorUtf8::generateCorpus(vector<string> &data) {
@@ -673,11 +692,12 @@ CorpusGenerator::~CorpusGenerator() { }
 
 // External entry point
 
-unique_ptr<CorpusGenerator> makeCorpusGenerator(const NGWrapper &graph,
+unique_ptr<CorpusGenerator> makeCorpusGenerator(const NGHolder &graph,
+                                                const ExpressionInfo &expr,
                                                 CorpusProperties &props) {
-    if (graph.utf8) {
-        return ue2::make_unique<CorpusGeneratorUtf8>(graph, props);
+    if (expr.utf8) {
+        return ue2::make_unique<CorpusGeneratorUtf8>(graph, expr, props);
     } else {
-        return ue2::make_unique<CorpusGeneratorImpl>(graph, props);
+        return ue2::make_unique<CorpusGeneratorImpl>(graph, expr, props);
     }
 }
diff --git a/util/ng_corpus_generator.h b/util/ng_corpus_generator.h
index a7445ab64..f230a10d0 100644
--- a/util/ng_corpus_generator.h
+++ b/util/ng_corpus_generator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -41,10 +41,17 @@
 
 namespace ue2 {
 
-class NGWrapper;
+class ExpressionInfo;
+class NGHolder;
 
 } // namespace ue2
 
+struct CorpusGenerationFailure {
+    explicit CorpusGenerationFailure(const std::string s) :
+        message(std::move(s)) {}
+    std::string message;
+};
+
 /** \brief Abstract interface to corpus generator tool. */
 class CorpusGenerator {
 public:
@@ -62,6 +69,7 @@ class CorpusGenerator {
 /** \brief Build a concrete impl conforming to the \ref CorpusGenerator
  * interface. */
 std::unique_ptr<CorpusGenerator>
-makeCorpusGenerator(const ue2::NGWrapper &graph, CorpusProperties &props);
+makeCorpusGenerator(const ue2::NGHolder &g, const ue2::ExpressionInfo &expr,
+                    CorpusProperties &props);
 
 #endif
diff --git a/util/ng_find_matches.cpp b/util/ng_find_matches.cpp
index 2b3373653..0a1f796f0 100644
--- a/util/ng_find_matches.cpp
+++ b/util/ng_find_matches.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -47,55 +47,744 @@
 using namespace std;
 using namespace ue2;
 
+using MatchSet = set<pair<size_t, size_t>>;
+using StateBitSet = boost::dynamic_bitset<>;
+
 namespace {
 
+/** \brief Max number of states (taking edit distance into account). */
+static constexpr size_t STATE_COUNT_MAX = 15000;
+
+// returns all successors up to a given depth in a vector of sets, indexed by
+// zero-based depth from source vertex
+static
+vector<flat_set<NFAVertex>>
+gatherSuccessorsByDepth(const NGHolder &g, const NFAVertex &src, u32 depth) {
+    assert(depth > 0);
+
+    vector<flat_set<NFAVertex>> result(depth);
+
+    // populate current set of successors
+    for (auto v : adjacent_vertices_range(src, g)) {
+        // ignore self-loops
+        if (src == v) {
+            continue;
+        }
+        DEBUG_PRINTF("Node %zu depth 1\n", g[v].index);
+        result[0].insert(v);
+    }
+
+    for (u32 d = 1; d < depth; d++) {
+        // collect all successors for all current level vertices
+        const auto &cur = result[d - 1];
+        auto &next = result[d];
+        for (auto u : cur) {
+            // don't go past special nodes
+            if (is_special(u, g)) {
+                continue;
+            }
+
+            for (auto v : adjacent_vertices_range(u, g)) {
+                // ignore self-loops
+                if (u == v) {
+                    continue;
+                }
+                DEBUG_PRINTF("Node %zu depth %u\n", g[v].index, d + 1);
+                next.insert(v);
+            }
+        }
+    }
+
+    return result;
+}
+
+// returns all predecessors up to a given depth in a vector of sets, indexed by
+// zero-based depth from source vertex
+static
+vector<flat_set<NFAVertex>>
+gatherPredecessorsByDepth(const NGHolder &g, NFAVertex src, u32 depth) {
+    assert(depth > 0);
+
+    vector<flat_set<NFAVertex>> result(depth);
+
+    // populate current set of successors
+    for (auto v : inv_adjacent_vertices_range(src, g)) {
+        // ignore self-loops
+        if (src == v) {
+            continue;
+        }
+        DEBUG_PRINTF("Node %zu depth 1\n", g[v].index);
+        result[0].insert(v);
+    }
+
+    for (u32 d = 1; d < depth; d++) {
+        // collect all successors for all current level vertices
+        const auto &cur = result[d - 1];
+        auto &next = result[d];
+        for (auto v : cur) {
+            for (auto u : inv_adjacent_vertices_range(v, g)) {
+                // ignore self-loops
+                if (v == u) {
+                    continue;
+                }
+                DEBUG_PRINTF("Node %zu depth %u\n", g[u].index, d + 1);
+                next.insert(u);
+            }
+        }
+    }
+
+    return result;
+}
+
+// this is a per-vertex, per-shadow level state transition table
+struct GraphCache {
+    GraphCache(u32 dist_in, const NGHolder &g) :
+        size(num_vertices(g)), edit_distance(dist_in)
+    {
+        auto dist_max = edit_distance + 1;
+
+        allocateStateTransitionTable(dist_max);
+        populateTransitionCache(g, dist_max);
+        populateAcceptCache(g, dist_max);
+    }
+
+    void allocateStateTransitionTable(u32 dist_max) {
+        // resize level 1 - per vertex
+        shadow_transitions.resize(size);
+        helper_transitions.resize(size);
+
+        // resize level 2 - per shadow level
+        for (u32 i = 0; i < size; i++) {
+            shadow_transitions[i].resize(dist_max);
+            helper_transitions[i].resize(dist_max);
+
+            // resize level 3 - per vertex
+            for (u32 d = 0; d < dist_max; d++) {
+                shadow_transitions[i][d].resize(size);
+                helper_transitions[i][d].resize(size);
+            }
+        }
+
+        // accept states are indexed by edit distance
+        accept_states.resize(dist_max);
+        accept_eod_states.resize(dist_max);
+
+        // vertex report maps are indexed by edit distance
+        vertex_reports_by_level.resize(dist_max);
+        vertex_eod_reports_by_level.resize(dist_max);
+    }
+
+    /*
+     * certain transitions to helpers are disallowed:
+     *  1. transitions from accept/acceptEod
+     *  2. transitions to accept/acceptEod
+     *  3. from start to startDs
+     *  4. to a virtual/multiline start
+     *
+     * everything else is allowed.
+     */
+    bool canTransitionToHelper(NFAVertex u, NFAVertex v, const NGHolder &g) const {
+        if (is_any_accept(u, g)) {
+            return false;
+        }
+        if (is_any_accept(v, g)) {
+            return false;
+        }
+        if (u == g.start && v == g.startDs) {
+            return false;
+        }
+        if (is_virtual_start(v, g)) {
+            return false;
+        }
+        return true;
+    }
+
+    void populateTransitionCache(const NGHolder &g, u32 dist_max) {
+        // populate mapping of vertex index to vertex
+        vector<NFAVertex> idx_to_v(size);
+        for (auto v : vertices_range(g)) {
+            idx_to_v[g[v].index] = v;
+        }
+
+        for (u32 i = 0; i < size; i++) {
+            auto cur_v = idx_to_v[i];
+
+            // set up transition tables
+            auto succs = gatherSuccessorsByDepth(g, cur_v, dist_max);
+
+            assert(succs.size() == dist_max);
+
+            for (u32 d = 0; d < dist_max; d++) {
+                auto &v_shadows = shadow_transitions[i][d];
+                auto cur_v_bit = i;
+
+                // enable transition to next level helper (this handles insertion)
+                if (d < edit_distance && !is_any_accept(cur_v, g)) {
+                    auto &next_v_helpers = helper_transitions[i][d + 1];
+
+                    next_v_helpers.set(cur_v_bit);
+                }
+
+                // if vertex has a self-loop, we can also transition to it,
+                // but only if we're at shadow level 0
+                if (edge(cur_v, cur_v, g).second && d == 0) {
+                    v_shadows.set(cur_v_bit);
+                }
+
+                // populate state transition tables
+                for (auto v : succs[d]) {
+                    auto v_bit = g[v].index;
+
+                    // we cannot transition to startDs on any level other than
+                    // level 0
+                    if (v != g.startDs || d == 0) {
+                        // this handles direct transitions as well as removals
+                        v_shadows.set(v_bit);
+                    }
+
+                    // we can also transition to next-level helper (handles
+                    // replace), provided we meet the criteria
+                    if (d < edit_distance && canTransitionToHelper(cur_v, v, g)) {
+                        auto &next_v_helpers = helper_transitions[i][d + 1];
+
+                        next_v_helpers.set(v_bit);
+                    }
+                }
+            }
+        }
+    }
+
+    void populateAcceptCache(const NGHolder &g, u32 dist_max) {
+        // set up accept states masks
+        StateBitSet accept(size);
+        accept.set(g[g.accept].index);
+        StateBitSet accept_eod(size);
+        accept_eod.set(g[g.acceptEod].index);
+
+        // gather accept and acceptEod states
+        for (u32 base_dist = 0; base_dist < dist_max; base_dist++) {
+            auto &states = accept_states[base_dist];
+            auto &eod_states = accept_eod_states[base_dist];
+
+            states.resize(size);
+            eod_states.resize(size);
+
+            // inspect each vertex
+            for (u32 i = 0; i < size; i++) {
+                // inspect all shadow levels from base_dist to dist_max
+                for (u32 d = 0; d < dist_max - base_dist; d++) {
+                    auto &shadows = shadow_transitions[i][d];
+
+                    // if this state transitions to accept, set its bit
+                    if ((shadows & accept).any()) {
+                        states.set(i);
+                    }
+                    if ((shadows & accept_eod).any()) {
+                        eod_states.set(i);
+                    }
+                }
+            }
+        }
+
+        // populate accepts cache
+        for (auto  v : inv_adjacent_vertices_range(g.accept, g)) {
+            const auto &rs = g[v].reports;
+
+            for (u32 d = 0; d <= edit_distance; d++) {
+                // add self to report list at all levels
+                vertex_reports_by_level[d][v].insert(rs.begin(), rs.end());
+            }
+            if (edit_distance == 0) {
+                // if edit distance is 0, no predecessors will have reports
+                continue;
+            }
+
+            auto preds_by_depth = gatherPredecessorsByDepth(g, v, edit_distance);
+            for (u32 pd = 0; pd < preds_by_depth.size(); pd++) {
+                const auto &preds = preds_by_depth[pd];
+                // for each predecessor, add reports up to maximum edit distance
+                // for current depth from source vertex
+                for (auto pred : preds) {
+                    for (u32 d = 0; d < edit_distance - pd; d++) {
+                        vertex_reports_by_level[d][pred].insert(rs.begin(), rs.end());
+                    }
+                }
+            }
+        }
+        for (auto v : inv_adjacent_vertices_range(g.acceptEod, g)) {
+            const auto &rs = g[v].reports;
+
+            if (v == g.accept) {
+                continue;
+            }
+
+            for (u32 d = 0; d <= edit_distance; d++) {
+                // add self to report list at all levels
+                vertex_eod_reports_by_level[d][v].insert(rs.begin(), rs.end());
+            }
+            if (edit_distance == 0) {
+                // if edit distance is 0, no predecessors will have reports
+                continue;
+            }
+
+            auto preds_by_depth = gatherPredecessorsByDepth(g, v, edit_distance);
+            for (u32 pd = 0; pd < preds_by_depth.size(); pd++) {
+                const auto &preds = preds_by_depth[pd];
+                // for each predecessor, add reports up to maximum edit distance
+                // for current depth from source vertex
+                for (auto pred : preds) {
+                    for (u32 d = 0; d < edit_distance - pd; d++) {
+                        vertex_eod_reports_by_level[d][pred].insert(rs.begin(), rs.end());
+                    }
+                }
+            }
+        }
+    }
+
+#ifdef DEBUG
+    void dumpStateTransitionTable(const NGHolder &g) {
+        StateBitSet accept(size);
+        accept.set(g[g.accept].index);
+        StateBitSet accept_eod(size);
+        accept_eod.set(g[g.acceptEod].index);
+
+        DEBUG_PRINTF("Dumping state transition tables\n");
+        DEBUG_PRINTF("Shadows:\n");
+        for (u32 i = 0; i < num_vertices(g); i++) {
+            DEBUG_PRINTF("%-7s %3u:", "Vertex", i);
+            for (u32 j = 0; j < num_vertices(g); j++) {
+                printf("%3i", j);
+            }
+            printf("\n");
+            for (u32 d = 0; d <= edit_distance; d++) {
+                DEBUG_PRINTF("%-7s %3u:", "Level", d);
+                const auto &s = getShadowTransitions(i, d);
+                for (u32 j = 0; j < num_vertices(g); j++) {
+                    printf("%3i", s.test(j));
+                }
+                printf("\n");
+            }
+            DEBUG_PRINTF("\n");
+        }
+
+        DEBUG_PRINTF("Helpers:\n");
+        for (u32 i = 0; i < num_vertices(g); i++) {
+            DEBUG_PRINTF("%-7s %3u:", "Vertex", i);
+            for (u32 j = 0; j < num_vertices(g); j++) {
+                printf("%3i", j);
+            }
+            printf("\n");
+            for (u32 d = 0; d <= edit_distance; d++) {
+                DEBUG_PRINTF("%-7s %3u:", "Level", d);
+                const auto &s = getHelperTransitions(i, d);
+                for (u32 j = 0; j < num_vertices(g); j++) {
+                    printf("%3i", s.test(j));
+                }
+                printf("\n");
+            }
+            DEBUG_PRINTF("\n");
+        }
+
+        DEBUG_PRINTF("Accept transitions:\n");
+        DEBUG_PRINTF("%-12s", "Vertex idx:");
+        for (u32 j = 0; j < num_vertices(g); j++) {
+            printf("%3i", j);
+        }
+        printf("\n");
+        for (u32 d = 0; d <= edit_distance; d++) {
+            DEBUG_PRINTF("%-7s %3u:", "Level", d);
+            const auto &s = getAcceptTransitions(d);
+            for (u32 j = 0; j < num_vertices(g); j++) {
+                printf("%3i", s.test(j));
+            }
+            printf("\n");
+        }
+        DEBUG_PRINTF("\n");
+
+        DEBUG_PRINTF("Accept EOD transitions:\n");
+        DEBUG_PRINTF("%-12s", "Vertex idx:");
+        for (u32 j = 0; j < num_vertices(g); j++) {
+            printf("%3i", j);
+        }
+        printf("\n");
+        for (u32 d = 0; d <= edit_distance; d++) {
+            DEBUG_PRINTF("%-7s %3u:", "Level", d);
+            const auto &s = getAcceptEodTransitions(d);
+            for (u32 j = 0; j < num_vertices(g); j++) {
+                printf("%3i", s.test(j));
+            }
+            printf("\n");
+        }
+        DEBUG_PRINTF("\n");
+
+        DEBUG_PRINTF("%-12s ", "Accepts:");
+        for (u32 i = 0; i < num_vertices(g); i++) {
+            printf("%3i", accept.test(i));
+        }
+        printf("\n");
+
+        DEBUG_PRINTF("%-12s ", "EOD Accepts:");
+        for (u32 i = 0; i < num_vertices(g); i++) {
+            printf("%3i", accept_eod.test(i));
+        }
+        printf("\n");
+
+        DEBUG_PRINTF("Reports\n");
+        for (auto v : vertices_range(g)) {
+            for (u32 d = 0; d <= edit_distance; d++) {
+                const auto &r = vertex_reports_by_level[d][v];
+                const auto &e = vertex_eod_reports_by_level[d][v];
+                DEBUG_PRINTF("%-7s %3zu %-8s %3zu %-8s %3zu\n",
+                             "Vertex", g[v].index, "rs:", r.size(), "eod:", e.size());
+            }
+        }
+        printf("\n");
+    }
+#endif
+
+    const StateBitSet& getShadowTransitions(u32 idx, u32 level) const {
+        assert(idx < size);
+        assert(level <= edit_distance);
+        return shadow_transitions[idx][level];
+    }
+    const StateBitSet& getHelperTransitions(u32 idx, u32 level) const {
+        assert(idx < size);
+        assert(level <= edit_distance);
+        return helper_transitions[idx][level];
+    }
+    const StateBitSet& getAcceptTransitions(u32 level) const {
+        assert(level <= edit_distance);
+        return accept_states[level];
+    }
+    const StateBitSet& getAcceptEodTransitions(u32 level) const {
+        assert(level <= edit_distance);
+        return accept_eod_states[level];
+    }
+
+    /*
+     * the bitsets are indexed by vertex and shadow level. the bitset's length is
+     * equal to the total number of vertices in the graph.
+     *
+     * for convenience, helper functions are provided.
+     */
+    vector<vector<StateBitSet>> shadow_transitions;
+    vector<vector<StateBitSet>> helper_transitions;
+
+    // accept states masks, indexed by shadow level
+    vector<StateBitSet> accept_states;
+    vector<StateBitSet> accept_eod_states;
+
+    // map of all reports associated with any vertex, indexed by shadow level
+    vector<map<NFAVertex, flat_set<ReportID>>> vertex_reports_by_level;
+    vector<map<NFAVertex, flat_set<ReportID>>> vertex_eod_reports_by_level;
+
+    u32 size;
+    u32 edit_distance;
+};
+
+
+/*
+ * SOM workflow is expected to be the following:
+ * - Caller calls getActiveStates, which reports SOM for each active states
+ * - Caller calls getSuccessorStates on each of the active states, which *doesn't*
+ *   report SOM
+ * - Caller decides if the successor state should be activated, and calls
+ *   activateState with SOM set to that of previous active state (not successor!)
+ * - activateState then resolves any conflicts between SOMs that may arise from
+ *   multiple active states progressing to the same successor
+ */
 struct StateSet {
-    explicit StateSet(size_t sz) : s(sz), som(sz, 0) {}
-    boost::dynamic_bitset<> s; // bitset of states that are on
-    vector<size_t> som; // som value for each state
+    struct State {
+        enum node_type {
+            NODE_SHADOW = 0,
+            NODE_HELPER
+        };
+        State(size_t idx_in, u32 level_in, size_t som_in, node_type type_in) :
+            idx(idx_in), level(level_in), som(som_in), type(type_in) {}
+        size_t idx;
+        u32 level;
+        size_t som;
+        node_type type;
+    };
+
+    // Temporary working data used for step() which we want to keep around
+    // (rather than reallocating vectors all the time).
+    struct WorkingData {
+        vector<State> active;
+        vector<State> succ_list;
+    };
+
+    StateSet(size_t sz, u32 dist_in) :
+            shadows(dist_in + 1), helpers(dist_in + 1),
+            shadows_som(dist_in + 1), helpers_som(dist_in + 1),
+            edit_distance(dist_in) {
+        for (u32 dist = 0; dist <= dist_in; dist++) {
+            shadows[dist].resize(sz, false);
+            helpers[dist].resize(sz, false);
+            shadows_som[dist].resize(sz, 0);
+            helpers_som[dist].resize(sz, 0);
+        }
+    }
+
+    void reset() {
+        for (u32 dist = 0; dist <= edit_distance; dist++) {
+            shadows[dist].reset();
+            helpers[dist].reset();
+            fill(shadows_som[dist].begin(), shadows_som[dist].end(), 0);
+            fill(helpers_som[dist].begin(), helpers_som[dist].end(), 0);
+        }
+    }
+
+    bool empty() const {
+        for (u32 dist = 0; dist <= edit_distance; dist++) {
+            if (shadows[dist].any()) {
+                return false;
+            }
+            if (helpers[dist].any()) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    size_t count() const {
+        size_t result = 0;
+
+        for (u32 dist = 0; dist <= edit_distance; dist++) {
+            result += shadows[dist].count();
+            result += helpers[dist].count();
+        }
+
+        return result;
+    }
+
+    bool setActive(const State &s) {
+        switch (s.type) {
+        case State::NODE_HELPER:
+            return helpers[s.level].test_set(s.idx);
+        case State::NODE_SHADOW:
+            return shadows[s.level].test_set(s.idx);
+        }
+        assert(0);
+        return false;
+    }
+
+    size_t getCachedSom(const State &s) const {
+        switch (s.type) {
+        case State::NODE_HELPER:
+            return helpers_som[s.level][s.idx];
+        case State::NODE_SHADOW:
+            return shadows_som[s.level][s.idx];
+        }
+        assert(0);
+        return 0;
+    }
+
+    void setCachedSom(const State &s, const size_t som_val) {
+        switch (s.type) {
+        case State::NODE_HELPER:
+            helpers_som[s.level][s.idx] = som_val;
+            break;
+        case State::NODE_SHADOW:
+            shadows_som[s.level][s.idx] = som_val;
+            break;
+        default:
+            assert(0);
+        }
+    }
+
+#ifdef DEBUG
+    void dumpActiveStates() const {
+        vector<State> states;
+        getActiveStates(states);
+
+        DEBUG_PRINTF("Dumping active states\n");
+
+        for (const auto &state : states) {
+            DEBUG_PRINTF("type: %s idx: %zu level: %u som: %zu\n",
+                         state.type == State::NODE_HELPER ? "HELPER" : "SHADOW",
+                         state.idx, state.level, state.som);
+        }
+    }
+#endif
+
+    void getActiveStates(vector<State> &result) const {
+        result.clear();
+
+        for (u32 dist = 0; dist <= edit_distance; dist++) {
+            // get all shadow vertices (including original graph)
+            const auto &cur_shadow_vertices = shadows[dist];
+            for (size_t id = cur_shadow_vertices.find_first();
+                 id != cur_shadow_vertices.npos;
+                 id = cur_shadow_vertices.find_next(id)) {
+                result.emplace_back(id, dist, shadows_som[dist][id],
+                                    State::NODE_SHADOW);
+            }
+
+            // the rest is only valid for edited graphs
+            if (dist == 0) {
+                continue;
+            }
+
+            // get all helper vertices
+            const auto &cur_helper_vertices = helpers[dist];
+            for (size_t id = cur_helper_vertices.find_first();
+                 id != cur_helper_vertices.npos;
+                 id = cur_helper_vertices.find_next(id)) {
+                result.emplace_back(id, dist, helpers_som[dist][id],
+                                    State::NODE_HELPER);
+            }
+        }
+
+        sort_and_unique(result);
+    }
+
+    // does not return SOM
+    void getSuccessors(const State &state, const GraphCache &gc,
+                       vector<State> &result) const {
+        result.clear();
+
+        // maximum shadow depth that we can go from current level
+        u32 max_depth = edit_distance - state.level + 1;
+
+        for (u32 d = 0; d < max_depth; d++) {
+            const auto &shadow_succ = gc.getShadowTransitions(state.idx, d);
+            for (size_t id = shadow_succ.find_first();
+                 id != shadow_succ.npos;
+                 id = shadow_succ.find_next(id)) {
+                auto new_level = state.level + d;
+                result.emplace_back(id, new_level, 0, State::NODE_SHADOW);
+            }
+
+            const auto &helper_succ = gc.getHelperTransitions(state.idx, d);
+            for (size_t id = helper_succ.find_first();
+                 id != helper_succ.npos;
+                 id = helper_succ.find_next(id)) {
+                auto new_level = state.level + d;
+                result.emplace_back(id, new_level, 0, State::NODE_HELPER);
+            }
+        }
+
+        sort_and_unique(result);
+    }
+
+    void getAcceptStates(const GraphCache &gc, vector<State> &result) const {
+        result.clear();
+
+        for (u32 dist = 0; dist <= edit_distance; dist++) {
+            // get all shadow vertices (including original graph)
+            auto cur_shadow_vertices = shadows[dist];
+            cur_shadow_vertices &= gc.getAcceptTransitions(dist);
+            for (size_t id = cur_shadow_vertices.find_first();
+                 id != cur_shadow_vertices.npos;
+                 id = cur_shadow_vertices.find_next(id)) {
+                result.emplace_back(id, dist, shadows_som[dist][id],
+                                    State::NODE_SHADOW);
+            }
+            auto cur_helper_vertices = helpers[dist];
+            cur_helper_vertices &= gc.getAcceptTransitions(dist);
+            for (size_t id = cur_helper_vertices.find_first();
+                 id != cur_helper_vertices.npos;
+                 id = cur_helper_vertices.find_next(id)) {
+                result.emplace_back(id, dist, helpers_som[dist][id],
+                                    State::NODE_HELPER);
+            }
+        }
+
+        sort_and_unique(result);
+    }
+
+    void getAcceptEodStates(const GraphCache &gc, vector<State> &result) const {
+        result.clear();
+
+        for (u32 dist = 0; dist <= edit_distance; dist++) {
+            // get all shadow vertices (including original graph)
+            auto cur_shadow_vertices = shadows[dist];
+            cur_shadow_vertices &= gc.getAcceptEodTransitions(dist);
+            for (size_t id = cur_shadow_vertices.find_first();
+                 id != cur_shadow_vertices.npos;
+                 id = cur_shadow_vertices.find_next(id)) {
+                result.emplace_back(id, dist, shadows_som[dist][id],
+                                    State::NODE_SHADOW);
+            }
+            auto cur_helper_vertices = helpers[dist];
+            cur_helper_vertices &= gc.getAcceptEodTransitions(dist);
+            for (size_t id = cur_helper_vertices.find_first();
+                 id != cur_helper_vertices.npos;
+                 id = cur_helper_vertices.find_next(id)) {
+                result.emplace_back(id, dist, helpers_som[dist][id],
+                                    State::NODE_HELPER);
+            }
+        }
+
+        sort_and_unique(result);
+    }
+
+    // the caller must specify SOM at current offset, and must not attempt to
+    // resolve SOM inheritance conflicts
+    void activateState(const State &state) {
+        size_t cur_som = state.som;
+        if (setActive(state)) {
+            size_t cached_som = getCachedSom(state);
+            cur_som = min(cur_som, cached_som);
+        }
+        setCachedSom(state, cur_som);
+    }
+
+    vector<StateBitSet> shadows;
+    vector<StateBitSet> helpers;
+    vector<vector<size_t>> shadows_som;
+    vector<vector<size_t>> helpers_som;
+    u32 edit_distance;
 };
 
-using MatchSet = set<pair<size_t, size_t>>;
+// for flat_set
+bool operator<(const StateSet::State &a, const StateSet::State &b) {
+    ORDER_CHECK(idx);
+    ORDER_CHECK(level);
+    ORDER_CHECK(type);
+    ORDER_CHECK(som);
+    return false;
+}
+
+bool operator==(const StateSet::State &a, const StateSet::State &b) {
+    return a.idx == b.idx && a.level == b.level && a.type == b.type &&
+           a.som == b.som;
+}
 
 struct fmstate {
     const size_t num_states; // number of vertices in graph
     StateSet states; // currently active states
     StateSet next; // states on after this iteration
+    GraphCache &gc;
     vector<NFAVertex> vertices; // mapping from index to vertex
     size_t offset = 0;
     unsigned char cur = 0;
     unsigned char prev = 0;
-    const bool som;
     const bool utf8;
     const bool allowStartDs;
     const ReportManager &rm;
 
-    boost::dynamic_bitset<> accept; // states leading to accept
-    boost::dynamic_bitset<> accept_with_eod; // states leading to accept or eod
-
-    fmstate(const NGHolder &g, bool som_in, bool utf8_in, bool aSD_in,
-            const ReportManager &rm_in)
-        : num_states(num_vertices(g)), states(num_states), next(num_states),
-          vertices(num_vertices(g), NGHolder::null_vertex()), som(som_in),
-          utf8(utf8_in), allowStartDs(aSD_in), rm(rm_in), accept(num_states),
-          accept_with_eod(num_states) {
+    fmstate(const NGHolder &g, GraphCache &gc_in, bool utf8_in, bool aSD_in,
+            const u32 edit_distance, const ReportManager &rm_in)
+        : num_states(num_vertices(g)),
+          states(num_states, edit_distance),
+          next(num_states, edit_distance),
+          gc(gc_in), vertices(num_vertices(g), NGHolder::null_vertex()),
+          utf8(utf8_in), allowStartDs(aSD_in), rm(rm_in) {
         // init states
-        states.s.set(g[g.start].index);
+        states.activateState(
+                    StateSet::State {g[g.start].index, 0, 0,
+                                     StateSet::State::NODE_SHADOW});
         if (allowStartDs) {
-            states.s.set(g[g.startDs].index);
+            states.activateState(
+                        StateSet::State {g[g.startDs].index, 0, 0,
+                                         StateSet::State::NODE_SHADOW});
         }
         // fill vertex mapping
-        for (const auto &v : vertices_range(g)) {
+        for (auto v : vertices_range(g)) {
             vertices[g[v].index] = v;
         }
-        // init accept states
-        for (const auto &u : inv_adjacent_vertices_range(g.accept, g)) {
-            accept.set(g[u].index);
-        }
-        accept_with_eod = accept;
-        for (const auto &u : inv_adjacent_vertices_range(g.acceptEod, g)) {
-            accept_with_eod.set(g[u].index);
-        }
     }
 };
 
@@ -140,8 +829,7 @@ bool isUtf8CodePoint(const char c) {
 }
 
 static
-bool canReach(const NGHolder &g, const NFAEdge &e,
-              struct fmstate &state) {
+bool canReach(const NGHolder &g, const NFAEdge &e, struct fmstate &state) {
     auto flags = g[e].assert_flags;
     if (!flags) {
         return true;
@@ -175,97 +863,149 @@ bool canReach(const NGHolder &g, const NFAEdge &e,
 }
 
 static
-void getMatches(const NGHolder &g, MatchSet &matches, struct fmstate &state,
-                bool allowEodMatches) {
-    auto acc_states = state.states.s;
-    acc_states &= allowEodMatches ? state.accept_with_eod : state.accept;
+void getAcceptMatches(const NGHolder &g, MatchSet &matches,
+                      struct fmstate &state, NFAVertex accept_vertex,
+                      vector<StateSet::State> &active_states) {
+    assert(accept_vertex == g.accept || accept_vertex == g.acceptEod);
+
+    const bool eod = accept_vertex == g.acceptEod;
+    if (eod) {
+        state.states.getAcceptEodStates(state.gc, active_states);
+    } else {
+        state.states.getAcceptStates(state.gc, active_states);
+    }
 
-    for (size_t i = acc_states.find_first(); i != acc_states.npos;
-         i = acc_states.find_next(i)) {
-        const NFAVertex u = state.vertices[i];
-        const size_t &som_offset = state.states.som[i];
+    DEBUG_PRINTF("Number of active states: %zu\n", active_states.size());
+
+    for (const auto &cur : active_states) {
+        auto u = state.vertices[cur.idx];
 
         // we can't accept anything from startDs in between UTF-8 codepoints
         if (state.utf8 && u == g.startDs && !isUtf8CodePoint(state.cur)) {
             continue;
         }
 
-        for (const auto &e : out_edges_range(u, g)) {
-            NFAVertex v = target(e, g);
-            if (v == g.accept || (v == g.acceptEod && allowEodMatches)) {
-                // check edge assertions if we are allowed to reach accept
-                if (!canReach(g, e, state)) {
-                    continue;
-                }
-                DEBUG_PRINTF("match found at %zu\n", state.offset);
+        const auto &reports =
+            eod ? state.gc.vertex_eod_reports_by_level[cur.level][u]
+                : state.gc.vertex_reports_by_level[cur.level][u];
 
-                assert(!g[u].reports.empty());
-                for (const auto &report_id : g[u].reports) {
-                    const Report &ri = state.rm.getReport(report_id);
+        NFAEdge e = edge(u, accept_vertex, g);
 
-                    DEBUG_PRINTF("report %u has offset adjustment %d\n",
-                                 report_id, ri.offsetAdjust);
-                    matches.emplace(som_offset, state.offset + ri.offsetAdjust);
-                }
-            }
+        // we assume edge assertions only exist at level 0
+        if (e && !canReach(g, e, state)) {
+            continue;
+        }
+
+        DEBUG_PRINTF("%smatch found at %zu\n", eod ? "eod " : "", state.offset);
+
+        assert(!reports.empty());
+        for (const auto &report_id : reports) {
+            const Report &ri = state.rm.getReport(report_id);
+
+            DEBUG_PRINTF("report %u has offset adjustment %d\n", report_id,
+                         ri.offsetAdjust);
+            DEBUG_PRINTF("match from (i:%zu,l:%u,t:%u): (%zu,%zu)\n", cur.idx,
+                         cur.level, cur.type, cur.som,
+                         state.offset + ri.offsetAdjust);
+            matches.emplace(cur.som, state.offset + ri.offsetAdjust);
         }
     }
 }
 
 static
-void step(const NGHolder &g, struct fmstate &state) {
-    state.next.s.reset();
-
-    for (size_t i = state.states.s.find_first(); i != state.states.s.npos;
-         i = state.states.s.find_next(i)) {
-        const NFAVertex &u = state.vertices[i];
-        const size_t &u_som_offset = state.states.som[i];
-
-        for (const auto &e : out_edges_range(u, g)) {
-            NFAVertex v = target(e, g);
-            if (v == g.acceptEod) {
-                // can't know the future: we don't know if we're at EOD.
+void getMatches(const NGHolder &g, MatchSet &matches, struct fmstate &state,
+                StateSet::WorkingData &wd, bool allowEodMatches) {
+    getAcceptMatches(g, matches, state, g.accept, wd.active);
+    if (allowEodMatches) {
+        getAcceptMatches(g, matches, state, g.acceptEod, wd.active);
+    }
+}
+
+static
+void step(const NGHolder &g, fmstate &state, StateSet::WorkingData &wd) {
+    state.next.reset();
+
+    state.states.getActiveStates(wd.active);
+
+    for (const auto &cur : wd.active) {
+        auto u = state.vertices[cur.idx];
+        state.states.getSuccessors(cur, state.gc, wd.succ_list);
+
+        for (auto succ : wd.succ_list) {
+            auto v = state.vertices[succ.idx];
+
+            if (is_any_accept(v, g)) {
                 continue;
             }
-            if (v == g.accept) {
+
+            if (!state.allowStartDs && v == g.startDs) {
                 continue;
             }
 
-            if (!state.allowStartDs && v == g.startDs) {
+            // GraphCache doesn't differentiate between successors for shadows
+            // and helpers, and StateSet does not know anything about the graph,
+            // so the only place we can do it is here. we can't self-loop on a
+            // startDs if we're startDs's helper, so disallow it.
+            if (u == g.startDs && v == g.startDs &&
+                succ.level != 0 && succ.level == cur.level) {
                 continue;
             }
 
-            const CharReach &cr = g[v].char_reach;
-            const size_t v_idx = g[v].index;
+            // for the reasons outlined above, also putting this here.
+            // disallow transitions from start to startDs on levels other than zero
+            if (u == g.start && v == g.startDs &&
+                cur.level != 0 && succ.level != 0) {
+                continue;
+            }
 
-            // check reachability and edge assertions
-            if (cr.test(state.cur) && canReach(g, e, state)) {
-                // if we aren't in SOM mode, just set every SOM to 0
-                if (!state.som) {
-                    state.next.s.set(v_idx);
-                    state.next.som[v_idx] = 0;
-                    continue;
+            bool can_reach = false;
+
+            if (succ.type == StateSet::State::NODE_HELPER) {
+                can_reach = true;
+            } else {
+                // we assume edge assertions only exist on level 0
+                const CharReach &cr = g[v].char_reach;
+                NFAEdge e = edge(u, v, g);
+
+                if (cr.test(state.cur) &&
+                    (!e || canReach(g, e, state))) {
+                    can_reach = true;
                 }
+            }
 
-                // if this is first vertex since start, use current offset as SOM
+            // check edge assertions if we are allowed to reach accept
+            DEBUG_PRINTF("reaching %zu->%zu ('%c'->'%c'): %s\n",
+                         g[u].index, g[v].index,
+                         ourisprint(state.prev) ? state.prev : '?',
+                         ourisprint(state.cur) ? state.cur : '?',
+                         can_reach ? "yes" : "no");
+
+            if (can_reach) {
+                // we should use current offset as SOM if:
+                //  - we're at level 0 and we're a start vertex
+                //  - we're a fake start shadow
                 size_t next_som;
-                if (u == g.start || u == g.startDs || is_virtual_start(u, g)) {
+                bool reset = is_any_start(u, g) && cur.level == 0;
+                reset |= is_virtual_start(u, g) &&
+                         cur.type == StateSet::State::NODE_SHADOW;
+
+                if (reset) {
                     next_som = state.offset;
                 } else {
                     // else, inherit SOM from predecessor
-                    next_som = u_som_offset;
+                    next_som = cur.som;
                 }
+                succ.som = next_som;
 
-                // check if the vertex is already active
-                // if this vertex is not yet active, use current SOM
-                if (!state.next.s.test(v_idx)) {
-                    state.next.s.set(v_idx);
-                    state.next.som[v_idx] = next_som;
-                } else {
-                    // else, work out leftmost SOM
-                    state.next.som[v_idx] =
-                        min(next_som, state.next.som[v_idx]);
-                }
+                DEBUG_PRINTF("src: idx %zu level: %u som: %zu type: %s\n",
+                             cur.idx, cur.level, cur.som,
+                             cur.type == StateSet::State::NODE_HELPER ? "H" : "S");
+                DEBUG_PRINTF("dst: idx %zu level: %u som: %zu type: %s\n",
+                             succ.idx, succ.level, succ.som,
+                             succ.type == StateSet::State::NODE_HELPER ? "H" : "S");
+
+                // activate successor (SOM will be handled by activateState)
+                state.next.activateState(succ);
             }
         }
     }
@@ -311,43 +1051,64 @@ void filterMatches(MatchSet &matches) {
  *
  *  Fills \a matches with offsets into the data stream where a match is found.
  */
-void findMatches(const NGHolder &g, const ReportManager &rm,
-                 const string &input, MatchSet &matches, const bool notEod,
-                 const bool som, const bool utf8) {
+bool findMatches(const NGHolder &g, const ReportManager &rm,
+                 const string &input, MatchSet &matches,
+                 const u32 edit_distance, const bool notEod, const bool utf8) {
     assert(hasCorrectlyNumberedVertices(g));
+    // cannot match fuzzy utf8 patterns, this should've been filtered out at
+    // compile time, so make it an assert
+    assert(!edit_distance || !utf8);
+
+    const size_t total_states = num_vertices(g) * (3 * edit_distance + 1);
+    DEBUG_PRINTF("Finding matches (%zu total states)\n", total_states);
+    if (total_states > STATE_COUNT_MAX) {
+        DEBUG_PRINTF("too big\n");
+        return false;
+    }
+
+    GraphCache gc(edit_distance, g);
+#ifdef DEBUG
+    gc.dumpStateTransitionTable(g);
+#endif
 
     const bool allowStartDs = (proper_out_degree(g.startDs, g) > 0);
 
-    struct fmstate state(g, som, utf8, allowStartDs, rm);
+    struct fmstate state(g, gc, utf8, allowStartDs, edit_distance, rm);
+
+    StateSet::WorkingData wd;
 
     for (auto it = input.begin(), ite = input.end(); it != ite; ++it) {
+#ifdef DEBUG
+        state.states.dumpActiveStates();
+#endif
         state.offset = distance(input.begin(), it);
         state.cur = *it;
 
-        step(g, state);
+        step(g, state, wd);
 
-        getMatches(g, matches, state, false);
+        getMatches(g, matches, state, wd, false);
 
-        DEBUG_PRINTF("index %zu, %zu states on\n", state.offset,
-                     state.next.s.count());
-        if (state.next.s.empty()) {
-            if (state.som) {
-                filterMatches(matches);
-            }
-            return;
+        DEBUG_PRINTF("offset %zu, %zu states on\n", state.offset,
+                     state.next.count());
+        if (state.next.empty()) {
+            filterMatches(matches);
+            return true;
         }
         state.states = state.next;
         state.prev = state.cur;
     }
+#ifdef DEBUG
+    state.states.dumpActiveStates();
+#endif
     state.offset = input.size();
     state.cur = 0;
 
     // do additional step to get matches after stream end, this time count eod
     // matches also (or not, if we're in notEod mode)
 
-    getMatches(g, matches, state, !notEod);
+    DEBUG_PRINTF("Looking for EOD matches\n");
+    getMatches(g, matches, state, wd, !notEod);
 
-    if (state.som) {
-        filterMatches(matches);
-    }
+    filterMatches(matches);
+    return true;
 }
diff --git a/util/ng_find_matches.h b/util/ng_find_matches.h
index e9e47010f..9860c202e 100644
--- a/util/ng_find_matches.h
+++ b/util/ng_find_matches.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, Intel Corporation
+ * Copyright (c) 2015-2017, Intel Corporation
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -44,13 +44,18 @@ struct BoundaryReports;
 
 } // namespace ue2
 
-/** \brief Find all matches for a given graph when executed against \a input.
+/**
+ * \brief Find all matches for a given graph when executed against \a input.
  *
- *  Fills \a matches with offsets into the data stream where a match is found.
+ * Fills \a matches with offsets into the data stream where a match is found.
+ *
+ * Returns false if this graph is too large to find its matches in reasonable
+ * time.
  */
-void findMatches(const ue2::NGHolder &g, const ue2::ReportManager &rm,
+bool findMatches(const ue2::NGHolder &g, const ue2::ReportManager &rm,
                  const std::string &input,
                  std::set<std::pair<size_t, size_t>> &matches,
-                 const bool notEod, const bool som, const bool utf8);
+                 const unsigned int max_edit_distance, const bool notEod,
+                 const bool utf8);
 
 #endif // NG_FIND_MATCHES_H