From 8aa48d2eb6366738141aa6548e4d4b7660833eff Mon Sep 17 00:00:00 2001
From: Pramod Kumbhar <pramod.kumbhar@epfl.ch>
Date: Wed, 5 Oct 2016 14:13:18 +0200
Subject: [PATCH] CoreNeuron development changes for BBP repository for 2nd
 open source release of CoreNeuron.

---
 .clang-format                                 |   94 +
 .gitreview                                    |    6 +
 CMake/ClangFormatUtils.cmake                  |    5 +
 CMake/Findmod2c.cmake                         |   72 +
 CMake/TestScriptUtils.cmake                   |    1 +
 CMake/config/CompilerFlagsHelpers.cmake       |   20 +-
 CMake/config/DefineInstallationPaths.cmake    |    2 +-
 CMake/config/ReleaseDebugAutoFlags.cmake      |    1 +
 CMake/config/TestHelpers.cmake                |    2 +-
 CMake/packages/FindBBPTestData.cmake          |    2 +
 CMake/packages/FindClangFormat.cmake          |   40 +
 CMake/packages/FindSLURM.cmake                |    1 +
 CMake/packages/Findmod2c.cmake                |    8 +
 CMake/packages/Findreportinglib.cmake         |   71 +
 CMake/portability/BlueGenePortability.cmake   |    1 +
 CMakeLists.txt                                |  122 +-
 README.md                                     |  102 +-
 apps/CMakeLists.txt                           |    7 +-
 apps/main.cpp                                 |    1 +
 coreneuron/CMakeLists.txt                     |   76 +-
 coreneuron/coreneuron.h                       |   78 +
 coreneuron/kinderiv.py                        |   91 +
 coreneuron/mech/cfile/cabvars.h               |   25 +-
 coreneuron/mech/mod2c_core_thread.h           |  129 +
 coreneuron/mech/mod_func.c.pl                 |   77 +-
 coreneuron/mech/mod_func_ptrs.c.pl            |  103 +
 coreneuron/mech/modfile/hh.mod                |    3 +-
 coreneuron/mech/modfile/netstim.mod           |  151 +-
 coreneuron/mech/modfile/stim.mod              |    5 +-
 coreneuron/nrnconf.h                          |   13 +-
 coreneuron/nrniv/balance.cpp                  |  105 +
 coreneuron/nrniv/cellorder.cpp                |  633 +++++
 coreneuron/nrniv/cellorder.h                  |   47 +
 coreneuron/nrniv/cellorder1.cpp               |  671 +++++
 coreneuron/nrniv/cellorder2.cpp               |  532 ++++
 coreneuron/nrniv/coreneuron_main.cpp          |    2 +-
 coreneuron/nrniv/cuda_profile.cu              |   31 +
 coreneuron/nrniv/cuda_profile.h               |    8 +
 coreneuron/nrniv/cvodestb.cpp                 |   83 +-
 coreneuron/nrniv/global_vars.cpp              |   99 +
 coreneuron/nrniv/have2want.h                  |  262 ++
 coreneuron/nrniv/ivocvect.cpp                 |   13 +-
 coreneuron/nrniv/ivocvect.h                   |   67 +-
 coreneuron/nrniv/lpt.cpp                      |   80 +
 coreneuron/nrniv/lpt.h                        |    9 +
 coreneuron/nrniv/main1.cpp                    |  255 +-
 coreneuron/nrniv/memory.h                     |   35 +-
 coreneuron/nrniv/mk_mech.cpp                  |  196 +-
 coreneuron/nrniv/netcon.h                     |  150 +-
 coreneuron/nrniv/netcvode.cpp                 |  899 ++++---
 coreneuron/nrniv/netcvode.h                   |   35 +-
 coreneuron/nrniv/netpar.cpp                   | 1259 +++++----
 coreneuron/nrniv/node_permute.cpp             |  310 +++
 coreneuron/nrniv/node_permute.h               |   17 +
 coreneuron/nrniv/nrn_acc_manager.cpp          |  982 +++++++
 coreneuron/nrniv/nrn_acc_manager.h            |   31 +
 coreneuron/nrniv/nrn_assert.h                 |    9 +-
 coreneuron/nrniv/nrn_datareader.cpp           |   55 +-
 coreneuron/nrniv/nrn_datareader.h             |  104 +-
 coreneuron/nrniv/nrn_setup.cpp                | 2284 +++++++++++------
 coreneuron/nrniv/nrn_setup.h                  |   85 +-
 coreneuron/nrniv/nrn_stats.cpp                |  195 +-
 coreneuron/nrniv/nrn_stats.h                  |    3 +-
 coreneuron/nrniv/nrniv_decl.h                 |   49 +-
 coreneuron/nrniv/nrniv_mf.h                   |   27 +-
 coreneuron/nrniv/nrnmutdec.h                  |  112 +-
 coreneuron/nrniv/nrnoptarg.cpp                |  215 +-
 coreneuron/nrniv/nrnoptarg.h                  |   56 +-
 coreneuron/nrniv/output_spikes.cpp            |   58 +-
 coreneuron/nrniv/output_spikes.h              |    8 +-
 coreneuron/nrniv/partrans.cpp                 |  159 ++
 coreneuron/nrniv/partrans.h                   |   61 +
 coreneuron/nrniv/partrans_setup.cpp           |  289 +++
 coreneuron/nrniv/patternstim.cpp              |  291 ++-
 coreneuron/nrniv/prcellstate.cpp              |  393 +--
 coreneuron/nrniv/profiler_interface.cpp       |   53 +
 coreneuron/nrniv/profiler_interface.h         |    7 +
 coreneuron/nrniv/tnode.h                      |   40 +
 coreneuron/nrniv/tqueue.cpp                   |  522 ++--
 coreneuron/nrniv/tqueue.h                     |  189 +-
 coreneuron/nrniv/tqueue.ipp                   |  362 +++
 coreneuron/nrniv/vrecitem.h                   |  100 +-
 coreneuron/nrniv/vrecord.cpp                  |  158 +-
 coreneuron/nrnmpi/mpispike.c                  |  680 ++---
 coreneuron/nrnmpi/mpispike.h                  |   19 +-
 coreneuron/nrnmpi/nrnmpi.c                    |  246 +-
 coreneuron/nrnmpi/nrnmpi.h                    |   30 +-
 coreneuron/nrnmpi/nrnmpi_def_cinc.h           |    6 +-
 coreneuron/nrnmpi/nrnmpi_impl.h               |    4 +
 coreneuron/nrnmpi/nrnmpidec.h                 |   23 +-
 coreneuron/nrnmpi/nrnmpiuse.h                 |    7 +-
 coreneuron/nrnoc/capac.c                      |  167 +-
 coreneuron/nrnoc/eion.c                       |  460 ++--
 coreneuron/nrnoc/fadvance_core.c              |  267 +-
 coreneuron/nrnoc/finitialize.c                |  108 +-
 coreneuron/nrnoc/md1redef.h                   |    6 +
 coreneuron/nrnoc/md2redef.h                   |    2 +
 coreneuron/nrnoc/membdef.h                    |   54 +-
 coreneuron/nrnoc/membfunc.h                   |  144 +-
 coreneuron/nrnoc/multicore.c                  |  764 +++---
 coreneuron/nrnoc/multicore.h                  |  125 +-
 coreneuron/nrnoc/nrnoc_aux.c                  |  117 +-
 coreneuron/nrnoc/nrnoc_decl.h                 |   12 +-
 coreneuron/nrnoc/nrnoc_ml.h                   |   69 +-
 coreneuron/nrnoc/nrntimeout.c                 |   68 +-
 coreneuron/nrnoc/register_mech.c              |  584 +++--
 coreneuron/nrnoc/solve_core.c                 |   89 +-
 coreneuron/nrnoc/treeset_core.c               |  162 +-
 coreneuron/nrnomp/nrnomp.c                    |   10 +-
 coreneuron/scopmath_core/abort.c              |   94 +
 coreneuron/scopmath_core/crout_thread.c       |  235 ++
 coreneuron/scopmath_core/dimplic.c            |   28 +
 coreneuron/scopmath_core/errcodes.h           |   45 +
 coreneuron/scopmath_core/newton_struct.h      |   60 +
 coreneuron/scopmath_core/newton_thread.c      |  246 ++
 coreneuron/scopmath_core/sparse_thread.c      |  853 ++++++
 coreneuron/scopmath_core/ssimplic_thread.c    |   79 +
 coreneuron/utils/endianness.h                 |   26 +-
 coreneuron/utils/memory_utils.cpp             |   74 +-
 coreneuron/utils/memory_utils.h               |    4 +-
 coreneuron/utils/randoms/Random123/aes.h      |  279 +-
 coreneuron/utils/randoms/Random123/array.h    |  455 ++--
 coreneuron/utils/randoms/Random123/ars.h      |  161 +-
 .../Random123/features/clangfeatures.h        |    6 +-
 .../Random123/features/compilerfeatures.h     |   45 +-
 .../randoms/Random123/features/crayfeatures.h |    2 +-
 .../Random123/features/fujitsufeatures.h      |   96 +
 .../randoms/Random123/features/gccfeatures.h  |   28 +-
 .../randoms/Random123/features/iccfeatures.h  |   10 +-
 .../randoms/Random123/features/msvcfeatures.h |   10 +-
 .../randoms/Random123/features/nvccfeatures.h |   13 +-
 .../Random123/features/openclfeatures.h       |    4 +-
 .../randoms/Random123/features/pgccfeatures.h |   12 +-
 .../utils/randoms/Random123/features/sse.h    |  162 +-
 .../Random123/features/sunprofeatures.h       |    2 +-
 .../randoms/Random123/features/xlcfeatures.h  |   12 +-
 coreneuron/utils/randoms/Random123/philox.h   |  441 ++--
 coreneuron/utils/randoms/Random123/threefry.h | 1754 ++++++++-----
 coreneuron/utils/randoms/nrnran123.c          |  146 +-
 coreneuron/utils/randoms/nrnran123.cu         |  160 ++
 coreneuron/utils/randoms/nrnran123.h          |  107 +-
 coreneuron/utils/reports/nrnreport.cpp        |  238 ++
 coreneuron/utils/reports/nrnreport.h          |  149 ++
 coreneuron/utils/sdprintf.cpp                 |   51 +-
 coreneuron/utils/sdprintf.h                   |   64 +-
 coreneuron/utils/swap_endian.h                |  387 +--
 default.nix                                   |   27 +
 tests/CMakeLists.txt                          |    4 +-
 tests/integration/CMakeLists.txt              |   34 +
 tests/integration/README                      |    1 +
 tests/integration/ring/ring_ref_solution.h    |    1 +
 tests/regression/CMakeLists.txt               |   32 +
 tests/regression/README                       |    1 +
 tests/regression/test_header.hpp              |   20 +
 tests/unit/alignment/CMakeLists.txt           |    2 +
 tests/unit/alignment/alignment.cpp            |    2 +
 tests/unit/cmdline_interface/CMakeLists.txt   |    4 +-
 .../test_cmdline_interface.cpp                |   20 +-
 tests/unit/endian/CMakeLists.txt              |    1 +
 tests/unit/endian/endianness_test.cpp         |    2 +
 tests/unit/endian/swap_endian_default.cpp     |    2 +
 tests/unit/endian/swap_endian_noasm.cpp       |    2 +
 tests/unit/endian/swap_endian_nounroll.cpp    |    2 +
 tests/unit/endian/swap_endian_oddunroll.cpp   |    2 +
 tests/unit/mechbuild/CMakeLists.txt           |    1 +
 tests/unit/omp/CMakeLists.txt                 |    3 +-
 tests/unit/omp/test_omp.cpp                   |    2 +
 tests/unit/queueing/CMakeLists.txt            |   33 +
 tests/unit/queueing/test_header.hpp           |   37 +
 tests/unit/queueing/test_queueing.cpp         |  159 ++
 tests/unit/sdprintf/CMakeLists.txt            |    4 +-
 tests/unit/sdprintf/test_sdprintf.cpp         |    1 +
 172 files changed, 18126 insertions(+), 7038 deletions(-)
 create mode 100644 .clang-format
 create mode 100644 .gitreview
 create mode 100644 CMake/ClangFormatUtils.cmake
 create mode 100644 CMake/Findmod2c.cmake
 create mode 100644 CMake/packages/FindClangFormat.cmake
 create mode 100644 CMake/packages/Findreportinglib.cmake
 create mode 100644 coreneuron/coreneuron.h
 create mode 100644 coreneuron/kinderiv.py
 create mode 100644 coreneuron/mech/mod2c_core_thread.h
 create mode 100755 coreneuron/mech/mod_func_ptrs.c.pl
 create mode 100644 coreneuron/nrniv/balance.cpp
 create mode 100644 coreneuron/nrniv/cellorder.cpp
 create mode 100644 coreneuron/nrniv/cellorder.h
 create mode 100644 coreneuron/nrniv/cellorder1.cpp
 create mode 100644 coreneuron/nrniv/cellorder2.cpp
 create mode 100644 coreneuron/nrniv/cuda_profile.cu
 create mode 100644 coreneuron/nrniv/cuda_profile.h
 create mode 100644 coreneuron/nrniv/global_vars.cpp
 create mode 100644 coreneuron/nrniv/have2want.h
 create mode 100644 coreneuron/nrniv/lpt.cpp
 create mode 100644 coreneuron/nrniv/lpt.h
 create mode 100644 coreneuron/nrniv/node_permute.cpp
 create mode 100644 coreneuron/nrniv/node_permute.h
 create mode 100644 coreneuron/nrniv/nrn_acc_manager.cpp
 create mode 100644 coreneuron/nrniv/nrn_acc_manager.h
 create mode 100644 coreneuron/nrniv/partrans.cpp
 create mode 100644 coreneuron/nrniv/partrans.h
 create mode 100644 coreneuron/nrniv/partrans_setup.cpp
 create mode 100644 coreneuron/nrniv/profiler_interface.cpp
 create mode 100644 coreneuron/nrniv/profiler_interface.h
 create mode 100644 coreneuron/nrniv/tnode.h
 create mode 100644 coreneuron/nrniv/tqueue.ipp
 create mode 100644 coreneuron/scopmath_core/abort.c
 create mode 100644 coreneuron/scopmath_core/crout_thread.c
 create mode 100644 coreneuron/scopmath_core/dimplic.c
 create mode 100644 coreneuron/scopmath_core/errcodes.h
 create mode 100644 coreneuron/scopmath_core/newton_struct.h
 create mode 100644 coreneuron/scopmath_core/newton_thread.c
 create mode 100644 coreneuron/scopmath_core/sparse_thread.c
 create mode 100644 coreneuron/scopmath_core/ssimplic_thread.c
 create mode 100644 coreneuron/utils/randoms/Random123/features/fujitsufeatures.h
 create mode 100644 coreneuron/utils/randoms/nrnran123.cu
 create mode 100644 coreneuron/utils/reports/nrnreport.cpp
 create mode 100644 coreneuron/utils/reports/nrnreport.h
 create mode 100644 default.nix
 create mode 100644 tests/integration/README
 create mode 100644 tests/regression/CMakeLists.txt
 create mode 100644 tests/regression/README
 create mode 100644 tests/regression/test_header.hpp
 create mode 100644 tests/unit/queueing/CMakeLists.txt
 create mode 100644 tests/unit/queueing/test_header.hpp
 create mode 100644 tests/unit/queueing/test_queueing.cpp

diff --git a/.clang-format b/.clang-format
new file mode 100644
index 000000000..07887181e
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,94 @@
+---
+Language:        Cpp
+# BasedOnStyle:  Chromium
+AccessModifierOffset: -2
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: true
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: false
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: true
+BinPackParameters: false
+BraceWrapping:
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: false
+ColumnLimit:     100
+CommentPragmas:  '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
+IncludeCategories:
+  - Regex:           '^<.*\.h>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IncludeIsMainRegex: '([-_](test|unittest))?$'
+IndentCaseLabels: true
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: All
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: false
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+ReflowComments:  true
+SortIncludes:    false
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp03
+TabWidth:        8
+UseTab:          Never
+JavaScriptQuotes: Leave
+...
+
diff --git a/.gitreview b/.gitreview
new file mode 100644
index 000000000..704c40dd1
--- /dev/null
+++ b/.gitreview
@@ -0,0 +1,6 @@
+[gerrit]
+host=bbpcode.epfl.ch
+port=22
+project=sim/coreneuron
+defaultbranch=master
+defaultremote=origin
diff --git a/CMake/ClangFormatUtils.cmake b/CMake/ClangFormatUtils.cmake
new file mode 100644
index 000000000..b966e81b4
--- /dev/null
+++ b/CMake/ClangFormatUtils.cmake
@@ -0,0 +1,5 @@
+string(REPLACE " " ";" FILES_TO_FORMAT ${SOURCE_FILES})
+
+FOREACH(SRC_FILE ${FILES_TO_FORMAT})
+    execute_process(COMMAND ${CLANG_FORMAT_EXECUTABLE} -i -style=file -fallback-style=none ${SRC_FILE})
+ENDFOREACH()
diff --git a/CMake/Findmod2c.cmake b/CMake/Findmod2c.cmake
new file mode 100644
index 000000000..b65790591
--- /dev/null
+++ b/CMake/Findmod2c.cmake
@@ -0,0 +1,72 @@
+# Copyright (c) 2016, Blue Brain Project
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+# 1. Redistributions of source code must retain the above copyright notice,
+#    this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+# 3. Neither the name of the copyright holder nor the names of its contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+
+
+# Findmod2c
+# -------------
+#
+# Find mod2c
+#
+# Find the mod2c Blue Brain HPC utils library
+#
+# Using mod2c:
+#
+# ::
+#
+#   find_package(mod2c REQUIRED)
+#   include_directories(${mod2c_INCLUDE_DIRS})
+#   target_link_libraries(foo ${mod2c_LIBRARIES})
+#
+# This module sets the following variables:
+#
+# ::
+#
+#   mod2c_FOUND - set to true if the library is found
+#   mod2c_INCLUDE_DIRS - list of required include directories
+#   mod2c_LIBRARIES - list of libraries to be linked
+
+#=============================================================================
+# Copyright 2015 Adrien Devresse <adrien.devresse@epfl.ch>
+#
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+#  License text for the above reference.)
+
+
+# UNIX paths are standard, no need to write.
+find_program(mod2c_BINARY
+  NAMES mod2c_core
+  PATHS "/usr/bin"
+  )
+
+
+# Checks 'REQUIRED', 'QUIET' and versions.
+include(FindPackageHandleStandardArgs)
+
+find_package_handle_standard_args(mod2c
+  FOUND_VAR mod2c_FOUND
+  REQUIRED_VARS mod2c_BINARY)
+  
diff --git a/CMake/TestScriptUtils.cmake b/CMake/TestScriptUtils.cmake
index 45605db85..24bcd20c6 100644
--- a/CMake/TestScriptUtils.cmake
+++ b/CMake/TestScriptUtils.cmake
@@ -24,6 +24,7 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 # THE POSSIBILITY OF SUCH DAMAGE.
 
+
 # Utility functions for manipulating test labels and producing
 # tests from scripts:
 #
diff --git a/CMake/config/CompilerFlagsHelpers.cmake b/CMake/config/CompilerFlagsHelpers.cmake
index 4d3b27599..f6917d604 100644
--- a/CMake/config/CompilerFlagsHelpers.cmake
+++ b/CMake/config/CompilerFlagsHelpers.cmake
@@ -24,6 +24,7 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 # THE POSSIBILITY OF SUCH DAMAGE.
 
+
 # CompilerFlagsHelpers.cmake
 # 
 # set of Convenience functions for portable compiler flags
@@ -33,7 +34,6 @@ set(SUPPORTED_COMPILER_LANGUAGE_LIST "C;CXX")
 
 ## detect compiler
 foreach(COMPILER_LANGUAGE ${SUPPORTED_COMPILER_LANGUAGE_LIST})
-
 	if(CMAKE_${COMPILER_LANGUAGE}_COMPILER_ID STREQUAL "XL")
 	  set(CMAKE_${COMPILER_LANGUAGE}_COMPILER_IS_XLC ON)
 	elseif(CMAKE_${COMPILER_LANGUAGE}_COMPILER_ID STREQUAL "Intel")
@@ -72,6 +72,8 @@ foreach(COMPILER_LANGUAGE ${SUPPORTED_COMPILER_LANGUAGE_LIST})
 		set(CMAKE_${COMPILER_LANGUAGE}_POSITION_INDEPENDANT "-qpic=small")
 
 		set(CMAKE_${COMPILER_LANGUAGE}_VECTORIZE "-qhot")
+		set(ADDITIONAL_THREADSAFE_FLAGS "-qthreaded")
+        set(IGNORE_UNKNOWN_PRAGMA_FLAGS "-qsuppress=1506-224")
 
 	# Microsoft compiler
 	elseif(CMAKE_${COMPILER_LANGUAGE}_COMPILER_IS_MSVC) 
@@ -104,6 +106,7 @@ foreach(COMPILER_LANGUAGE ${SUPPORTED_COMPILER_LANGUAGE_LIST})
 		set(CMAKE_${COMPILER_LANGUAGE}_POSITION_INDEPENDANT "-fPIC")
 
 		set(CMAKE_${COMPILER_LANGUAGE}_VECTORIZE "-ftree-vectorize")
+        set(IGNORE_UNKNOWN_PRAGMA_FLAGS "-Wno-unknown-pragmas")
 
 		if(CMAKE_${COMPILER_LANGUAGE}_COMPILER_IS_GCC AND ( CMAKE_${COMPILER_LANGUAGE}_COMPILER_VERSION VERSION_GREATER "4.7.0") )
 			set(CMAKE_${COMPILER_LANGUAGE}_LINK_TIME_OPT "-flto")
@@ -119,14 +122,19 @@ foreach(COMPILER_LANGUAGE ${SUPPORTED_COMPILER_LANGUAGE_LIST})
 	## rest of the world
 	else()
 
+        ## unknown compiler flags produce error on Cray and hence just set this for intel now
+        if(CMAKE_${COMPILER_LANGUAGE}_COMPILER_IS_ICC)
+            set(IGNORE_UNKNOWN_PRAGMA_FLAGS "-Wno-unknown-pragmas")
+        endif()
+
 		set(CMAKE_${COMPILER_LANGUAGE}_WARNING_ALL "")
 
-		set(CMAKE_${COMPILER_LANGUAGE}_DEBUGINFO_FLAGS "-g")
+		set(CMAKE_${COMPILER_LANGUAGE}_DEBUGINFO_FLAGS "")
 
-		set(CMAKE_${COMPILER_LANGUAGE}_OPT_NONE "-O0")
-		set(CMAKE_${COMPILER_LANGUAGE}_OPT_NORMAL "-O2")
-		set(CMAKE_${COMPILER_LANGUAGE}_OPT_AGGRESSIVE "-O3")
-		set(CMAKE_${COMPILER_LANGUAGE}_OPT_FASTEST "-O3")
+		set(CMAKE_${COMPILER_LANGUAGE}_OPT_NONE "")
+		set(CMAKE_${COMPILER_LANGUAGE}_OPT_NORMAL "")
+		set(CMAKE_${COMPILER_LANGUAGE}_OPT_AGGRESSIVE "")
+		set(CMAKE_${COMPILER_LANGUAGE}_OPT_FASTEST "")
 
 		set(CMAKE_${COMPILER_LANGUAGE}_STACK_PROTECTION "")
 		set(CMAKE_${COMPILER_LANGUAGE}_POSITION_INDEPENDANT "")
diff --git a/CMake/config/DefineInstallationPaths.cmake b/CMake/config/DefineInstallationPaths.cmake
index e252db10b..14883ed65 100644
--- a/CMake/config/DefineInstallationPaths.cmake
+++ b/CMake/config/DefineInstallationPaths.cmake
@@ -24,6 +24,7 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 # THE POSSIBILITY OF SUCH DAMAGE.
 
+
 IF (UNIX)
   IF (NOT APPLICATION_NAME)
     MESSAGE(STATUS "${PROJECT_NAME} is used as APPLICATION_NAME")
@@ -32,7 +33,6 @@ IF (UNIX)
 
 # detect lib suffix
 
-
 GET_PROPERTY(LIB64_SUFFIX GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS)
 
 IF(LIB64_SUFFIX)
diff --git a/CMake/config/ReleaseDebugAutoFlags.cmake b/CMake/config/ReleaseDebugAutoFlags.cmake
index c5edd0e77..c1030c259 100644
--- a/CMake/config/ReleaseDebugAutoFlags.cmake
+++ b/CMake/config/ReleaseDebugAutoFlags.cmake
@@ -24,6 +24,7 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 # THE POSSIBILITY OF SUCH DAMAGE.
 
+
 # ReleaseDebugAutoFlags.cmake
 # 
 # Release / Debug configuration helper
diff --git a/CMake/config/TestHelpers.cmake b/CMake/config/TestHelpers.cmake
index 381cd7bd1..916baaf45 100644
--- a/CMake/config/TestHelpers.cmake
+++ b/CMake/config/TestHelpers.cmake
@@ -24,11 +24,11 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 # THE POSSIBILITY OF SUCH DAMAGE.
 
+
 # TestHelpers.cmake
 # 
 # set of Convenience functions for unit testing with cmake
 
-
 ##
 # enable or disable detection of SLURM and MPIEXEC
 option(AUTO_TEST_WITH_SLURM "automatically add srun as test prefix in a SLURM environment" TRUE)
diff --git a/CMake/packages/FindBBPTestData.cmake b/CMake/packages/FindBBPTestData.cmake
index 62480f3a0..edc344638 100644
--- a/CMake/packages/FindBBPTestData.cmake
+++ b/CMake/packages/FindBBPTestData.cmake
@@ -23,6 +23,8 @@
 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 # THE POSSIBILITY OF SUCH DAMAGE.
+
+
 ##
 ## Find package file for the Blue Brain Functional test data files
 ##
diff --git a/CMake/packages/FindClangFormat.cmake b/CMake/packages/FindClangFormat.cmake
new file mode 100644
index 000000000..d77d514a4
--- /dev/null
+++ b/CMake/packages/FindClangFormat.cmake
@@ -0,0 +1,40 @@
+# Copyright (c) 2016, Blue Brain Project
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+# 1. Redistributions of source code must retain the above copyright notice,
+#    this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+# 3. Neither the name of the copyright holder nor the names of its contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+
+string(REPLACE ":" ";" _PATH $ENV{PATH})
+foreach(p ${_PATH})
+        file(GLOB cand ${p}/clang-format*)
+        if(cand)
+                set(CLANG_FORMAT_EXECUTABLE ${cand})
+                set(CLANG_FORMAT_FOUND ON)
+                execute_process(COMMAND ${CLANG_FORMAT_EXECUTABLE} -version OUTPUT_VARIABLE clang_out )
+                string(REGEX MATCH .*\(version[^\n]*\)\n version ${clang_out})
+                set(CLANG_FORMAT_VERSION ${CMAKE_MATCH_1})
+                break()
+        else()
+                set(CLANG_FORMAT_FOUND OFF)
+        endif()
+endforeach()
diff --git a/CMake/packages/FindSLURM.cmake b/CMake/packages/FindSLURM.cmake
index ee2cc2218..a9a281d58 100644
--- a/CMake/packages/FindSLURM.cmake
+++ b/CMake/packages/FindSLURM.cmake
@@ -24,6 +24,7 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 # THE POSSIBILITY OF SUCH DAMAGE.
 
+
 find_program(SLURM_SBATCH_COMMAND sbatch DOC "Path to the SLURM sbatch executable")
 find_program(SLURM_SRUN_COMMAND srun DOC "Path to the SLURM srun executable")
 find_program(SLURM_SACCTMGR_COMMAND sacctmgr DOC "Path to the SLURM sacctmgr executable")
diff --git a/CMake/packages/Findmod2c.cmake b/CMake/packages/Findmod2c.cmake
index e452b9ee9..b65790591 100644
--- a/CMake/packages/Findmod2c.cmake
+++ b/CMake/packages/Findmod2c.cmake
@@ -24,6 +24,7 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 # THE POSSIBILITY OF SUCH DAMAGE.
 
+
 # Findmod2c
 # -------------
 #
@@ -47,6 +48,13 @@
 #   mod2c_INCLUDE_DIRS - list of required include directories
 #   mod2c_LIBRARIES - list of libraries to be linked
 
+#=============================================================================
+# Copyright 2015 Adrien Devresse <adrien.devresse@epfl.ch>
+#
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+#  License text for the above reference.)
+
 
 # UNIX paths are standard, no need to write.
 find_program(mod2c_BINARY
diff --git a/CMake/packages/Findreportinglib.cmake b/CMake/packages/Findreportinglib.cmake
new file mode 100644
index 000000000..59db6a109
--- /dev/null
+++ b/CMake/packages/Findreportinglib.cmake
@@ -0,0 +1,71 @@
+# Copyright (c) 2016, Blue Brain Project
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+# 1. Redistributions of source code must retain the above copyright notice,
+#    this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+# 3. Neither the name of the copyright holder nor the names of its contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+
+
+# Findreportinglib
+# -------------
+#
+# Find reportinglib
+#
+# Find the reportinglib Blue Brain HPC utils library
+#
+# Using reportinglib:
+#
+# ::
+#
+#   find_package(reportinglib REQUIRED)
+#   include_directories(${reportinglib_INCLUDE_DIRS})
+#   target_link_libraries(foo ${reportinglib_LIBRARIES})
+#
+# This module sets the following variables:
+#
+# ::
+#
+#   reportinglib_FOUND - set to true if the library is found
+#   reportinglib_INCLUDE_DIRS - list of required include directories
+#   reportinglib_LIBRARIES - list of libraries to be linked
+
+#=============================================================================
+# Copyright 2015 Adrien Devresse <adrien.devresse@epfl.ch>
+#
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+#  License text for the above reference.)
+
+
+# UNIX paths are standard, no need to write.
+find_path(reportinglib_INCLUDE_DIR reportinglib/Report.h)
+
+find_library(reportinglib_LIBRARY reportinglib)
+get_filename_component(reportinglib_LIB_DIR ${reportinglib_LIBRARY} DIRECTORY)
+
+# Checks 'REQUIRED', 'QUIET' and versions.
+include(FindPackageHandleStandardArgs)
+
+find_package_handle_standard_args(reportinglib
+  FOUND_VAR reportinglib_FOUND
+  REQUIRED_VARS reportinglib_INCLUDE_DIR reportinglib_LIBRARY reportinglib_LIB_DIR)
+  
diff --git a/CMake/portability/BlueGenePortability.cmake b/CMake/portability/BlueGenePortability.cmake
index 779e651fa..6fbd3b65b 100644
--- a/CMake/portability/BlueGenePortability.cmake
+++ b/CMake/portability/BlueGenePortability.cmake
@@ -24,6 +24,7 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 # THE POSSIBILITY OF SUCH DAMAGE.
 
+
 ##
 ## Portability check on Blue Gene Q environment
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6aa5b9f51..89a985958 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,6 +25,15 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 
 
+# Ecole Polytechnique Federale de Lausanne
+# Brain Mind Institute,
+# Blue Brain Project
+# (c) 2006-2016. All rights reserved.
+#
+# Author: Aleksandr Ovcharenko
+# Core Neuron
+
+
 # Initial Setup
 
 CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12)
@@ -32,15 +41,19 @@ PROJECT(coreneuron)
 
 set(CORENEURON_DESCRIPTION "BBP CoreNeuron simulator")
 
-set(VERSION_MAJOR "0")
-set(VERSION_MINOR "5")
-set(VERSION_PATCH "0")
+set(VERSION_MAJOR "1")
+set(VERSION_MINOR "0")
+set(VERSION_PATCH "1")
 set(VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}")
 
 ## global options
 option(CORENEURON_MAIN "Build the CORE NEURON main" ON)
 option(CORENEURON_OPENMP "Build the CORE NEURON with OpenMP implementation" ON)
 option(DISABLE_NRN_TIMEOUT "Disable nrn_timeout implementation" OFF)
+option(ENABLE_REPORTINGLIB "Enable use of ReportingLib for soma reports" OFF)
+option(EXPORT_MECHS_FUNCTIONS "Enable exporting of mod init, jacob, state, cur and BeforeAfter functions of mod-based mechanisms" OFF)
+option(ENABLE_MPI "Enable MPI-based execution" ON)
+option(ENABLE_SOA "Enable SoA Memory Layout" ON)
 
 ## mech / mod options
 set(ADDITIONAL_MECHPATH "" CACHE PATH "Search path for optional additional mechanism MOD files")
@@ -50,7 +63,6 @@ set(ADDITIONAL_MECHS "" CACHE FILEPATH "File containing list of additional mecha
 option(UNIT_TESTS "Enable unit tests compilation and execution" ON)
 option(FUNCTIONAL_TESTS "Enable functional tests compilation and execution" ON)
 
-
 list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/CMake
       ${PROJECT_SOURCE_DIR}/CMake/portability
       ${PROJECT_SOURCE_DIR}/CMake/packages
@@ -66,8 +78,22 @@ include(FindPkgConfig)
 include(CTest)
 
 #Find all dependencies
-find_package(MPI REQUIRED)
-find_package(mod2c 1.1  REQUIRED)
+if (ENABLE_MPI)
+    find_package(MPI REQUIRED)
+    add_definitions("-DNRNMPI=1")
+else()
+    message(STATUS "Use of MPI disabled by user-provided option")
+    add_definitions("-DNRNMPI=0")
+endif()
+
+#Memory layout transformation: 0 for SoA and 1 for AoS
+if (ENABLE_SOA)
+    add_definitions("-DLAYOUT=0")
+else()
+    add_definitions("-DLAYOUT=1")
+endif()
+
+find_package(mod2c 2.0.0  REQUIRED)
 find_package(Boost 1.41.0 QUIET COMPONENTS filesystem system atomic unit_test_framework)
 
 include(BlueGenePortability)
@@ -75,26 +101,98 @@ include(BlueGenePortability)
 # Threading
 if(CORENEURON_OPENMP)
     find_package(OpenMP)
-    if(OpenMP_FOUND)
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-    endif(OpenMP_FOUND)
+    if(OPENMP_FOUND)
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS} ${ADDITIONAL_THREADSAFE_FLAGS}")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS} ${ADDITIONAL_THREADSAFE_FLAGS}")
+    endif(OPENMP_FOUND)
 endif(CORENEURON_OPENMP)
 
+# ReportingLib
+if(ENABLE_REPORTINGLIB)
+    find_package(reportinglib REQUIRED)
+    if(reportinglib_FOUND)
+        include_directories(${reportinglib_INCLUDE_DIR})
+        add_definitions("-DENABLE_REPORTING")
+    else(reportinglib_FOUND)
+        message(FATAL_ERROR "Enabled use of ReportingLib but ReportingLib installion not found!")
+    endif(reportinglib_FOUND)
+endif(ENABLE_REPORTINGLIB)
+
 include(CheckIncludeFiles)
 CHECK_INCLUDE_FILES (malloc.h have_malloc_h)
 if(have_malloc_h)
   add_definitions("-DHAVE_MALLOC_H")
 endif()
 
+# Some mechanisms use NEURON specific code which is skipped
+# using this macro for CoreNeuron build
+add_definitions(-DCORENEURON_BUILD)
+
 CHECK_INCLUDE_FILES("spi/include/kernel/memory.h" have_memory_h)
 if(have_memory_h)
   add_definitions("-DHAVE_MEMORY_H")
 endif()
 
+option (ENABLE_SELECTIVE_GPU_PROFILING "Enable GPU profiling only for Solver" ON)
+option (ENABLE_OPENACC "Enable use of OpenACC" OFF)
+
+find_package(ClangFormat)
+
+if(CLANG_FORMAT_FOUND)
+    message("clang-format : ${CLANG_FORMAT_EXECUTABLE} : ${CLANG_FORMAT_VERSION}")
+else()
+    message("clang-format executable not found")
+endif()
+
+if(CLANG_FORMAT_FOUND)
+    file(COPY ${PROJECT_SOURCE_DIR}/.clang-format DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
+
+    #to indent files using clang-format
+    file(GLOB_RECURSE SRC_FILES_FOR_CLANG_FORMAT ${PROJECT_SOURCE_DIR}/coreneuron/*.c*
+        ${PROJECT_SOURCE_DIR}/coreneuron/*.h* ${PROJECT_SOURCE_DIR}/coreneuron/*.ipp*)
+
+    add_custom_target(formatsource COMMAND ${CMAKE_COMMAND}
+        -DSOURCE_FILES:STRING="${SRC_FILES_FOR_CLANG_FORMAT}"
+        -DCLANG_FORMAT_EXECUTABLE=${CLANG_FORMAT_EXECUTABLE}
+        -P "${PROJECT_SOURCE_DIR}/CMake/ClangFormatUtils.cmake"
+        )
+endif()
+
+if(ENABLE_OPENACC)
+    if(${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
+        add_definitions( -DPG_ACC_BUGS)
+    endif()
+
+    find_package(CUDA 5.0)
+    SET(CUDA_SEPARABLE_COMPILATION ON)
+
+    if(CUDA_FOUND)
+        if(ENABLE_SELECTIVE_GPU_PROFILING)
+            add_definitions( -DCUDA_PROFILING)
+            add_definitions( -DENABLE_SELECTIVE_PROFILING)
+        endif(ENABLE_SELECTIVE_GPU_PROFILING)
+    else(CUDA_FOUND)
+        message( FATAL_ERROR "Error : Can't file CUDA, load module or set path!" )
+    endif(CUDA_FOUND)
+
+ELSE(ENABLE_OPENACC)
+    SET (ENABLE_SELECTIVE_GPU_PROFILING OFF)
+    #OpenACC pragmas are not guarded, disable all unknown pragm warnings
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${IGNORE_UNKNOWN_PRAGMA_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} ${IGNORE_UNKNOWN_PRAGMA_FLAGS}")
+endif(ENABLE_OPENACC)
+
+# Some mechanisms use NEURON specific code which is skipped
+# using this macro for CoreNeuron build
+add_definitions(-DCORENEURON_BUILD)
+
 if(DISABLE_NRN_TIMEOUT)
     add_definitions("-DDISABLE_TIMEOUT")
-endif(DISABLE_NRN_TIMEOUT)
+endif()
+
+if(EXPORT_MECHS_FUNCTIONS)
+    add_definitions("-DEXPORT_MECHS_FUNCTIONS")
+endif()
 
 # Core library
 add_subdirectory(coreneuron)
@@ -110,3 +208,5 @@ if(Boost_FOUND)
 endif(Boost_FOUND)
 
 
+# Documentation
+#include(DoxygenRule)
diff --git a/README.md b/README.md
index 39a357628..61fca21cd 100644
--- a/README.md
+++ b/README.md
@@ -5,15 +5,13 @@ CoreNEURON is a simplified engine for the [NEURON](https://www.neuron.yale.edu/n
 
 # Features
 
-CoreNEURON supports the following subset of features provided by [NEURON](https://www.neuron.yale.edu/neuron/):
-* FixedStep method
-* Todo
+CoreNEURON supports limited features provided by [NEURON](https://www.neuron.yale.edu/neuron/).
 
-# Requirements
+# Dependencies
 * [CMake 2.8.12+](https://cmake.org)
 * [MOD2C](http://github.com/BlueBrain/mod2c)
-* [MPI 2.0+](http://mpich.org)
-
+* [MPI 2.0+](http://mpich.org) [Optional]
+* [PGI OpenACC Compiler >=16.3](https://www.pgroup.com/resources/accel.htm) [Optional, for GPU systems]
 
 # Installation
 
@@ -26,6 +24,13 @@ export CC=mpicc
 export CXX=mpicxx
 ```
 
+If you don't have MPI, you can disable MPI dependency using CMake option *-DENABLE_MPI=OFF*:
+```bash
+export CC=gcc
+export CXX=g++
+cmake .. -DENABLE_MPI=OFF
+```
+
 The workflow for building CoreNEURON is slightly different from that of NEURON, especially considering the use of **nrnivmodl**. Currently we do not provide **nrnivmodl** for CoreNEURON and hence the user needs to provide paths of mod file directories (semicolon separated) at the time of the build process using the *ADDITIONAL_MECHPATH* variable:
 
 ```bash
@@ -36,6 +41,55 @@ make
 make install
 ```
 
+# Building with GPU support
+
+CoreNEURON has support for GPUs using OpenACC programming model when enabled with *-DENABLE_OPENACC=ON*.
+
+Here are the steps to compile with PGI compiler:
+
+```bash
+module purge
+module load pgi/pgi64/16.5 pgi/mpich/16.5
+module load cuda/6.0
+
+export CC=mpicc
+export CXX=mpicxx
+
+cmake .. -DCMAKE_C_FLAGS:STRING="-acc -Minfo=acc -Minline=size:200,levels:10 -O3 -DSWAP_ENDIAN_DISABLE_ASM -DDISABLE_HOC_EXP" -DCMAKE_CXX_FLAGS:STRING="-acc -Minfo=acc -Minline=size:200,levels:10 -O3 -DSWAP_ENDIAN_DISABLE_ASM -DDISABLE_HOC_EXP" -DCOMPILE_LIBRARY_TYPE=STATIC -DCMAKE_INSTALL_PREFIX=$EXPER_DIR/install/ -DCUDA_HOST_COMPILER=`which gcc` -DCUDA_PROPAGATE_HOST_FLAGS=OFF -DENABLE_SELECTIVE_GPU_PROFILING=ON -DENABLE_OPENACC=ON
+```
+
+And now you can run with --gpu option as:
+
+```bash
+export CUDA_VISIBLE_DEVICES=0   #if needed
+mpirun -n 1 ./bin/coreneuron_exec -d ../tests/integration/ring -mpi -e 100 --gpu --celsius=6.3
+```
+
+Additionally you can enable cell reordering mechanism to improve GPU performance using cell_permute option:
+```bash
+mpirun -n 1 ./bin/coreneuron_exec -d ../tests/integration/ring -mpi -e 100 --gpu --celsius=6.3 --cell_permute=1
+```
+
+Note that if your model is using Random123 random number generator, you can't use same executable for CPU and GPU runs.
+This will be fixed in next version.
+
+# Using ReportingLib
+If you want enable use of ReportingLib for the soma reports, install ReportingLib first and enable it using -DENABLE_REPORTINGLIB (use same install path for ReportingLib as CoreNeuron).
+
+# Using Neurodamus / Additional MOD files
+
+If you have MOD files from the NEURON model, then you have to explicitly build those MOD files with CoreNEURON using *ADDITIONAL_MECHPATH* option:
+```bash
+cmake .. -DADDITIONAL_MECHPATH="/path/of/mod/files/directory/"
+```
+This directory should have only mod files compatible with CoreNEURON.
+
+For BPP Users: If you are building CoreNeuron with Neurodamus, you have to set *ADDITIONAL_MECHPATH* and *ADDITIONAL_MECHS* as:
+```bash
+cmake .. -DADDITIONAL_MECHPATH="/path/of/neurodamus/lib/modlib" -DADDITIONAL_MECHS="/path/of/neurodamus/lib/modlib/coreneuron_modlist.txt"
+```
+Make sure to switch to appropriate branch of Neurodamus (based on your dataset/experiment, e.g. coreneuronsetup).
+
 On a Cray system the user has to provide the path to the MPI library as follows:
 ```bash
 export CC=`which cc`
@@ -47,7 +101,7 @@ We have tested the build process on the following platforms:
 
 * Blue Gene/Q: XLC/GCC
 * x86: Intel, PGI, GCC, Cray
-* OSX: Clang
+* OS X: Clang, GCC
 
 
 # Optimization Flags
@@ -59,8 +113,9 @@ cmake .. -DCMAKE_CXX_FLAGS="-O3 -qtune=qp -qarch=qp -q64 -qhot=simd -qsmp -qthre
 ```
 
 * By default OpenMP threading is enabled. You can disable it with -DCORENEURON_OPENMP=OFF
-* By default CoreNEURON (And NEURON) uses the AoS (Array of Structs) memory layout for all data structures. For efficient memory access and vectorization you can use the SoA (Struct of Array) layout by adding "-DLAYOUT=0" to the C/C++ compiler flags (0 = SoA and 1 = AoS).
+* By default CoreNEURON uses the SoA (Structure of Array) memory layout for all data structures. You can switch to AoS using -DENABLE_SOA=OFF.
 * If the default compiler flags are not supported, try -DCMAKE_BUILD_TARGET=SOME_TARGET
+* NEURON wraps `exp` function with hoc_Exp; disable this using "-DDISABLE_HOC_EXP"
 
 
 # RUNNING SIMULATION:
@@ -100,6 +155,16 @@ In order to see the command line options, you can use:
               Set the path for the output data to PATH (char*). The default value is '.'.
        -k TIME, --forwardskip=TIME
               Set forwardskip to TIME (double). The default value is '0.'.
+       -r, --report
+              Enable soma report.
+       -w TIME, --dt_report=TIME
+              Set the dt for soma reports (using ReportingLib) to TIME (double). The default value is '0.1'.
+       -z MULTIPLE, --multiple=MULTIPLE
+              Model duplication factor. Model size is normal size * MULTIPLE (int). The default value is '1'.
+       -x EXTRACON, --extracon=EXTRACON
+              Number of extra random connections in each thread to other duplicate models (int). The default value is '0'.
+       -R TYPE, --cell_permute=TYPE
+              Permutation of cells for efficient execution of solver on GPU (TYPE could be 1 or 2).
        -mpi
               Enable MPI. In order to initialize MPI environment this argument must be specified.
 ```
@@ -109,6 +174,27 @@ In order to see the command line options, you can use:
 Currently CoreNEURON only outputs spike data. When running the simulation, each MPI rank writes spike information
 into a file `out.#mpi_rank`. These files should be combined and sorted to compare with NEURON spike output.
 
+# Running tests
+
+Once you compile CoreNEURON, unit tests and ring test will be compile by if Boost is available.
+If you pass the path for Neurodamus channels, 10 cell tests will also be compile. You can run tests using
+
+```bash
+make test
+```
+
+If you have different mpi launcher, you can specify it during cmake configuration as:
+```bash
+cmake .. -DTEST_MPI_EXEC_BIN="mpirun" -DTEST_EXEC_PREFIX="mpirun;-n;2" -DTEST_EXEC_PREFIX="mpirun;-n;2" -DAUTO_TEST_WITH_SLURM=OFF -DAUTO_TEST_WITH_MPIEXEC=OFF
+```
+
+# Developer Notes
+If you have installed `clang-format`, you can reformat/reindent generated .c files from mod2c using:
+```
+make formatbuild
+```
+The `.clang-format` file in the source repository is compatible with version 3.9.
+
 ## License
 * See LICENSE.txt
 * See [NEURON](https://www.neuron.yale.edu/neuron/)
diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt
index 7844face5..af6cc79d9 100644
--- a/apps/CMakeLists.txt
+++ b/apps/CMakeLists.txt
@@ -24,12 +24,13 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 # THE POSSIBILITY OF SUCH DAMAGE.
 
+
 include_directories(${PROJECT_SOURCE_DIR} ${PROJECT_SOURCE_DIR}/coreneuron ${CMAKE_CURRENT_SOURCE_DIR} ${PROJECT_BINARY_DIR}/coreneuron)
 
 # The mechanism registration function will be defined depending on whether we
 # are using neurodamus list of mechanisms
-if(ADDITIONAL_MECHS)
-  add_definitions(-DADDITIONAL_MECHS)
+if(ADDITIONAL_MECHS OR ADDITIONAL_MECHPATH)
+    add_definitions(-DADDITIONAL_MECHS)
 endif()
 
 FILE(GLOB coreneuron_exec_src "*.c*")
@@ -40,7 +41,7 @@ set_target_properties(coreneuron_exec PROPERTIES
                             OUTPUT_NAME "coreneuron_exec"
                             RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
 
-target_link_libraries(coreneuron_exec coreneuron ${MPI_C_LIBRARIES} ${MPI_CXX_LIBRARIES})
+target_link_libraries(coreneuron_exec coreneuron ${MPI_C_LIBRARIES} ${MPI_CXX_LIBRARIES} ${reportinglib_LIBRARY})
 
 install(TARGETS coreneuron_exec
          DESTINATION ${BIN_INSTALL_DIR}/)
diff --git a/apps/main.cpp b/apps/main.cpp
index dfedda623..73253ccf6 100644
--- a/apps/main.cpp
+++ b/apps/main.cpp
@@ -26,6 +26,7 @@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 THE POSSIBILITY OF SUCH DAMAGE.
 */
 
+
 extern int main1(int argc, char** argv, char** env);
 extern "C" {extern void modl_reg(void);}
 
diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index fdb329d8d..e11b38f11 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -24,6 +24,7 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 # THE POSSIBILITY OF SUCH DAMAGE.
 
+
 include_directories(utils/randoms)
 include_directories(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
 include_directories(${MPI_INCLUDE_PATH})
@@ -31,6 +32,7 @@ include_directories(${MPI_INCLUDE_PATH})
 FILE(GLOB_RECURSE coreneuron_all_headers "*.h")
 FILE(GLOB_RECURSE coreneuron_all_templates "*.ipp")
 FILE(GLOB_RECURSE coreneuron_all_c_files "*.c*")
+file(GLOB_RECURSE coreneuron_cuda_files "*.cu")
 
 # Compile and include MOD files
 
@@ -69,7 +71,7 @@ macro(mod2c_target name input)
     list(APPEND MOD2C_${name}_MODS "${mod2c_modname_}")
 
     add_custom_command(OUTPUT "${mod2c_output_}"
-      DEPENDS ${input}
+      DEPENDS ${input} "${mod2c_install_prefix}/mod2c_core"
       COMMAND ${CMAKE_COMMAND} -E copy "${mod2c_source_}" "${CMAKE_CURRENT_BINARY_DIR}"
       COMMAND ${MOD2C} "${mod2c_modname_}"
       WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}")
@@ -115,6 +117,7 @@ macro(mod2c_from_file name modlist searchpath)
 
         if(modpath_)
             mod2c_target(${name} "${modpath_}")
+            list(APPEND MOD_PATHS ${modpath_})
         else()
 	    message(WARNING "Unable to locate MOD file: ${mod_}")
         endif()
@@ -149,9 +152,10 @@ mod2c_from_file(OPTMECH "${ADDITIONAL_MECHS}" "${ADDITIONAL_MECHPATH}")
 
 # For 'non-standard' mod files, need to generate registering function in mod_func.c
 
-set(MOD_FUNC_C "${CMAKE_CURRENT_BINARY_DIR}/mod_func.c")
+set(MOD_FUNC_C    "${CMAKE_CURRENT_BINARY_DIR}/mod_func.c")
 set(MOD_FUNC_C_PL "${CMAKE_CURRENT_SOURCE_DIR}/mech/mod_func.c.pl")
 
+# ... pass as argument to the perl script the list of mod file names;
 add_custom_command(OUTPUT "${MOD_FUNC_C}"
     DEPENDS ${MECH_FILE_LISTS} "${MOD_FUNC_C_PL}"
     COMMAND perl "${MOD_FUNC_C_PL}" ${MOD2C_OPTMECH_MODS} > "${MOD_FUNC_C}"
@@ -163,16 +167,76 @@ foreach(depfile ${MECH_FILE_LISTS})
     configure_file("${depfile}" "${CMAKE_CURRENT_BINARY_DIR}/dummy_reconfigure_dep")
 endforeach()
 
+# to work around no acc ability to pass function pointers as arguments,
+# some translated c files depend on a _kinderiv.h file that is constructed
+# by kinderiv.py
+
+set(KINDERIV_PY "${CMAKE_CURRENT_SOURCE_DIR}/kinderiv.py")
+set(KINDERIV_H "${CMAKE_CURRENT_BINARY_DIR}/_kinderiv.h")
+
+add_custom_command(OUTPUT "${KINDERIV_H}"
+    DEPENDS ${MOD2C_OPTMECH_OUTPUTS} ${MOD2C_STDMECH_OUTPUTS} "${KINDERIV_PY}"
+    COMMAND python "${KINDERIV_PY}"
+    WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}")
+
+if (EXPORT_MECHS_FUNCTIONS)
+  # Create C file with all "get function pointers" methods
+  set(MOD_FUNC_PTRS_C    "${CMAKE_CURRENT_BINARY_DIR}/mod_func_ptrs.c")
+  set(MOD_FUNC_PTRS_C_PL "${CMAKE_CURRENT_SOURCE_DIR}/mech/mod_func_ptrs.c.pl")
+
+  # ... pass as argument to the perl script the list of mods full paths;
+  add_custom_command(OUTPUT "${MOD_FUNC_PTRS_C}"
+      DEPENDS ${MECH_FILE_LISTS} "${MOD_FUNC_PTRS_C_PL}"
+      COMMAND perl "${MOD_FUNC_PTRS_C_PL}" ${MOD_PATHS} > "${MOD_FUNC_PTRS_C}"
+      WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}")
+endif()
+
 # Add generated sources from MOD files
 
-set(GENERATED_MECH_C_FILES ${MOD_FUNC_C} ${MOD2C_STDMECH_OUTPUTS} ${MOD2C_OPTMECH_OUTPUTS})
+set(GENERATED_MECH_C_FILES ${MOD_FUNC_C} ${MOD_FUNC_PTRS_C} ${MOD2C_STDMECH_OUTPUTS} ${MOD2C_OPTMECH_OUTPUTS})
+
+# artificial cells must be on cpu, defaul nrnran123.c is for cpu, nrn_setup.cpp uses nrnran123 for only memory calculation purpose which should use cpu version of nrnran123
+set(NOACC_MECH_C_FILES ${CMAKE_CURRENT_BINARY_DIR}/netstim.c ${CMAKE_CURRENT_BINARY_DIR}/netstim_inhpoisson.c ${CMAKE_CURRENT_BINARY_DIR}/pattern.c ${CMAKE_CURRENT_SOURCE_DIR}/utils/randoms/nrnran123.c ${CMAKE_CURRENT_SOURCE_DIR}/nrniv/nrn_setup.cpp)
+
+if(ENABLE_OPENACC)
+    set_source_files_properties(${GENERATED_MECH_C_FILES} PROPERTIES COMPILE_FLAGS "")
+    set_source_files_properties(${NOACC_MECH_C_FILES} PROPERTIES COMPILE_FLAGS "-DDISABLE_OPENACC")
+    if(${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
+        set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/scopmath_core/sparse_thread.c PROPERTIES COMPILE_FLAGS "-ta=tesla:nollvm")
+    endif()
+endif()
 
 set(coreneuron_all_c_files ${coreneuron_all_c_files} ${GENERATED_MECH_C_FILES})
 
+#to indent generated c/cpp files
+if(CLANG_FORMAT_FOUND)
+    add_custom_target(formatbuild
+        COMMAND ${CMAKE_COMMAND}
+            -DSOURCE_FILES:STRING="${GENERATED_MECH_C_FILES}"
+            -DCLANG_FORMAT_EXECUTABLE=${CLANG_FORMAT_EXECUTABLE}
+            -P "${PROJECT_SOURCE_DIR}/CMake/ClangFormatUtils.cmake"
+        DEPENDS ${GENERATED_MECH_C_FILES}
+        )
+endif()
+
 
-add_library(coreneuron ${COMPILE_LIBRARY_TYPE} ${coreneuron_all_headers} ${coreneuron_all_templates} ${coreneuron_all_c_files})
+add_library(coreneuron ${COMPILE_LIBRARY_TYPE} ${coreneuron_all_headers} ${coreneuron_all_templates} ${coreneuron_all_c_files} ${KINDERIV_H})
+
+set(link_cudacoreneuron)
+set(link_reportinglib)
+
+#@TODO: CMake should have option for arch
+if(ENABLE_OPENACC)
+    cuda_add_library("cudacoreneuron" ${coreneuron_cuda_files} OPTIONS -arch=sm_20)
+    set(link_cudacoreneuron cudacoreneuron)
+endif()
+
+if(ENABLE_REPORTINGLIB)
+  set(link_reportinglib ${REPORTINGLIB_LIBRARIES})
+endif()
+target_link_libraries(coreneuron ${MPI_C_LIBRARIES}
+    ${link_reportinglib} ${link_cudacoreneuron} ${CUDA_LIBRARIES} )
 
-target_link_libraries(coreneuron ${MPI_C_LIBRARIES})
 
 set_target_properties(coreneuron PROPERTIES
                             VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}
@@ -180,5 +244,5 @@ set_target_properties(coreneuron PROPERTIES
                             CLEAN_DIRECT_OUTPUT 1)
 
 install(TARGETS coreneuron
-             LIBRARY DESTINATION ${LIB_INSTALL_DIR} 
+             LIBRARY DESTINATION ${LIB_INSTALL_DIR}
              ARCHIVE DESTINATION ${LIB_INSTALL_DIR} )
diff --git a/coreneuron/coreneuron.h b/coreneuron/coreneuron.h
new file mode 100644
index 000000000..27697cff9
--- /dev/null
+++ b/coreneuron/coreneuron.h
@@ -0,0 +1,78 @@
+/*
+Copyright (c) 2016, Blue Brain Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/***
+ * Includes all headers required to communicate and run all methods
+ * described in CoreNeuron, neurox, and mod2c C-generated mechanisms
+ * functions.
+**/
+
+#ifndef CORENEURON_H
+#define CORENEURON_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include "coreneuron/scopmath_core/newton_struct.h"  //Newton Struct
+#include "coreneuron/nrnoc/membdef.h"                //static definitions
+#include "coreneuron/nrnoc/nrnoc_ml.h"               //Memb_list and mechs info
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+// TODO add external variables required by mechanisms
+// extern double celsius;
+// extern int nrn_ion_global_map_size;
+// extern double** nrn_ion_global_map;
+
+#ifdef EXPORT_MECHS_FUNCTIONS
+// from (auto-generated) mod_func_ptrs.c
+extern mod_f_t get_init_function(const char* sym);
+extern mod_f_t get_cur_function(const char* sym);
+extern mod_f_t get_state_function(const char* sym);
+extern mod_f_t get_BA_function(const char* sym, int BA_func_id);
+#endif
+
+// from nrnoc/capac.c
+extern void nrn_init_capacitance(struct NrnThread*, struct Memb_list*, int);
+;
+extern void nrn_cur_capacitance(struct NrnThread* _nt, struct Memb_list* ml, int type);
+extern void nrn_alloc_capacitance(double* data, Datum* pdata, int type);
+
+// from nrnoc/eion.c
+extern void nrn_init_ion(struct NrnThread*, struct Memb_list*, int);
+extern void nrn_cur_ion(struct NrnThread* _nt, struct Memb_list* ml, int type);
+extern void nrn_alloc_ion(double* data, Datum* pdata, int type);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/coreneuron/kinderiv.py b/coreneuron/kinderiv.py
new file mode 100644
index 000000000..65c747e69
--- /dev/null
+++ b/coreneuron/kinderiv.py
@@ -0,0 +1,91 @@
+#!/usr/bin/python
+
+# read the translated mod files and construct _kinderiv.h
+
+# if _kinderiv.h already exists and is the same as what this script generates
+# then do not update it (to avoid re-compiling all the mech c files if
+# only one of them changes)
+
+kftmp = '_tmp_kinderiv.h'
+kf = '_kinderiv.h'
+
+import os
+
+fnames = [f.replace('.mod', '.c') for f in os.listdir('.') if f.endswith('.mod')]
+deriv = []
+kin = []
+for fname in fnames:
+  f = open(fname, "r")
+  for line in f:
+    word = line.split()
+    if len(word) > 3:
+      if word[0] == '/*' and word[1] == '_derivimplic_':
+        deriv.append([word[2], word[3], fname, word[1]])
+      if word[0] == '/*' and word[1] == '_kinetic_':
+        kin.append([word[2], word[3], fname, word[1]])
+  f.close()
+
+fout = open(kftmp, "w")
+fout.write('''
+#ifndef _kinderiv_h
+#define _kinderiv_h
+''')
+
+fout.write("\n/* data used to construct this file */\n")
+for l in [deriv, kin]:
+  for item in l:
+    fout.write('/*')
+    for word in item:
+      fout.write(' %s' % word)
+    fout.write(' */\n')
+
+fout.write("\n/* declarations */\n")
+for item in deriv:
+  fout.write('#pragma acc routine seq\n')
+  fout.write('extern int %s%s(_threadargsproto_);\n' % (item[0], item[1]))
+  fout.write('#pragma acc routine seq\n')
+  fout.write('extern int _cb_%s%s(_threadargsproto_);\n' % (item[0], item[1]))
+
+for item in kin:
+  fout.write('#pragma acc routine seq\n')
+  fout.write('extern int %s%s(void*, double*, _threadargsproto_);\n' % (item[0], item[1]))
+
+fout.write("\n/* callback indices */\n")
+derivoffset = 1
+kinoffset = 1
+for i, item in enumerate(deriv):
+  fout.write('#define _derivimplic_%s%s %d\n' % (item[0], item[1], i + derivoffset))
+for i, item in enumerate(kin):
+  fout.write('#define _kinetic_%s%s %d\n' % (item[0], item[1], i + kinoffset))
+
+fout.write("\n/* switch cases */\n")
+fout.write("\n#define _NRN_DERIVIMPLIC_CASES \\\n")
+for item in deriv:
+  fout.write("  case _derivimplic_%s%s: %s%s(_threadargs_); break; \\\n" % (item[0], item[1], item[0], item[1]))
+fout.write("\n")
+
+fout.write("\n#define _NRN_DERIVIMPLIC_CB_CASES \\\n")
+for item in deriv:
+  fout.write("  case _derivimplic_%s%s: _cb_%s%s(_threadargs_); break; \\\n" % (item[0], item[1], item[0], item[1]))
+fout.write("\n")
+
+fout.write("\n#define _NRN_KINETIC_CASES \\\n")
+for item in kin:
+  fout.write("  case _kinetic_%s%s: %s%s(so, rhs, _threadargs_); break; \\\n" % (item[0], item[1], item[0], item[1]))
+fout.write("\n")
+
+fout.write('\n#endif\n')
+fout.close()
+
+# if kf exists and is same as kftmp, just remove kftmp. Otherwise
+# rename kftmp to kf
+import filecmp
+b = False
+try:
+  b = filecmp.cmp(kftmp, kf)
+except:
+  pass
+if b:
+  os.remove(kftmp)
+else:
+  os.rename(kftmp, kf)
diff --git a/coreneuron/mech/cfile/cabvars.h b/coreneuron/mech/cfile/cabvars.h
index 846d1bb1f..53e21ee43 100644
--- a/coreneuron/mech/cfile/cabvars.h
+++ b/coreneuron/mech/cfile/cabvars.h
@@ -30,28 +30,19 @@ THE POSSIBILITY OF SUCH DAMAGE.
 extern "C" {
 #endif
 
-extern  void capac_reg_(void), _passive_reg(void),
+extern void capacitance_reg(void), _passive_reg(void),
 #if EXTRACELLULAR
-	extracell_reg_(void),
+    extracell_reg_(void),
 #endif
-	_stim_reg(void),
-	_hh_reg(void),
-        _netstim_reg(void),
-	_expsyn_reg(void);
+    _stim_reg(void), _hh_reg(void), _netstim_reg(void), _expsyn_reg(void);
 
-static void (*mechanism[])(void) = { /* type will start at 3 */
-	capac_reg_,
-	_passive_reg,
+static void (*mechanism[])(void) = {/* type will start at 3 */
+                                    capacitance_reg, _passive_reg,
 #if EXTRACELLULAR
-	/* extracellular requires special handling and must be type 5 */
-	extracell_reg_,
+                                    /* extracellular requires special handling and must be type 5 */
+                                    extracell_reg_,
 #endif
-	_stim_reg,
-	_hh_reg,
-	_expsyn_reg,
-        _netstim_reg,
-	0
-};
+                                    _stim_reg, _hh_reg, _expsyn_reg, _netstim_reg, 0};
 
 #ifdef __cplusplus
 }
diff --git a/coreneuron/mech/mod2c_core_thread.h b/coreneuron/mech/mod2c_core_thread.h
new file mode 100644
index 000000000..bd0ae3f8a
--- /dev/null
+++ b/coreneuron/mech/mod2c_core_thread.h
@@ -0,0 +1,129 @@
+#ifndef mod2c_core_thread_h
+#define mod2c_core_thread_h
+
+#include "coreneuron/nrnoc/multicore.h"
+#include "coreneuron/nrnoc/nrnoc_ml.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if !defined(LAYOUT)
+/* 1 means AoS, >1 means AoSoA, <= 0 means SOA */
+#define LAYOUT 1
+#endif
+#if LAYOUT >= 1
+#define _STRIDE LAYOUT
+#else
+#define _STRIDE _cntml_padded + _iml
+#endif
+
+#define _threadargscomma_ _iml, _cntml_padded, _p, _ppvar, _thread, _nt, _v,
+#define _threadargsprotocomma_                                                                    \
+    int _iml, int _cntml_padded, double *_p, Datum *_ppvar, ThreadDatum *_thread, NrnThread *_nt, \
+        double _v,
+#define _threadargs_ _iml, _cntml_padded, _p, _ppvar, _thread, _nt, _v
+#define _threadargsproto_                                                                         \
+    int _iml, int _cntml_padded, double *_p, Datum *_ppvar, ThreadDatum *_thread, NrnThread *_nt, \
+        double _v
+
+#if 0
+
+typedef int (*DIFUN)(_threadargsproto_);
+typedef int (*NEWTFUN)(_threadargsproto_);
+typedef int (*SPFUN)(struct SparseObj*, double*, _threadargsproto_);
+#define difun(arg) (*arg)(_threadargs_);
+#define newtfun(arg) (*arg)(_threadargs_);
+
+#else
+
+typedef int DIFUN;
+typedef int NEWTFUN;
+typedef int SPFUN;
+#pragma acc routine seq
+extern int nrn_derivimplic_steer(int, _threadargsproto_);
+#define difun(arg) nrn_derivimplic_steer(arg, _threadargs_);
+#pragma acc routine seq
+extern int nrn_newton_steer(int, _threadargsproto_);
+#define newtfun(arg) nrn_newton_steer(arg, _threadargs_);
+
+#endif
+
+typedef struct Elm {
+    unsigned row;        /* Row location */
+    unsigned col;        /* Column location */
+    double* value;       /* The value SOA  _cntml_padded of them*/
+    struct Elm* r_up;    /* Link to element in same column */
+    struct Elm* r_down;  /*       in solution order */
+    struct Elm* c_left;  /* Link to left element in same row */
+    struct Elm* c_right; /*       in solution order (see getelm) */
+} Elm;
+#define ELM0 (Elm*)0
+
+typedef struct Item {
+    Elm* elm;
+    unsigned norder; /* order of a row */
+    struct Item* next;
+    struct Item* prev;
+} Item;
+#define ITEM0 (Item*)0
+
+typedef Item List; /* list of mixed items */
+
+typedef struct SparseObj {  /* all the state information */
+    Elm** rowst;            /* link to first element in row (solution order)*/
+    Elm** diag;             /* link to pivot element in row (solution order)*/
+    void* elmpool;          /* no interthread cache line sharing for elements */
+    unsigned neqn;          /* number of equations */
+    unsigned _cntml_padded; /* number of instances */
+    unsigned* varord;       /* row and column order for pivots */
+    double* rhs;            /* initially- right hand side        finally - answer */
+    SPFUN oldfun;
+    unsigned* ngetcall; /* per instance counter for number of calls to _getelm */
+    int phase;          /* 0-solution phase; 1-count phase; 2-build list phase */
+    int numop;
+    unsigned coef_list_size;
+    double** coef_list; /* pointer to (first instance) value in _getelm order */
+    /* don't really need the rest */
+    int nroworder;   /* just for freeing */
+    Item** roworder; /* roworder[i] is pointer to order item for row i.
+                             Does not have to be in orderlist */
+    List* orderlist; /* list of rows sorted by norder
+                             that haven't been used */
+    int do_flag;
+} SparseObj;
+
+#pragma acc routine seq
+extern int nrn_kinetic_steer(int, SparseObj*, double*, _threadargsproto_);
+#define spfun(arg1, arg2, arg3) nrn_kinetic_steer(arg1, arg2, arg3, _threadargs_);
+
+#pragma acc routine seq
+extern int derivimplicit_thread(int, int*, int*, DIFUN, _threadargsproto_);
+#pragma acc routine seq
+extern int _ss_derivimplicit_thread(int n, int* slist, int* dlist, DIFUN fun, _threadargsproto_);
+#pragma acc routine seq
+extern int
+sparse_thread(SparseObj*, int, int*, int*, double*, double, SPFUN, int, _threadargsproto_);
+#pragma acc routine seq
+int _ss_sparse_thread(SparseObj*,
+                      int n,
+                      int* s,
+                      int* d,
+                      double* t,
+                      double dt,
+                      SPFUN fun,
+                      int linflag,
+                      _threadargsproto_);
+
+#pragma acc routine seq
+extern double _modl_get_dt_thread(NrnThread*);
+#pragma acc routine seq
+extern void _modl_set_dt_thread(double, NrnThread*);
+
+void nrn_sparseobj_copyto_device(SparseObj* so);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/coreneuron/mech/mod_func.c.pl b/coreneuron/mech/mod_func.c.pl
index a61369a61..f53b74df1 100644
--- a/coreneuron/mech/mod_func.c.pl
+++ b/coreneuron/mech/mod_func.c.pl
@@ -1,57 +1,44 @@
-#!/usr/bin/perl
-
-# Copyright (c) 2016, Blue Brain Project
-# All rights reserved.
-
-# Redistribution and use in source and binary forms, with or without modification,
-# are permitted provided that the following conditions are met:
-# 1. Redistributions of source code must retain the above copyright notice,
-#    this list of conditions and the following disclaimer.
-# 2. Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-# 3. Neither the name of the copyright holder nor the names of its contributors
-#    may be used to endorse or promote products derived from this software
-#    without specific prior written permission.
-
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
-# THE POSSIBILITY OF SUCH DAMAGE.
-
-
-# Construct the modl_reg() function from a provided list
-# of modules.
-
-# Usage: mod_func.c.pl [MECH1.mod MECH2.mod ...]
-
-@mods=@ARGV;
+#!/ usr / bin / perl
+
+#Copyright(c) 2014 EPFL - BBP, All rights reserved.
+#
+#THIS SOFTWARE IS PROVIDED BY THE BLUE BRAIN PROJECT "AS IS"
+#AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+#THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+#PURPOSE ARE DISCLAIMED.IN NO EVENT SHALL THE BLUE BRAIN PROJECT
+#BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#CONSEQUENTIAL DAMAGES(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+#BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+#WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT(INCLUDING NEGLIGENCE
+#OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+#IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#Construct the modl_reg() function from a provided list
+#of modules.
+
+#Usage : mod_func.c.pl[MECH1.mod MECH2.mod...]
+
+@mods = @ARGV;
 s/\.mod$// foreach @mods;
 
 @mods=sort @mods;
 
-print <<"__eof";
+print << "__eof";
 #include <stdio.h>
 extern int nrnmpi_myid;
 extern int nrn_nobanner_;
-extern int @{[join ",\n  ", map {"_${_}_reg(void)"} @mods]};
+extern int @{[join ",\n  ", map{"_${_}_reg(void)"} @mods]};
 
-void modl_reg(){
-  if (!nrn_nobanner_) if (nrnmpi_myid < 1) {
-    fprintf(stderr, " Additional mechanisms from files\\n");
+void modl_reg() {
+    if (!nrn_nobanner_)
+        if (nrnmpi_myid < 1) {
+            fprintf(stderr, " Additional mechanisms from files\\n");
 
-@{[join "\n",map {"    fprintf(stderr,\" $_.mod\");"} @mods]}
-    fprintf(stderr, "\\n\\n");
-  }
+            @{[join "\n", map{"    fprintf(stderr,\" $_.mod\");"} @mods] } fprintf(stderr,
+                                                                                   "\\n\\n");
+        }
 
-@{[join "\n",map {" _${_}_reg();"} @mods]}
+    @{[join "\n", map{" _${_}_reg();"} @mods] }
 }
 __eof
-
diff --git a/coreneuron/mech/mod_func_ptrs.c.pl b/coreneuron/mech/mod_func_ptrs.c.pl
new file mode 100755
index 000000000..7c0d8ed00
--- /dev/null
+++ b/coreneuron/mech/mod_func_ptrs.c.pl
@@ -0,0 +1,103 @@
+#!/usr/bin/perl
+
+# Copyright (c) 2014 EPFL-BBP, All rights reserved.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE BLUE BRAIN PROJECT "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE BLUE BRAIN PROJECT
+# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+# OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+# IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+# Constructs the getters for mechanisms function pointers;
+
+# Usage: mod_func_ptrs.c.pl [PATH-MECH1.mod PATH-MECH2.mod ...]
+
+@mods=@ARGV;
+
+s/\.mod$// foreach @mods;
+
+@mods=sort @mods;
+
+@funcs=('init','cur','state');
+#@funcs=('init','cur','jacob','state');
+
+print <<"__eof";
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include "coreneuron/coreneuron.h"
+__eof
+
+#Get the correct SUFFIX from each mod file for each mechanism
+@suffixes_all=();
+@suffixes_with_cur=(); #with cur function (BREAKPOINT block in mod)
+
+for $m(@mods) {
+  $filename = "${m}.mod";
+  open $fh, '<', $filename or die "error: unable to open file '$filename' for reading : $!.";
+  my @content = <$fh>;
+  close $fh;
+  my @lines = grep /SUFFIX/, @content;
+  if (!@lines) {
+    @lines = grep /POINT_PROCESS/, @content;
+  }
+  if (!@lines) {
+    @lines = grep /ARTIFICIAL_CELL/, @content;
+  }
+  if (!@lines) {
+    die "error: unable to find mechanism name for ${filename}. Add the missing keywork to coreneuron/mech/mod_func.c.pl."
+  }
+  @lines[0] =~ s/^\s+|\s+$//g;  #remove trailing whitespaces from beginning and end
+  @lines[0] =~ s/ +/ /; #replace multiple spaces by one
+  @lines[0] =~ s/[\r\n]+$//; #remove bad endings (breakline)
+  my @words = split / /, @lines[0]; #get words from first (and only) line containing 'SUFFIX'
+  my $suffix = @words[1]; #get SUFFIX name as second word"
+  push(@suffixes_all, $suffix);
+
+  #now add only those with nrn_cur function definition
+  my @breakpointlines = grep /BREAKPOINT/, @content;
+  if (scalar @breakpointlines == 1) {
+    push(@suffixes_with_cur, $suffix);
+  }
+}
+
+#Output the get of function pointers for init, jacob, current and state functions
+
+for $f(@funcs) {
+
+@suffixes_with_this_func=();
+if ($f eq "cur"){
+  @suffixes_with_this_func = @suffixes_with_cur;
+}
+else {
+  @suffixes_with_this_func = @suffixes_all;
+}
+
+print <<"__eof";
+
+extern void \n  @{[join ",\n  ", map {"_nrn_${f}__${_}(NrnThread*, Memb_list*, int)"} @suffixes_with_this_func]};
+
+mod_f_t get_${f}_function(const char * sym)
+{
+@{[join "\n",map {"  if (strcmp(sym, \"${_}\") == 0)  return _nrn_${f}__${_};"} @suffixes_with_this_func]}
+  return NULL;
+}
+__eof
+
+}
+
+print <<"__eof";
+
+mod_f_t get_BA_function(const char * sym, int BA_func_id)
+{
+  return NULL;
+}
+__eof
+
diff --git a/coreneuron/mech/modfile/hh.mod b/coreneuron/mech/modfile/hh.mod
index 1d7c19d92..053a15f43 100644
--- a/coreneuron/mech/modfile/hh.mod
+++ b/coreneuron/mech/modfile/hh.mod
@@ -26,6 +26,7 @@ NEURON {
         USEION k READ ek WRITE ik
         NONSPECIFIC_CURRENT il
         RANGE gnabar, gkbar, gl, el, gna, gk
+        :GLOBAL minf, hinf, ninf, mtau, htau, ntau
         RANGE minf, hinf, ninf, mtau, htau, ntau
 	THREADSAFE : assigned GLOBALs will be per thread
 }
@@ -89,7 +90,7 @@ DERIVATIVE states {
 PROCEDURE rates(v(mV)) {  :Computes rate and other constants at current v.
                       :Call once from HOC to initialize inf at resting v.
         LOCAL  alpha, beta, sum, q10
-        TABLE minf, mtau, hinf, htau, ninf, ntau DEPEND celsius FROM -100 TO 100 WITH 200
+:        TABLE minf, mtau, hinf, htau, ninf, ntau DEPEND celsius FROM -100 TO 100 WITH 200
 
 UNITSOFF
         q10 = 3^((celsius - 6.3)/10)
diff --git a/coreneuron/mech/modfile/netstim.mod b/coreneuron/mech/modfile/netstim.mod
index b4179556c..1d5169e50 100644
--- a/coreneuron/mech/modfile/netstim.mod
+++ b/coreneuron/mech/modfile/netstim.mod
@@ -1,12 +1,19 @@
 : $Id: netstim.mod 2212 2008-09-08 14:32:26Z hines $
 : comments at end
 
-: the Random idiom has been changed to be compatible with CORENEURON
-: ie. no longer allow the low quality scop_exprand generator or connection
-: with the hoc Random object. Instead make direct use of nrnran123.h
-: and, through the BBCOREPOINTER idioms, provide bbcore_read and bbcore_write
-: methods for coreneuron and bluron2coreneuron so that a NetStim instance
-: will give the same random streams for simulation in either version.
+: the Random idiom has been extended to support CoreNEURON.
+
+: For backward compatibility, noiseFromRandom(hocRandom) can still be used
+: as well as the default low-quality scop_exprand generator.
+: However, CoreNEURON will not accept usage of the low-quality generator,
+: and, if noiseFromRandom is used to specify the random stream, that stream
+: must be using the Random123 generator.
+
+: The recommended idiom for specfication of the random stream is to use
+: noiseFromRandom123(id1, id2[, id3])
+
+: If any instance uses noiseFromRandom123, then no instance can use noiseFromRandom
+: and vice versa.
 
 NEURON	{ 
   ARTIFICIAL_CELL NetStim
@@ -30,14 +37,42 @@ ASSIGNED {
 	donotuse
 }
 
+VERBATIM
+#if NRNBBCORE /* running in CoreNEURON */
+
+#define IFNEWSTYLE(arg) arg
+
+#else /* running in NEURON */
+
+/*
+   1 means noiseFromRandom was called when _ran_compat was previously 0 .
+   2 means noiseFromRandom123 was called when _ran_compart was previously 0.
+*/
+static int _ran_compat; /* specifies the noise style for all instances */
+#define IFNEWSTYLE(arg) if(_ran_compat == 2) { arg }
+
+#endif /* running in NEURON */
+ENDVERBATIM
+
+:backward compatibility
+PROCEDURE seed(x) {
+VERBATIM
+#if !NRNBBCORE
+ENDVERBATIM
+	set_seed(x)
+VERBATIM
+#endif
+ENDVERBATIM
+}
+
 INITIAL {
 
-        VERBATIM
-          if (_p_donotuse)
-          {
-            nrnran123_setseq((nrnran123_State*)_p_donotuse, 0, 0);
-          }
-        ENDVERBATIM
+	VERBATIM
+	  if (_p_donotuse) {
+	    /* only this style initializes the stream on finitialize */
+	    IFNEWSTYLE(nrnran123_setseq((nrnran123_State*)_p_donotuse, 0, 0);)
+	  }
+	ENDVERBATIM
 
 	on = 0 : off
 	ispike = 0
@@ -80,6 +115,13 @@ FUNCTION invl(mean (ms)) (ms) {
 }
 VERBATIM
 #include "nrnran123.h"
+
+#if !NRNBBCORE
+/* backward compatibility */
+double nrn_random_pick(void* r);
+void* nrn_random_arg(int argpos);
+int nrn_random_isran123(void* r, uint32_t* id1, uint32_t* id2, uint32_t* id3);
+#endif
 ENDVERBATIM
 
 FUNCTION erand() {
@@ -90,24 +132,74 @@ VERBATIM
 		: each instance. However, the corresponding hoc Random
 		: distribution MUST be set to Random.negexp(1)
 		*/
+#if !NRNBBCORE
+		if (_ran_compat == 2) {
+			_lerand = nrnran123_negexp((nrnran123_State*)_p_donotuse);
+		}else{
+			_lerand = nrn_random_pick(_p_donotuse);
+		}
+#else
 		_lerand = nrnran123_negexp((nrnran123_State*)_p_donotuse);
+#endif
+		return _lerand;
+	}else{
+#if NRNBBCORE
+		assert(0);
+#else
+		/*
+		: the old standby. Cannot use if reproducible parallel sim
+		: independent of nhost or which host this instance is on
+		: is desired, since each instance on this cpu draws from
+		: the same stream
+		*/
+#endif
+	}
+#if !NRNBBCORE
+ENDVERBATIM
+	erand = exprand(1)
+VERBATIM
+#endif
+ENDVERBATIM
+}
+
+PROCEDURE noiseFromRandom() {
+VERBATIM
+#if !NRNBBCORE
+ {
+	void** pv = (void**)(&_p_donotuse);
+	if (_ran_compat == 2) {
+		fprintf(stderr, "NetStim.noiseFromRandom123 was previously called\n");
+		assert(0);
+	}
+	_ran_compat = 1;
+	if (ifarg(1)) {
+		*pv = nrn_random_arg(1);
 	}else{
-                _lerand = 0.0;
-//		assert(0);
+		*pv = (void*)0;
 	}
+ }
+#endif
 ENDVERBATIM
 }
 
+
 PROCEDURE noiseFromRandom123() {
 VERBATIM
 #if !NRNBBCORE
  {
 	nrnran123_State** pv = (nrnran123_State**)(&_p_donotuse);
+	if (_ran_compat == 1) {
+		fprintf(stderr, "NetStim.noiseFromRandom was previously called\n");
+		assert(0);
+	}
+	_ran_compat = 2;
 	if (*pv) {
 		nrnran123_deletestream(*pv);
 		*pv = (nrnran123_State*)0;
 	}
-	if (ifarg(2)) {
+	if (ifarg(3)) {
+		*pv = nrnran123_newstream3((uint32_t)*getarg(1), (uint32_t)*getarg(2), (uint32_t)*getarg(3));
+	}else if (ifarg(2)) {
 		*pv = nrnran123_newstream((uint32_t)*getarg(1), (uint32_t)*getarg(2));
 	}
  }
@@ -116,26 +208,43 @@ ENDVERBATIM
 }
 
 VERBATIM
+#if !NRNBBCORE
 static void bbcore_write(double* x, int* d, int* xx, int *offset, _threadargsproto_) {
 	if (!noise) { return; }
+	/* error if using the legacy scop_exprand */
+	if (!_p_donotuse) {
+		fprintf(stderr, "NetStim: cannot use the legacy scop_negexp generator for the random stream.\n");
+		assert(0);
+	}
 	if (d) {
 		uint32_t* di = ((uint32_t*)d) + *offset;
-		nrnran123_State** pv = (nrnran123_State**)(&_p_donotuse);
-		nrnran123_getids(*pv, di, di+1);
-printf("Netstim bbcore_write %d %d\n", di[0], di[1]);
+		if (_ran_compat == 1) {
+			void** pv = (void**)(&_p_donotuse);
+			/* error if not using Random123 generator */
+			if (!nrn_random_isran123(*pv, di, di+1, di+2)) {
+				fprintf(stderr, "NetStim: Random123 generator is required\n");
+				assert(0);
+			}
+		}else{
+			nrnran123_State** pv = (nrnran123_State**)(&_p_donotuse);
+			nrnran123_getids3(*pv, di, di+1, di+2);
+		}
+		/*printf("Netstim bbcore_write %d %d %d\n", di[0], di[1], di[3]);*/
 	}
-	*offset += 2;
+	*offset += 3;
 }
+#endif
+
 static void bbcore_read(double* x, int* d, int* xx, int* offset, _threadargsproto_) {
 	assert(!_p_donotuse);
 	if (noise) {
 		uint32_t* di = ((uint32_t*)d) + *offset;
 		nrnran123_State** pv = (nrnran123_State**)(&_p_donotuse);
-		*pv = nrnran123_newstream(di[0], di[1]);
+		*pv = nrnran123_newstream3(di[0], di[1], di[2]);
 	}else{
 		return;
 	}
-	*offset += 2;
+	*offset += 3;
 }
 ENDVERBATIM
 
diff --git a/coreneuron/mech/modfile/stim.mod b/coreneuron/mech/modfile/stim.mod
index 687946d30..89a47f76d 100644
--- a/coreneuron/mech/modfile/stim.mod
+++ b/coreneuron/mech/modfile/stim.mod
@@ -26,8 +26,9 @@ INITIAL {
 }
 
 BREAKPOINT {
-	at_time(del)
-	at_time(del+dur)
+    : for fixed step methos, we can ignore at_time, was introduced for variable timestep, will be deprecated anyway. 
+	: at_time(del)
+	: at_time(del+dur)
 
 	if (t < del + dur && t >= del) {
 		i = amp
diff --git a/coreneuron/nrnconf.h b/coreneuron/nrnconf.h
index 71a55f077..0a20997d3 100644
--- a/coreneuron/nrnconf.h
+++ b/coreneuron/nrnconf.h
@@ -45,7 +45,7 @@ typedef int (*Pfri)();
 typedef char Symbol;
 
 #define CACHEVEC 2
-#define VEC_A(i) (_nt->_actual_a[(i)]) 
+#define VEC_A(i) (_nt->_actual_a[(i)])
 #define VEC_B(i) (_nt->_actual_b[(i)])
 #define VEC_D(i) (_nt->_actual_d[(i)])
 #define VEC_RHS(i) (_nt->_actual_rhs[(i)])
@@ -59,6 +59,7 @@ extern "C" {
 
 extern double celsius;
 extern double t, dt;
+extern int rev_dt;
 extern int secondorder;
 extern int stoprun;
 #define tstopbit (1 << 15)
@@ -69,6 +70,9 @@ extern void hoc_execerror(const char*, const char*); /* print and abort */
 extern void hoc_warning(const char*, const char*);
 extern void* nrn_cacheline_alloc(void** memptr, size_t size);
 extern double* makevector(size_t size); /* size in bytes */
+extern double** makematrix(size_t nrow, size_t ncol);
+void freevector(double*);
+void freematrix(double**);
 extern void* emalloc(size_t size);
 extern void* ecalloc(size_t n, size_t size);
 extern void* erealloc(void* ptr, size_t size);
@@ -78,10 +82,9 @@ extern double hoc_Exp(double x);
 
 /* will go away at some point */
 typedef struct Point_process {
-	void* _presyn; /* for artificial cell net_event */
-	int _i_instance;
-	short _type;
-	short _tid; /* NrnThread id */
+    int _i_instance;
+    short _type;
+    short _tid; /* NrnThread id */
 } Point_process;
 
 extern char* pnt_name(Point_process* pnt);
diff --git a/coreneuron/nrniv/balance.cpp b/coreneuron/nrniv/balance.cpp
new file mode 100644
index 000000000..2879de541
--- /dev/null
+++ b/coreneuron/nrniv/balance.cpp
@@ -0,0 +1,105 @@
+// use LPT algorithm to balance cells so all warps have similar number
+// of compartments.
+// NB: Ideally we'd balance so that warps have similar ncycle. But we do not
+// know how to predict warp quality without an apriori set of cells to
+// fill the warp. For large numbers of cells in a warp,
+// it is a justifiable speculation to presume that there will be very
+// few holes in warp filling. I.e., ncycle = ncompart/warpsize
+
+// competing objectives are to keep identical cells together and also
+// balance warps.
+
+#include "coreneuron/nrnconf.h"
+#include "coreneuron/nrniv/tnode.h"
+#include "coreneuron/nrniv/lpt.h"
+#include <algorithm>
+
+int cellorder_nwarp = 0;  // 0 means do not balance
+
+// ordering by warp, then old order
+bool warpcmp(const TNode* a, const TNode* b) {
+    bool res = false;
+    if (a->groupindex < b->groupindex) {
+        res = true;
+    } else if (a->groupindex == b->groupindex) {
+        if (a->nodevec_index < b->nodevec_index) {
+            res = true;
+        }
+    }
+    return res;
+}
+
+// order the ncell nodevec roots for balance and return a displacement
+// vector specifying the contiguous roots for a warp.
+// The return vector should be freed by the caller.
+// On entry, nodevec is ordered so that each cell type is together and
+// largest cells first. On exit, nodevec is ordered so that warp i
+// should contain roots nodevec[displ[i]:displ[i+1]]
+
+size_t warp_balance(size_t ncell, VecTNode& nodevec) {
+    if (ncell == 0) {
+        return 0;
+    }
+
+    if (cellorder_nwarp == 0) {
+        return 0;
+    }
+    size_t nwarp = size_t(cellorder_nwarp);
+    // cannot be more warps than cells
+    nwarp = (ncell < nwarp) ? ncell : nwarp;
+
+    // cellsize vector and location of types.
+    std::vector<size_t> cellsize(ncell);
+    std::vector<size_t> typedispl;
+    size_t total_compart = 0;
+    typedispl.push_back(0);  // types are already in order
+    for (size_t i = 0; i < ncell; ++i) {
+        cellsize[i] = nodevec[i]->treesize;
+        total_compart += cellsize[i];
+        if (i == 0 || nodevec[i]->hash != nodevec[i - 1]->hash) {
+            typedispl.push_back(typedispl.back() + 1);
+        } else {
+            typedispl.back() += 1;
+        }
+    }
+
+#if 0
+  size_t ncore = nwarp * warpsize;
+  size_t cells_per_type = ncell/(typedispl.size() - 1);
+  size_t ideal_ncycle = total_compart/ncore;
+  size_t avg_cells_per_warp = total_compart/(ncell*nwarp);
+#endif
+
+    size_t ideal_compart_per_warp = total_compart / nwarp;
+
+    size_t min_cells_per_warp = 0;
+    for (size_t i = 0, sz = 0; sz < ideal_compart_per_warp; ++i) {
+        ++min_cells_per_warp;
+        sz += cellsize[i];
+    }
+
+    // balance when order is unrestricted (identical cells not together)
+    // i.e. pieces are cellsize
+    double best_balance = 0.0;
+    std::vector<size_t>* inwarp = lpt(nwarp, cellsize, &best_balance);
+    printf("best_balance=%g ncell=%ld ntype=%ld nwarp=%ld\n", best_balance, ncell,
+           typedispl.size() - 1, nwarp);
+
+    // order the roots for balance
+    for (size_t i = 0; i < ncell; ++i) {
+        TNode* nd = nodevec[i];
+        nd->groupindex = (*inwarp)[i];
+    }
+    std::sort(nodevec.begin(), nodevec.begin() + ncell, warpcmp);
+    for (size_t i = 0; i < nodevec.size(); ++i) {
+        TNode* nd = nodevec[i];
+        for (size_t j = 0; j < nd->children.size(); ++j) {
+            nd->children[j]->groupindex = nd->groupindex;
+        }
+        nd->nodevec_index = i;
+    }
+
+    delete inwarp;
+
+    return nwarp;
+}
diff --git a/coreneuron/nrniv/cellorder.cpp b/coreneuron/nrniv/cellorder.cpp
new file mode 100644
index 000000000..195f6cd13
--- /dev/null
+++ b/coreneuron/nrniv/cellorder.cpp
@@ -0,0 +1,633 @@
+#include "coreneuron/nrnconf.h"
+#include "coreneuron/nrnoc/multicore.h"
+#include "coreneuron/nrnoc/nrnoc_decl.h"
+#include "coreneuron/nrniv/nrn_assert.h"
+#include "coreneuron/nrniv/cellorder.h"
+#include "coreneuron/nrniv/tnode.h"
+#include "coreneuron/nrniv/lpt.h"
+
+#include "coreneuron/nrniv/node_permute.h"  // for print_quality
+#include <set>
+
+#ifdef _OPENACC
+#include <openacc.h>
+#endif
+
+int use_interleave_permute;
+InterleaveInfo* interleave_info;  // nrn_nthread array
+
+InterleaveInfo::InterleaveInfo() {
+    nwarp = 0;
+    nstride = 0;
+    stridedispl = NULL;
+    stride = NULL;
+    firstnode = NULL;
+    lastnode = NULL;
+    cellsize = NULL;
+
+    // for print statistics
+    nnode = NULL;
+    ncycle = NULL;
+    idle = NULL;
+    cache_access = NULL;
+    child_race = NULL;
+}
+
+InterleaveInfo::~InterleaveInfo() {
+    if (stride) {
+        delete[] stride;
+        delete[] firstnode;
+        delete[] lastnode;
+        delete[] cellsize;
+    }
+    if (stridedispl) {
+        delete[] stridedispl;
+    }
+    if (idle) {
+        delete[] nnode;
+        delete[] ncycle;
+        delete[] idle;
+        delete[] cache_access;
+        delete[] child_race;
+    }
+}
+
+void create_interleave_info() {
+    destroy_interleave_info();
+    interleave_info = new InterleaveInfo[nrn_nthread];
+}
+
+void destroy_interleave_info() {
+    if (interleave_info) {
+        delete[] interleave_info;
+        interleave_info = NULL;
+    }
+}
+
+// more precise visualization of the warp quality
+// can be called after admin2
+static void print_quality2(int iwarp, InterleaveInfo& ii, int* p) {
+    int pc = (iwarp == 0);  // print warp 0
+    pc = 0;                 // turn off printing
+    int nodebegin = ii.lastnode[iwarp];
+    int* stride = ii.stride + ii.stridedispl[iwarp];
+    int ncycle = ii.cellsize[iwarp];
+
+#if 0
+  int nodeend = ii.lastnode[iwarp+1];
+  int nnode = ii.lastnode[ii.nwarp];
+#endif
+
+    int inode = nodebegin;
+
+    size_t nn = 0;  // number of nodes in warp. '.'
+    size_t nx = 0;  // number of idle cores on all cycles. 'X'
+    size_t ncacheline = 0;
+    ;                // number of parent memory cacheline accesses.
+                     //   assmue warpsize is max number in a cachline so all o
+    size_t ncr = 0;  // number of child race. nchild-1 of same parent in same cycle
+
+    for (int icycle = 0; icycle < ncycle; ++icycle) {
+        int s = stride[icycle];
+        int lastp = -2;
+        if (pc)
+            printf("  ");
+        std::set<int> crace;  // how many children have same parent in a cycle
+        for (int icore = 0; icore < warpsize; ++icore) {
+            char ch = '.';
+            if (icore < s) {
+                int par = p[inode];
+                if (crace.find(par) != crace.end()) {
+                    ch = 'r';
+                    ++ncr;
+                } else {
+                    crace.insert(par);
+                }
+
+                if (par != lastp + 1) {
+                    ch = (ch == 'r') ? 'R' : 'o';
+                    ++ncacheline;
+                }
+                lastp = p[inode++];
+                ++nn;
+            } else {
+                ch = 'X';
+                ++nx;
+            }
+            if (pc)
+                printf("%c", ch);
+        }
+        if (pc)
+            printf("\n");
+    }
+
+    ii.nnode[iwarp] = nn;
+    ii.ncycle[iwarp] = size_t(ncycle);
+    ii.idle[iwarp] = nx;
+    ii.cache_access[iwarp] = ncacheline;
+    ii.child_race[iwarp] = ncr;
+    if (pc)
+        printf("warp %d:  %ld nodes, %d cycles, %ld idle, %ld cache access, %ld child races\n",
+               iwarp, nn, ncycle, nx, ncacheline, ncr);
+}
+
+static void print_quality1(int iwarp, InterleaveInfo& ii, int ncell, int* p) {
+    int pc = ((iwarp == 0) || iwarp == (ii.nwarp - 1));  // warp not to skip printing
+    pc = 0;                                              // turn off printing.
+    int* stride = ii.stride;
+    int cellbegin = iwarp * warpsize;
+    int cellend = cellbegin + warpsize;
+    cellend = (cellend < stride[0]) ? cellend : stride[0];
+
+    int ncycle = 0;
+    for (int i = cellbegin; i < cellend; ++i) {
+        if (ncycle < ii.cellsize[i]) {
+            ncycle = ii.cellsize[i];
+        }
+    }
+    nrn_assert(ncycle == ii.cellsize[cellend - 1]);
+    nrn_assert(ncycle <= ii.nstride);
+
+    int ncell_in_warp = cellend - cellbegin;
+
+    size_t n = 0;   // number of nodes in warp (not including roots)
+    size_t nx = 0;  // number of idle cores on all cycles. X
+    size_t ncacheline = 0;
+    ;  // number of parent memory cacheline accesses.
+       // assume warpsize is max number in a cachline so
+       // first core has all o
+
+    int inode = ii.firstnode[cellbegin];
+    for (int icycle = 0; icycle < ncycle; ++icycle) {
+        int sbegin = ncell - stride[icycle] - cellbegin;
+        int lastp = -2;
+        if (pc)
+            printf("  ");
+        for (int icore = 0; icore < warpsize; ++icore) {
+            char ch = '.';
+            if (icore < ncell_in_warp && icore >= sbegin) {
+                int par = p[inode + icore];
+                if (par != lastp + 1) {
+                    ch = 'o';
+                    ++ncacheline;
+                }
+                lastp = par;
+                ++n;
+            } else {
+                ch = 'X';
+                ++nx;
+            }
+            if (pc)
+                printf("%c", ch);
+        }
+        if (pc)
+            printf("\n");
+        inode += ii.stride[icycle + 1];
+    }
+
+    ii.nnode[iwarp] = n;
+    ii.ncycle[iwarp] = (size_t)ncycle;
+    ii.idle[iwarp] = nx;
+    ii.cache_access[iwarp] = ncacheline;
+    ii.child_race[iwarp] = 0;
+    if (pc)
+        printf("warp %d:  %ld nodes, %d cycles, %ld idle, %ld cache access\n", iwarp, n, ncycle, nx,
+               ncacheline);
+}
+
+static void warp_balance(int ith, InterleaveInfo& ii) {
+    size_t nwarp = size_t(ii.nwarp);
+    size_t smm[4][3];  // sum_min_max see cp below
+    for (size_t j = 0; j < 4; ++j) {
+        smm[j][0] = 0;
+        smm[j][1] = 1000000000;
+        smm[j][2] = 0;
+    }
+    double emax = 0.0, emin = 1.0;
+    for (size_t i = 0; i < nwarp; ++i) {
+        size_t n = ii.nnode[i];
+        double e = double(n) / (n + ii.idle[i]);
+        if (emax < e) {
+            emax = e;
+        }
+        if (emin > e) {
+            emin = e;
+        }
+        size_t s[4] = {n, ii.idle[i], ii.cache_access[i], ii.child_race[i]};
+        for (size_t j = 0; j < 4; ++j) {
+            smm[j][0] += s[j];
+            if (smm[j][1] > s[j]) {
+                smm[j][1] = s[j];
+            }
+            if (smm[j][2] < s[j]) {
+                smm[j][2] = s[j];
+            }
+        }
+    }
+    std::vector<size_t> v(nwarp);
+    for (size_t i = 0; i < nwarp; ++i) {
+        v[i] = ii.ncycle[i];
+    }
+    double bal = load_balance(v);
+    printf("thread %d nwarp=%ld  balance=%g  warp_efficiency %g to %g\n", ith, nwarp, bal, emin,
+           emax);
+    const char* cp[4] = {"nodes", "idle", "ca", "cr"};
+    for (size_t i = 0; i < 4; ++i) {
+        printf("  %s=%ld (%ld:%ld)", cp[i], smm[i][0], smm[i][1], smm[i][2]);
+    }
+    printf("\n");
+}
+
+int* interleave_order(int ith, int ncell, int nnode, int* parent) {
+    // ensure parent of root = -1
+    for (int i = 0; i < ncell; ++i) {
+        if (parent[i] == 0) {
+            parent[i] = -1;
+        }
+    }
+
+    int nwarp, nstride, *stride, *firstnode, *lastnode, *cellsize, *stridedispl;
+
+    int* order = node_order(ncell, nnode, parent, nwarp, nstride, stride, firstnode, lastnode,
+                            cellsize, stridedispl);
+
+    if (interleave_info) {
+        InterleaveInfo& ii = interleave_info[ith];
+        ii.nwarp = nwarp;
+        ii.nstride = nstride;
+        ii.stridedispl = stridedispl;
+        ii.stride = stride;
+        ii.firstnode = firstnode;
+        ii.lastnode = lastnode;
+        ii.cellsize = cellsize;
+        if (0 && ith == 0 && use_interleave_permute == 1) {
+            printf("ith=%d nstride=%d ncell=%d nnode=%d\n", ith, nstride, ncell, nnode);
+            for (int i = 0; i < ncell; ++i) {
+                printf("icell=%d cellsize=%d first=%d last=%d\n", i, cellsize[i], firstnode[i],
+                       lastnode[i]);
+            }
+            for (int i = 0; i < nstride; ++i) {
+                printf("istride=%d stride=%d\n", i, stride[i]);
+            }
+        }
+        if (ith == 0) {
+            // needed for print_quality[12] and done once here to save time
+            int* p = new int[nnode];
+            for (int i = 0; i < nnode; ++i) {
+                p[i] = parent[i];
+            }
+            permute_ptr(p, nnode, order);
+            node_permute(p, nnode, order);
+
+            ii.nnode = new size_t[nwarp];
+            ii.ncycle = new size_t[nwarp];
+            ii.idle = new size_t[nwarp];
+            ii.cache_access = new size_t[nwarp];
+            ii.child_race = new size_t[nwarp];
+            for (int i = 0; i < nwarp; ++i) {
+                if (use_interleave_permute == 1) {
+                    print_quality1(i, interleave_info[ith], ncell, p);
+                }
+                if (use_interleave_permute == 2) {
+                    print_quality2(i, interleave_info[ith], p);
+                }
+            }
+            delete[] p;
+            warp_balance(ith, interleave_info[ith]);
+        }
+    }
+
+    return order;
+}
+
+#if INTERLEAVE_DEBUG  // only the cell per core style
+static int** cell_indices_debug(NrnThread& nt, InterleaveInfo& ii) {
+    int ncell = nt.ncell;
+    int nnode = nt.end;
+    int* parents = nt._v_parent_index;
+
+    // we expect the nodes to be interleave ordered with smallest cell first
+    // establish consistency with ii.
+    // first ncell parents are -1
+    for (int i = 0; i < ncell; ++i) {
+        nrn_assert(parents[i] == -1);
+    }
+    int* sz;
+    int* cell;
+    sz = new int[ncell];
+    cell = new int[nnode];
+    for (int i = 0; i < ncell; ++i) {
+        sz[i] = 0;
+        cell[i] = i;
+    }
+    for (int i = ncell; i < nnode; ++i) {
+        cell[i] = cell[parents[i]];
+        sz[cell[i]] += 1;
+    }
+
+    // cells are in inceasing sz order;
+    for (int i = 1; i < ncell; ++i) {
+        nrn_assert(sz[i - 1] <= sz[i]);
+    }
+    // same as ii.cellsize
+    for (int i = 0; i < ncell; ++i) {
+        nrn_assert(sz[i] == ii.cellsize[i]);
+    }
+
+    int** cellindices = new int*[ncell];
+    for (int i = 0; i < ncell; ++i) {
+        cellindices[i] = new int[sz[i]];
+        sz[i] = 0;  // restart sz counts
+    }
+    for (int i = ncell; i < nnode; ++i) {
+        cellindices[cell[i]][sz[cell[i]]] = i;
+        sz[cell[i]] += 1;
+    }
+    // cellindices first and last same as ii first and last
+    for (int i = 0; i < ncell; ++i) {
+        nrn_assert(cellindices[i][0] == ii.firstnode[i]);
+        nrn_assert(cellindices[i][sz[i] - 1] == ii.lastnode[i]);
+    }
+
+    delete[] sz;
+    delete[] cell;
+
+    return cellindices;
+}
+
+static int*** cell_indices_threads;
+void mk_cell_indices() {
+    cell_indices_threads = new int**[nrn_nthread];
+    for (int i = 0; i < nrn_nthread; ++i) {
+        NrnThread& nt = nrn_threads[i];
+        if (nt.ncell) {
+            cell_indices_threads[i] = cell_indices_debug(nt, interleave_info[i]);
+        } else {
+            cell_indices_threads[i] = NULL;
+        }
+    }
+}
+#endif  // INTERLEAVE_DEBUG
+
+#if 1
+#define GPU_V(i) nt->_actual_v[i]
+#define GPU_A(i) nt->_actual_a[i]
+#define GPU_B(i) nt->_actual_b[i]
+#define GPU_D(i) nt->_actual_d[i]
+#define GPU_RHS(i) nt->_actual_rhs[i]
+#define GPU_PARENT(i) nt->_v_parent_index[i]
+
+// How does the interleaved permutation with stride get used in
+// triagularization?
+
+// each cell in parallel regardless of inhomogeneous topology
+static void triang_interleaved(NrnThread* nt,
+                               int icell,
+                               int icellsize,
+                               int nstride,
+                               int* stride,
+                               int* lastnode) {
+    int i = lastnode[icell];
+    for (int istride = nstride - 1; istride >= 0; --istride) {
+        if (istride < icellsize) {  // only first icellsize strides matter
+            // what is the index
+            int ip = GPU_PARENT(i);
+#ifndef _OPENACC
+            nrn_assert(ip >= 0);  // if (ip < 0) return;
+#endif
+            double p = GPU_A(i) / GPU_D(i);
+            GPU_D(ip) -= p * GPU_B(i);
+            GPU_RHS(ip) -= p * GPU_RHS(i);
+            i -= stride[istride];
+        }
+    }
+}
+
+// back substitution?
+static void bksub_interleaved(NrnThread* nt,
+                              int icell,
+                              int icellsize,
+                              int nstride,
+                              int* stride,
+                              int* firstnode) {
+    if (nstride) {
+    }  // otherwise unused
+    int i = firstnode[icell];
+    GPU_RHS(icell) /= GPU_D(icell);  // the root
+    for (int istride = 0; istride < icellsize; ++istride) {
+        int ip = GPU_PARENT(i);
+#ifndef _OPENACC
+        nrn_assert(ip >= 0);
+#endif
+        GPU_RHS(i) -= GPU_B(i) * GPU_RHS(ip);
+        GPU_RHS(i) /= GPU_D(i);
+        i += stride[istride + 1];
+    }
+}
+
+// icore ranges [0:warpsize) ; stride[ncycle]
+static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* stride, int lastnode) {
+    int icycle = ncycle - 1;
+    int istride = stride[icycle];
+    int i = lastnode - istride + icore;
+#if !defined(_OPENACC)
+    int ii = i;
+#endif
+
+    #pragma acc loop seq
+    for (;;) {  // ncycle loop
+#if !defined(_OPENACC)
+        // serial test, gpu does this in parallel
+        for (int icore = 0; icore < warpsize; ++icore) {
+            int i = ii + icore;
+#endif
+            if (icore < istride) {  // most efficient if istride equal  warpsize
+                // what is the index
+                int ip = GPU_PARENT(i);
+                double p = GPU_A(i) / GPU_D(i);
+                #pragma acc atomic update
+                GPU_D(ip) -= p * GPU_B(i);
+                #pragma acc atomic update
+                GPU_RHS(ip) -= p * GPU_RHS(i);
+            }
+#if !defined(_OPENACC)
+        }
+#endif
+        if (icycle == 0) {
+            break;
+        }
+        --icycle;
+        istride = stride[icycle];
+        i -= istride;
+#if !defined(_OPENACC)
+        ii -= istride;
+#endif
+    }
+}
+
+// icore ranges [0:warpsize) ; stride[ncycle]
+static void bksub_interleaved2(NrnThread* nt,
+                               int root,
+                               int lastroot,
+                               int icore,
+                               int ncycle,
+                               int* stride,
+                               int firstnode) {
+#if !defined(_OPENACC)
+    for (int i = root; i < lastroot; i += 1) {
+#else
+    #pragma acc loop seq
+    for (int i = root; i < lastroot; i += warpsize) {
+#endif
+        GPU_RHS(i) /= GPU_D(i);  // the root
+    }
+
+    int i = firstnode + icore;
+#if !defined(_OPENACC)
+    int ii = i;
+#endif
+    for (int icycle = 0; icycle < ncycle; ++icycle) {
+        int istride = stride[icycle];
+#if !defined(_OPENACC)
+        // serial test, gpu does this in parallel
+        for (int icore = 0; icore < warpsize; ++icore) {
+            int i = ii + icore;
+#endif
+            if (icore < istride) {
+                int ip = GPU_PARENT(i);
+                GPU_RHS(i) -= GPU_B(i) * GPU_RHS(ip);
+                GPU_RHS(i) /= GPU_D(i);
+            }
+            i += istride;
+#if !defined(_OPENACC)
+        }
+        ii += istride;
+#endif
+    }
+}
+
+#ifdef ENABLE_CUDA_INTERFACE
+void solve_interleaved_launcher(NrnThread* nt, InterleaveInfo* info, int ncell);
+#endif
+
+int temp1[1024] = {0};
+int temp2[1024] = {0};
+int temp3[1024] = {0};
+
+void solve_interleaved2(int ith) {
+    static int foo = 1;
+    NrnThread* nt = nrn_threads + ith;
+    InterleaveInfo& ii = interleave_info[ith];
+    int nwarp = ii.nwarp;
+    if (nwarp == 0) {
+        return;
+    }
+    int* ncycles = ii.cellsize;         // nwarp of these
+    int* stridedispl = ii.stridedispl;  // nwarp+1 of these
+    int* strides = ii.stride;           // sum ncycles of these (bad since ncompart/warpsize)
+    int* rootbegin = ii.firstnode;      // nwarp+1 of these
+    int* nodebegin = ii.lastnode;       // nwarp+1 of these
+#ifdef _OPENACC
+    int nstride = stridedispl[nwarp];
+    int stream_id = nt->stream_id;
+#endif
+
+    int ncore = nwarp * warpsize;
+#if 0 && defined(ENABLE_CUDA_INTERFACE)  // not implemented
+    NrnThread* d_nt = (NrnThread*) acc_deviceptr(nt);
+    InterleaveInfo* d_info = (InterleaveInfo*) acc_deviceptr(interleave_info+ith);
+    solve_interleaved2_launcher(d_nt, d_info);
+#else
+#ifdef _OPENACC
+//    #pragma acc kernels loop gang(1), vector(32) present(nt[0:1], strides[0:nstride],...
+    #pragma acc parallel loop present(                                                          \
+        nt[0 : 1],                                                                              \
+            strides[0 : nstride],                                                               \
+               ncycles[0 : nwarp],                                                              \
+                       stridedispl[0 : nwarp + 1],                                              \
+                                   rootbegin[0 : nwarp + 1],                                    \
+                                             nodebegin[0 : nwarp + 1]) if (nt->compute_gpu)     \
+                                                                               async(stream_id)
+#endif
+    for (int icore = 0; icore < ncore; ++icore) {
+        int iwarp = icore / warpsize;     // figure out the >> value
+        int ic = icore & (warpsize - 1);  // figure out the & mask
+        int ncycle = ncycles[iwarp];
+        int* stride = strides + stridedispl[iwarp];
+        int root = rootbegin[iwarp];
+        int lastroot = rootbegin[iwarp + 1];
+        int firstnode = nodebegin[iwarp];
+        int lastnode = nodebegin[iwarp + 1];
+// temp1[icore] = ic;
+// temp2[icore] = ncycle;
+// temp3[icore] = stride - strides;
+#if !defined(_OPENACC)
+        if (ic == 0) {  // serial test mode. triang and bksub do all cores in warp
+#endif
+            triang_interleaved2(nt, ic, ncycle, stride, lastnode);
+            bksub_interleaved2(nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
+#if !defined(_OPENACC)
+        }  // serial test mode
+#endif
+    }
+#ifdef _OPENACC
+#pragma acc wait(nt->stream_id)
+#endif
+#endif
+    if (foo == 1) {
+        return;
+    }
+    foo = 0;
+    for (int i = 0; i < ncore; ++i) {
+        printf("%d => %d %d %d\n", i, temp1[i], temp2[i], temp3[i]);
+    }
+}
+
+void solve_interleaved1(int ith) {
+    NrnThread* nt = nrn_threads + ith;
+    int ncell = nt->ncell;
+    if (ncell == 0) {
+        return;
+    }
+    InterleaveInfo& ii = interleave_info[ith];
+    int nstride = ii.nstride;
+    int* stride = ii.stride;
+    int* firstnode = ii.firstnode;
+    int* lastnode = ii.lastnode;
+    int* cellsize = ii.cellsize;
+#if _OPENACC
+    int stream_id = nt->stream_id;
+#endif
+
+#ifdef ENABLE_CUDA_INTERFACE
+    NrnThread* d_nt = (NrnThread*)acc_deviceptr(nt);
+    InterleaveInfo* d_info = (InterleaveInfo*)acc_deviceptr(interleave_info + ith);
+    solve_interleaved_launcher(d_nt, d_info, ncell);
+#else
+#ifdef _OPENACC
+    #pragma acc parallel loop present(                                                         \
+        nt[0 : 1], stride[0 : nstride],                                                        \
+                      firstnode[0 : ncell], lastnode[0 : ncell],                               \
+                                                     cellsize[0 : ncell]) if (nt->compute_gpu) \
+                                                                                  async(stream_id)
+#endif
+    for (int icell = 0; icell < ncell; ++icell) {
+        int icellsize = cellsize[icell];
+        triang_interleaved(nt, icell, icellsize, nstride, stride, lastnode);
+        bksub_interleaved(nt, icell, icellsize, nstride, stride, firstnode);
+    }
+#ifdef _OPENACC
+#pragma acc wait(nt->stream_id)
+#endif
+#endif
+}
+
+void solve_interleaved(int ith) {
+    if (use_interleave_permute != 1) {
+        solve_interleaved2(ith);
+    } else {
+        solve_interleaved1(ith);
+    }
+}
+
+#endif
diff --git a/coreneuron/nrniv/cellorder.h b/coreneuron/nrniv/cellorder.h
new file mode 100644
index 000000000..dd5f451db
--- /dev/null
+++ b/coreneuron/nrniv/cellorder.h
@@ -0,0 +1,47 @@
+#ifndef cellorder_h
+#define cellorder_h
+
+int* interleave_order(int ith, int ncell, int nnode, int* parent);
+
+void create_interleave_info();
+void destroy_interleave_info();
+
+class InterleaveInfo {
+  public:
+    InterleaveInfo();
+    virtual ~InterleaveInfo();
+    int nwarp;  // used only by interleave2
+    int nstride;
+    int* stridedispl;  // interleave2: nwarp+1
+    int* stride;       // interleave2: stride  length is stridedispl[nwarp]
+    int* firstnode;    // interleave2: rootbegin nwarp+1 displacements
+    int* lastnode;     // interleave2: nodebegin nwarp+1 displacements
+    int* cellsize;     // interleave2: ncycles nwarp
+
+    // statistics (nwarp of each)
+    size_t* nnode;
+    size_t* ncycle;
+    size_t* idle;
+    size_t* cache_access;
+    size_t* child_race;
+};
+
+// interleaved from cellorder2.cpp
+int* node_order(int ncell,
+                int nnode,
+                int* parents,
+                int& nwarp,
+                int& nstride,
+                int*& stride,
+                int*& firstnode,
+                int*& lastnode,
+                int*& cellsize,
+                int*& stridedispl);
+
+#define INTERLEAVE_DEBUG 0
+
+#if INTERLEAVE_DEBUG
+void mk_cell_indices();
+#endif
+
+#endif
diff --git a/coreneuron/nrniv/cellorder1.cpp b/coreneuron/nrniv/cellorder1.cpp
new file mode 100644
index 000000000..64346373f
--- /dev/null
+++ b/coreneuron/nrniv/cellorder1.cpp
@@ -0,0 +1,671 @@
+#include <stdio.h>
+#include "coreneuron/nrniv/nrn_assert.h"
+#include "coreneuron/nrniv/cellorder.h"
+#include "coreneuron/nrniv/tnode.h"
+
+// just for use_interleave_permute
+#include "coreneuron/nrniv/nrniv_decl.h"
+
+#include <map>
+#include <set>
+#include <algorithm>
+#include <string.h>
+
+using namespace std;
+
+static size_t groupsize = 32;
+
+static bool tnode_earlier(TNode* a, TNode* b) {
+    bool result = false;
+    if (a->treesize < b->treesize) {  // treesize dominates
+        result = true;
+    } else if (a->treesize == b->treesize) {
+        if (a->hash < b->hash) {  // if treesize same, keep identical trees together
+            result = true;
+        } else if (a->hash == b->hash) {
+            result = a->nodeindex < b->nodeindex;  // identical trees ordered by nodeindex
+        }
+    }
+    return result;
+}
+
+static bool ptr_tnode_earlier(TNode* a, TNode* b) {
+    return tnode_earlier(a, b);
+}
+
+TNode::TNode(int ix) {
+    nodeindex = ix;
+    cellindex = 0;
+    groupindex = 0;
+    level = 0;
+    hash = 0;
+    treesize = 1;
+    nodevec_index = 0;
+    treenode_order = 0;
+    parent = NULL;
+    children.reserve(2);
+}
+
+TNode::~TNode() {
+}
+
+size_t TNode::mkhash() {  // call on all nodes in leaf to root order
+    // concept from http://stackoverflow.com/questions/20511347/a-good-hash-function-for-a-vector
+    std::sort(children.begin(), children.end(), ptr_tnode_earlier);
+    hash = children.size();
+    treesize = 1;
+    for (size_t i = 0; i < children.size(); ++i) {  // need sorted by child hash
+        hash ^= children[i]->hash + 0x9e3779b9 + (hash << 6) + (hash >> 2);
+        treesize += children[i]->treesize;
+    }
+    return hash;  // hash of leaf nodes is 0
+}
+
+static void tree_analysis(int* parent, int nnode, int ncell, VecTNode&);
+static void node_interleave_order(int ncell, VecTNode&);
+static void admin1(int ncell,
+                   VecTNode& nodevec,
+                   int& nwarp,
+                   int& nstride,
+                   int*& stride,
+                   int*& firstnode,
+                   int*& lastnode,
+                   int*& cellsize);
+static void admin2(int ncell,
+                   VecTNode& nodevec,
+                   int& nwarp,
+                   int& nstride,
+                   int*& stridedispl,
+                   int*& strides,
+                   int*& rootbegin,
+                   int*& nodebegin,
+                   int*& ncycles);
+static void check(VecTNode&);
+static void prtree(VecTNode&);
+
+typedef std::pair<TNode*, int> TNI;
+typedef std::map<size_t, pair<TNode*, int> > HashCnt;
+typedef vector<TNI> TNIVec;
+
+static char* stree(TNode* nd) {
+    char s[1000];
+
+    if (nd->treesize > 100) {
+        return strdup("");
+    }
+    s[0] = '(';
+    s[1] = '\0';
+    for (size_t i = 0; i < nd->children.size(); ++i) {  // need sorted by child hash
+        char* sr = stree(nd->children[i]);
+        strcat(s, sr);
+        free(sr);
+    }
+    strcat(s, ")");
+    return strdup(s);
+}
+
+/*
+assess the quality of the ordering. The measure is the size of a contiguous
+list of nodes whose parents have the same order. How many contiguous lists
+have that same size. How many nodes participate in that size list.
+Modify the quality measure from experience with performance. Start with
+list of (nnode, size_participation)
+*/
+static void quality(VecTNode& nodevec, size_t max = 32) {
+    size_t qcnt = 0;  // how many contiguous nodes have contiguous parents
+
+    // first ncell nodes are by definition in contiguous order
+    for (size_t i = 0; i < nodevec.size(); ++i) {
+        if (nodevec[i]->parent != NULL) {
+            break;
+        }
+        qcnt += 1;
+    }
+    size_t ncell = qcnt;
+
+    // key is how many parents in contiguous order
+    // value is number of nodes that participate in that
+    map<size_t, size_t> qual;
+    size_t ip_last = 10000000000;
+    for (size_t i = ncell; i < nodevec.size(); ++i) {
+        size_t ip = nodevec[i]->parent->nodevec_index;
+        // i%max == 0 means that if we start a warp with 8 and then have 32
+        // the 32 is broken into 24 and 8. (modify if the arrangement during
+        // gaussian elimination becomes more sophisticated.(
+        if (ip == ip_last + 1 && i % max != 0) {  // contiguous
+            qcnt += 1;
+        } else {
+            if (qcnt == 1) {
+                // printf("unique %ld p=%ld ix=%d\n", i, ip, nodevec[i]->nodeindex);
+            }
+            qual[max] += (qcnt / max) * max;
+            size_t x = qcnt % max;
+            if (x) {
+                qual[x] += x;
+            }
+            qcnt = 1;
+        }
+        ip_last = ip;
+    }
+    qual[max] += (qcnt / max) * max;
+    size_t x = qcnt % max;
+    if (x) {
+        qual[x] += x;
+    }
+
+    // print result
+    qcnt = 0;
+#if 0
+  for (map<size_t, size_t>::iterator it = qual.begin(); it != qual.end(); ++it) {
+    qcnt += it->second;
+    printf("%6ld %6ld\n", it->first, it->second);
+  }
+#endif
+#if 0
+  printf("qual.size=%ld  qual total nodes=%ld  nodevec.size=%ld\n",
+    qual.size(), qcnt, nodevec.size());
+#endif
+
+    // how many race conditions. ie refer to same parent on different core
+    // of warp (max cores) or parent in same group of max.
+    size_t maxip = ncell;
+    size_t nrace1 = 0;
+    size_t nrace2 = 0;
+    set<size_t> ipused;
+    for (size_t i = ncell; i < nodevec.size(); ++i) {
+        TNode* nd = nodevec[i];
+        size_t ip = nd->parent->nodevec_index;
+        if (i % max == 0) {
+            maxip = i;
+            ipused.clear();
+        }
+        if (ip >= maxip) {
+            nrace1 += 1;
+        } /*else*/
+        {
+            if (ipused.find(ip) != ipused.end()) {
+                nrace2 += 1;
+                if (ip >= maxip) {
+                    // printf("race for parent %ld (parent in same group as multiple users))\n",
+                    // ip);
+                }
+            } else {
+                ipused.insert(ip);
+            }
+        }
+    }
+#if 0
+  printf("nrace = %ld (parent in same group of %ld nodes)\n", nrace1, max);
+  printf("nrace = %ld (parent used more than once by same group of %ld nodes)\n", nrace2, max);
+#endif
+}
+
+size_t level_from_root(VecTNode& nodevec) {
+    size_t maxlevel = 0;
+    for (size_t i = 0; i < nodevec.size(); ++i) {
+        TNode* nd = nodevec[i];
+        if (nd->parent) {
+            nd->level = nd->parent->level + 1;
+            if (maxlevel < nd->level) {
+                maxlevel = nd->level;
+            }
+        } else {
+            nd->level = 0;
+        }
+    }
+    return maxlevel;
+}
+
+size_t level_from_leaf(VecTNode& nodevec) {
+    size_t maxlevel = 0;
+    for (size_t i = nodevec.size() - 1; true; --i) {
+        TNode* nd = nodevec[i];
+        size_t lmax = 0;
+        for (size_t ichild = 0; ichild < nd->children.size(); ++ichild) {
+            if (lmax <= nd->children[ichild]->level) {
+                lmax = nd->children[ichild]->level + 1;
+            }
+        }
+        nd->level = lmax;
+        if (maxlevel < lmax) {
+            maxlevel = lmax;
+        }
+        if (i == 0) {
+            break;
+        }
+    }
+    return maxlevel;
+}
+
+static void set_cellindex(int ncell, VecTNode& nodevec) {
+    for (int i = 0; i < ncell; ++i) {
+        nodevec[i]->cellindex = i;
+    }
+    for (size_t i = 0; i < nodevec.size(); ++i) {
+        TNode& nd = *nodevec[i];
+        for (size_t j = 0; j < nd.children.size(); ++j) {
+            TNode* cnode = nd.children[j];
+            cnode->cellindex = nd.cellindex;
+        }
+    }
+}
+
+static void set_groupindex(VecTNode& nodevec) {
+    for (size_t i = 0; i < nodevec.size(); ++i) {
+        TNode* nd = nodevec[i];
+        if (nd->parent) {
+            nd->groupindex = nd->parent->groupindex;
+        } else {
+            nd->groupindex = i / groupsize;
+        }
+    }
+}
+
+#if 0
+#define MSS MSS_ident_stat
+typedef map<size_t, size_t> MSS;
+static bool vsmss_comp(const pair<size_t, MSS*>& a, const pair<size_t, MSS*>& b) {
+  bool result = false;
+  const MSS::iterator& aa = a.second->begin();
+  const MSS::iterator& bb = b.second->begin();
+  if (aa->first < bb->first) {
+    result = true;
+  }else if (aa->first == bb->first) {
+    if (aa->second < bb->second) {
+      result = true;
+    }
+  }
+  return result;
+}
+#endif
+
+// how many identical trees and their levels
+// print when more than one instance of a type
+// reverse the sense of levels (all leaves are level 0) to get a good
+// idea of the depth of identical subtrees.
+static void ident_statistic(VecTNode& nodevec, size_t ncell) {
+    // reverse sense of levels
+    //  size_t maxlevel = level_from_leaf(nodevec);
+    size_t maxlevel = level_from_root(nodevec);
+
+    // # in each level
+    vector<vector<size_t> > n_in_level(maxlevel + 1);
+    for (size_t i = 0; i <= maxlevel; ++i) {
+        n_in_level[i].resize(ncell / groupsize);
+    }
+    for (size_t i = 0; i < nodevec.size(); ++i) {
+        n_in_level[nodevec[i]->level][nodevec[i]->groupindex]++;
+    }
+    printf("n_in_level.size = %ld\n", n_in_level.size());
+    for (size_t i = 0; i < n_in_level.size(); ++i) {
+        printf("%5ld\n", i);
+        for (size_t j = 0; j < n_in_level[i].size(); ++j) {
+            printf(" %5ld", n_in_level[i][j]);
+        }
+        printf("\n");
+    }
+
+#if 0
+  typedef map<size_t, MSS> MSMSS;
+  typedef vector<pair<size_t, MSS*> > VSMSS;
+  MSMSS info;
+  for (size_t i=0; i < nodevec.size(); ++i) {
+    TNode* nd = nodevec[i];
+    info[nd->hash][nd->level]++;
+  }
+
+  VSMSS vinfo;
+  for (MSMSS::iterator i = info.begin(); i != info.end(); ++i) {
+    vinfo.push_back(pair<size_t, MSS*>(i->first, &(i->second)));
+  }
+  std::sort(vinfo.begin(), vinfo.end(), vsmss_comp);
+
+  for (VSMSS::iterator i = vinfo.begin(); i < vinfo.end(); ++i) {
+    MSS* ival = i->second;
+    if (ival->size() > 1 || ival->begin()->second > 8) {
+      printf("hash %ld", i->first);
+      for (MSS::iterator j = ival->begin(); j != ival->end(); ++j) {
+        printf(" (%ld, %ld)", j->first, j->second);
+      }
+      printf("\n");
+    }
+  }
+  printf("max level = %ld\n", maxlevel);
+#endif
+}
+#undef MSS
+
+// for cells with same size, keep identical trees together
+
+// parent is (unpermuted)  nnode length vector of parent node indices.
+// return a permutation (of length nnode) which orders cells of same
+// size so that identical trees are grouped together.
+// Note: cellorder[ncell:nnode] are the identify permutation.
+
+int* node_order(int ncell,
+                int nnode,
+                int* parent,
+                int& nwarp,
+                int& nstride,
+                int*& stride,
+                int*& firstnode,
+                int*& lastnode,
+                int*& cellsize,
+                int*& stridedispl) {
+    VecTNode nodevec;
+    if (0)
+        prtree(nodevec);  // avoid unused warning
+
+    // nodevec[0:ncell] in increasing size, with identical trees together,
+    // and otherwise nodeindex order
+    tree_analysis(parent, nnode, ncell, nodevec);
+    check(nodevec);
+
+    set_cellindex(ncell, nodevec);
+    set_groupindex(nodevec);
+    level_from_root(nodevec);
+
+    // nodevec[ncell:nnode] cells are interleaved in nodevec[0:ncell] cell order
+    if (use_interleave_permute == 1) {
+        node_interleave_order(ncell, nodevec);
+    } else {
+        group_order2(nodevec, groupsize, ncell);
+    }
+    check(nodevec);
+
+#if 0
+  for (int i=0; i < ncell; ++i) {
+    TNode& nd = *nodevec[i];
+    printf("%d size=%ld hash=%ld ix=%d\n", i, nd.treesize, nd.hash, nd.nodeindex);
+  }
+#endif
+
+    if (0)
+        ident_statistic(nodevec, ncell);
+    quality(nodevec);
+
+    // the permutation
+    int* nodeorder = new int[nnode];
+    for (int i = 0; i < nnode; ++i) {
+        TNode& nd = *nodevec[i];
+        nodeorder[nd.nodeindex] = i;
+    }
+
+    // administrative statistics for gauss elimination
+    if (use_interleave_permute == 1) {
+        admin1(ncell, nodevec, nwarp, nstride, stride, firstnode, lastnode, cellsize);
+    } else {
+        //  admin2(ncell, nodevec, nwarp, nstride, stridedispl, stride, rootbegin, nodebegin,
+        //  ncycles);
+        admin2(ncell, nodevec, nwarp, nstride, stridedispl, stride, firstnode, lastnode, cellsize);
+    }
+
+#if 1
+    int ntopol = 1;
+    for (int i = 1; i < ncell; ++i) {
+        if (nodevec[i - 1]->hash != nodevec[i]->hash) {
+            ntopol += 1;
+        }
+    }
+    printf("%d distinct tree topologies\n", ntopol);
+#endif
+
+    for (size_t i = 0; i < nodevec.size(); ++i) {
+        delete nodevec[i];
+    }
+
+    return nodeorder;
+}
+
+void check(VecTNode& nodevec) {
+    // printf("check\n");
+    size_t nnode = nodevec.size();
+    size_t ncell = 0;
+    for (size_t i = 0; i < nnode; ++i) {
+        nodevec[i]->nodevec_index = i;
+        if (nodevec[i]->parent == NULL) {
+            ncell++;
+        }
+    }
+    for (size_t i = 0; i < ncell; ++i) {
+        nrn_assert(nodevec[i]->parent == NULL);
+    }
+    for (size_t i = ncell; i < nnode; ++i) {
+        TNode& nd = *nodevec[i];
+        if (nd.parent->nodevec_index >= nd.nodevec_index) {
+            printf("error i=%ld nodevec_index=%ld parent=%ld\n", i, nd.nodevec_index,
+                   nd.parent->nodevec_index);
+        }
+        nrn_assert(nd.nodevec_index > nd.parent->nodevec_index);
+    }
+}
+
+void prtree(VecTNode& nodevec) {
+    size_t nnode = nodevec.size();
+    for (size_t i = 0; i < nnode; ++i) {
+        nodevec[i]->nodevec_index = i;
+    }
+    for (size_t i = 0; i < nnode; ++i) {
+        TNode& nd = *nodevec[i];
+        printf("%ld p=%d   c=%ld l=%ld o=%ld   ix=%d pix=%d\n", i,
+               nd.parent ? int(nd.parent->nodevec_index) : -1, nd.cellindex, nd.level,
+               nd.treenode_order, nd.nodeindex, nd.parent ? int(nd.parent->nodeindex) : -1);
+    }
+}
+
+void tree_analysis(int* parent, int nnode, int ncell, VecTNode& nodevec) {
+    //  VecTNode nodevec;
+
+    // create empty TNodes (knowing only their index)
+    nodevec.reserve(nnode);
+    for (int i = 0; i < nnode; ++i) {
+        nodevec.push_back(new TNode(i));
+    }
+
+    // determine the (sorted by hash) children of each node
+    for (int i = nnode - 1; i >= ncell; --i) {
+        nodevec[i]->parent = nodevec[parent[i]];
+        nodevec[i]->mkhash();
+        nodevec[parent[i]]->children.push_back(nodevec[i]);
+    }
+
+    // determine hash of the cells
+    for (int i = 0; i < ncell; ++i) {
+        nodevec[i]->mkhash();
+    }
+
+    std::sort(nodevec.begin(), nodevec.begin() + ncell, tnode_earlier);
+}
+
+static bool interleave_comp(TNode* a, TNode* b) {
+    bool result = false;
+    if (a->treenode_order < b->treenode_order) {
+        result = true;
+    } else if (a->treenode_order == b->treenode_order) {
+        if (a->cellindex < b->cellindex) {
+            result = true;
+        }
+    }
+    return result;
+}
+
+// sort so nodevec[ncell:nnode] cell instances are interleaved. Keep the
+// secondary ordering with respect to treenode_order so each cell is still a tree.
+
+void node_interleave_order(int ncell, VecTNode& nodevec) {
+    int* order = new int[ncell];
+    for (int i = 0; i < ncell; ++i) {
+        order[i] = 0;
+        nodevec[i]->treenode_order = order[i]++;
+    }
+    for (size_t i = 0; i < nodevec.size(); ++i) {
+        TNode& nd = *nodevec[i];
+        for (size_t j = 0; j < nd.children.size(); ++j) {
+            TNode* cnode = nd.children[j];
+            cnode->treenode_order = order[nd.cellindex]++;
+        }
+    }
+    delete[] order;
+
+    //  std::sort(nodevec.begin() + ncell, nodevec.end(), contig_comp);
+    std::sort(nodevec.begin() + ncell, nodevec.end(), interleave_comp);
+
+#if 0
+  for (size_t i=0; i < nodevec.size(); ++i) {
+    TNode& nd = *nodevec[i];
+    printf("%ld cell=%ld ix=%d\n",  i, nd.cellindex, nd.nodeindex);
+  }
+#endif
+}
+
+static void admin1(int ncell,
+                   VecTNode& nodevec,
+                   int& nwarp,
+                   int& nstride,
+                   int*& stride,
+                   int*& firstnode,
+                   int*& lastnode,
+                   int*& cellsize) {
+    // firstnode[i] is the index of the first nonroot node of the cell
+    // lastnode[i] is the index of the last node of the cell
+    // cellsize is the number of nodes in the cell not counting root.
+    // nstride is the maximum cell size (not counting root)
+    // stride[i] is the number of cells with an ith node.
+    firstnode = new int[ncell];
+    lastnode = new int[ncell];
+    cellsize = new int[ncell];
+
+    nwarp = (ncell % warpsize == 0) ? (ncell / warpsize) : (ncell / warpsize + 1);
+
+    for (int i = 0; i < ncell; ++i) {
+        firstnode[i] = -1;
+        lastnode[i] = -1;
+        cellsize[i] = 0;
+    }
+
+    nstride = 0;
+    for (size_t i = ncell; i < nodevec.size(); ++i) {
+        TNode& nd = *nodevec[i];
+        size_t ci = nd.cellindex;
+        if (firstnode[ci] == -1) {
+            firstnode[ci] = i;
+        }
+        lastnode[ci] = i;
+        cellsize[ci] += 1;
+        if (nstride < cellsize[ci]) {
+            nstride = cellsize[ci];
+        }
+    }
+
+    stride = new int[nstride + 1];  // in case back substitution accesses this
+    for (int i = 0; i <= nstride; ++i) {
+        stride[i] = 0;
+    }
+    for (size_t i = ncell; i < nodevec.size(); ++i) {
+        TNode& nd = *nodevec[i];
+        stride[nd.treenode_order - 1] += 1;  // -1 because treenode order includes root
+    }
+}
+
+// for admin2 we allow the node organisation in warps of (say 4 cores per warp)
+// ...............  ideal warp but unbalanced relative to warp with max cycles
+// ...............  ncycle = 15, icore [0:4), all strides are 4.
+// ...............
+// ...............
+//
+// ..........       unbalanced relative to warp with max cycles
+// ..........       ncycle = 10, not all strides the same because
+// ..........       of need to avoid occasional race conditions.
+//  .  . ..         icore [4:8) only 4 strides of 4
+//
+// ....................  ncycle = 20, uses only one core in the warp (cable)
+//                       icore 8, all ncycle strides are 1
+
+// One thing to be unhappy about is the large stride vector of size about
+// number of compartments/warpsize. There are a lot of models where the
+// stride for a warp is constant except for one cycle in the warp and that
+// is easy to obtain when there are more than warpsize cells per warp.
+
+static size_t stride_length(size_t begin, size_t end, VecTNode& nodevec) {
+    // return stride length starting at i. Do not go past j.
+    // max stride is warpsize.
+    // At this time, only assume vicious parent race conditions matter.
+    if (end - begin > warpsize) {
+        end = begin + warpsize;
+    }
+    for (size_t i = begin; i < end; ++i) {
+        TNode* nd = nodevec[i];
+        nrn_assert(nd->nodevec_index == i);
+        size_t diff = dist2child(nd);
+        if (i + diff < end) {
+            end = i + diff;
+        }
+    }
+    return end - begin;
+}
+
+static void admin2(int ncell,
+                   VecTNode& nodevec,
+                   int& nwarp,
+                   int& nstride,
+                   int*& stridedispl,
+                   int*& strides,
+                   int*& rootbegin,
+                   int*& nodebegin,
+                   int*& ncycles) {
+    // the number of groups is the number of warps needed
+    // ncore is the number of warps * warpsize
+    nwarp = nodevec[ncell - 1]->groupindex + 1;
+
+    ncycles = new int[nwarp];
+    stridedispl = new int[nwarp + 1];  // running sum of ncycles (start at 0)
+    rootbegin = new int[nwarp + 1];    // index (+1) of first root in warp.
+    nodebegin = new int[nwarp + 1];    // index (+1) of first node in warp.
+
+    // rootbegin and nodebegin are the root index values + 1 of the last of
+    // the sequence of constant groupindex
+    rootbegin[0] = 0;
+    for (size_t i = 0; i < size_t(ncell); ++i) {
+        rootbegin[nodevec[i]->groupindex + 1] = i + 1;
+    }
+    nodebegin[0] = ncell;
+    for (size_t i = size_t(ncell); i < nodevec.size(); ++i) {
+        nodebegin[nodevec[i]->groupindex + 1] = i + 1;
+    }
+
+    // ncycles, stridedispl, and nstride
+    nstride = 0;
+    stridedispl[0] = 0;
+    for (size_t iwarp = 0; iwarp < (size_t)nwarp; ++iwarp) {
+        size_t j = size_t(nodebegin[iwarp + 1]);
+        int nc = 0;
+        size_t i = nodebegin[iwarp];
+        while (i < j) {
+            i += stride_length(i, j, nodevec);
+            ++nc;
+        }
+        ncycles[iwarp] = nc;
+        stridedispl[iwarp + 1] = stridedispl[iwarp] + nc;
+        nstride += nc;
+    }
+
+    // strides
+    strides = new int[nstride];
+    nstride = 0;
+    for (size_t iwarp = 0; iwarp < (size_t)nwarp; ++iwarp) {
+        size_t j = size_t(nodebegin[iwarp + 1]);
+        size_t i = nodebegin[iwarp];
+        while (i < j) {
+            int k = stride_length(i, j, nodevec);
+            i += k;
+            strides[nstride++] = k;
+        }
+    }
+
+#if 0
+printf("warp rootbegin nodebegin stridedispl\n");
+for (int i = 0; i <= nwarp; ++i){
+  printf("%4d %4d %4d %4d\n", i, rootbegin[i], nodebegin[i], stridedispl[i]);
+}
+#endif
+}
diff --git a/coreneuron/nrniv/cellorder2.cpp b/coreneuron/nrniv/cellorder2.cpp
new file mode 100644
index 000000000..e0cd57eb8
--- /dev/null
+++ b/coreneuron/nrniv/cellorder2.cpp
@@ -0,0 +1,532 @@
+#include <stdio.h>
+#include "coreneuron/nrniv/nrn_assert.h"
+#include "coreneuron/nrniv/cellorder.h"
+#include "coreneuron/nrniv/tnode.h"
+#include "coreneuron/nrniv/nrniv_decl.h"
+
+#include <map>
+#include <set>
+#include <algorithm>
+#include <string.h>
+
+using namespace std;
+
+// experiment starting with identical cell ordering
+// groupindex aleady defined that keeps identical cells together
+// begin with leaf to root ordering
+
+typedef VecTNode VTN;        // level of nodes
+typedef vector<VTN> VVTN;    // group of levels
+typedef vector<VVTN> VVVTN;  // groups
+
+// verify level in groups of nident identical nodes
+void chklevel(VTN& level, size_t nident = 8) {
+#if 0
+  nrn_assert(level.size() % nident == 0);
+  for (size_t i = 0; i <  level.size(); ++i) {
+    size_t j = nident*int(i/nident);
+    nrn_assert(level[i]->hash == level[j]->hash);
+  }
+#endif
+}
+
+// first child before second child, etc
+// if same parent level, then parent order
+// if not same parent, then earlier parent (no parent earlier than parent)
+// if same parents, then children order
+// if no parents then nodevec_index order.
+static bool sortlevel_cmp(TNode* a, TNode* b) {
+    // when starting with leaf to root order
+    // note that leaves are at max level and all roots at level 0
+    bool result = false;
+    // since cannot have an index < 0, just add 1 to level
+    size_t palevel = a->parent ? 1 + a->parent->level : 0;
+    size_t pblevel = b->parent ? 1 + b->parent->level : 0;
+    if (palevel < pblevel) {          // only used when starting leaf to root order
+        result = true;                // earlier level first
+    } else if (palevel == pblevel) {  // alwayse true when starting root to leaf
+        if (palevel == 0) {           // a and b are roots
+            if (a->nodevec_index < b->nodevec_index) {
+                result = true;
+            }
+        } else {  // parent order (already sorted with proper treenode_order
+            if (a->treenode_order < b->treenode_order) {  // children order
+                result = true;
+            } else if (a->treenode_order == b->treenode_order) {
+                if (a->parent->treenode_order < b->parent->treenode_order) {
+                    result = true;
+                }
+            }
+        }
+    }
+    return result;
+}
+
+static void sortlevel(VTN& level) {
+    std::sort(level.begin(), level.end(), sortlevel_cmp);
+
+#if 0
+printf("after sortlevel\n");
+for (size_t i = 0; i < level.size(); ++i) {
+TNode* nd = level[i];
+printf("ilev=%ld i=%ld plev=%ld pi=%ld phash=%ld ord=%ld hash=%ld\n",
+nd->level, i, nd->parent?nd->parent->level:0,
+nd->parent?nd->parent->treenode_order:0, nd->parent?nd->parent->hash:0,
+nd->treenode_order, nd->hash);
+}
+chklevel(level);
+#endif
+
+    for (size_t i = 0; i < level.size(); ++i) {
+        level[i]->treenode_order = i;
+    }
+}
+
+static void set_treenode_order(VVTN& levels) {
+    size_t order = 0;
+    for (size_t i = 0; i < levels.size(); ++i) {
+        for (size_t j = 0; j < levels[i].size(); ++j) {
+            TNode* nd = levels[i][j];
+            nd->treenode_order = order++;
+        }
+    }
+}
+
+// every level starts out with no race conditions involving both
+// parent and child in the same level. Can we arrange things so that
+// every level has at least 32 nodes?
+static size_t g32(TNode* nd) {
+    return nd->nodevec_index / warpsize;
+}
+
+static bool is_parent_race(TNode* nd) {  // vitiating
+    size_t pg = g32(nd);
+    for (size_t i = 0; i < nd->children.size(); ++i) {
+        if (pg == g32(nd->children[i])) {
+            return true;
+        }
+    }
+    return false;
+}
+
+// less than 32 apart
+static bool is_parent_race2(TNode* nd) {  // vitiating
+    size_t pi = nd->nodevec_index;
+    for (size_t i = 0; i < nd->children.size(); ++i) {
+        if (nd->children[i]->nodevec_index - pi < warpsize) {
+            return true;
+        }
+    }
+    return false;
+}
+
+static bool is_child_race(TNode* nd) {  // potentially handleable by atomic
+    if (nd->children.size() < 2) {
+        return false;
+    }
+    if (nd->children.size() == 2) {
+        return g32(nd->children[0]) == g32(nd->children[1]);
+    }
+    std::set<size_t> s;
+    for (size_t i = 0; i < nd->children.size(); ++i) {
+        size_t gc = g32(nd->children[i]);
+        if (s.find(gc) != s.end()) {
+            return true;
+        }
+        s.insert(gc);
+    }
+    return false;
+}
+
+static bool is_child_race2(TNode* nd) {  // potentially handleable by atomic
+    if (nd->children.size() < 2) {
+        return false;
+    }
+    if (nd->children.size() == 2) {
+        size_t c0 = nd->children[0]->nodevec_index;
+        size_t c1 = nd->children[1]->nodevec_index;
+        c0 = (c0 < c1) ? (c1 - c0) : (c0 - c1);
+        return c0 < warpsize;
+    }
+    size_t ic0 = nd->children[0]->nodevec_index;
+    for (size_t i = 1; i < nd->children.size(); ++i) {
+        size_t ic = nd->children[i]->nodevec_index;
+        if (ic - ic0 < warpsize) {
+            return true;
+        }
+        ic0 = ic;
+    }
+    return false;
+}
+
+size_t dist2child(TNode* nd) {
+    size_t d = 1000;
+    size_t pi = nd->nodevec_index;
+    for (size_t i = 0; i < nd->children.size(); ++i) {
+        size_t d1 = nd->children[i]->nodevec_index - pi;
+        if (d1 < d) {
+            d = d1;
+        }
+    }
+    return d;
+}
+
+// from stackoverflow.com
+template <typename T>
+static void move_range(size_t start, size_t length, size_t dst, std::vector<T>& v) {
+    typename std::vector<T>::iterator first, middle, last;
+    if (start < dst) {
+        first = v.begin() + start;
+        middle = first + length;
+        last = v.begin() + dst;
+    } else {
+        first = v.begin() + dst;
+        middle = v.begin() + start;
+        last = middle + length;
+    }
+    std::rotate(first, middle, last);
+}
+
+static void move_nodes(size_t start, size_t length, size_t dst, VTN& nodes) {
+    nrn_assert(dst <= nodes.size());
+    nrn_assert(start + length <= dst);
+    move_range(start, length, dst, nodes);
+
+    // check correctness of move
+    for (size_t i = start; i < dst - length; ++i) {
+        nrn_assert(nodes[i]->nodevec_index == i + length);
+    }
+    for (size_t i = dst - length; i < dst; ++i) {
+        nrn_assert(nodes[i]->nodevec_index == start + (i - (dst - length)));
+    }
+
+    // update nodevec_index
+    for (size_t i = start; i < dst; ++i) {
+        nodes[i]->nodevec_index = i;
+    }
+}
+
+// least number of nodes to move after nd to eliminate prace
+static size_t need2move(TNode* nd) {
+    size_t d = dist2child(nd);
+    return warpsize - ((nd->nodevec_index % warpsize) + d);
+}
+
+#if 0
+static void how_many_warpsize_groups_have_only_leaves(VTN& nodes) {
+  size_t n = 0;
+  for (size_t i = 0; i < nodes.size(); i += warpsize) {
+    bool r = true;
+    for (size_t j=0; j < warpsize; ++j) {
+      if (nodes[i+j]->children.size() != 0) {
+        r = false;
+        break;
+      }
+    }
+    if (r) {
+      printf("warpsize group %ld starting at level %ld\n", i/warpsize, nodes[i]->level);
+      ++n;
+    }
+  }
+  printf("number of warpsize groups with only leaves = %ld\n", n);
+}
+#endif
+
+static void pr_race_situation(VTN& nodes) {
+    size_t prace2 = 0;
+    size_t prace = 0;
+    size_t crace = 0;
+    for (size_t i = nodes.size() - 1; nodes[i]->level != 0; --i) {
+        TNode* nd = nodes[i];
+        if (is_parent_race2(nd)) {
+            ++prace2;
+        }
+        if (is_parent_race(nd)) {
+            printf("level=%ld i=%ld d=%ld n=%ld", nd->level, nd->nodevec_index, dist2child(nd),
+                   need2move(nd));
+            for (size_t j = 0; j < nd->children.size(); ++j) {
+                TNode* cnd = nd->children[j];
+                printf("   %ld %ld", cnd->level, cnd->nodevec_index);
+            }
+            printf("\n");
+            ++prace;
+        }
+        if (is_child_race(nd)) {
+            ++crace;
+        }
+    }
+    printf("prace=%ld  crace=%ld prace2=%ld\n", prace, crace, prace2);
+}
+
+static size_t next_leaf(TNode* nd, VTN& nodes) {
+    size_t i = 0;
+    for (i = nd->nodevec_index - 1; i > 0; --i) {
+        if (nodes[i]->children.size() == 0) {
+            return i;
+        }
+    }
+    //  nrn_assert(i > 0);
+    return 0;
+}
+
+static void checkrace(TNode* nd, VTN& nodes) {
+    bool res = true;
+    for (size_t i = nd->nodevec_index; i < nodes.size(); ++i) {
+        if (is_parent_race2(nodes[i])) {
+            //      printf("checkrace %ld\n", i);
+            res = false;
+        }
+    }
+    if (0 && res) {
+        printf("checkrace no race from nd onward\n");
+    }
+}
+
+static bool eliminate_race(TNode* nd, size_t d, VTN& nodes, TNode* look) {
+    // printf("eliminate_race %ld %ld\n", nd->nodevec_index, d);
+    // opportunistically move that number of leaves
+    // error if no leaves left to move.
+    size_t i = look->nodevec_index;
+    while (d > 0) {
+        i = next_leaf(nodes[i], nodes);
+        if (i == 0) {
+            return false;
+        }
+        size_t n = 1;
+        while (nodes[i - 1]->children.size() == 0 && n < d) {
+            --i;
+            ++n;
+        }
+        // printf("  move_nodes src=%ld len=%ld dest=%ld\n", i, n, nd->nodevec_index);
+        move_nodes(i, n, nd->nodevec_index + 1, nodes);
+        d -= n;
+    }
+    checkrace(nd, nodes);
+    return true;
+}
+
+static void eliminate_prace(TNode* nd, VTN& nodes) {
+    size_t d = warpsize - dist2child(nd);
+    bool b = eliminate_race(nd, d, nodes, nd);
+    if (0 && !b) {
+        printf("could not eliminate prace for g=%ld  c=%ld l=%ld o=%ld   %ld\n", nd->groupindex,
+               nd->cellindex, nd->level, nd->treenode_order, nd->hash);
+    }
+}
+
+static void eliminate_crace(TNode* nd, VTN& nodes) {
+    size_t c0 = nd->children[0]->nodevec_index;
+    size_t c1 = nd->children[1]->nodevec_index;
+    size_t d = warpsize - ((c0 > c1) ? (c0 - c1) : (c1 - c0));
+    TNode* cnd = nd->children[0];
+    bool b = eliminate_race(cnd, d, nodes, nd);
+    if (0 && !b) {
+        printf("could not eliminate crace for g=%ld  c=%ld l=%ld o=%ld   %ld\n", nd->groupindex,
+               nd->cellindex, nd->level, nd->treenode_order, nd->hash);
+    }
+}
+
+static void question2(VVTN& levels) {
+    size_t nnode = 0;
+    for (size_t i = 0; i < levels.size(); ++i) {
+        nnode += levels[i].size();
+    }
+    VTN nodes(nnode);
+    nnode = 0;
+    for (size_t i = 0; i < levels.size(); ++i) {
+        for (size_t j = 0; j < levels[i].size(); ++j) {
+            nodes[nnode++] = levels[i][j];
+        }
+    }
+    for (size_t i = 0; i < nodes.size(); ++i) {
+        nodes[i]->nodevec_index = i;
+    }
+
+    //  how_many_warpsize_groups_have_only_leaves(nodes);
+
+    // work backward and check the distance from parent to children.
+    // if parent in different group then there is no vitiating race.
+    // if children in different group then ther is no race (satisfied by
+    // atomic).
+    // If there is a vitiating race, then figure out how many nodes
+    // need to be inserted just before the parent to avoid the race.
+    //   It is not clear if we should prioritize safe nodes (when moved they
+    //   do not introduce a race) and/or contiguous nodes (probably, to keep
+    //   the low hanging fruit together).
+    //   At least, moved nodes should have proper tree order and not themselves
+    //   introduce a race at their new location.  Leaves are nice in that there
+    //   are no restrictions in movement toward higher indices.
+    //   Note that unless groups of 32 are inserted, it may be the case that
+    //   races are generated at greater indices since otherwise a portion of
+    //   each group is placed into the next group. This would not be an issue
+    //   if, in fact, the stronger requirement of every parent having
+    //   pi + 32 <= ci is demanded instead of merely being in different warpsize.
+    //   One nice thing about adding warpsize nodes is that it does not disturb
+    //   any existing contiguous groups except the moved group which gets divided
+    //   between parent warpsize and child, where the nodes past the parent
+    //   get same relative indices in the next warpsize
+
+    //  let's see how well we can do by opportunistically moving leaves to
+    //  separate parents from children by warpsize (ie is_parent_prace2 is false)
+    //  Hopefully, we won't run out of leaves before eliminating all
+    //  is_parent_prace2
+
+    if (0 && nodes.size() % warpsize != 0) {
+        size_t nnode = nodes.size() - levels[0].size();
+        printf("warp of %ld cells has %ld nodes in last cycle %ld\n", levels[0].size(),
+               nnode % warpsize, nnode / warpsize + 1);
+    }
+
+    //  pr_race_situation(nodes);
+
+    // eliminate parent and children races using leaves
+    for (size_t i = nodes.size() - 1; i >= levels[0].size(); --i) {
+        TNode* nd = nodes[i];
+        if (is_child_race2(nd)) {
+            eliminate_crace(nd, nodes);
+            i = nd->nodevec_index;
+        }
+        if (is_parent_race2(nd)) {
+            eliminate_prace(nd, nodes);
+            i = nd->nodevec_index;
+        }
+    }
+    if (0) {
+        pr_race_situation(nodes);
+    }
+    // copy nodes indices to treenode_order
+    for (size_t i = 0; i < nodes.size(); ++i) {
+        nodes[i]->treenode_order = i;
+    }
+}
+
+// size of groups with contiguous parents for each level
+static void question(VVTN& levels) {
+#if 0
+  for (size_t i = 0; i < levels.size(); ++i) {
+    printf("%3ld %5ld", i, levels[i].size());
+    size_t iplast = 100000000;
+    size_t nsame = 0;
+    for (size_t j=0; j < levels[i].size(); ++j) {
+      TNode* nd = levels[i][j];
+      if (nd->parent == NULL) {
+        nsame += 1;
+      }else if (nd->parent->treenode_order == iplast + 1) {
+        nsame += 1;
+        iplast = nd->parent->treenode_order;
+      }else{
+        if (nsame) { printf(" %3ld", nsame); }
+        nsame = 1;
+        iplast = nd->parent->treenode_order;
+      }
+    }
+    if (nsame) { printf(" %3ld", nsame); }
+    printf("\n");
+  }
+#endif
+}
+
+static void analyze(VVTN& levels) {
+    // sort each level with respect to parent level order
+    // earliest parent level first.
+
+    // treenode order can be anything as long as first children < second
+    // children etc.. After sorting a level, the order will be correct for
+    // that level, ranging from [0:level.size]
+    for (size_t i = 0; i < levels.size(); ++i) {
+        chklevel(levels[i]);
+        for (size_t j = 0; j < levels[i].size(); ++j) {
+            TNode* nd = levels[i][j];
+            for (size_t k = 0; k < nd->children.size(); ++k) {
+                nd->children[k]->treenode_order = k;
+            }
+        }
+    }
+
+    for (size_t i = 0; i < levels.size(); ++i) {
+        sortlevel(levels[i]);
+        chklevel(levels[i]);
+    }
+
+    set_treenode_order(levels);
+}
+
+void prgroupsize(VVVTN& groups) {
+#if 0
+  for (size_t i=0; i < groups[0].size(); ++i) {
+    printf("%5ld\n", i);
+    for (size_t j=0; j < groups.size(); ++j) {
+      printf(" %5ld", groups[j][i].size());   
+    }
+    printf("\n");
+  }
+#endif
+}
+
+// group index primary, treenode_order secondary
+static bool final_nodevec_cmp(TNode* a, TNode* b) {
+    bool result = false;
+    if (a->groupindex < b->groupindex) {
+        result = true;
+    } else if (a->groupindex == b->groupindex) {
+        if (a->treenode_order < b->treenode_order) {
+            result = true;
+        }
+    }
+    return result;
+}
+
+static void set_nodeindex(VecTNode& nodevec) {
+    for (size_t i = 0; i < nodevec.size(); ++i) {
+        nodevec[i]->nodevec_index = i;
+    }
+}
+
+void group_order2(VecTNode& nodevec, size_t groupsize, size_t ncell) {
+#if 1
+    size_t maxlevel = level_from_root(nodevec);
+#else
+    size_t maxlevel = level_from_leaf(nodevec);
+    // reverse the level numbering so leaves are at maxlevel.
+    // also make all roots have level 0
+    for (size_t i = 0; i < nodevec.size(); ++i) {
+        TNode* nd = nodevec[i];
+        nd->level = maxlevel - nd->level;
+        if (nd->parent == NULL) {
+            nd->level = 0;
+        }
+    }
+#endif
+
+    // if not NULL use this to define groups (and reset TNode.groupindex)
+    size_t nwarp = warp_balance(ncell, nodevec);
+
+    // work on a cellgroup as a vector of levels. ie only possible race is
+    // two children in same warpsize
+
+    VVVTN groups(nwarp ? nwarp : (ncell / groupsize + ((ncell % groupsize) ? 1 : 0)));
+
+    for (size_t i = 0; i < groups.size(); ++i) {
+        groups[i].resize(maxlevel + 1);
+    }
+
+    for (size_t i = 0; i < nodevec.size(); ++i) {
+        TNode* nd = nodevec[i];
+        groups[nd->groupindex][nd->level].push_back(nd);
+    }
+
+    prgroupsize(groups);
+
+    // deal with each group
+    for (size_t i = 0; i < groups.size(); ++i) {
+        analyze(groups[i]);
+        question2(groups[i]);
+    }
+
+    question(groups[0]);
+    //  question2(groups[0]);
+
+    // final nodevec order according to group_index and treenode_order
+    std::sort(nodevec.begin() + ncell, nodevec.end(), final_nodevec_cmp);
+    set_nodeindex(nodevec);
+}
diff --git a/coreneuron/nrniv/coreneuron_main.cpp b/coreneuron/nrniv/coreneuron_main.cpp
index 1cc0d397b..4b3d24d60 100644
--- a/coreneuron/nrniv/coreneuron_main.cpp
+++ b/coreneuron/nrniv/coreneuron_main.cpp
@@ -29,5 +29,5 @@ THE POSSIBILITY OF SUCH DAMAGE.
 extern int main1(int argc, char** argv, char** env);
 
 int main(int argc, char** argv, char** env) {
-  return main1(argc, argv, env);
+    return main1(argc, argv, env);
 }
diff --git a/coreneuron/nrniv/cuda_profile.cu b/coreneuron/nrniv/cuda_profile.cu
new file mode 100644
index 000000000..16bac0d1f
--- /dev/null
+++ b/coreneuron/nrniv/cuda_profile.cu
@@ -0,0 +1,31 @@
+#include "cuda_profiler_api.h"
+#include <stdio.h>
+
+void print_gpu_memory_usage() {
+    size_t free_byte;
+    size_t total_byte;
+
+    cudaError_t cuda_status = cudaMemGetInfo(&free_byte, &total_byte);
+
+    if (cudaSuccess != cuda_status) {
+        printf("Error: cudaMemGetInfo fails, %s \n", cudaGetErrorString(cuda_status));
+        exit(1);
+    }
+
+    double free_db = (double)free_byte;
+    double total_db = (double)total_byte;
+    double used_db = total_db - free_db;
+    printf("\n  => GPU MEMORY USAGE (MB) : Used = %f, Free = %f MB, Total = %f",
+           used_db / 1024.0 / 1024.0, free_db / 1024.0 / 1024.0, total_db / 1024.0 / 1024.0);
+    fflush(stdout);
+}
+
+void start_cuda_profile() {
+    cudaProfilerStart();
+    print_gpu_memory_usage();
+}
+
+void stop_cuda_profile() {
+    cudaProfilerStop();
+    print_gpu_memory_usage();
+}
diff --git a/coreneuron/nrniv/cuda_profile.h b/coreneuron/nrniv/cuda_profile.h
new file mode 100644
index 000000000..28c43829a
--- /dev/null
+++ b/coreneuron/nrniv/cuda_profile.h
@@ -0,0 +1,8 @@
+#ifndef _cuda_profile_h_
+#define _cuda_profile_h_
+
+void start_cuda_profile();
+void stop_cuda_profile();
+void print_gpu_memory_usage();
+
+#endif
diff --git a/coreneuron/nrniv/cvodestb.cpp b/coreneuron/nrniv/cvodestb.cpp
index c49b6fee4..0fe27fa12 100644
--- a/coreneuron/nrniv/cvodestb.cpp
+++ b/coreneuron/nrniv/cvodestb.cpp
@@ -33,61 +33,82 @@ THE POSSIBILITY OF SUCH DAMAGE.
 #include "coreneuron/nrniv/netcvode.h"
 #include "coreneuron/nrniv/vrecitem.h"
 
+#include "coreneuron/nrniv/nrn_acc_manager.h"
+
 extern "C" {
 extern NetCvode* net_cvode_instance;
 
 // for fixed step thread
+// check thresholds and deliver all (including binqueue) events
+// up to t+dt/2
 void deliver_net_events(NrnThread* nt) {
-	(void)nt;
-	if (net_cvode_instance) {
-		net_cvode_instance->check_thresh(nt);
-		net_cvode_instance->deliver_net_events(nt);
-	}
+    (void)nt;
+    if (net_cvode_instance) {
+        net_cvode_instance->check_thresh(nt);
+        net_cvode_instance->deliver_net_events(nt);
+    }
 }
 
-// handle events during finitialize()
+// deliver events (but not binqueue)  up to nt->_t
 void nrn_deliver_events(NrnThread* nt) {
-	double tsav = nt->_t;
-	if (net_cvode_instance) {
-		net_cvode_instance->deliver_events(tsav, nt);
-	}
-	nt->_t = tsav;
+    double tsav = nt->_t;
+    if (net_cvode_instance) {
+        net_cvode_instance->deliver_events(tsav, nt);
+    }
+    nt->_t = tsav;
+
+    /*before executing on gpu, we have to update the NetReceiveBuffer_t on GPU */
+    update_net_receive_buffer(nt);
+
+    for (int i = 0; i < net_buf_receive_cnt_; ++i) {
+        (*net_buf_receive_[i])(nt);
+    }
 }
 
 void clear_event_queue() {
-	if (net_cvode_instance) {
-		net_cvode_instance->clear_events();
-	}
+    if (net_cvode_instance) {
+        net_cvode_instance->clear_events();
+    }
 }
 
 void init_net_events() {
-	if (net_cvode_instance) {
-		net_cvode_instance->init_events();
-	}
-}
+    if (net_cvode_instance) {
+        net_cvode_instance->init_events();
+    }
 
+#if defined(_OPENACC)
+    /* weight vectors could be updated (from INITIAL block of NET_RECEIVE, update those on GPU's */
+    for (int ith = 0; ith < nrn_nthread; ++ith) {
+        NrnThread* nt = nrn_threads + ith;
+        double* weights = nt->weights;
+        int n_weight = nt->n_weight;
+        if (n_weight) {
+            #pragma acc update device(weights[0 : n_weight]) if (nt->compute_gpu)
+        }
+    }
+#endif
+}
 
 void nrn_play_init() {
     for (int ith = 0; ith < nrn_nthread; ++ith) {
-	NrnThread* nt = nrn_threads + ith;
-	for (int i=0; i < nt->n_vecplay; ++i) {
-		((PlayRecord*)nt->_vecplay[i])->play_init();
-	}
+        NrnThread* nt = nrn_threads + ith;
+        for (int i = 0; i < nt->n_vecplay; ++i) {
+            ((PlayRecord*)nt->_vecplay[i])->play_init();
+        }
     }
 }
 
 void fixed_play_continuous(NrnThread* nt) {
-	for (int i=0; i < nt->n_vecplay; ++i) {
-		((PlayRecord*)nt->_vecplay[i])->continuous(nt->_t);
-	}
+    for (int i = 0; i < nt->n_vecplay; ++i) {
+        ((PlayRecord*)nt->_vecplay[i])->continuous(nt->_t);
+    }
 }
 
 int at_time(NrnThread* nt, double te) {
-	double x = te - 1e-11;
-	if (x <= nt->_t && x > (nt->_t - nt->_dt)) {
-		return 1;
-	}
-	return 0;
+    double x = te - 1e-11;
+    if (x <= nt->_t && x > (nt->_t - nt->_dt)) {
+        return 1;
+    }
+    return 0;
 }
-
 }
diff --git a/coreneuron/nrniv/global_vars.cpp b/coreneuron/nrniv/global_vars.cpp
new file mode 100644
index 000000000..ef020ccaf
--- /dev/null
+++ b/coreneuron/nrniv/global_vars.cpp
@@ -0,0 +1,99 @@
+#include <stdio.h>
+#include <string.h>
+#include <map>
+#include <string>
+#include <algorithm>
+
+#include "coreneuron/nrnconf.h"
+#include "coreneuron/nrniv/nrniv_decl.h"
+#include "coreneuron/nrnoc/membfunc.h"
+#include "coreneuron/nrniv/nrn_assert.h"
+
+using namespace std;
+
+typedef pair<size_t, double*> PSD;
+typedef map<string, PSD> N2V;
+
+static N2V* n2v;
+
+void hoc_register_var(DoubScal* ds, DoubVec* dv, VoidFunc*) {
+    if (!n2v) {
+        n2v = new N2V();
+    }
+    for (size_t i = 0; ds[i].name; ++i) {
+        (*n2v)[ds[i].name] = PSD(0, ds[i].pdoub);
+    }
+    for (size_t i = 0; dv[i].name; ++i) {
+        (*n2v)[dv[i].name] = PSD(dv[i].index1, ds[i].pdoub);
+    }
+}
+
+void set_globals(const char* path) {
+    if (!n2v) {
+        n2v = new N2V();
+    }
+    (*n2v)["celsius"] = PSD(0, &celsius);
+    (*n2v)["dt"] = PSD(0, &dt);
+
+    string fname = string(path) + string("/globals.dat");
+    FILE* f = fopen(fname.c_str(), "r");
+    if (!f) {
+        printf("ignore: could not open %s\n", fname.c_str());
+        delete n2v;
+        return;
+    }
+
+    char line[256];
+    char name[256];
+    double val;
+    int n;
+
+    for (;;) {
+        nrn_assert(fgets(line, 256, f) != NULL);
+        N2V::iterator it;
+        if (sscanf(line, "%s %lf", name, &val) == 2) {
+            if (strcmp(name, "0") == 0) {
+                break;
+            }
+            it = n2v->find(name);
+            if (it != n2v->end()) {
+                nrn_assert(it->second.first == 0);
+                *(it->second.second) = val;
+            }
+        } else if (sscanf(line, "%[^[][%d]\n", name, &n) == 2) {
+            if (strcmp(name, "0") == 0) {
+                break;
+            }
+            it = n2v->find(name);
+            if (it != n2v->end()) {
+                nrn_assert(it->second.first == (size_t)n);
+                double* pval = it->second.second;
+                for (int i = 0; i < n; ++i) {
+                    nrn_assert(fgets(line, 256, f) != NULL);
+                    nrn_assert(sscanf(line, "%lf\n", &val) == 1);
+                    pval[i] = val;
+                }
+            }
+        } else {
+            nrn_assert(0);
+        }
+    }
+
+    if (fgets(line, 256, f) != NULL) {
+        if (sscanf(line, "%s %d", name, &n) == 2) {
+            if (strcmp(name, "secondorder") == 0) {
+                secondorder = n;
+            }
+        }
+    }
+
+    fclose(f);
+
+#if 0
+  for (N2V::iterator i = n2v->begin(); i != n2v->end(); ++i) {
+    printf("%s %ld %p\n", i->first.c_str(), i->second.first, i->second.second);
+  }
+#endif
+
+    delete n2v;
+}
diff --git a/coreneuron/nrniv/have2want.h b/coreneuron/nrniv/have2want.h
new file mode 100644
index 000000000..f0ed362c7
--- /dev/null
+++ b/coreneuron/nrniv/have2want.h
@@ -0,0 +1,262 @@
+/*
+To be included by a file that desires rendezvous rank exchange functionality.
+Need to define HAVEWANT_t, HAVEWANT_alltoallv, and HAVEWANT2Int
+*/
+
+#ifdef have2want_h
+#error "This implementation can only be included once"
+/* The static function names could involve a macro name. */
+#endif
+
+#define have2want_h
+
+/*
+
+A rank owns a set of HAVEWANT_t keys and wants information associated with
+a set of HAVEWANT_t keys owned by unknown ranks.  Owners do not know which
+ranks want their information. Ranks that want info do not know which ranks
+own that info.
+
+The have_to_want function returns two new vectors of keys along with
+associated count and displacement vectors of length nhost and nhost+1
+respectively. Note that a send_to_want_displ[i+1] =
+  send_to_want_cnt[i] + send_to_want_displ[i] .
+
+send_to_want[send_to_want_displ[i] to send_to_want_displ[i+1]] contains
+the keys from this rank for which rank i wants information.
+
+recv_from_have[recv_from_have_displ[i] to recv_from_have_displ[i+1] contains
+the keys from which rank i is sending information to this rank.
+
+Note that on rank i, the order of keys in the rank j area of send_to_want
+is the same order of keys on rank j in the ith area in recv_from_have.
+
+The rendezvous_rank function is used to parallelize this computation
+and minimize memory usage so that no single rank ever needs to know all keys.
+*/
+
+#ifndef HAVEWANT_t
+#define HAVEWANT_t int
+#endif
+
+// round robin default rendezvous rank function
+static int default_rendezvous(HAVEWANT_t key) {
+    return key % nrnmpi_numprocs;
+}
+
+static int* cnt2displ(int* cnt) {
+    int* displ = new int[nrnmpi_numprocs + 1];
+    displ[0] = 0;
+    for (int i = 0; i < nrnmpi_numprocs; ++i) {
+        displ[i + 1] = displ[i] + cnt[i];
+    }
+    return displ;
+}
+
+static int* srccnt2destcnt(int* srccnt) {
+    int* destcnt = new int[nrnmpi_numprocs];
+#if NRNMPI
+    if (nrnmpi_numprocs > 1) {
+        nrnmpi_int_alltoall(srccnt, destcnt, 1);
+    } else
+#endif
+    {
+        for (int i = 0; i < nrnmpi_numprocs; ++i) {
+            destcnt[i] = srccnt[i];
+        }
+    }
+    return destcnt;
+}
+
+static void rendezvous_rank_get(HAVEWANT_t* data,
+                                int size,
+                                HAVEWANT_t*& sdata,
+                                int*& scnt,
+                                int*& sdispl,
+                                HAVEWANT_t*& rdata,
+                                int*& rcnt,
+                                int*& rdispl,
+                                int (*rendezvous_rank)(HAVEWANT_t)) {
+    int nhost = nrnmpi_numprocs;
+
+    // count what gets sent
+    scnt = new int[nhost];
+    for (int i = 0; i < nhost; ++i) {
+        scnt[i] = 0;
+    }
+    for (int i = 0; i < size; ++i) {
+        int r = (*rendezvous_rank)(data[i]);
+        ++scnt[r];
+    }
+
+    sdispl = cnt2displ(scnt);
+    rcnt = srccnt2destcnt(scnt);
+    rdispl = cnt2displ(rcnt);
+    sdata = new HAVEWANT_t[sdispl[nhost]];
+    rdata = new HAVEWANT_t[rdispl[nhost]];
+    // scatter data into sdata by recalculating scnt.
+    for (int i = 0; i < nhost; ++i) {
+        scnt[i] = 0;
+    }
+    for (int i = 0; i < size; ++i) {
+        int r = (*rendezvous_rank)(data[i]);
+        sdata[sdispl[r] + scnt[r]] = data[i];
+        ++scnt[r];
+    }
+#if NRNMPI
+    if (nhost > 1) {
+        HAVEWANT_alltoallv(sdata, scnt, sdispl, rdata, rcnt, rdispl);
+    } else
+#endif
+    {
+        for (int i = 0; i < sdispl[nhost]; ++i) {
+            rdata[i] = sdata[i];
+        }
+    }
+}
+
+static void have_to_want(HAVEWANT_t* have,
+                         int have_size,
+                         HAVEWANT_t* want,
+                         int want_size,
+                         HAVEWANT_t*& send_to_want,
+                         int*& send_to_want_cnt,
+                         int*& send_to_want_displ,
+                         HAVEWANT_t*& recv_from_have,
+                         int*& recv_from_have_cnt,
+                         int*& recv_from_have_displ,
+                         int (*rendezvous_rank)(HAVEWANT_t)) {
+    // 1) Send have and want to the rendezvous ranks.
+    // 2) Rendezvous rank matches have and want.
+    // 3) Rendezvous ranks tell the want ranks which ranks own the keys
+    // 4) Ranks that want tell owner ranks where to send.
+
+    int nhost = nrnmpi_numprocs;
+
+    // 1) Send have and want to the rendezvous ranks.
+    HAVEWANT_t *have_s_data, *have_r_data;
+    int *have_s_cnt, *have_s_displ, *have_r_cnt, *have_r_displ;
+    rendezvous_rank_get(have, have_size, have_s_data, have_s_cnt, have_s_displ, have_r_data,
+                        have_r_cnt, have_r_displ, rendezvous_rank);
+    // assume it is an error if two ranks have the same key so create
+    // hash table of key2rank. Will also need it for matching have and want
+    HAVEWANT2Int havekey2rank = HAVEWANT2Int();
+    for (int r = 0; r < nhost; ++r) {
+        for (int i = 0; i < have_r_cnt[r]; ++i) {
+            HAVEWANT_t key = have_r_data[have_r_displ[r] + i];
+            if (havekey2rank.find(key) != havekey2rank.end()) {
+                char buf[200];
+                sprintf(buf, "key %lld owned by multiple ranks\n", (long long)key);
+                hoc_execerror(buf, 0);
+            }
+            havekey2rank[key] = r;
+        }
+    }
+    delete[] have_s_data;
+    delete[] have_s_cnt;
+    delete[] have_s_displ;
+    delete[] have_r_data;
+    delete[] have_r_cnt;
+    delete[] have_r_displ;
+
+    HAVEWANT_t *want_s_data, *want_r_data;
+    int *want_s_cnt, *want_s_displ, *want_r_cnt, *want_r_displ;
+    rendezvous_rank_get(want, want_size, want_s_data, want_s_cnt, want_s_displ, want_r_data,
+                        want_r_cnt, want_r_displ, rendezvous_rank);
+
+    // 2) Rendezvous rank matches have and want.
+    //    we already have made the havekey2rank map.
+    // Create an array parallel to want_r_data which contains the ranks that
+    // have that data.
+    int n = want_r_displ[nhost];
+    int* want_r_ownerranks = new int[n];
+    for (int r = 0; r < nhost; ++r) {
+        for (int i = 0; i < want_r_cnt[r]; ++i) {
+            int ix = want_r_displ[r] + i;
+            HAVEWANT_t key = want_r_data[ix];
+            if (havekey2rank.find(key) == havekey2rank.end()) {
+                char buf[200];
+                sprintf(buf, "key = %lld is wanted but does not exist\n", (long long)key);
+                hoc_execerror(buf, 0);
+            }
+            want_r_ownerranks[ix] = havekey2rank[key];
+        }
+    }
+    delete[] want_r_data;
+
+    // 3) Rendezvous ranks tell the want ranks which ranks own the keys
+    // The ranks that want keys need to know the ranks that own those keys.
+    // The want_s_ownerranks will be parallel to the want_s_data.
+    // That is, each item defines the rank from which information associated
+    // with that key is coming from
+    int* want_s_ownerranks = new int[want_s_displ[nhost]];
+#if NRNMPI
+    if (nhost > 1) {
+        nrnmpi_int_alltoallv(want_r_ownerranks, want_r_cnt, want_r_displ, want_s_ownerranks,
+                             want_s_cnt, want_s_displ);
+    } else
+#endif
+    {
+        for (int i = 0; i < want_r_displ[nhost]; ++i) {
+            want_s_ownerranks[i] = want_r_ownerranks[i];
+        }
+    }
+    delete[] want_r_ownerranks;
+    delete[] want_r_cnt;
+    delete[] want_r_displ;
+
+    // 4) Ranks that want tell owner ranks where to send.
+    // Finished with the rendezvous ranks. The ranks that want keys know the
+    // owner ranks for those keys. The next step is for the want ranks to
+    // tell the owner ranks where to send.
+    // The parallel want_s_ownerranks and want_s_data are now uselessly ordered
+    // by rendezvous rank. Reorganize so that want ranks can tell owner ranks
+    // what they want.
+    n = want_s_displ[nhost];
+    delete[] want_s_displ;
+    for (int i = 0; i < nhost; ++i) {
+        want_s_cnt[i] = 0;
+    }
+    HAVEWANT_t* old_want_s_data = want_s_data;
+    want_s_data = new HAVEWANT_t[n];
+    // compute the counts
+    for (int i = 0; i < n; ++i) {
+        int r = want_s_ownerranks[i];
+        ++want_s_cnt[r];
+    }
+    want_s_displ = cnt2displ(want_s_cnt);
+    for (int i = 0; i < nhost; ++i) {
+        want_s_cnt[i] = 0;
+    }  // recount while filling
+    for (int i = 0; i < n; ++i) {
+        int r = want_s_ownerranks[i];
+        HAVEWANT_t key = old_want_s_data[i];
+        want_s_data[want_s_displ[r] + want_s_cnt[r]] = key;
+        ++want_s_cnt[r];
+    }
+    delete[] want_s_ownerranks;
+    delete[] old_want_s_data;
+    want_r_cnt = srccnt2destcnt(want_s_cnt);
+    want_r_displ = cnt2displ(want_r_cnt);
+    want_r_data = new HAVEWANT_t[want_r_displ[nhost]];
+#if NRNMPI
+    if (nhost > 1) {
+        HAVEWANT_alltoallv(want_s_data, want_s_cnt, want_s_displ, want_r_data, want_r_cnt,
+                           want_r_displ);
+    } else
+#endif
+    {
+        for (int i = 0; i < want_s_displ[nhost]; ++i) {
+            want_r_data[i] = want_s_data[i];
+        }
+    }
+    // now the want_r_data on the have_ranks are grouped according to the ranks
+    // that want those keys.
+
+    send_to_want = want_r_data;
+    send_to_want_cnt = want_r_cnt;
+    send_to_want_displ = want_r_displ;
+    recv_from_have = want_s_data;
+    recv_from_have_cnt = want_s_cnt;
+    recv_from_have_displ = want_s_displ;
+}
diff --git a/coreneuron/nrniv/ivocvect.cpp b/coreneuron/nrniv/ivocvect.cpp
index 191ea3613..dab8371ce 100644
--- a/coreneuron/nrniv/ivocvect.cpp
+++ b/coreneuron/nrniv/ivocvect.cpp
@@ -29,8 +29,13 @@ THE POSSIBILITY OF SUCH DAMAGE.
 #include "coreneuron/nrniv/ivocvect.h"
 
 extern "C" {
-  IvocVect* vector_new(int n) { return new IvocVect(n); }
-  int vector_capacity(IvocVect* v) { return v->size(); }
-  double* vector_vec(IvocVect* v) { return v->data(); }
+IvocVect* vector_new1(int n) {
+    return new IvocVect(n);
+}
+int vector_capacity(IvocVect* v) {
+    return v->size();
+}
+double* vector_vec(IvocVect* v) {
+    return v->data();
+}
 }
-
diff --git a/coreneuron/nrniv/ivocvect.h b/coreneuron/nrniv/ivocvect.h
index 3879a18a5..bb10fd826 100644
--- a/coreneuron/nrniv/ivocvect.h
+++ b/coreneuron/nrniv/ivocvect.h
@@ -29,43 +29,76 @@ THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef ivoc_vector_h
 #define ivoc_vector_h
 
+#if defined(__cplusplus)
+
 #include <stdio.h>
 #include "coreneuron/nrniv/nrnmutdec.h"
 
 template <typename T>
-class fixed_vector{
+class fixed_vector {
     size_t n_;
-    T* data_;
     MUTDEC
 
   public:
-    fixed_vector(size_t n):n_(n) { data_ = new T[n_]; }
-    ~fixed_vector() { delete [] data_; }
+    T* data_; /*making public for openacc copying */
+    fixed_vector(size_t n) : n_(n) {
+        data_ = new T[n_];
+    }
+    ~fixed_vector() {
+        delete[] data_;
+    }
 
-    const T& operator[] (int i) const { return data_[i]; }
-    T& operator[] (int i) { return data_[i]; }
+    const T& operator[](int i) const {
+        return data_[i];
+    }
+    T& operator[](int i) {
+        return data_[i];
+    }
 
-    const T* data(void) const { return data_; }
-    T* data(void) { return data_; }
+    const T* data(void) const {
+        return data_;
+    }
+    T* data(void) {
+        return data_;
+    }
 
-    size_t size() const { return n_; }
+    size_t size() const {
+        return n_;
+    }
 
 #if (USE_PTHREAD || defined(_OPENMP))
-	void mutconstruct(int mkmut) {if (!mut_) MUTCONSTRUCT(mkmut)}
+    void mutconstruct(int mkmut) {
+        if (!mut_)
+            MUTCONSTRUCT(mkmut)
+    }
 #else
-	void mutconstruct(int) {}
+    void mutconstruct(int) {
+    }
 #endif
-	void lock() {MUTLOCK}
-	void unlock() {MUTUNLOCK}
+    void lock() {
+        MUTLOCK
+    }
+    void unlock() {
+        MUTUNLOCK
+    }
 };
 
 typedef fixed_vector<double> IvocVect;
 
 extern "C" {
-  extern IvocVect* vector_new(int n);
-  extern int vector_capacity(IvocVect* v);
-  extern double* vector_vec(IvocVect* v);
-}
 
+#else
+
+typedef void IvocVect;
+
+#endif /* !defined(__cplusplus) */
+
+extern IvocVect* vector_new1(int n);
+extern int vector_capacity(IvocVect* v);
+extern double* vector_vec(IvocVect* v);
+
+#if defined(__cplusplus)
+}
 #endif
 
+#endif
diff --git a/coreneuron/nrniv/lpt.cpp b/coreneuron/nrniv/lpt.cpp
new file mode 100644
index 000000000..7ebfd4e35
--- /dev/null
+++ b/coreneuron/nrniv/lpt.cpp
@@ -0,0 +1,80 @@
+#include "coreneuron/nrnconf.h"  // for size_t
+#include "coreneuron/nrniv/lpt.h"
+#include "coreneuron/nrniv/nrn_assert.h"
+#include <functional>
+#include <algorithm>
+#include <queue>
+
+typedef std::pair<size_t, size_t> P;
+
+// always want the largest remaining piece
+bool piece_comp(const P& a, const P& b) {
+    return a.second > b.second;
+}
+
+// always want the smallest bag
+struct bag_comp {
+    bool operator()(const P& a, const P& b) {
+        return a.second > b.second;
+    }
+};
+
+// lpt Least Processing Time algorithm.
+// Largest piece goes into least size bag.
+// in: number of bags, vector of sizes
+// return: a new vector of bag indices parallel to the vector of sizes.
+
+std::vector<size_t>* lpt(size_t nbag, std::vector<size_t>& pieces, double* bal) {
+    nrn_assert(nbag > 0);
+    nrn_assert(pieces.size() > 0);
+
+    std::vector<P> pvec;
+    for (size_t i = 0; i < pieces.size(); ++i) {
+        pvec.push_back(P(i, pieces[i]));
+    }
+    std::sort(pvec.begin(), pvec.end(), piece_comp);
+
+    std::vector<size_t>* bagindices = new std::vector<size_t>(pieces.size());
+
+    std::priority_queue<P, std::vector<P>, bag_comp> bagq;
+    for (size_t i = 0; i < nbag; ++i) {
+        bagq.push(P(i, 0));
+    }
+
+    for (size_t i = 0; i < pvec.size(); ++i) {
+        P& p = pvec[i];
+        P bagqitem = bagq.top();
+        bagq.pop();
+        (*bagindices)[p.first] = bagqitem.first;
+        bagqitem.second += p.second;
+        bagq.push(bagqitem);
+    }
+
+    // load balance average/max (1.0 is perfect)
+    std::vector<size_t> v(bagq.size());
+    for (size_t i = 1; i < nbag; ++i) {
+        v[i] = bagq.top().second;
+        bagq.pop();
+    }
+    double b = load_balance(v);
+    if (bal) {
+        *bal = b;
+    } else {
+        printf("load balance = %g for %ld pieces in %ld bags\n", b, pieces.size(), nbag);
+    }
+
+    return bagindices;
+}
+
+double load_balance(std::vector<size_t>& v) {
+    size_t sum = 0;
+    size_t max = 1;
+    for (size_t i = 0; i < v.size(); ++i) {
+        size_t val = v[i];
+        sum += val;
+        if (max < val) {
+            max = val;
+        }
+    }
+    return (double(sum) / v.size()) / max;
+}
diff --git a/coreneuron/nrniv/lpt.h b/coreneuron/nrniv/lpt.h
new file mode 100644
index 000000000..f36aafd32
--- /dev/null
+++ b/coreneuron/nrniv/lpt.h
@@ -0,0 +1,9 @@
+#ifndef lpt_h
+#define lpt_h
+
+#include <vector>
+
+std::vector<size_t>* lpt(size_t nbag, std::vector<size_t>& pieces, double* bal = NULL);
+
+double load_balance(std::vector<size_t>&);
+#endif
diff --git a/coreneuron/nrniv/main1.cpp b/coreneuron/nrniv/main1.cpp
index 65f401428..1cfa011b4 100644
--- a/coreneuron/nrniv/main1.cpp
+++ b/coreneuron/nrniv/main1.cpp
@@ -32,6 +32,7 @@ THE POSSIBILITY OF SUCH DAMAGE.
  * @brief File containing main driver routine for CoreNeuron
  */
 
+#include "coreneuron/utils/randoms/nrnran123.h"
 #include "coreneuron/nrnconf.h"
 #include "coreneuron/nrnoc/multicore.h"
 #include "coreneuron/nrnoc/nrnoc_decl.h"
@@ -41,13 +42,17 @@ THE POSSIBILITY OF SUCH DAMAGE.
 #include "coreneuron/utils/endianness.h"
 #include "coreneuron/utils/memory_utils.h"
 #include "coreneuron/nrniv/nrnoptarg.h"
-#include "coreneuron/utils/randoms/nrnran123.h"
 #include "coreneuron/utils/sdprintf.h"
 #include "coreneuron/nrniv/nrn_stats.h"
+#include "coreneuron/utils/reports/nrnreport.h"
+#include "coreneuron/nrniv/nrn_acc_manager.h"
+#include "coreneuron/nrniv/profiler_interface.h"
+#include "coreneuron/nrniv/partrans.h"
+#include <string.h>
 
 #if 0
 #include <fenv.h>
-#define NRN_FEEXCEPT (FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW )
+#define NRN_FEEXCEPT (FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW)
 int nrn_feenableexcept() {
   int result = -1;
   result = feenableexcept(NRN_FEEXCEPT);
@@ -55,18 +60,26 @@ int nrn_feenableexcept() {
 }
 #endif
 
-int main1( int argc, char **argv, char **env )
-{
-    char prcellname[1024], filesdat_buf[1024];
-
-    ( void )env; /* unused */
+int main1(int argc, char** argv, char** env);
+void nrn_init_and_load_data(int argc, char** argv, cn_input_params& input_params);
+void call_prcellstate_for_prcellgid(int prcellgid, int compute_gpu, int is_init);
 
+void nrn_init_and_load_data(int argc, char** argv, cn_input_params& input_params) {
 #if defined(NRN_FEEXCEPT)
     nrn_feenableexcept();
 #endif
 
+#ifdef ENABLE_SELECTIVE_PROFILING
+    stop_profile();
+#endif
+
     // mpi initialisation
-    nrnmpi_init( 1, &argc, &argv );
+#if NRNMPI
+    nrnmpi_init(1, &argc, &argv);
+#endif
+
+    // memory footprint after mpi initialisation
+    report_mem_usage("After MPI_Init");
 
     // initialise default coreneuron parameters
     initnrn();
@@ -74,124 +87,238 @@ int main1( int argc, char **argv, char **env )
     // create mutex for nrn123, protect instance_count_
     nrnran123_mutconstruct();
 
-    // handles coreneuron configuration parameters
-    cn_input_params input_params;
-
     // read command line parameters
-    input_params.read_cb_opts( argc, argv );
+    input_params.read_cb_opts(argc, argv);
+
+    // set global variables
+    celsius = input_params.celsius;
+    t = input_params.celsius;
+
+#if _OPENACC
+    if (!input_params.compute_gpu && input_params.cell_interleave_permute == 2) {
+        fprintf(
+            stderr,
+            "compiled with _OPENACC does not allow the combination of --cell_permute=2 and missing --gpu\n");
+        exit(1);
+    }
+#endif
 
     // if multi-threading enabled, make sure mpi library supports it
-    if ( input_params.threading ) {
+#if NRNMPI
+    if (input_params.threading) {
         nrnmpi_check_threading_support();
     }
+#endif
+
+    // full path of files.dat file
+    char filesdat_buf[1024];
+    sd_ptr filesdat = input_params.get_filesdat_path(filesdat_buf, sizeof(filesdat_buf));
+
+    // reads mechanism information from bbcore_mech.dat
+    mk_mech(input_params.datpath);
+
+    // read the global variable names and set their values from globals.dat
+    set_globals(input_params.datpath);
+
+    report_mem_usage("After mk_mech");
 
     // set global variables for start time, timestep and temperature
     t = input_params.tstart;
-    dt = input_params.dt;
-    celsius = input_params.celsius;
 
-    // full path of files.dat file
-    sd_ptr filesdat=input_params.get_filesdat_path(filesdat_buf,sizeof(filesdat_buf));
+    if (input_params.dt != -1000.) {  // command line arg highest precedence
+        dt = input_params.dt;
+    } else if (dt == -1000.) {  // not on command line and no celsius in globals.dat
+        dt = 0.025;             // lowest precedence
+    }
 
-    // memory footprint after mpi initialisation
-    report_mem_usage( "After MPI_Init" );
+    input_params.dt = dt;  // for printing
 
-    // reads mechanism information from bbcore_mech.dat
-    mk_mech( input_params.datpath );
+    rev_dt = (int)(1. / dt);
+
+    if (input_params.celsius != -1000.) {  // command line arg highest precedence
+        celsius = input_params.celsius;
+    } else if (celsius == -1000.) {  // not on command line and no celsius in globals.dat
+        celsius = 34.0;              // lowest precedence
+    }
 
-    report_mem_usage( "After mk_mech" );
+    input_params.celsius = celsius;  // for printing
 
     // create net_cvode instance
     mk_netcvode();
 
     // One part done before call to nrn_setup. Other part after.
-    if ( input_params.patternstim ) {
+    if (input_params.patternstim) {
         nrn_set_extra_thread0_vdata();
     }
 
-    report_mem_usage( "Before nrn_setup" );
+    report_mem_usage("Before nrn_setup");
 
-    // reading *.dat files and setting up the data structures
-    nrn_setup( input_params.datpath, filesdat, nrn_need_byteswap, input_params.threading );
+    // set if need to interleave cells
+    use_interleave_permute = input_params.cell_interleave_permute;
+    cellorder_nwarp = input_params.nwarp;
+    use_solve_interleave = input_params.cell_interleave_permute;
 
-    report_mem_usage( "After nrn_setup " );
+    // pass by flag so existing tests do not need a changed nrn_setup prototype.
+    nrn_setup_multiple = input_params.multiple;
+    nrn_setup_extracon = input_params.extracon;
+
+    // reading *.dat files and setting up the data structures, setting mindelay
+    nrn_setup(input_params, filesdat, nrn_need_byteswap);
+
+    report_mem_usage("After nrn_setup ");
 
     // Invoke PatternStim
-    if ( input_params.patternstim ) {
-        nrn_mkPatternStim( input_params.patternstim );
+    if (input_params.patternstim) {
+        nrn_mkPatternStim(input_params.patternstim);
     }
 
     /// Setting the timeout
     nrn_set_timeout(200.);
 
-    // find mindelay and set configuration parameter
-    double mindelay = BBS_netpar_mindelay( input_params.maxdelay );
-
-    input_params.set_mindelay( mindelay );
-
     // show all configuration parameters for current run
     input_params.show_cb_opts();
 
-    // alloctae buffer for mpi communication
-    mk_spikevec_buffer( input_params.spikebuf );
-
-    report_mem_usage( "After mk_spikevec_buffer" );
+    // allocate buffer for mpi communication
+    mk_spikevec_buffer(input_params.spikebuf);
 
-    nrn_finitialize( 1, input_params.voltage );
+    report_mem_usage("After mk_spikevec_buffer");
 
-    report_mem_usage( "After nrn_finitialize" );
+    if (input_params.compute_gpu) {
+        setup_nrnthreads_on_device(nrn_threads, nrn_nthread);
+    }
 
-    // call prcellstae for prcellgid
-    if ( input_params.prcellgid >= 0 ) {
-        sprintf( prcellname, "t%g", t );
-        prcellstate( input_params.prcellgid, prcellname );
+    if (nrn_have_gaps) {
+        nrn_partrans::gap_update_indices();
     }
 
-    // handle forwardskip
-    if ( input_params.forwardskip > 0.0 ) {
-        handle_forward_skip( input_params.forwardskip, input_params.prcellgid );
+    // call prcellstate for prcellgid
+    call_prcellstate_for_prcellgid(input_params.prcellgid, input_params.compute_gpu, 1);
+}
+
+void call_prcellstate_for_prcellgid(int prcellgid, int compute_gpu, int is_init) {
+    char prcellname[1024];
+#ifdef ENABLE_CUDA
+    const char* prprefix = "cu";
+#else
+    const char* prprefix = "acc";
+#endif
+
+    if (prcellgid >= 0) {
+        if (compute_gpu) {
+            if (is_init)
+                sprintf(prcellname, "%s_gpu_init", prprefix);
+            else
+                sprintf(prcellname, "%s_gpu_t%g", prprefix, t);
+        } else {
+            if (is_init)
+                strcpy(prcellname, "cpu_init");
+            else
+                sprintf(prcellname, "cpu_t%g", t);
+        }
+        update_nrnthreads_on_host(nrn_threads, nrn_nthread);
+        prcellstate(prcellgid, prcellname);
     }
+}
+
+int main1(int argc, char** argv, char** env) {
+    (void)env; /* unused */
+
+    // Initial data loading
+    cn_input_params input_params;
+
+    // initializationa and loading functions moved to separate
+    nrn_init_and_load_data(argc, argv, input_params);
+
+    #pragma acc data copyin(celsius, secondorder) if (input_params.compute_gpu)
+    {
+        nrn_finitialize(input_params.voltage != 1000., input_params.voltage);
+
+        report_mem_usage("After nrn_finitialize");
+
+#ifdef ENABLE_REPORTING
+        ReportGenerator* r = NULL;
+#endif
+
+        // if reports are enabled using ReportingLib
+        if (input_params.report) {
+#ifdef ENABLE_REPORTING
+            if (input_params.multiple > 1) {
+                if (nrnmpi_myid == 0)
+                    printf(
+                        "\n WARNING! : Can't enable reports with model duplications feature! \n");
+            } else {
+                r = new ReportGenerator(input_params.report, input_params.tstart,
+                                        input_params.tstop, input_params.dt, input_params.mindelay,
+                                        input_params.dt_report, input_params.outpath);
+                r->register_report();
+            }
+#else
+            if (nrnmpi_myid == 0)
+                printf("\n WARNING! : Can't enable reports, recompile with ReportingLib! \n");
+#endif
+        }
+
+        // call prcellstate for prcellgid
+        call_prcellstate_for_prcellgid(input_params.prcellgid, input_params.compute_gpu, 0);
 
-    /// Solver execution
-    BBS_netpar_solve( input_params.tstop );
+        // handle forwardskip
+        if (input_params.forwardskip > 0.0) {
+            handle_forward_skip(input_params.forwardskip, input_params.prcellgid);
+        }
 
-    // Report global cell statistics
-    report_cell_stats();
+#ifdef ENABLE_SELECTIVE_PROFILING
+        start_profile();
+#endif
+
+        /// Solver execution
+        BBS_netpar_solve(input_params.tstop);
 
-    // prcellstate after end of solver
-    if ( input_params.prcellgid >= 0 ) {
-        sprintf( prcellname, "t%g", t );
-        prcellstate( input_params.prcellgid, prcellname );
+        // Report global cell statistics
+        report_cell_stats();
+
+#ifdef ENABLE_SELECTIVE_PROFILING
+        stop_profile();
+#endif
+
+        // prcellstate after end of solver
+        call_prcellstate_for_prcellgid(input_params.prcellgid, input_params.compute_gpu, 0);
+
+#ifdef ENABLE_REPORTING
+        if (input_params.report && r)
+            delete r;
+#endif
     }
 
     // write spike information to input_params.outpath
-    output_spikes( input_params.outpath );
+    output_spikes(input_params.outpath);
 
     // Cleaning the memory
     nrn_cleanup();
 
     // mpi finalize
+#if NRNMPI
     nrnmpi_finalize();
+#endif
+
+    finalize_data_on_device();
 
     return 0;
 }
 
-
 /* perform forwardskip and call prcellstate for prcellgid */
-void handle_forward_skip( double forwardskip, int prcellgid )
-{
+void handle_forward_skip(double forwardskip, int prcellgid) {
     double savedt = dt;
     double savet = t;
 
     dt = forwardskip * 0.1;
     t = -1e9;
 
-    for ( int step = 0; step < 10; ++step ) {
+    for (int step = 0; step < 10; ++step) {
         nrn_fixed_step_minimal();
     }
 
-    if ( prcellgid >= 0 ) {
-        prcellstate( prcellgid, "fs" );
+    if (prcellgid >= 0) {
+        prcellstate(prcellgid, "fs");
     }
 
     dt = savedt;
@@ -199,8 +326,6 @@ void handle_forward_skip( double forwardskip, int prcellgid )
     dt2thread(-1.);
 }
 
-
-const char *nrn_version( int )
-{
+const char* nrn_version(int) {
     return "version id unimplemented";
 }
diff --git a/coreneuron/nrniv/memory.h b/coreneuron/nrniv/memory.h
index 0288f06fe..cf58465b3 100644
--- a/coreneuron/nrniv/memory.h
+++ b/coreneuron/nrniv/memory.h
@@ -31,48 +31,51 @@ THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <stdlib.h>
 #include <string.h>
+#include "coreneuron/nrniv/nrn_assert.h"
 
 namespace coreneuron {
     /** Independent function to compute the needed chunkding,
         the chunk argument is the number of doubles the chunk is chunkded upon.
     */
-    template<int chunk>
+    template <int chunk>
     inline int soa_padded_size(int cnt, int layout) {
         int imod = cnt % chunk;
-        if (layout == 1) return cnt; 
+        if (layout == 1)
+            return cnt;
         if (imod) {
-          int idiv = cnt / chunk;
-          return (idiv + 1) * chunk;
+            int idiv = cnt / chunk;
+            return (idiv + 1) * chunk;
         }
         return cnt;
     }
-    
-    
+
     /** Check for the pointer alignment.
     */
     inline bool is_aligned(void* pointer, size_t alignment) {
-        return (((uintptr_t)(const void *)(pointer)) % (alignment) == 0);
+        return (((uintptr_t)(const void*)(pointer)) % (alignment) == 0);
     }
-    
+
     /** Allocate the aligned memory.
     */
     inline void* emalloc_align(size_t size, size_t alignment) {
         void* memptr;
-        assert(posix_memalign(&memptr, alignment, size) == 0);
-        assert(is_aligned(memptr, alignment));
+        nrn_assert(posix_memalign(&memptr, alignment, size) == 0);
+        nrn_assert(is_aligned(memptr, alignment));
         return memptr;
     }
-    
+
     /** Allocate the aligned memory and set it to 1.
     */
     inline void* ecalloc_align(size_t n, size_t alignment, size_t size) {
         void* p;
-        if (n == 0) { return (void*)0; }
-        assert(posix_memalign(&p, alignment, n*size) == 0);
-        assert(is_aligned(p, alignment));
-        memset(p, 1, n*size); // Avoid native division by zero (cyme...)
+        if (n == 0) {
+            return (void*)0;
+        }
+        nrn_assert(posix_memalign(&p, alignment, n * size) == 0);
+        nrn_assert(is_aligned(p, alignment));
+        memset(p, 1, n * size);  // Avoid native division by zero (cyme...)
         return p;
     }
-} //end name space
+}  // end name space
 
 #endif
diff --git a/coreneuron/nrniv/mk_mech.cpp b/coreneuron/nrniv/mk_mech.cpp
index ef0068a3a..d91a5f6b1 100644
--- a/coreneuron/nrniv/mk_mech.cpp
+++ b/coreneuron/nrniv/mk_mech.cpp
@@ -38,107 +38,131 @@ THE POSSIBILITY OF SUCH DAMAGE.
 #include "coreneuron/utils/sdprintf.h"
 #include "coreneuron/mech/cfile/cabvars.h"
 
-static char banner[] =
-"Duke, Yale, and the BlueBrain Project -- Copyright 1984-2015";
+static char banner[] = "Duke, Yale, and the BlueBrain Project -- Copyright 1984-2015";
 
 int nrn_nobanner_;
 
 int nrn_need_byteswap;
 // following copied (except for nrn_need_byteswap line) from NEURON ivocvect.cpp
-#define BYTEHEADER uint32_t _II__;  char *_IN__; char _OUT__[16]; int BYTESWAP_FLAG=0;
-#define BYTESWAP(_X__,_TYPE__) \
-    BYTESWAP_FLAG = nrn_need_byteswap; \
-    if (BYTESWAP_FLAG == 1) { \
-        _IN__ = (char *) &(_X__); \
-        for (_II__=0;_II__< sizeof(_TYPE__);_II__++) { \
-                _OUT__[_II__] = _IN__[sizeof(_TYPE__)-_II__-1]; } \
-        (_X__) = *((_TYPE__ *) &_OUT__); \
+#define BYTEHEADER   \
+    uint32_t _II__;  \
+    char* _IN__;     \
+    char _OUT__[16]; \
+    int BYTESWAP_FLAG = 0;
+#define BYTESWAP(_X__, _TYPE__)                                 \
+    BYTESWAP_FLAG = nrn_need_byteswap;                          \
+    if (BYTESWAP_FLAG == 1) {                                   \
+        _IN__ = (char*)&(_X__);                                 \
+        for (_II__ = 0; _II__ < sizeof(_TYPE__); _II__++) {     \
+            _OUT__[_II__] = _IN__[sizeof(_TYPE__) - _II__ - 1]; \
+        }                                                       \
+        (_X__) = *((_TYPE__*)&_OUT__);                          \
     }
 
 std::map<std::string, int> mech2type;
 
-/// Read meta data about the mechanisms and allocate corresponding mechanism management data structures
+/// Read meta data about the mechanisms and allocate corresponding mechanism management data
+/// structures
 void mk_mech(const char* datpath) {
-  char fnamebuf[1024];
-  sd_ptr fname=sdprintf(fnamebuf, sizeof(fnamebuf), "%s/%s", datpath, "bbcore_mech.dat");
-  FILE* f;
-  f = fopen(fname, "r");
-  assert(f);
-//  printf("reading %s\n", fname);
-  int n=0;
-  nrn_assert(fscanf(f, "%d\n", &n) == 1);
-
-  /// Allocate space for mechanism related data structures
-  alloc_mech(n);
-
-  /// Read all the mechanisms and their meta data
-  for (int i=2; i < n; ++i) {
-    char mname[100];
-    int type=0, pnttype=0, is_art=0, is_ion=0, dsize=0, pdsize=0;
-    nrn_assert(fscanf(f, "%s %d %d %d %d %d %d\n", mname, &type, &pnttype, &is_art, &is_ion, &dsize, &pdsize) == 7);
-    nrn_assert(i == type);
+    char fnamebuf[1024];
+    sd_ptr fname = sdprintf(fnamebuf, sizeof(fnamebuf), "%s/%s", datpath, "bbcore_mech.dat");
+    FILE* f;
+    f = fopen(fname, "r");
+
+    if (f == NULL) {
+        fprintf(stderr, "Error: couldn't find bbcore_mech.dat file in the dataset directory \n");
+        fprintf(
+            stderr,
+            "       Make sure to pass full directory path of dataset using -d DIR or --datpath=DIR \n");
+    }
+
+    nrn_assert(f);
+    //  printf("reading %s\n", fname);
+    int n = 0;
+    nrn_assert(fscanf(f, "%d\n", &n) == 1);
+
+    /// Allocate space for mechanism related data structures
+    alloc_mech(n);
+
+    /// Read all the mechanisms and their meta data
+    for (int i = 2; i < n; ++i) {
+        char mname[100];
+        int type = 0, pnttype = 0, is_art = 0, is_ion = 0, dsize = 0, pdsize = 0;
+        nrn_assert(fscanf(f, "%s %d %d %d %d %d %d\n", mname, &type, &pnttype, &is_art, &is_ion,
+                          &dsize, &pdsize) == 7);
+        nrn_assert(i == type);
 #ifdef DEBUG
-    printf("%s %d %d %d %d %d %d\n", mname, type, pnttype, is_art, is_ion, dsize, pdsize);
+        printf("%s %d %d %d %d %d %d\n", mname, type, pnttype, is_art, is_ion, dsize, pdsize);
 #endif
-    std::string str(mname);
-    mech2type[str] = type;
-    pnt_map[type] = (char)pnttype;
-    nrn_prop_param_size_[type] = dsize;
-    nrn_prop_dparam_size_[type] = pdsize;
-    nrn_is_artificial_[type] = is_art;
-    if (is_ion) {
-      double charge = 0.;
-      nrn_assert(fscanf(f, "%lf\n", &charge) == 1);
-      // strip the _ion
-      char iname[100];
-      strcpy(iname, mname);
-      iname[strlen(iname) - 4] = '\0';
-      //printf("%s %s\n", mname, iname);
-      ion_reg(iname, charge);
+        std::string str(mname);
+        memb_func[type].sym = (Symbol*)strdup(mname);
+        mech2type[str] = type;
+        pnt_map[type] = (char)pnttype;
+        nrn_prop_param_size_[type] = dsize;
+        nrn_prop_dparam_size_[type] = pdsize;
+        nrn_is_artificial_[type] = is_art;
+        if (is_ion) {
+            double charge = 0.;
+            nrn_assert(fscanf(f, "%lf\n", &charge) == 1);
+            // strip the _ion
+            char iname[100];
+            strcpy(iname, mname);
+            iname[strlen(iname) - 4] = '\0';
+            // printf("%s %s\n", mname, iname);
+            ion_reg(iname, charge);
+        }
+        // printf("%s %d %d\n", mname, nrn_get_mechtype(mname), type);
+    }
+
+    // an int32_t binary 1 is at this position. After reading can decide if
+    // binary info in files needs to be byteswapped.
+    int32_t x;
+    nrn_assert(fread(&x, sizeof(int32_t), 1, f) == 1);
+    nrn_need_byteswap = 0;
+    if (x != 1) {
+        BYTEHEADER;
+        nrn_need_byteswap = 1;
+        BYTESWAP(x, int32_t);
+        nrn_assert(x == 1);
+    }
+
+    fclose(f);
+
+    if (nrnmpi_myid < 1 && nrn_nobanner_ == 0) {
+        fprintf(stderr, " \n");
+        fprintf(stderr, " %s\n", banner);
+        fprintf(stderr, " %s\n", nrn_version(1));
+        fprintf(stderr, " \n");
+        fflush(stderr);
     }
-    //printf("%s %d %d\n", mname, nrn_get_mechtype(mname), type);
-  }
-
-  // an int32_t binary 1 is at this position. After reading can decide if
-  // binary info in files needs to be byteswapped.
-  int32_t x;
-  nrn_assert(fread(&x, sizeof(int32_t), 1, f) == 1);
-  nrn_need_byteswap = 0;
-  if (x != 1) {
-    BYTEHEADER;
-    nrn_need_byteswap = 1;
-    BYTESWAP(x, int32_t);
-    assert(x == 1);
-  }
-
-  fclose(f);
-
-  if (nrnmpi_myid < 1 && nrn_nobanner_ == 0) {
-      fprintf(stderr, " \n");
-      fprintf(stderr, " %s\n", banner);
-      fprintf(stderr, " %s\n", nrn_version(1));
-      fprintf(stderr, " \n");
-      fflush(stderr);
-  }
-/* will have to put this back if any mod file refers to diam */
-//	register_mech(morph_mech, morph_alloc, (Pfri)0, (Pfri)0, (Pfri)0, (Pfri)0, -1, 0);
-
-  /// Calling _reg functions for the default mechanisms from the file mech/cfile/cabvars.h
-  for (int i=0; mechanism[i]; i++) {
-      (*mechanism[i])();
-  }
-
-  /// Calling the rest of additional Neurodamus or other _reg functions from the file mod_func.c
-  /// (built during the config time from mech/mod_func.c.pl)
-  modl_reg();
+    /* will have to put this back if any mod file refers to diam */
+    //	register_mech(morph_mech, morph_alloc, (Pfri)0, (Pfri)0, (Pfri)0, (Pfri)0, -1, 0);
+
+    /// Calling _reg functions for the default mechanisms from the file mech/cfile/cabvars.h
+    for (int i = 0; mechanism[i]; i++) {
+        (*mechanism[i])();
+    }
+
+    /// Calling the rest of additional Neurodamus or other _reg functions from the file mod_func.c
+    /// (built during the config time from mech/mod_func.c.pl)
+    modl_reg();
 }
 
 /// Get mechanism type by the mechanism name
 int nrn_get_mechtype(const char* name) {
-  std::string str(name);
-  std::map<std::string, int>::const_iterator mapit;
-  mapit = mech2type.find(str);
-  if (mapit == mech2type.end())
-    return -1; // Could not find the mechanism
-  return mapit->second;
+    std::string str(name);
+    std::map<std::string, int>::const_iterator mapit;
+    mapit = mech2type.find(str);
+    if (mapit == mech2type.end())
+        return -1;  // Could not find the mechanism
+    return mapit->second;
+}
+
+const char* nrn_get_mechname(int type) {
+    for (std::map<std::string, int>::iterator i = mech2type.begin(); i != mech2type.end(); ++i) {
+        if (type == i->second) {
+            return i->first.c_str();
+        }
+    }
+    return NULL;
 }
diff --git a/coreneuron/nrniv/netcon.h b/coreneuron/nrniv/netcon.h
index e54d08020..438da9460 100644
--- a/coreneuron/nrniv/netcon.h
+++ b/coreneuron/nrniv/netcon.h
@@ -52,120 +52,128 @@ class NetCvode;
 #define InputPreSynType 20
 
 class DiscreteEvent {
-public:
-	DiscreteEvent();
-	virtual ~DiscreteEvent();
-	virtual void send(double deliverytime, NetCvode*, NrnThread*);
-	virtual void deliver(double t, NetCvode*, NrnThread*);
-	virtual int type() { return DiscreteEventType; }
+  public:
+    DiscreteEvent();
+    virtual ~DiscreteEvent();
+    virtual void send(double deliverytime, NetCvode*, NrnThread*);
+    virtual void deliver(double t, NetCvode*, NrnThread*);
+    virtual int type() {
+        return DiscreteEventType;
+    }
 
     virtual void pr(const char*, double t, NetCvode*);
-	// actions performed over each item in the event queue.
-	virtual void frecord_init(TQItem*) {};
 };
 
 class NetCon : public DiscreteEvent {
-public:
+  public:
     bool active_;
     double delay_;
-    DiscreteEvent* src_; // either a PreSyn or an InputPreSyn or NULL
     Point_process* target_;
     union {
-        double* weight_;
-        int srcgid_; // only to help InputPreSyn during setup
+        int weight_index_;
+        int srcgid_;  // only to help InputPreSyn during setup
         // before weights are read and stored. Saves on transient
         // memory requirements by avoiding storage of all group file
         // netcon_srcgid lists. ie. that info is copied into here.
     } u;
 
-	NetCon();
-	virtual ~NetCon();
-	virtual void send(double sendtime, NetCvode*, NrnThread*);
-    virtual void deliver(double,  NetCvode* ns, NrnThread*);
-	virtual int type() { return NetConType; }
+    NetCon();
+    virtual ~NetCon();
+    virtual void send(double sendtime, NetCvode*, NrnThread*);
+    virtual void deliver(double, NetCvode* ns, NrnThread*);
+    virtual int type() {
+        return NetConType;
+    }
+    virtual void pr(const char*, double t, NetCvode*);
 };
 
 class SelfEvent : public DiscreteEvent {
-public:
+  public:
     double flag_;
     Point_process* target_;
-    double* weight_;
-    void** movable_; // actually a TQItem**
+    void** movable_;  // actually a TQItem**
+    int weight_index_;
+
+    SelfEvent();
+    virtual ~SelfEvent();
+    virtual void deliver(double, NetCvode*, NrnThread*);
+    virtual int type() {
+        return SelfEventType;
+    }
 
-	SelfEvent();
-	virtual ~SelfEvent();
-	virtual void deliver(double, NetCvode*, NrnThread*);
-    virtual int type() { return SelfEventType; }
     virtual void pr(const char*, double t, NetCvode*);
 
-private:
-	void call_net_receive(NetCvode*);
+  private:
+    void call_net_receive(NetCvode*);
 };
 
-
 class ConditionEvent : public DiscreteEvent {
-public:
-	// condition detection factored out of PreSyn for re-use
-	ConditionEvent();
-	virtual ~ConditionEvent();
-	virtual void check(NrnThread*, double sendtime, double teps = 0.0);
-	virtual double value() { return -1.; }
-
-	bool flag_; // true when below, false when above.
+  public:
+    // condition detection factored out of PreSyn for re-use
+    ConditionEvent();
+    virtual ~ConditionEvent();
+    virtual bool check(NrnThread*);
+    virtual double value(NrnThread*) {
+        return -1.;
+    }
+
+    int flag_;  // true when below, false when above. (changed from bool to int to avoid cray acc
+                // bug(?))
 };
 
-
 class PreSyn : public ConditionEvent {
-public:
+  public:
 #if NRNMPI
-    unsigned char localgid_; // compressed gid for spike transfer
+    unsigned char localgid_;  // compressed gid for spike transfer
 #endif
-    int nc_index_; //replaces dil_, index into global NetCon** netcon_in_presyn_order_
-    int nc_cnt_; // how many netcon starting at nc_index_
+    int nc_index_;  // replaces dil_, index into global NetCon** netcon_in_presyn_order_
+    int nc_cnt_;    // how many netcon starting at nc_index_
     int output_index_;
     int gid_;
     double threshold_;
-    double* thvar_;
+    int thvar_index_;  // >=0 points into NrnThread._actual_v
     Point_process* pntsrc_;
-    NrnThread* nt_;
 
-	PreSyn();
-	virtual ~PreSyn();
-	virtual void send(double sendtime, NetCvode*, NrnThread*);
-	virtual void deliver(double, NetCvode*, NrnThread*);
-	virtual int type() { return PreSynType; }
+    PreSyn();
+    virtual ~PreSyn();
+    virtual void send(double sendtime, NetCvode*, NrnThread*);
+    virtual void deliver(double, NetCvode*, NrnThread*);
+    virtual int type() {
+        return PreSynType;
+    }
 
-    virtual double value() { return *thvar_ - threshold_; }
-	void record(double t);
+    virtual double value(NrnThread*);
+    void record(double t);
 };
 
 class InputPreSyn : public DiscreteEvent {
-public:
-    int nc_index_; //replaces dil_, index into global NetCon** netcon_in_presyn_order_
-    int nc_cnt_; // how many netcon starting at nc_index_
-    int gid_;
-
-	InputPreSyn();
-	virtual ~InputPreSyn();
-	virtual void send(double sendtime, NetCvode*, NrnThread*);
-	virtual void deliver(double, NetCvode*, NrnThread*);
-	virtual int type() { return InputPreSynType; }
-
-
+  public:
+    int nc_index_;  // replaces dil_, index into global NetCon** netcon_in_presyn_order_
+    int nc_cnt_;    // how many netcon starting at nc_index_
+
+    InputPreSyn();
+    virtual ~InputPreSyn();
+    virtual void send(double sendtime, NetCvode*, NrnThread*);
+    virtual void deliver(double, NetCvode*, NrnThread*);
+    virtual int type() {
+        return InputPreSynType;
+    }
 };
 
 class NetParEvent : public DiscreteEvent {
-public:
-    int ithread_; // for pr()
-    double wx_, ws_; // exchange time and "spikes to Presyn" time
+  public:
+    int ithread_;     // for pr()
+    double wx_, ws_;  // exchange time and "spikes to Presyn" time
+
+    NetParEvent();
+    virtual ~NetParEvent();
+    virtual void send(double, NetCvode*, NrnThread*);
+    virtual void deliver(double, NetCvode*, NrnThread*);
+    virtual int type() {
+        return NetParEventType;
+    }
 
-	NetParEvent();
-	virtual ~NetParEvent();
-	virtual void send(double, NetCvode*, NrnThread*);
-	virtual void deliver(double, NetCvode*, NrnThread*);
-    virtual int type() { return NetParEventType; }
-
-	virtual void pr(const char*, double t, NetCvode*);
+    virtual void pr(const char*, double t, NetCvode*);
 };
 
 #endif
diff --git a/coreneuron/nrniv/netcvode.cpp b/coreneuron/nrniv/netcvode.cpp
index 5627753af..f380f2989 100644
--- a/coreneuron/nrniv/netcvode.cpp
+++ b/coreneuron/nrniv/netcvode.cpp
@@ -27,6 +27,8 @@ THE POSSIBILITY OF SUCH DAMAGE.
 */
 
 #include <float.h>
+#include <vector>
+#include <map>
 #include "coreneuron/nrnconf.h"
 #include "coreneuron/nrnoc/multicore.h"
 #include "coreneuron/nrnoc/nrnoc_decl.h"
@@ -36,6 +38,10 @@ THE POSSIBILITY OF SUCH DAMAGE.
 #include "coreneuron/nrniv/nrniv_decl.h"
 #include "coreneuron/nrniv/output_spikes.h"
 #include "coreneuron/nrniv/nrn_assert.h"
+#include "coreneuron/nrniv/nrn_acc_manager.h"
+#ifdef _OPENACC
+#include <openacc.h>
+#endif
 
 #define PP2NT(pp) (nrn_threads + (pp)->_tid)
 #define PP2t(pp) (PP2NT(pp)->_t)
@@ -47,548 +53,705 @@ double NetCvode::eps_;
 NetCvode* net_cvode_instance;
 int cvode_active_;
 
+/// Flag to use the bin queue
+bool nrn_use_bin_queue_ = 0;
+
 void mk_netcvode() {
-	if (!net_cvode_instance) {
-		net_cvode_instance = new NetCvode();
-	}
+    if (!net_cvode_instance) {
+        net_cvode_instance = new NetCvode();
+    }
 }
 
+extern void nrn_outputevent(unsigned char, double);
 extern "C" {
 extern pnt_receive_t* pnt_receive;
 extern pnt_receive_t* pnt_receive_init;
 extern short* nrn_artcell_qindex_;
 extern bool nrn_use_localgid_;
-extern void nrn_outputevent(unsigned char, double);
 extern void nrn2ncs_outputevent(int netcon_output_index, double firetime);
-void net_send(void**, double*, Point_process*, double, double);
+void net_send(void**, int, Point_process*, double, double);
 void net_event(Point_process* pnt, double time);
 void net_move(void**, Point_process*, double);
-void artcell_net_send(void**, double*, Point_process*, double, double);
+void net_sem_from_gpu(int sendtype, int i_vdata, int, int ith, int ipnt, double, double);
+void artcell_net_send(void**, int, Point_process*, double, double);
 void artcell_net_move(void**, Point_process*, double);
 extern void nrn_fixed_step_minimal();
 extern void nrn_fixed_step_group_minimal(int);
 
 #ifdef DEBUG
-//temporary
-static int nrn_errno_check(int type)
-{
-  printf("nrn_errno_check() was called on pid %d: errno=%d type=%d\n", nrnmpi_myid, errno, type);
-//  assert(0);
-  type = 0;
-  return 1;
+// temporary
+static int nrn_errno_check(int type) {
+    printf("nrn_errno_check() was called on pid %d: errno=%d type=%d\n", nrnmpi_myid, errno, type);
+    //  assert(0);
+    type = 0;
+    return 1;
 }
 #endif
+}
 
+// for _OPENACC and/or NET_RECEIVE_BUFFERING
+// sem 0:3 send event move
+void net_sem_from_gpu(int sendtype,
+                      int i_vdata,
+                      int weight_index_,
+                      int ith,
+                      int ipnt,
+                      double td,
+                      double flag) {
+    NrnThread& nt = nrn_threads[ith];
+    Point_process* pnt = (Point_process*)nt._vdata[ipnt];
+    if (sendtype == 0) {
+        net_send(nt._vdata + i_vdata, weight_index_, pnt, td, flag);
+    } else if (sendtype == 2) {
+        net_move(nt._vdata + i_vdata, pnt, td);
+    } else {
+        net_event(pnt, td);
+    }
 }
 
-void net_send(void** v, double* weight, Point_process* pnt, double td, double flag) {
-	NrnThread* nt = PP2NT(pnt);
-	NetCvodeThreadData& p = net_cvode_instance->p[nt->id];
+void net_send(void** v, int weight_index_, Point_process* pnt, double td, double flag) {
+    NrnThread* nt = PP2NT(pnt);
+    NetCvodeThreadData& p = net_cvode_instance->p[nt->id];
     SelfEvent* se = new SelfEvent;
-	se->flag_ = flag;
-	se->target_ = pnt;
-	se->weight_ = weight;
-	se->movable_ = v; // needed for SaveState
-	assert(net_cvode_instance);
-	++p.unreffed_event_cnt_;
-	if (td < nt->_t) {
-		char buf[100];
-		sprintf(buf, "net_send td-t = %g", td - nt->_t);
-		se->pr(buf, td, net_cvode_instance);
-		abort();
-		hoc_execerror("net_send delay < 0", 0);
-	}
-	TQItem* q;
-	q = net_cvode_instance->event(td, se, nt);
-	if (flag == 1.0) {
-		*v = (void*)q;
-	}
-//printf("net_send %g %s %g %p\n", td, pnt_name(pnt), flag, *v);
-}
-
-void artcell_net_send(void** v, double* weight, Point_process* pnt, double td, double flag) {
-  net_send(v, weight, pnt, td, flag);
+    se->flag_ = flag;
+    se->target_ = pnt;
+    se->weight_index_ = weight_index_;
+    se->movable_ = v;  // needed for SaveState
+    assert(net_cvode_instance);
+    ++p.unreffed_event_cnt_;
+    if (td < nt->_t) {
+        char buf[100];
+        sprintf(buf, "net_send td-t = %g", td - nt->_t);
+        se->pr(buf, td, net_cvode_instance);
+        abort();
+        hoc_execerror("net_send delay < 0", 0);
+    }
+    TQItem* q;
+    q = net_cvode_instance->event(td, se, nt);
+    if (flag == 1.0) {
+        *v = (void*)q;
+    }
+    // printf("net_send %g %s %g %p\n", td, pnt_name(pnt), flag, *v);
+}
+
+void artcell_net_send(void** v, int weight_index_, Point_process* pnt, double td, double flag) {
+    net_send(v, weight_index_, pnt, td, flag);
 }
 
 void net_event(Point_process* pnt, double time) {
-	PreSyn* ps = (PreSyn*)pnt->_presyn;
-	if (ps) {
-		if (time < PP2t(pnt)) {
-			char buf[100];
-			sprintf(buf, "net_event time-t = %g", time-PP2t(pnt));
-			ps->pr(buf, time, net_cvode_instance);
-			hoc_execerror("net_event time < t", 0);
-		}
-		ps->send(time, net_cvode_instance, ps->nt_);
-	}
-}
-
-struct InterThreadEvent {
-	DiscreteEvent* de_;	
-	double t_;
-};
-
-#define ITE_SIZE 10
+    NrnThread* nt = PP2NT(pnt);
+    PreSyn* ps = nt->presyns + nt->pnt2presyn_ix[pnttype2presyn[pnt->_type]][pnt->_i_instance];
+    if (ps) {
+        if (time < nt->_t) {
+            char buf[100];
+            sprintf(buf, "net_event time-t = %g", time - nt->_t);
+            ps->pr(buf, time, net_cvode_instance);
+            hoc_execerror("net_event time < t", 0);
+        }
+        ps->send(time, net_cvode_instance, nt);
+    }
+}
+
 NetCvodeThreadData::NetCvodeThreadData() {
-	tqe_ = new TQueue();
-	ite_size_ = ITE_SIZE;
-	ite_cnt_ = 0;
-	unreffed_event_cnt_ = 0;
-	immediate_deliver_ = -1e100;
-	inter_thread_events_ = new InterThreadEvent[ite_size_];
-	MUTCONSTRUCT(1)
+    tqe_ = new TQueue<QTYPE>();
+    unreffed_event_cnt_ = 0;
+    inter_thread_events_.reserve(1000);
+    MUTCONSTRUCT(1)
 }
 
 NetCvodeThreadData::~NetCvodeThreadData() {
-	delete [] inter_thread_events_;
-	delete tqe_;
-	MUTDESTRUCT
+    inter_thread_events_.clear();
+    delete tqe_;
+    MUTDESTRUCT
 }
 
-/// If the PreSyn is on a different thread than the target
-/// We have to lock the buffer
+/// If the PreSyn is on a different thread than the target,
+/// we have to lock the buffer
 void NetCvodeThreadData::interthread_send(double td, DiscreteEvent* db, NrnThread* nt) {
-	//bin_event(td, db, nt);
-	(void)nt; // avoid unused warning
-	
-	MUTLOCK
-	if(ite_cnt_ >= ite_size_) {
-		ite_size_ *= 2;
-		InterThreadEvent* in = new InterThreadEvent[ite_size_];
-		for (int i=0; i < ite_cnt_; ++i) {
-			in[i].de_ = inter_thread_events_[i].de_;
-			in[i].t_ = inter_thread_events_[i].t_;
-		}
-		delete [] inter_thread_events_;
-		inter_thread_events_ = in;
-	}
-	InterThreadEvent& ite = inter_thread_events_[ite_cnt_++];
-	ite.de_ = db;
-	ite.t_ = td;
-
-	/* this is race condition for pthread implementation.
-	 * we are not using cvode in coreneuron and hence 
-	 * it's safe to comment out following lines. Remember 
-	 * some locks are per thread and hence not safe to 
-	 * lock global variables 
-	 */
-	//int& b = net_cvode_instance->enqueueing_;
-	//if (!b) { b = 1; }
-	MUTUNLOCK
+    (void)nt;  // avoid unused warning
+    MUTLOCK
+
+    InterThreadEvent ite;
+    ite.de_ = db;
+    ite.t_ = td;
+    inter_thread_events_.push_back(ite);
+
+    MUTUNLOCK
 }
 
 void NetCvodeThreadData::enqueue(NetCvode* nc, NrnThread* nt) {
-	int i;
-	MUTLOCK
-	for (i = 0; i < ite_cnt_; ++i) {
-		InterThreadEvent& ite = inter_thread_events_[i];
-		nc->bin_event(ite.t_, ite.de_, nt);
-	}
-	ite_cnt_ = 0;
-	MUTUNLOCK
+    MUTLOCK
+    for (size_t i = 0; i < inter_thread_events_.size(); ++i) {
+        InterThreadEvent ite = inter_thread_events_[i];
+        nc->bin_event(ite.t_, ite.de_, nt);
+#if COLLECT_TQueue_STATISTICS
+        /// TQueue::qtype::ite = 2
+        tqe_->record_stat_event(2, ite.t_);
+#endif
+    }
+    inter_thread_events_.clear();
+    MUTUNLOCK
 }
 
 NetCvode::NetCvode(void) {
-    eps_ = 100.*DBL_EPSILON;
-	print_event_ = 0;
-	pcnt_ = 0;
-	p = nil;
-	p_construct(1);
-	// eventually these should not have to be thread safe
-	// for parallel network simulations hardly any presyns have
-	// a threshold and it can be very inefficient to check the entire
-	// presyn list for thresholds during the fixed step method.
-	// So keep a threshold list.
+    eps_ = 100. * DBL_EPSILON;
+    print_event_ = 0;
+    pcnt_ = 0;
+    p = nil;
+    p_construct(1);
+    // eventually these should not have to be thread safe
+    // for parallel network simulations hardly any presyns have
+    // a threshold and it can be very inefficient to check the entire
+    // presyn list for thresholds during the fixed step method.
+    // So keep a threshold list.
 }
 
 NetCvode::~NetCvode() {
-	if (net_cvode_instance == (NetCvode*)this) {
-		net_cvode_instance = nil;
-	}	
-	p_construct(0);
-}
+    if (net_cvode_instance == (NetCvode*)this)
+        net_cvode_instance = nil;
 
+    p_construct(0);
+}
 
 void nrn_p_construct() {
-	net_cvode_instance->p_construct(nrn_nthread);
+    net_cvode_instance->p_construct(nrn_nthread);
 }
 
-
 void NetCvode::p_construct(int n) {
-	int i;
-	if (pcnt_ != n) {
-		if (p) {
-			delete [] p;
-			p = nil;
-		}
-		if (n > 0) {
-			p = new NetCvodeThreadData[n];
-		}else{
-			p = nil;
-		}
-		pcnt_ = n;
-	}
-	for (i=0; i < n; ++i) {
-		p[i].unreffed_event_cnt_ = 0;
-	}
-}
+    int i;
 
+    if (pcnt_ != n) {
+        if (p) {
+            delete[] p;
+            p = nil;
+        }
+
+        if (n > 0)
+            p = new NetCvodeThreadData[n];
+        else
+            p = nil;
+
+        pcnt_ = n;
+    }
+
+    for (i = 0; i < n; ++i)
+        p[i].unreffed_event_cnt_ = 0;
+}
 
 TQItem* NetCvode::bin_event(double td, DiscreteEvent* db, NrnThread* nt) {
+    if (nrn_use_bin_queue_) {
+#if PRINT_EVENT
+        if (print_event_) {
+            db->pr("binq send", td, this);
+        }
+#endif
+        return p[nt->id].tqe_->enqueue_bin(td, db);
+    } else {
 #if PRINT_EVENT
-	if (print_event_) {db->pr("send", td, this);}
+        if (print_event_) {
+            db->pr("send", td, this);
+        }
 #endif
-	return p[nt->id].tqe_->insert(td, db);
+        return p[nt->id].tqe_->insert(td, db);
+    }
 }
 
 TQItem* NetCvode::event(double td, DiscreteEvent* db, NrnThread* nt) {
 #if PRINT_EVENT
-	if (print_event_) { db->pr("send", td, this); }
+    if (print_event_) {
+        db->pr("send", td, this);
+    }
 #endif
-	return p[nt->id].tqe_->insert(td, db);
+    return p[nt->id].tqe_->insert(td, db);
 }
 
-
 void NetCvode::clear_events() {
-	// SelfEvents need to be "freed". Other kinds of DiscreteEvents may
-	// already have gone out of existence so the tqe_ may contain many
-	// invalid item data pointers
-	enqueueing_ = 0;
-    for (int i=0; i < nrn_nthread; ++i) {
-		NetCvodeThreadData& d = p[i];
-		delete d.tqe_;
-		d.tqe_ = new TQueue();
-		d.unreffed_event_cnt_ = 0;
-		d.immediate_deliver_ = -1e100;
-		d.ite_cnt_ = 0;
-		d.tqe_->nshift_ = -1;
+    // DiscreteEvents may already have gone out of existence so the tqe_
+    // may contain many invalid item data pointers
+    enqueueing_ = 0;
+    for (int i = 0; i < nrn_nthread; ++i) {
+        NetCvodeThreadData& d = p[i];
+        delete d.tqe_;
+        d.tqe_ = new TQueue<QTYPE>();
+        d.unreffed_event_cnt_ = 0;
+        d.inter_thread_events_.clear();
+        d.tqe_->nshift_ = -1;
         d.tqe_->shift_bin(nrn_threads->_t);
-	}
+    }
 }
 
 void NetCvode::init_events() {
-	for (int i=0; i < nrn_nthread; ++i) {
-		p[i].tqe_->nshift_ = -1;
+    for (int i = 0; i < nrn_nthread; ++i) {
+        p[i].tqe_->nshift_ = -1;
         p[i].tqe_->shift_bin(nrn_threads->_t);
-	}
-	for (int tid=0; tid < nrn_nthread; ++tid) {// can be done in parallel
-		NrnThread* nt = nrn_threads + tid;
-
-		for (int ipre = 0; ipre < nt->n_presyn; ++ ipre) {
-			PreSyn* ps = nt->presyns + ipre;
-			ps->flag_ = false;
-		}
-
-		for (int inetc = 0; inetc < nt->n_netcon; ++inetc) {
-			NetCon* d = nt->netcons + inetc;
-			if (d->target_) {
-				int type = d->target_->_type;
-				if (pnt_receive_init[type]) {
-(*pnt_receive_init[type])(d->target_, d->u.weight_, 0);
-				}else{
-					int cnt = pnt_receive_size[type]; 
-					double* wt = d->u.weight_;
-					//not the first
-					for (int j = 1; j < cnt; ++j) {
-						wt[j] = 0.;
-					}
-				}
-			}
-		}
-	}
-}
+    }
 
+    for (int tid = 0; tid < nrn_nthread; ++tid) {  // can be done in parallel
+        NrnThread* nt = nrn_threads + tid;
+
+        for (int ipre = 0; ipre < nt->n_presyn; ++ipre) {
+            PreSyn* ps = nt->presyns + ipre;
+            ps->flag_ = false;
+        }
+
+        for (int inetc = 0; inetc < nt->n_netcon; ++inetc) {
+            NetCon* d = nt->netcons + inetc;
+            if (d->target_) {
+                int type = d->target_->_type;
+                if (pnt_receive_init[type]) {
+                    (*pnt_receive_init[type])(d->target_, d->u.weight_index_, 0);
+                } else {
+                    int cnt = pnt_receive_size[type];
+                    double* wt = nt->weights + d->u.weight_index_;
+                    // not the first
+                    for (int j = 1; j < cnt; ++j) {
+                        wt[j] = 0.;
+                    }
+                }
+            }
+        }
+    }
+}
 
 bool NetCvode::deliver_event(double til, NrnThread* nt) {
-	TQItem* q;
-	if ((q = p[nt->id].tqe_->atomic_dq(til)) != 0) {
-		DiscreteEvent* de = (DiscreteEvent*)q->data_;
-		double tt = q->t_;
-		delete q;
+    TQItem* q;
+    if ((q = p[nt->id].tqe_->atomic_dq(til)) != 0) {
+        DiscreteEvent* de = (DiscreteEvent*)q->data_;
+        double tt = q->t_;
+        delete q;
 #if PRINT_EVENT
-		if (print_event_) { de->pr("deliver", tt, this); }
+        if (print_event_) {
+            de->pr("deliver", tt, this);
+        }
 #endif
-		de->deliver(tt, this, nt);
-		return true;
-	}else{
-		return false;
-	}
+        de->deliver(tt, this, nt);
+
+        /// In case of a self event we need to delete the self event
+        if (de->type() == SelfEventType)
+            delete (SelfEvent*)de;
+
+        return true;
+    } else
+        return false;
 }
 
 void net_move(void** v, Point_process* pnt, double tt) {
-	if (!(*v)) {
-		hoc_execerror( "No event with flag=1 for net_move in ", memb_func[pnt->_type].sym);
-	}
-	TQItem* q = (TQItem*)(*v);
-//printf("net_move tt=%g %s *v=%p\n", tt, memb_func[pnt->_type].sym, *v);
-	if (tt < PP2t(pnt)) {
-		assert(0);
-	}
-	net_cvode_instance->move_event(q, tt, PP2NT(pnt));
+    if (!(*v))
+        hoc_execerror("No event with flag=1 for net_move in ", memb_func[pnt->_type].sym);
+
+    TQItem* q = (TQItem*)(*v);
+    // printf("net_move tt=%g %s *v=%p\n", tt, memb_func[pnt->_type].sym, *v);
+    if (tt < PP2t(pnt))
+        nrn_assert(0);
+
+    net_cvode_instance->move_event(q, tt, PP2NT(pnt));
 }
 
 void artcell_net_move(void** v, Point_process* pnt, double tt) {
-  net_move(v, pnt, tt);
+    net_move(v, pnt, tt);
 }
 
 void NetCvode::move_event(TQItem* q, double tnew, NrnThread* nt) {
-  int tid = nt->id;
+    int tid = nt->id;
+
 #if PRINT_EVENT
-if (print_event_) {
-  SelfEvent* se = (SelfEvent*)q->data_;
-  printf("NetCvode::move_event self event target %s t=%g, old=%g new=%g\n", memb_func[se->target_->_type].sym, nt->_t, q->t_, tnew);
-}
+    if (print_event_) {
+        SelfEvent* se = (SelfEvent*)q->data_;
+        printf("NetCvode::move_event self event target %s t=%g, old=%g new=%g\n",
+               memb_func[se->target_->_type].sym, nt->_t, q->t_, tnew);
+    }
 #endif
-  p[tid].tqe_->move(q, tnew);
+
+    p[tid].tqe_->move(q, tnew);
 }
 
 void NetCvode::deliver_events(double til, NrnThread* nt) {
-//printf("deliver_events til %20.15g\n", til);
-  /// Enqueue any outstanding events in the interthread event buffer
-  p[nt->id].enqueue(this, nt);
-  while(deliver_event(til, nt)) {
-    ;
-  }
+    // printf("deliver_events til %20.15g\n", til);
+    /// Enqueue any outstanding events in the interthread event buffer
+    p[nt->id].enqueue(this, nt);
+
+    /// Deliver events. When the map is used, the loop is explicit
+    while (deliver_event(til, nt))
+        ;
 }
 
-DiscreteEvent::DiscreteEvent() {}
-DiscreteEvent::~DiscreteEvent() {}
+DiscreteEvent::DiscreteEvent() {
+}
+DiscreteEvent::~DiscreteEvent() {
+}
 
 NetCon::NetCon() {
-	active_ = false; u.weight_ = NULL;
-	src_  = NULL; target_ = NULL;
-	delay_ = 1.0;
+    active_ = false;
+    u.weight_index_ = 0;
+    target_ = NULL;
+    delay_ = 1.0;
 }
 
 NetCon::~NetCon() {
 }
 
-
 PreSyn::PreSyn() {
     nc_index_ = 0;
-	nc_cnt_ = 0;
-	flag_ = false;
-	thvar_ = NULL;
-	pntsrc_ = NULL;
-	threshold_ = 10.;
-	gid_ = -1;
-	nt_ = NULL;
-	localgid_ = 0;
-	output_index_ = 0;
+    nc_cnt_ = 0;
+    flag_ = false;
+    thvar_index_ = -1;
+    pntsrc_ = NULL;
+    threshold_ = 10.;
+    gid_ = -1;
+#if NRNMPI
+    localgid_ = 0;
+#endif
+    output_index_ = 0;
 }
 
 InputPreSyn::InputPreSyn() {
-	nc_index_ = -1;
-	nc_cnt_ = 0;
-	gid_ = -1;
+    nc_index_ = -1;
+    nc_cnt_ = 0;
 }
 
 PreSyn::~PreSyn() {
-//	printf("~PreSyn %p\n", this);
-	nrn_cleanup_presyn(this);
-	if (thvar_ || pntsrc_) {
-		if (!thvar_) {
-			if (pntsrc_) {
-				pntsrc_ = nil;
-			}
-		}
-	}
+    //	printf("~PreSyn %p\n", this);
+    if (pntsrc_) {
+        pntsrc_ = nil;
+    }
 }
 
 InputPreSyn::~InputPreSyn() {
-//	printf("~InputPreSyn %p\n", this);
-	nrn_cleanup_presyn(this);
 }
 
-
 void PreSyn::record(double tt) {
-	spikevec_lock();
-	assert(spikevec_size < spikevec_buffer_size);
-	spikevec_gid[spikevec_size] = gid_;
-	spikevec_time[spikevec_size] = tt;
-	++spikevec_size;
-	spikevec_unlock();
+    spikevec_lock();
+    assert(spikevec_size < spikevec_buffer_size);
+    spikevec_gid[spikevec_size] = gid_;
+    spikevec_time[spikevec_size] = tt;
+    ++spikevec_size;
+    spikevec_unlock();
+}
+
+bool ConditionEvent::check(NrnThread* nt) {
+    if (value(nt) > 0.0) {
+        if (flag_ == false) {
+            flag_ = true;
+            return true;
+        }
+    } else {
+        flag_ = false;
+    }
+    return false;
 }
 
-void ConditionEvent::check(NrnThread* nt, double tt, double teps) {
-	if (value() > 0.0) {
-		if (flag_ == false) {
-			flag_ = true;
-			send(tt + teps, net_cvode_instance, nt);
-		}
-	}else{
-		flag_ = false;
-	}
+ConditionEvent::ConditionEvent() {
+}
+ConditionEvent::~ConditionEvent() {
 }
-
-ConditionEvent::ConditionEvent() {}
-ConditionEvent::~ConditionEvent() {}
-
 
 void DiscreteEvent::send(double tt, NetCvode* ns, NrnThread* nt) {
-	ns->event(tt, this, nt);
+    ns->event(tt, this, nt);
 }
 
 void DiscreteEvent::deliver(double tt, NetCvode* ns, NrnThread* nt) {
-	(void)tt; (void)ns; (void)nt;
+    (void)tt;
+    (void)ns;
+    (void)nt;
 }
 
 void DiscreteEvent::pr(const char* s, double tt, NetCvode* ns) {
-	(void)ns;
-	printf("%s DiscreteEvent %.15g\n", s, tt);
+    (void)ns;
+    printf("%s DiscreteEvent %.15g\n", s, tt);
 }
 
 void NetCon::send(double tt, NetCvode* ns, NrnThread* nt) {
-	if (active_ && target_) {
+    if (active_ && target_) {
         nrn_assert(PP2NT(target_) == nt);
-	ns->bin_event(tt, this, PP2NT(target_));
+        ns->bin_event(tt, this, PP2NT(target_));
     }
 }
-	
+
 void NetCon::deliver(double tt, NetCvode* ns, NrnThread* nt) {
     (void)ns;
-    assert(target_);
-    if (PP2NT(target_) != nt) {
+    nrn_assert(target_);
+
+    if (PP2NT(target_) != nt)
         printf("NetCon::deliver nt=%d target=%d\n", nt->id, PP2NT(target_)->id);
-    }
-	assert(PP2NT(target_) == nt);
-	int typ = target_->_type;
-	nt->_t = tt;
 
-//printf("NetCon::deliver t=%g tt=%g %s\n", t, tt, pnt_name(target_));
-	POINT_RECEIVE(typ, target_, u.weight_, 0);
+    nrn_assert(PP2NT(target_) == nt);
+    int typ = target_->_type;
+    nt->_t = tt;
+
+    // printf("NetCon::deliver t=%g tt=%g %s\n", t, tt, pnt_name(target_));
+    POINT_RECEIVE(typ, target_, u.weight_index_, 0);
 #ifdef DEBUG
-	if (errno) {
-		if (nrn_errno_check(typ)) {
-hoc_warning("errno set during NetCon deliver to NET_RECEIVE", (char*)0);
-		}
-	}
+    if (errno && nrn_errno_check(typ))
+        hoc_warning("errno set during NetCon deliver to NET_RECEIVE", (char*)0);
 #endif
 }
 
+void NetCon::pr(const char* s, double tt, NetCvode* ns) {
+    (void)ns;
+    Point_process* pp = target_;
+    printf("%s NetCon target=%s[%d] %.15g\n", s, memb_func[pp->_type].sym, pp->_i_instance, tt);
+}
 
 void PreSyn::send(double tt, NetCvode* ns, NrnThread* nt) {
-	record(tt);
-    {
-		for (int i = nc_cnt_-1; i >= 0; --i) {
-			NetCon* d = netcon_in_presyn_order_[nc_index_ + i];
-			if (d->active_ && d->target_) {
-				NrnThread* n = PP2NT(d->target_);
-				if (nt == n) {
-					ns->bin_event(tt + d->delay_, d, n);
-				}else{
-					ns->p[n->id].interthread_send(tt + d->delay_, d, n);
-				}
-			}
-		}
-	}
+    record(tt);
+    for (int i = nc_cnt_ - 1; i >= 0; --i) {
+        NetCon* d = netcon_in_presyn_order_[nc_index_ + i];
+        if (d->active_ && d->target_) {
+            NrnThread* n = PP2NT(d->target_);
+
+            if (nt == n)
+                ns->bin_event(tt + d->delay_, d, n);
+            else
+                ns->p[n->id].interthread_send(tt + d->delay_, d, n);
+        }
+    }
+
 #if NRNMPI
-	if (output_index_ >= 0) {
-		if (nrn_use_localgid_) {
-			nrn_outputevent(localgid_, tt);
-		}else
-		nrn2ncs_outputevent(output_index_, tt);
-	}
-#endif //NRNMPI
-}
-	
+    if (output_index_ >= 0) {
+        if (nrn_use_localgid_)
+            nrn_outputevent(localgid_, tt);
+        else
+            nrn2ncs_outputevent(output_index_, tt);
+    }
+#endif  // NRNMPI
+}
+
 void InputPreSyn::send(double tt, NetCvode* ns, NrnThread* nt) {
-    {
-		for (int i = nc_cnt_-1; i >= 0; --i) {
-			NetCon* d = netcon_in_presyn_order_[nc_index_ + i];
-			if (d->active_ && d->target_) {
-				NrnThread* n = PP2NT(d->target_);
-				if (nt == n) {
-					ns->bin_event(tt + d->delay_, d, n);
-				}else{
-					ns->p[n->id].interthread_send(tt + d->delay_, d, n);
-				}
-			}
-		}
-	}
+    for (int i = nc_cnt_ - 1; i >= 0; --i) {
+        NetCon* d = netcon_in_presyn_order_[nc_index_ + i];
+        if (d->active_ && d->target_) {
+            NrnThread* n = PP2NT(d->target_);
+
+#if COLLECT_TQueue_STATISTICS
+            /// TQueue::qtype::spike = 1
+            ns->p[nt->id].tqe_->record_stat_event(1, tt);
+#endif
+
+            if (nt == n)
+                ns->bin_event(tt + d->delay_, d, n);
+            else
+                ns->p[n->id].interthread_send(tt + d->delay_, d, n);
+        }
+    }
 }
 
 void PreSyn::deliver(double, NetCvode*, NrnThread*) {
-	assert(0); // no PreSyn delay.
+    assert(0);  // no PreSyn delay.
 }
 
 void InputPreSyn::deliver(double, NetCvode*, NrnThread*) {
-	assert(0); // no InputPreSyn delay.
+    assert(0);  // no InputPreSyn delay.
 }
 
-
-SelfEvent::SelfEvent() {}
-SelfEvent::~SelfEvent() {}
+SelfEvent::SelfEvent() {
+}
+SelfEvent::~SelfEvent() {
+}
 
 void SelfEvent::deliver(double tt, NetCvode* ns, NrnThread* nt) {
-	assert(nt == PP2NT(target_));
-	PP2t(target_) = tt;
-//printf("SelfEvent::deliver t=%g tt=%g %s\n", PP2t(target), tt, pnt_name(target_));
-	call_net_receive(ns);
+    nrn_assert(nt == PP2NT(target_));
+    PP2t(target_) = tt;
+    // printf("SelfEvent::deliver t=%g tt=%g %s\n", PP2t(target), tt, pnt_name(target_));
+    call_net_receive(ns);
 }
 
-
 void SelfEvent::call_net_receive(NetCvode* ns) {
-	POINT_RECEIVE(target_->_type, target_, weight_, flag_);
+    POINT_RECEIVE(target_->_type, target_, weight_index_, flag_);
+
 #ifdef DEBUG
-	if (errno) {
-		if (nrn_errno_check(target_->_type)) {
-hoc_warning("errno set during SelfEvent deliver to NET_RECEIVE", (char*)0);
-		}
-	}
+    if (errno && nrn_errno_check(target_->_type))
+        hoc_warning("errno set during SelfEvent deliver to NET_RECEIVE", (char*)0);
 #endif
-	NetCvodeThreadData& nctd = ns->p[PP2NT(target_)->id];
-	--nctd.unreffed_event_cnt_;
+
+    NetCvodeThreadData& nctd = ns->p[PP2NT(target_)->id];
+    --nctd.unreffed_event_cnt_;
 }
 
-void SelfEvent::pr(const char* s, double tt, NetCvode *) {
-	printf("%s", s);
-	printf(" SelfEvent target=%s %.15g flag=%g\n", pnt_name(target_), tt, flag_);
+void SelfEvent::pr(const char* s, double tt, NetCvode*) {
+    printf("%s", s);
+    printf(" SelfEvent target=%s %.15g flag=%g\n", pnt_name(target_), tt, flag_);
 }
 
 void ncs2nrn_integrate(double tstop) {
-	double ts;
-        int n = (int)((tstop - nrn_threads->_t)/dt + 1e-9);
-	    if (n > 3) {
+    double ts;
+    int n = (int)((tstop - nrn_threads->_t) / dt + 1e-9);
+
+    if (n > 3 && !nrn_have_gaps) {
         nrn_fixed_step_group_minimal(n);
-	    }else{
+    } else {
 #if NRNMPI
-		ts = tstop - dt;
-        assert(nrn_threads->_t <= tstop);
-		// It may very well be the case that we do not advance at all
+        ts = tstop - dt;
+        nrn_assert(nrn_threads->_t <= tstop);
+        // It may very well be the case that we do not advance at all
         while (nrn_threads->_t <= ts) {
 #else
-		ts = tstop - .5*dt;
+        ts = tstop - .5 * dt;
         while (nrn_threads->_t < ts) {
 #endif
             nrn_fixed_step_minimal();
-			if (stoprun) {break;}
-		}
-	    }
-	// handle all the pending flag=1 self events
-for (int i=0; i < nrn_nthread; ++i) { assert(nrn_threads[i]._t == nrn_threads->_t);}
-}
 
+            if (stoprun)
+                break;
+        }
+    }
+
+    // handle all the pending flag=1 self events
+    for (int i = 0; i < nrn_nthread; ++i)
+        nrn_assert(nrn_threads[i]._t == nrn_threads->_t);
+}
 
 // factored this out from deliver_net_events so we can
 // stay in the cache
-void NetCvode::check_thresh(NrnThread* nt) { // for default method
-	int i;
+// net_send_buffer added so checking can be done on gpu
+// while event queueing is on cpu.
+// Remember: passsing reference variable causes cray
+// compiler bug
+
+static bool pscheck(double var, double thresh, int* flag) {
+    if (var > thresh) {
+        if (*flag == false) {
+            *flag = true;
+            return true;
+        }
+    } else {
+        *flag = false;
+    }
+    return false;
+}
 
-	for (i=0; i < nt->ncell; ++i) {
-		PreSyn* ps = nt->presyns + i;
-		assert(ps->thvar_);
-		ps->check(nt, nt->_t, 1e-10);
-	}
+double PreSyn::value(NrnThread* nt) {
+    return nt->_actual_v[thvar_index_] - threshold_;
 }
 
-void NetCvode::deliver_net_events(NrnThread* nt) { // for default method
+void NetCvode::check_thresh(NrnThread* nt) {  // for default method
+    int i;
+    double teps = 1e-10;
+
+    nt->_net_send_buffer_cnt = 0;
+    int net_send_buf_count = 0;
+    PreSyn* presyns = nt->presyns;
+    PreSynHelper* presyns_helper = nt->presyns_helper;
+    double* actual_v = nt->_actual_v;
+
+#if defined(_OPENACC)
+    int stream_id = nt->stream_id;
+#endif
+
+    if (nt->ncell == 0)
+        return;
+
+//_net_send_buffer_cnt is no longer used in openacc kernel, remove this?
+//#ifdef _OPENACC
+//    if(nt->compute_gpu)
+//        acc_update_device(&(nt->_net_send_buffer_cnt), sizeof(int));
+//#endif
+
+// on GPU...
+    #pragma acc parallel loop present(                 \
+        nt[0 : 1],                                     \
+            presyns_helper[0 : nt->n_presyn],          \
+                      presyns[0 : nt->n_presyn],       \
+                              actual_v[0 : nt->end])   \
+                                  copy(net_send_buf_count) if (nt->compute_gpu) async(stream_id)
+    for (i = 0; i < nt->ncell; ++i) {
+        PreSyn* ps = presyns + i;
+        PreSynHelper* psh = presyns_helper + i;
+        int idx = 0;
+        int thidx = ps->thvar_index_;
+        double v = actual_v[thidx];
+        double threshold = ps->threshold_;
+        int* flag = &(psh->flag_);
+
+        if (pscheck(v, threshold, flag)) {
+#ifndef _OPENACC
+            nt->_net_send_buffer_cnt = net_send_buf_count;
+            if (nt->_net_send_buffer_cnt >= nt->_net_send_buffer_size) {
+                nt->_net_send_buffer_size *= 2;
+                nt->_net_send_buffer =
+                    (int*)erealloc(nt->_net_send_buffer, nt->_net_send_buffer_size * sizeof(int));
+            }
+#endif
+
+            #pragma acc atomic capture
+            idx = net_send_buf_count++;
+
+            nt->_net_send_buffer[idx] = i;
+        }
+    }
+
+    #pragma acc wait(stream_id)
+    nt->_net_send_buffer_cnt = net_send_buf_count;
+
+    if (nt->_net_send_buffer_cnt) {
+#ifdef _OPENACC
+        int* nsbuffer = nt->_net_send_buffer;
+#endif
+        #pragma acc update host(nsbuffer[0 : nt->_net_send_buffer_cnt]) if (nt->compute_gpu) \
+                                                                        async(stream_id)
+        #pragma acc wait(stream_id)
+    }
+
+    // on CPU...
+    for (i = 0; i < nt->_net_send_buffer_cnt; ++i) {
+        PreSyn* ps = nt->presyns + nt->_net_send_buffer[i];
+        ps->send(nt->_t + teps, net_cvode_instance, nt);
+    }
+}
+
+// events including binqueue events up to t+dt/2
+void NetCvode::deliver_net_events(NrnThread* nt) {  // for default method
+    TQItem* q;
     double tm, tsav;
+    int tid = nt->id;
     tsav = nt->_t;
-    tm = nt->_t + 0.5*nt->_dt;
+    tm = nt->_t + 0.5 * nt->_dt;
+tryagain:
+    // one of the events on the main queue may be a NetParEvent
+    // which due to dt round off error can result in an event
+    // placed on the bin queue to be delivered now, which
+    // can put 0 delay events on to the main queue. So loop til
+    // no events. The alternative would be to deliver an idt=0 event
+    // immediately but that would very much change the sequence
+    // with respect to what is being done here and it is unclear
+    // how to fix the value of t there. This can be a do while loop
+    // but I do not want to affect the case of not using a bin queue.
+
+    if (nrn_use_bin_queue_) {
+        while ((q = p[tid].tqe_->dequeue_bin()) != 0) {
+            DiscreteEvent* db = (DiscreteEvent*)q->data_;
+
+#if PRINT_EVENT
+            if (print_event_) {
+                db->pr("binq deliver", nrn_threads->_t, this);
+            }
+#endif
+
+#if COLLECT_TQueue_STATISTICS
+            /// TQueue::qtype::deq = 3
+            p[tid].tqe_->record_stat_event(3, q->t_);
+#endif
+
+            delete q;
+            db->deliver(nt->_t, this, nt);
+        }
+        // assert(int(tm/nt->_dt)%1000 == p[tid].tqe_->nshift_);
+    }
 
     deliver_events(tm, nt);
 
+    if (nrn_use_bin_queue_) {
+        if (p[tid].tqe_->top()) {
+            goto tryagain;
+        }
+        p[tid].tqe_->shift_bin(tm);
+    }
+
     nt->_t = tsav;
-}
 
+    /*before executing on gpu, we have to update the NetReceiveBuffer_t on GPU */
+    update_net_receive_buffer(nt);
+
+    for (int i = 0; i < net_buf_receive_cnt_; ++i) {
+        (*net_buf_receive_[i])(nt);
+    }
+}
diff --git a/coreneuron/nrniv/netcvode.h b/coreneuron/nrniv/netcvode.h
index 50ca593ad..ae5adb87c 100644
--- a/coreneuron/nrniv/netcvode.h
+++ b/coreneuron/nrniv/netcvode.h
@@ -31,22 +31,27 @@ THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "coreneuron/nrniv/tqueue.h"
 
-#define PRINT_EVENT 1
+#define PRINT_EVENT 0
+
+/// QTYPE options include: spltree, pq_que
+/// STL priority queue is used instead of the splay tree by default.
+/// TO DO: check if stl queue works with move_event functions.
+#define QTYPE pq_que
 
 class DiscreteEvent;
-class SelfEventPool;
 class NetCvode;
-struct InterThreadEvent;
+
+struct InterThreadEvent {
+    DiscreteEvent* de_;
+    double t_;
+};
 
 class NetCvodeThreadData {
-public:
-    int ite_cnt_;
-    int ite_size_;
+  public:
     int unreffed_event_cnt_;
-    TQueue* tqe_;
-    InterThreadEvent* inter_thread_events_;
+    TQueue<QTYPE>* tqe_;
+    std::vector<InterThreadEvent> inter_thread_events_;
     MUTDEC
-    double immediate_deliver_;
 
     NetCvodeThreadData();
     virtual ~NetCvodeThreadData();
@@ -55,7 +60,7 @@ class NetCvodeThreadData {
 };
 
 class NetCvode {
-public:
+  public:
     int print_event_;
     int pcnt_;
     int enqueueing_;
@@ -66,13 +71,15 @@ class NetCvode {
     virtual ~NetCvode();
     void p_construct(int);
     void check_thresh(NrnThread*);
-    static double eps(double x) { return eps_*fabs(x); }
+    static double eps(double x) {
+        return eps_ * fabs(x);
+    }
     TQItem* event(double tdeliver, DiscreteEvent*, NrnThread*);
     void move_event(TQItem*, double, NrnThread*);
     TQItem* bin_event(double tdeliver, DiscreteEvent*, NrnThread*);
-    void deliver_net_events(NrnThread*); // for default staggered time step method
-    void deliver_events(double til, NrnThread*); // for initialization events
-    bool deliver_event(double til, NrnThread*); //uses TQueue atomically
+    void deliver_net_events(NrnThread*);          // for default staggered time step method
+    void deliver_events(double til, NrnThread*);  // for initialization events
+    bool deliver_event(double til, NrnThread*);   // uses TQueue atomically
     void clear_events();
     void init_events();
     void point_receive(int, Point_process*, double*, double);
diff --git a/coreneuron/nrniv/netpar.cpp b/coreneuron/nrniv/netpar.cpp
index 080c99af0..6bddb3e03 100644
--- a/coreneuron/nrniv/netpar.cpp
+++ b/coreneuron/nrniv/netpar.cpp
@@ -33,7 +33,7 @@ THE POSSIBILITY OF SUCH DAMAGE.
 #include "coreneuron/nrnconf.h"
 #include "coreneuron/nrnoc/multicore.h"
 #include "coreneuron/nrnmpi/nrnmpi.h"
-#include "coreneuron/nrniv/nrn_assert.h"
+#include "coreneuron/nrnoc/nrnoc_decl.h"
 
 class PreSyn;
 class InputPreSyn;
@@ -42,16 +42,10 @@ class InputPreSyn;
 #include "coreneuron/nrniv/netcvode.h"
 #include "coreneuron/nrniv/nrniv_decl.h"
 #include "coreneuron/nrniv/ivocvect.h"
+#include "coreneuron/nrniv/nrn_assert.h"
 
 static double t_exchange_;
-static double dt1_; // 1/dt
-static void alloc_space();
-
-/// Vector of maps for negative presyns
-std::vector< std::map<int, PreSyn*> > neg_gid2out;
-/// Maps for ouput and input presyns
-std::map<int, PreSyn*> gid2out;
-std::map<int, InputPreSyn*> gid2in;
+static double dt1_;  // 1/dt
 
 extern "C" {
 extern NetCvode* net_cvode_instance;
@@ -61,8 +55,6 @@ int nrnmpi_spike_compress(int nspike, bool gid_compress, int xchng_meth);
 void nrn_spike_exchange_init();
 }
 
-static double set_mindelay(double maxdelay);
-
 #if NRNMPI
 
 #include "coreneuron/nrnmpi/mpispike.h"
@@ -75,42 +67,34 @@ extern void nrnmpi_int_allgather(int*, int*, int);
 void nrn2ncs_outputevent(int netcon_output_index, double firetime);
 }
 
-
-void nrnmpi_gid_clear(void)
-{
-  gid2in.clear();
-  gid2out.clear();
-}
-
 // for compressed gid info during spike exchange
 bool nrn_use_localgid_;
 void nrn_outputevent(unsigned char localgid, double firetime);
-std::vector< std::map<int, InputPreSyn*> > localmaps;
+std::vector<std::map<int, InputPreSyn*> > localmaps;
 
 #define NRNSTAT 1
 static int nsend_, nsendmax_, nrecv_, nrecv_useful_;
 #if NRNSTAT
-/// Needs further allocation if desired to collect the histogram statistics 
+/// Needs further allocation if desired to collect the histogram statistics
 static IvocVect* max_histogram_;
-#endif 
+#endif
 
-static int ocapacity_; // for spikeout_
+static int ocapacity_;  // for spikeout_
 // require it to be smaller than  min_interprocessor_delay.
-static double wt_; // wait time for nrnmpi_spike_exchange
-static double wt1_; // time to find the PreSyns and send the spikes.
+static double wt_;   // wait time for nrnmpi_spike_exchange
+static double wt1_;  // time to find the PreSyns and send the spikes.
 static bool use_compress_;
 static int spfixout_capacity_;
 static int idxout_;
 static void nrn_spike_exchange_compressed(NrnThread*);
-#endif // NRNMPI
+#endif  // NRNMPI
 
 static int active_;
 static double usable_mindelay_;
-static double min_interprocessor_delay_;
-static double mindelay_; // the one actually used. Some of our optional algorithms
+static double mindelay_;  // the one actually used. Some of our optional algorithms
 static double last_maxstep_arg_;
-static NetParEvent* npe_; // nrn_nthread of them
-static int n_npe_; // just to compare with nrn_nthread
+static NetParEvent* npe_;  // nrn_nthread of them
+static int n_npe_;         // just to compare with nrn_nthread
 
 #if NRNMPI
 // for combination of threads and mpi.
@@ -119,495 +103,551 @@ static MUTDEC
 #endif
 #endif
 
-NetParEvent::NetParEvent(){
-  wx_ = ws_ = 0.;
-  ithread_ = -1;
+    /// Allocate space for spikes: 200 structs of {int gid; double time}
+    /// coming from nrnmpi.h and array of int of the global domain size
+    static void
+    alloc_mpi_space() {
+#if NRNMPI
+    if (!spikeout_) {
+        ocapacity_ = 100;
+        spikeout_ = (NRNMPI_Spike*)emalloc(ocapacity_ * sizeof(NRNMPI_Spike));
+        icapacity_ = 100;
+        spikein_ = (NRNMPI_Spike*)malloc(icapacity_ * sizeof(NRNMPI_Spike));
+        nin_ = (int*)emalloc(nrnmpi_numprocs * sizeof(int));
+#if nrn_spikebuf_size > 0
+        spbufout_ = (NRNMPI_Spikebuf*)emalloc(sizeof(NRNMPI_Spikebuf));
+        spbufin_ = (NRNMPI_Spikebuf*)emalloc(nrnmpi_numprocs * sizeof(NRNMPI_Spikebuf));
+#endif
+    }
+#endif
+}
+
+NetParEvent::NetParEvent() {
+    wx_ = ws_ = 0.;
+    ithread_ = -1;
 }
 
-NetParEvent::~NetParEvent(){
+NetParEvent::~NetParEvent() {
 }
 
-void NetParEvent::send(double tt, NetCvode* nc, NrnThread* nt){
-  nc->event(tt + usable_mindelay_, this, nt);
+void NetParEvent::send(double tt, NetCvode* nc, NrnThread* nt) {
+    nc->event(tt + usable_mindelay_, this, nt);
 }
 
-void NetParEvent::deliver(double tt, NetCvode* nc, NrnThread* nt){
-  net_cvode_instance->deliver_events(tt, nt);
-  nt->_stop_stepping = 1;
-  nt->_t = tt;
-  send(tt, nc, nt);
+void NetParEvent::deliver(double tt, NetCvode* nc, NrnThread* nt) {
+    net_cvode_instance->deliver_events(tt, nt);
+    nt->_stop_stepping = 1;
+    nt->_t = tt;
+    send(tt, nc, nt);
 }
 
-void NetParEvent::pr(const char* m, double tt, NetCvode*){
-  printf("%s NetParEvent %d t=%.15g tt-t=%g\n", m, ithread_, tt, tt - nrn_threads[ithread_]._t);
+void NetParEvent::pr(const char* m, double tt, NetCvode*) {
+    printf("%s NetParEvent %d t=%.15g tt-t=%g\n", m, ithread_, tt, tt - nrn_threads[ithread_]._t);
 }
 
 #if NRNMPI
 inline static void sppk(unsigned char* c, int gid) {
-	for (int i = localgid_size_-1; i >= 0; --i) {
-		c[i] = gid & 255;
-		gid >>= 8;
-	}
+    for (int i = localgid_size_ - 1; i >= 0; --i) {
+        c[i] = gid & 255;
+        gid >>= 8;
+    }
 }
 inline static int spupk(unsigned char* c) {
-	int gid = *c++;
-	for (int i = 1; i < localgid_size_; ++i) {
-		gid <<= 8;
-		gid += *c++;
-	}
-	return gid;
+    int gid = *c++;
+    for (int i = 1; i < localgid_size_; ++i) {
+        gid <<= 8;
+        gid += *c++;
+    }
+    return gid;
 }
 
 void nrn_outputevent(unsigned char localgid, double firetime) {
-	if (!active_) { return; }
-	MUTLOCK
-	nout_++;
-	int i = idxout_;
-	idxout_ += 2;
-	if (idxout_ >= spfixout_capacity_) {
-		spfixout_capacity_ *= 2;
-		spfixout_ = (unsigned char*)erealloc(spfixout_, spfixout_capacity_*sizeof(unsigned char));
-	}
-	spfixout_[i++] = (unsigned char)((firetime - t_exchange_)*dt1_ + .5);
-	spfixout_[i] = localgid;
-//printf("%d idx=%d lgid=%d firetime=%g t_exchange_=%g [0]=%d [1]=%d\n", nrnmpi_myid, i, (int)localgid, firetime, t_exchange_, (int)spfixout_[i-1], (int)spfixout_[i]);
-	MUTUNLOCK
+    if (!active_) {
+        return;
+    }
+    MUTLOCK
+    nout_++;
+    int i = idxout_;
+    idxout_ += 2;
+    if (idxout_ >= spfixout_capacity_) {
+        spfixout_capacity_ *= 2;
+        spfixout_ = (unsigned char*)erealloc(spfixout_, spfixout_capacity_ * sizeof(unsigned char));
+    }
+    spfixout_[i++] = (unsigned char)((firetime - t_exchange_) * dt1_ + .5);
+    spfixout_[i] = localgid;
+    // printf("%d idx=%d lgid=%d firetime=%g t_exchange_=%g [0]=%d [1]=%d\n", nrnmpi_myid, i,
+    // (int)localgid, firetime, t_exchange_, (int)spfixout_[i-1], (int)spfixout_[i]);
+    MUTUNLOCK
 }
 
 void nrn2ncs_outputevent(int gid, double firetime) {
-	if (!active_) { return; }
-	MUTLOCK
+    if (!active_) {
+        return;
+    }
+    MUTLOCK
     if (use_compress_) {
-	nout_++;
-	int i = idxout_;
-	idxout_ += 1 + localgid_size_;
-	if (idxout_ >= spfixout_capacity_) {
-		spfixout_capacity_ *= 2;
-		spfixout_ = (unsigned char*)erealloc(spfixout_, spfixout_capacity_*sizeof(unsigned char));
-	}
-//printf("%d nrnncs_outputevent %d %.20g %.20g %d\n", nrnmpi_myid, gid, firetime, t_exchange_,
-//(int)((unsigned char)((firetime - t_exchange_)*dt1_ + .5)));
-	spfixout_[i++] = (unsigned char)((firetime - t_exchange_)*dt1_ + .5);
-//printf("%d idx=%d firetime=%g t_exchange_=%g spfixout=%d\n", nrnmpi_myid, i, firetime, t_exchange_, (int)spfixout_[i-1]);
-	sppk(spfixout_+i, gid);
-//printf("%d idx=%d gid=%d spupk=%d\n", nrnmpi_myid, i, gid, spupk(spfixout_+i));
-    }else{
+        nout_++;
+        int i = idxout_;
+        idxout_ += 1 + localgid_size_;
+        if (idxout_ >= spfixout_capacity_) {
+            spfixout_capacity_ *= 2;
+            spfixout_ =
+                (unsigned char*)erealloc(spfixout_, spfixout_capacity_ * sizeof(unsigned char));
+        }
+        // printf("%d nrnncs_outputevent %d %.20g %.20g %d\n", nrnmpi_myid, gid, firetime,
+        // t_exchange_,
+        //(int)((unsigned char)((firetime - t_exchange_)*dt1_ + .5)));
+        spfixout_[i++] = (unsigned char)((firetime - t_exchange_) * dt1_ + .5);
+        // printf("%d idx=%d firetime=%g t_exchange_=%g spfixout=%d\n", nrnmpi_myid, i, firetime,
+        // t_exchange_, (int)spfixout_[i-1]);
+        sppk(spfixout_ + i, gid);
+        // printf("%d idx=%d gid=%d spupk=%d\n", nrnmpi_myid, i, gid, spupk(spfixout_+i));
+    } else {
 #if nrn_spikebuf_size == 0
-	int i = nout_++;
-	if (i >= ocapacity_) {
-		ocapacity_ *= 2;
-		spikeout_ = (NRNMPI_Spike*)erealloc(spikeout_, ocapacity_*sizeof(NRNMPI_Spike));
-	}		
-//printf("%d cell %d in slot %d fired at %g\n", nrnmpi_myid, gid, i, firetime);
-	spikeout_[i].gid = gid;
-	spikeout_[i].spiketime = firetime;
+        int i = nout_++;
+        if (i >= ocapacity_) {
+            ocapacity_ *= 2;
+            spikeout_ = (NRNMPI_Spike*)erealloc(spikeout_, ocapacity_ * sizeof(NRNMPI_Spike));
+        }
+        // printf("%d cell %d in slot %d fired at %g\n", nrnmpi_myid, gid, i, firetime);
+        spikeout_[i].gid = gid;
+        spikeout_[i].spiketime = firetime;
 #else
-	int i = nout_++;
-	if (i >= nrn_spikebuf_size) {
-		i -= nrn_spikebuf_size;
-		if (i >= ocapacity_) {
-			ocapacity_ *= 2;
-			spikeout_ = (NRNMPI_Spike*)hoc_Erealloc(spikeout_, ocapacity_*sizeof(NRNMPI_Spike)); hoc_malchk();
-		}		
-		spikeout_[i].gid = gid;
-		spikeout_[i].spiketime = firetime;
-	}else{
-		spbufout_->gid[i] = gid;
-		spbufout_->spiketime[i] = firetime;
-	}
+        int i = nout_++;
+        if (i >= nrn_spikebuf_size) {
+            i -= nrn_spikebuf_size;
+            if (i >= ocapacity_) {
+                ocapacity_ *= 2;
+                spikeout_ =
+                    (NRNMPI_Spike*)hoc_Erealloc(spikeout_, ocapacity_ * sizeof(NRNMPI_Spike));
+                hoc_malchk();
+            }
+            spikeout_[i].gid = gid;
+            spikeout_[i].spiketime = firetime;
+        } else {
+            spbufout_->gid[i] = gid;
+            spbufout_->spiketime[i] = firetime;
+        }
 #endif
     }
-	MUTUNLOCK
-//printf("%d cell %d in slot %d fired at %g\n", nrnmpi_myid, gid, i, firetime);
+    MUTUNLOCK
+    // printf("%d cell %d in slot %d fired at %g\n", nrnmpi_myid, gid, i, firetime);
 }
-#endif // NRNMPI
+#endif  // NRNMPI
 
 static int nrn_need_npe() {
-	int b = 0;
-	if (active_) { b = 1; }
-	if (nrn_nthread > 1) { b = 1; }
-	if (b) {
-		if (last_maxstep_arg_ == 0) {
-			last_maxstep_arg_ =   100.;
-		}
-		set_mindelay(last_maxstep_arg_);
-	}else{
-		if (npe_) {
-			delete [] npe_;
-			npe_ = nil;
-			n_npe_ = 0;
-		}
-	}
-	return b;
+    int b = 0;
+    if (active_) {
+        b = 1;
+    }
+    if (nrn_nthread > 1) {
+        b = 1;
+    }
+    if (b) {
+        if (last_maxstep_arg_ == 0) {
+            last_maxstep_arg_ = 100.;
+        }
+    } else {
+        if (npe_) {
+            delete[] npe_;
+            npe_ = nil;
+            n_npe_ = 0;
+        }
+    }
+    return b;
 }
 
 #define TBUFSIZE 0
 
 void nrn_spike_exchange_init() {
-//printf("nrn_spike_exchange_init\n");
-	if (!nrn_need_npe()) { return; }
-	alloc_space();
-//printf("nrnmpi_use=%d active=%d\n", nrnmpi_use, active_);
+    // printf("nrn_spike_exchange_init\n");
+    if (!nrn_need_npe()) {
+        return;
+    }
+    alloc_mpi_space();
+    // printf("nrnmpi_use=%d active=%d\n", nrnmpi_use, active_);
     std::map<int, InputPreSyn*>::iterator gid2in_it;
-	usable_mindelay_ = mindelay_;
-	if (nrn_nthread > 1) {
-		usable_mindelay_ -= dt;
-	}
-	if ((usable_mindelay_ < 1e-9) || (usable_mindelay_ < dt)) {
-		if (nrnmpi_myid == 0) {
-			hoc_execerror("usable mindelay is 0", "(or less than dt for fixed step method)");
-		}else{
-			return;
-		}
-	}
+    usable_mindelay_ = mindelay_;
+    if (nrn_nthread > 1) {
+        usable_mindelay_ -= dt;
+    }
+    if ((usable_mindelay_ < 1e-9) || (usable_mindelay_ < dt)) {
+        if (nrnmpi_myid == 0) {
+            hoc_execerror("usable mindelay is 0", "(or less than dt for fixed step method)");
+        } else {
+            return;
+        }
+    }
 
 #if TBUFSIZE
-		itbuf_ = 0;
+    itbuf_ = 0;
 #endif
 
-	if (n_npe_ != nrn_nthread) {
-		if (npe_) { delete [] npe_; }
-		npe_ = new NetParEvent[nrn_nthread];
-		n_npe_ = nrn_nthread;
-	}
-	for (int i = 0; i < nrn_nthread; ++i) {
-		npe_[i].ithread_ = i;
-		npe_[i].wx_ = 0.;
-		npe_[i].ws_ = 0.;
-		npe_[i].send(t, net_cvode_instance, nrn_threads + i);
-	}
+    if (n_npe_ != nrn_nthread) {
+        if (npe_) {
+            delete[] npe_;
+        }
+        npe_ = new NetParEvent[nrn_nthread];
+        n_npe_ = nrn_nthread;
+    }
+    for (int i = 0; i < nrn_nthread; ++i) {
+        npe_[i].ithread_ = i;
+        npe_[i].wx_ = 0.;
+        npe_[i].ws_ = 0.;
+        npe_[i].send(t, net_cvode_instance, nrn_threads + i);
+    }
 #if NRNMPI
     if (use_compress_) {
-	idxout_ = 2;
-	t_exchange_ = t;
-	dt1_ = 1./dt;
-	usable_mindelay_ = floor(mindelay_ * dt1_ + 1e-9) * dt;
-	assert (usable_mindelay_ >= dt && (usable_mindelay_ * dt1_) < 255);
-    }else{
+        idxout_ = 2;
+        t_exchange_ = t;
+        dt1_ = rev_dt;
+        usable_mindelay_ = floor(mindelay_ * dt1_ + 1e-9) * dt;
+        assert(usable_mindelay_ >= dt && (usable_mindelay_ * dt1_) < 255);
+    } else {
 #if nrn_spikebuf_size > 0
-	if (spbufout_) {
-		spbufout_->nspike = 0;
-	}
+        if (spbufout_) {
+            spbufout_->nspike = 0;
+        }
 #endif
     }
-	nout_ = 0;
-	nsend_ = nsendmax_ = nrecv_ = nrecv_useful_ = 0;
-	if (nrnmpi_numprocs > 0) {
-		if (nrn_nthread > 0) {
+    nout_ = 0;
+    nsend_ = nsendmax_ = nrecv_ = nrecv_useful_ = 0;
+    if (nrnmpi_numprocs > 0) {
+        if (nrn_nthread > 0) {
 #if (USE_PTHREAD || defined(_OPENMP))
-			if (!mut_) {
-				MUTCONSTRUCT(1)
-			}
+            if (!mut_) {
+                MUTCONSTRUCT(1)
+            }
 #endif
-		}else{
-			MUTDESTRUCT
-		}
-	}
-#endif // NRNMPI
-	//if (nrnmpi_myid == 0){printf("usable_mindelay_ = %g\n", usable_mindelay_);}
+        } else {
+            MUTDESTRUCT
+        }
+    }
+#endif  // NRNMPI
+        // if (nrnmpi_myid == 0){printf("usable_mindelay_ = %g\n", usable_mindelay_);}
 }
 
 #if NRNMPI
 void nrn_spike_exchange(NrnThread* nt) {
-	if (!active_) { return; }
-	if (use_compress_) { nrn_spike_exchange_compressed(nt); return; }
+    if (!active_) {
+        return;
+    }
+    if (use_compress_) {
+        nrn_spike_exchange_compressed(nt);
+        return;
+    }
 #if TBUFSIZE
-	nrnmpi_barrier();
+    nrnmpi_barrier();
 #endif
-	double wt;
-	int i, n;
+    double wt;
+    int i, n;
     std::map<int, InputPreSyn*>::iterator gid2in_it;
 #if NRNSTAT
-	nsend_ += nout_;
-	if (nsendmax_ < nout_) { nsendmax_ = nout_; }
+    nsend_ += nout_;
+    if (nsendmax_ < nout_) {
+        nsendmax_ = nout_;
+    }
 #endif
 #if nrn_spikebuf_size > 0
-	spbufout_->nspike = nout_;
+    spbufout_->nspike = nout_;
 #endif
-	wt = nrnmpi_wtime();
+    wt = nrn_wtime();
 
-	n = nrnmpi_spike_exchange();
+    n = nrnmpi_spike_exchange();
 
-	wt_ = nrnmpi_wtime() - wt;
-	wt = nrnmpi_wtime();
+    wt_ = nrn_wtime() - wt;
+    wt = nrn_wtime();
 #if TBUFSIZE
-	tbuf_[itbuf_++] = (unsigned long)nout_;
-	tbuf_[itbuf_++] = (unsigned long)n;
+    tbuf_[itbuf_++] = (unsigned long)nout_;
+    tbuf_[itbuf_++] = (unsigned long)n;
 #endif
 
-	errno = 0;
-//if (n > 0) {
-//printf("%d nrn_spike_exchange sent %d received %d\n", nrnmpi_myid, nout_, n);
-//}
-	nout_ = 0;
-	if (n == 0) {
+    errno = 0;
+    // if (n > 0) {
+    // printf("%d nrn_spike_exchange sent %d received %d\n", nrnmpi_myid, nout_, n);
+    //}
+    nout_ = 0;
+    if (n == 0) {
 #if NRNSTAT
-		if (max_histogram_) { vector_vec(max_histogram_)[0] += 1.; }
+        if (max_histogram_) {
+            vector_vec(max_histogram_)[0] += 1.;
+        }
 #endif
-		return;
-	}
+        return;
+    }
 #if NRNSTAT
-	nrecv_ += n;
-	if (max_histogram_) {
-		int mx = 0;
-		if (n > 0) {
-			for (i=nrnmpi_numprocs-1 ; i >= 0; --i) {
+    nrecv_ += n;
+    if (max_histogram_) {
+        int mx = 0;
+        if (n > 0) {
+            for (i = nrnmpi_numprocs - 1; i >= 0; --i) {
 #if nrn_spikebuf_size == 0
-				if (mx < nin_[i]) {
-					mx = nin_[i];
-				}
+                if (mx < nin_[i]) {
+                    mx = nin_[i];
+                }
 #else
-				if (mx < spbufin_[i].nspike) {
-					mx = spbufin_[i].nspike;
-				}
+                if (mx < spbufin_[i].nspike) {
+                    mx = spbufin_[i].nspike;
+                }
 #endif
-			}
-		}
-		int ms = vector_capacity(max_histogram_)-1;
-		mx = (mx < ms) ? mx : ms;
-		vector_vec(max_histogram_)[mx] += 1.;
-	}
-#endif // NRNSTAT
+            }
+        }
+        int ms = vector_capacity(max_histogram_) - 1;
+        mx = (mx < ms) ? mx : ms;
+        vector_vec(max_histogram_)[mx] += 1.;
+    }
+#endif  // NRNSTAT
 #if nrn_spikebuf_size > 0
-	for (i = 0; i < nrnmpi_numprocs; ++i) {
-		int j;
-		int nn = spbufin_[i].nspike;
-		if (nn > nrn_spikebuf_size) { nn = nrn_spikebuf_size; }
-		for (j=0; j < nn; ++j) {
+    for (i = 0; i < nrnmpi_numprocs; ++i) {
+        int j;
+        int nn = spbufin_[i].nspike;
+        if (nn > nrn_spikebuf_size) {
+            nn = nrn_spikebuf_size;
+        }
+        for (j = 0; j < nn; ++j) {
             gid2in_it = gid2in.find(spbufin_[i].gid[j]);
             if (gid2in_it != gid2in.end()) {
                 InputPreSyn* ps = gid2in_it->second;
                 ps->send(spbufin_[i].spiketime[j], net_cvode_instance, nt);
 #if NRNSTAT
-				++nrecv_useful_;
+                ++nrecv_useful_;
 #endif
-			}
-		}
-	}
-	n = ovfl_;
-#endif // nrn_spikebuf_size > 0
-	for (i = 0; i < n; ++i) {
+            }
+        }
+    }
+    n = ovfl_;
+#endif  // nrn_spikebuf_size > 0
+    for (i = 0; i < n; ++i) {
         gid2in_it = gid2in.find(spikein_[i].gid);
         if (gid2in_it != gid2in.end()) {
             InputPreSyn* ps = gid2in_it->second;
-			ps->send(spikein_[i].spiketime, net_cvode_instance, nt);
+            ps->send(spikein_[i].spiketime, net_cvode_instance, nt);
 #if NRNSTAT
-			++nrecv_useful_;
+            ++nrecv_useful_;
 #endif
-		}
-	}
-	wt1_ = nrnmpi_wtime() - wt;
+        }
+    }
+    wt1_ = nrn_wtime() - wt;
 }
 
 void nrn_spike_exchange_compressed(NrnThread* nt) {
-	if (!active_) { return; }
+    if (!active_) {
+        return;
+    }
 #if TBUFSIZE
-	nrnmpi_barrier();
+    nrnmpi_barrier();
 #endif
-	double wt;
-	int i, n, idx;
+    double wt;
+    int i, n, idx;
     std::map<int, InputPreSyn*>::iterator gid2in_it;
 #if NRNSTAT
-	nsend_ += nout_;
-	if (nsendmax_ < nout_) { nsendmax_ = nout_; }
+    nsend_ += nout_;
+    if (nsendmax_ < nout_) {
+        nsendmax_ = nout_;
+    }
 #endif
-	assert(nout_ < 0x10000);
-	spfixout_[1] = (unsigned char)(nout_ & 0xff);
-	spfixout_[0] = (unsigned char)(nout_>>8);
-
-	wt = nrnmpi_wtime();
-	n = nrnmpi_spike_exchange_compressed();
-	wt_ = nrnmpi_wtime() - wt;
-	wt = nrnmpi_wtime();
+    assert(nout_ < 0x10000);
+    spfixout_[1] = (unsigned char)(nout_ & 0xff);
+    spfixout_[0] = (unsigned char)(nout_ >> 8);
+
+    wt = nrn_wtime();
+    n = nrnmpi_spike_exchange_compressed();
+    wt_ = nrn_wtime() - wt;
+    wt = nrn_wtime();
 #if TBUFSIZE
-        tbuf_[itbuf_++] = (unsigned long)nout_;
-        tbuf_[itbuf_++] = (unsigned long)n;
-#endif
-	errno = 0;
-//if (n > 0) {
-//printf("%d nrn_spike_exchange sent %d received %d\n", nrnmpi_myid, nout_, n);
-//}
-	nout_ = 0;
-	idxout_ = 2;
-	if (n == 0) {
+    tbuf_[itbuf_++] = (unsigned long)nout_;
+    tbuf_[itbuf_++] = (unsigned long)n;
+#endif
+    errno = 0;
+    // if (n > 0) {
+    // printf("%d nrn_spike_exchange sent %d received %d\n", nrnmpi_myid, nout_, n);
+    //}
+    nout_ = 0;
+    idxout_ = 2;
+    if (n == 0) {
 #if NRNSTAT
-		if (max_histogram_) { vector_vec(max_histogram_)[0] += 1.; }
+        if (max_histogram_) {
+            vector_vec(max_histogram_)[0] += 1.;
+        }
 #endif
-		t_exchange_ = nrn_threads->_t;
-		return;
-	}
+        t_exchange_ = nrn_threads->_t;
+        return;
+    }
 #if NRNSTAT
-	nrecv_ += n;
-	if (max_histogram_) {
-		int mx = 0;
-		if (n > 0) {
-			for (i=nrnmpi_numprocs-1 ; i >= 0; --i) {
-				if (mx < nin_[i]) {
-					mx = nin_[i];
-				}
-			}
-		}
-		int ms = vector_capacity(max_histogram_)-1;
-		mx = (mx < ms) ? mx : ms;
-		vector_vec(max_histogram_)[mx] += 1.;
-	}
-#endif // NRNSTAT
+    nrecv_ += n;
+    if (max_histogram_) {
+        int mx = 0;
+        if (n > 0) {
+            for (i = nrnmpi_numprocs - 1; i >= 0; --i) {
+                if (mx < nin_[i]) {
+                    mx = nin_[i];
+                }
+            }
+        }
+        int ms = vector_capacity(max_histogram_) - 1;
+        mx = (mx < ms) ? mx : ms;
+        vector_vec(max_histogram_)[mx] += 1.;
+    }
+#endif  // NRNSTAT
     if (nrn_use_localgid_) {
-	int idxov = 0;
-	for (i = 0; i < nrnmpi_numprocs; ++i) {
-		int j, nnn;
-		int nn = nin_[i];
-	    if (nn) {
-		if (i == nrnmpi_myid) { // skip but may need to increment idxov.
-			if (nn > ag_send_nspike_) {
-				idxov += (nn - ag_send_nspike_)*(1 + localgid_size_);
-			}
-			continue;
-		}
-        std::map<int, InputPreSyn*> gps = localmaps[i];
-		if (nn > ag_send_nspike_) {
-			nnn = ag_send_nspike_;
-		}else{
-			nnn = nn;
-		}
-		idx = 2 + i*ag_send_size_;
-		for (j=0; j < nnn; ++j) {
-			// order is (firetime,gid) pairs.
-			double firetime = spfixin_[idx++]*dt + t_exchange_;
-			int lgid = (int)spfixin_[idx];
-			idx += localgid_size_;
-            gid2in_it = gps.find(lgid);
-            if (gid2in_it != gps.end()) {
-                InputPreSyn* ps = gid2in_it->second;
-				ps->send(firetime + 1e-10, net_cvode_instance, nt);
+        int idxov = 0;
+        for (i = 0; i < nrnmpi_numprocs; ++i) {
+            int j, nnn;
+            int nn = nin_[i];
+            if (nn) {
+                if (i == nrnmpi_myid) {  // skip but may need to increment idxov.
+                    if (nn > ag_send_nspike_) {
+                        idxov += (nn - ag_send_nspike_) * (1 + localgid_size_);
+                    }
+                    continue;
+                }
+                std::map<int, InputPreSyn*> gps = localmaps[i];
+                if (nn > ag_send_nspike_) {
+                    nnn = ag_send_nspike_;
+                } else {
+                    nnn = nn;
+                }
+                idx = 2 + i * ag_send_size_;
+                for (j = 0; j < nnn; ++j) {
+                    // order is (firetime,gid) pairs.
+                    double firetime = spfixin_[idx++] * dt + t_exchange_;
+                    int lgid = (int)spfixin_[idx];
+                    idx += localgid_size_;
+                    gid2in_it = gps.find(lgid);
+                    if (gid2in_it != gps.end()) {
+                        InputPreSyn* ps = gid2in_it->second;
+                        ps->send(firetime + 1e-10, net_cvode_instance, nt);
 #if NRNSTAT
-				++nrecv_useful_;
+                        ++nrecv_useful_;
+#endif
+                    }
+                }
+                for (; j < nn; ++j) {
+                    double firetime = spfixin_ovfl_[idxov++] * dt + t_exchange_;
+                    int lgid = (int)spfixin_ovfl_[idxov];
+                    idxov += localgid_size_;
+                    gid2in_it = gps.find(lgid);
+                    if (gid2in_it != gps.end()) {
+                        InputPreSyn* ps = gid2in_it->second;
+                        ps->send(firetime + 1e-10, net_cvode_instance, nt);
+#if NRNSTAT
+                        ++nrecv_useful_;
 #endif
-			}
-		}
-		for ( ; j < nn; ++j) {
-			double firetime = spfixin_ovfl_[idxov++]*dt + t_exchange_;
-			int lgid = (int)spfixin_ovfl_[idxov];
-			idxov += localgid_size_;
-            gid2in_it = gps.find(lgid);
-            if (gid2in_it != gps.end()) {
-                InputPreSyn* ps = gid2in_it->second;
-				ps->send(firetime+1e-10, net_cvode_instance, nt);
+                    }
+                }
+            }
+        }
+    } else {
+        for (i = 0; i < nrnmpi_numprocs; ++i) {
+            int j;
+            int nn = nin_[i];
+            if (nn > ag_send_nspike_) {
+                nn = ag_send_nspike_;
+            }
+            idx = 2 + i * ag_send_size_;
+            for (j = 0; j < nn; ++j) {
+                // order is (firetime,gid) pairs.
+                double firetime = spfixin_[idx++] * dt + t_exchange_;
+                int gid = spupk(spfixin_ + idx);
+                idx += localgid_size_;
+                gid2in_it = gid2in.find(gid);
+                if (gid2in_it != gid2in.end()) {
+                    InputPreSyn* ps = gid2in_it->second;
+                    ps->send(firetime + 1e-10, net_cvode_instance, nt);
 #if NRNSTAT
-				++nrecv_useful_;
+                    ++nrecv_useful_;
 #endif
-			}
-		}
-	    }
-	}
-    }else{
-	for (i = 0; i < nrnmpi_numprocs; ++i) {
-		int j;
-		int nn = nin_[i];
-		if (nn > ag_send_nspike_) { nn = ag_send_nspike_; }
-		idx = 2 + i*ag_send_size_;
-		for (j=0; j < nn; ++j) {
-			// order is (firetime,gid) pairs.
-			double firetime = spfixin_[idx++]*dt + t_exchange_;
-			int gid = spupk(spfixin_ + idx);
-			idx += localgid_size_;
+                }
+            }
+        }
+        n = ovfl_;
+        idx = 0;
+        for (i = 0; i < n; ++i) {
+            double firetime = spfixin_ovfl_[idx++] * dt + t_exchange_;
+            int gid = spupk(spfixin_ovfl_ + idx);
+            idx += localgid_size_;
             gid2in_it = gid2in.find(gid);
             if (gid2in_it != gid2in.end()) {
                 InputPreSyn* ps = gid2in_it->second;
-				ps->send(firetime+1e-10, net_cvode_instance, nt);
+                ps->send(firetime + 1e-10, net_cvode_instance, nt);
 #if NRNSTAT
-				++nrecv_useful_;
+                ++nrecv_useful_;
 #endif
-			}
-		}
-	}
-	n = ovfl_;
-	idx = 0;
-	for (i = 0; i < n; ++i) {
-		double firetime = spfixin_ovfl_[idx++]*dt + t_exchange_;
-		int gid = spupk(spfixin_ovfl_ + idx);
-		idx += localgid_size_;
-        gid2in_it = gid2in.find(gid);
-        if (gid2in_it != gid2in.end()) {
-            InputPreSyn* ps = gid2in_it->second;
-			ps->send(firetime+1e-10, net_cvode_instance, nt);
-#if NRNSTAT
-			++nrecv_useful_;
-#endif
-		}
-	}
+            }
+        }
     }
-	t_exchange_ = nrn_threads->_t;
-	wt1_ = nrnmpi_wtime() - wt;
+    t_exchange_ = nrn_threads->_t;
+    wt1_ = nrn_wtime() - wt;
 }
 
 static void mk_localgid_rep() {
     int i, k;
 
-	// how many gids are there on this machine
-	// and can they be compressed into one byte
-	int ngid = 0;
+    // how many gids are there on this machine
+    // and can they be compressed into one byte
+    int ngid = 0;
     std::map<int, PreSyn*>::iterator gid2out_it;
     std::map<int, InputPreSyn*>::iterator gid2in_it;
-    for(gid2out_it = gid2out.begin(); gid2out_it != gid2out.end(); ++gid2out_it) {
+    for (gid2out_it = gid2out.begin(); gid2out_it != gid2out.end(); ++gid2out_it) {
         if (gid2out_it->second->output_index_ >= 0) {
-			++ngid;
-		}
-	}
-
-	int ngidmax = nrnmpi_int_allmax(ngid);
-	if (ngidmax > 256) {
-		//do not compress
-		return;
-	}
-	localgid_size_ = sizeof(unsigned char);
-	nrn_use_localgid_ = true;
-
-	// allocate Allgather receive buffer (send is the nrnmpi_myid one)
-	int* rbuf = new int[nrnmpi_numprocs*(ngidmax + 1)];
-	int* sbuf = new int[ngidmax + 1];
-
-	sbuf[0] = ngid;
-	++sbuf;
-	ngid = 0;
-	// define the local gid and fill with the gids on this machine
-    for(gid2out_it = gid2out.begin(); gid2out_it != gid2out.end(); ++gid2out_it) {
+            ++ngid;
+        }
+    }
+
+    int ngidmax = nrnmpi_int_allmax(ngid);
+    if (ngidmax > 256) {
+        // do not compress
+        return;
+    }
+    localgid_size_ = sizeof(unsigned char);
+    nrn_use_localgid_ = true;
+
+    // allocate Allgather receive buffer (send is the nrnmpi_myid one)
+    int* rbuf = new int[nrnmpi_numprocs * (ngidmax + 1)];
+    int* sbuf = new int[ngidmax + 1];
+
+    sbuf[0] = ngid;
+    ++sbuf;
+    ngid = 0;
+    // define the local gid and fill with the gids on this machine
+    for (gid2out_it = gid2out.begin(); gid2out_it != gid2out.end(); ++gid2out_it) {
         if (gid2out_it->second->output_index_ >= 0) {
             gid2out_it->second->localgid_ = (unsigned char)ngid;
             sbuf[ngid] = gid2out_it->second->output_index_;
-			++ngid;
-		}
-	}
-	--sbuf;
-
-	// exchange everything
-	nrnmpi_int_allgather(sbuf, rbuf, ngidmax+1);
-	delete [] sbuf;
-	errno = 0;
-
-	// create the maps
-	// there is a lot of potential for efficiency here. i.e. use of
-	// perfect hash functions, or even simple Vectors.
+            ++ngid;
+        }
+    }
+    --sbuf;
+
+    // exchange everything
+    nrnmpi_int_allgather(sbuf, rbuf, ngidmax + 1);
+    delete[] sbuf;
+    errno = 0;
+
+    // create the maps
+    // there is a lot of potential for efficiency here. i.e. use of
+    // perfect hash functions, or even simple Vectors.
     localmaps.clear();
     localmaps.resize(nrnmpi_numprocs);
 
-	// fill in the maps
-	for (i = 0; i < nrnmpi_numprocs; ++i) if (i != nrnmpi_myid) {
-		sbuf = rbuf + i*(ngidmax + 1);
-		ngid = *(sbuf++);
-		for (k=0; k < ngid; ++k) {
-            gid2in_it = gid2in.find(int(sbuf[k]));
-            if (gid2in_it != gid2in.end()) {
-                localmaps[i][k] = gid2in_it->second;
-			}
-		}
-	}
+    // fill in the maps
+    for (i = 0; i < nrnmpi_numprocs; ++i)
+        if (i != nrnmpi_myid) {
+            sbuf = rbuf + i * (ngidmax + 1);
+            ngid = *(sbuf++);
+            for (k = 0; k < ngid; ++k) {
+                gid2in_it = gid2in.find(int(sbuf[k]));
+                if (gid2in_it != gid2in.end()) {
+                    localmaps[i][k] = gid2in_it->second;
+                }
+            }
+        }
 
-	// cleanup
-	delete [] rbuf;
+    // cleanup
+    delete[] rbuf;
 }
 
-#endif // NRNMPI
+#endif  // NRNMPI
 
 // may stimulate a gid for a cell not owned by this cpu. This allows
 // us to run single cells or subnets and stimulate exactly according to
@@ -624,260 +664,145 @@ static void mk_localgid_rep() {
 void nrn_fake_fire(int gid, double spiketime, int fake_out) {
     std::map<int, InputPreSyn*>::iterator gid2in_it;
     gid2in_it = gid2in.find(gid);
-    if (gid2in_it != gid2in.end())
-    {
+    if (gid2in_it != gid2in.end()) {
         InputPreSyn* psi = gid2in_it->second;
         assert(psi);
-//printf("nrn_fake_fire %d %g\n", gid, spiketime);
+        // printf("nrn_fake_fire %d %g\n", gid, spiketime);
         psi->send(spiketime, net_cvode_instance, nrn_threads);
 #if NRNSTAT
         ++nrecv_useful_;
 #endif
-    }else if (fake_out)
-    {
+    } else if (fake_out) {
         std::map<int, PreSyn*>::iterator gid2out_it;
         gid2out_it = gid2out.find(gid);
-        if (gid2out_it != gid2out.end())
-        {
-          PreSyn* ps = gid2out_it->second;
-          assert(ps);
-//printf("nrn_fake_fire fake_out %d %g\n", gid, spiketime);
-          ps->send(spiketime, net_cvode_instance, nrn_threads);
+        if (gid2out_it != gid2out.end()) {
+            PreSyn* ps = gid2out_it->second;
+            assert(ps);
+            // printf("nrn_fake_fire fake_out %d %g\n", gid, spiketime);
+            ps->send(spiketime, net_cvode_instance, nrn_threads);
 #if NRNSTAT
-          ++nrecv_useful_;
+            ++nrecv_useful_;
 #endif
         }
     }
-
 }
 
-
-void netpar_tid_gid2ps_alloc(int nth) {
-  // nth is same as ngroup in nrn_setup.cpp, not necessarily nrn_nthread.
-  neg_gid2out.resize(nth);
+static int timeout_ = 0;
+int nrn_set_timeout(int timeout) {
+    int tt;
+    tt = timeout_;
+    timeout_ = timeout;
+    return tt;
 }
 
-void netpar_tid_gid2ps_free() {
-  neg_gid2out.clear();
-}
+void BBS_netpar_solve(double tstop) {
+    double time = nrn_wtime();
 
-void netpar_tid_gid2ps(int tid, int gid, PreSyn** ps, InputPreSyn** psi){
-  /// for gid < 0 returns the PreSyn* in the thread (tid) specific map.
-  *ps = NULL;
-  *psi = NULL;
-  std::map<int, PreSyn*>::iterator gid2out_it;
-  if (gid >= 0) {
-      gid2out_it = gid2out.find(gid);
-      if (gid2out_it != gid2out.end()) {
-        *ps = gid2out_it->second;
-      }else{
-        std::map<int, InputPreSyn*>::iterator gid2in_it;
-        gid2in_it = gid2in.find(gid);
-        if (gid2in_it != gid2in.end()) {
-          *psi = gid2in_it->second;
+#if NRNMPI
+    double mt, md;
+    tstopunset;
+    mt = dt;
+    md = mindelay_ - 1e-10;
+    if (md < mt) {
+        if (nrnmpi_myid == 0) {
+            hoc_execerror("mindelay is 0", "(or less than dt for fixed step method)");
+        } else {
+            return;
         }
-      }
-  }else{
-    gid2out_it = neg_gid2out[tid].find(gid);
-    if (gid2out_it != neg_gid2out[tid].end()) {
-      *ps = gid2out_it->second;
     }
-  }
-}
-  
-void netpar_tid_set_gid2node(int tid, int gid, int nid, PreSyn* ps) {
-  if (gid >= 0) {
-      /// Allocate space for spikes: 200 structs of {int gid; double time}
-      /// coming from nrnmpi.h and array of int of the global domain size
-      alloc_space();
-#if NRNMPI
-      if (nid == nrnmpi_myid) {
-#else
-      {
-#endif
-          char m[200];
-          if (gid2in.find(gid) != gid2in.end()) {
-              sprintf(m, "gid=%d already exists as an input port", gid);
-              hoc_execerror(m, "Setup all the output ports on this process before using them as input ports.");
-          }
-          if (gid2out.find(gid) != gid2out.end()) {
-              sprintf(m, "gid=%d already exists on this process as an output port", gid);
-              hoc_execerror(m, 0);
-          }
-          gid2out[gid] = ps;
-          ps->gid_ = gid;
-          ps->output_index_ = gid;
-      }
-  }else{
-    nrn_assert(nid == nrnmpi_myid);
-    nrn_assert(neg_gid2out[tid].find(gid) == neg_gid2out[tid].end());
-    neg_gid2out[tid][gid] = ps;
-  }
-}
-
 
-void nrn_reset_gid2out(void) {
-  gid2out.clear();
-}
-
-void nrn_reset_gid2in(void) {
-  gid2in.clear();
-}
-
-static void alloc_space() {
-#if NRNMPI
-	if (!spikeout_) {
-		ocapacity_  = 100;
-		spikeout_ = (NRNMPI_Spike*)emalloc(ocapacity_*sizeof(NRNMPI_Spike));
-		icapacity_  = 100;
-		spikein_ = (NRNMPI_Spike*)malloc(icapacity_*sizeof(NRNMPI_Spike));
-		nin_ = (int*)emalloc(nrnmpi_numprocs*sizeof(int));
-#if nrn_spikebuf_size > 0
-spbufout_ = (NRNMPI_Spikebuf*)emalloc(sizeof(NRNMPI_Spikebuf));
-spbufin_ = (NRNMPI_Spikebuf*)emalloc(nrnmpi_numprocs*sizeof(NRNMPI_Spikebuf));
-#endif
+    nrn_timeout(timeout_);
+    ncs2nrn_integrate(tstop * (1. + 1e-11));
+    nrn_spike_exchange(nrn_threads);
+    nrn_timeout(0);
+    if (npe_) {
+        npe_[0].wx_ = npe_[0].ws_ = 0.;
+    };
+    // printf("%d netpar_solve exit t=%g tstop=%g mindelay_=%g\n",nrnmpi_myid, t, tstop, mindelay_);
+    nrnmpi_barrier();
+#else  // not NRNMPI
+    ncs2nrn_integrate(tstop);
 #endif
-	}
-}
-
-
-void nrn_cleanup_presyn(DiscreteEvent*) {
-    // for multi-send, need to cleanup the list of hosts here
-}
+    tstopunset;
 
-
-int input_gid_register(int gid) {
-	alloc_space();
-    if (gid2out.find(gid) != gid2out.end()) {
-		return 0;
-    }else if (gid2in.find(gid) != gid2in.end()) {
-		return 0;
-	}
-    gid2in[gid] = NULL;
-	return 1;
+    if (nrnmpi_myid == 0) {
+        printf("\nSolver Time : %g\n", nrn_wtime() - time);
+    }
 }
 
-int input_gid_associate(int gid, InputPreSyn* psi) {
-    std::map<int, InputPreSyn*>::iterator gid2in_it;
-    gid2in_it = gid2in.find(gid);
-    if (gid2in_it != gid2in.end()) {
-        if (gid2in_it->second) {
-			return 0;
-		}
-        gid2in_it->second = psi;
-                psi->gid_ = gid;
-		return 1;
-	}
-	return 0;
-}
+double set_mindelay(double maxdelay) {
+    double mindelay = maxdelay;
+    last_maxstep_arg_ = maxdelay;
 
+    // if all==1 then minimum delay of all NetCon no matter the source.
+    // except if src in same thread as NetCon
+    int all = (nrn_nthread > 1);
+    // minumum delay of all NetCon having an InputPreSyn source
 
-static int timeout_ = 0;
-int nrn_set_timeout(int timeout) {
-	int tt;
-	tt = timeout_;
-	timeout_ = timeout;
-	return tt;
-}
+    /** we have removed nt_ from PreSyn. Build local map of PreSyn
+     *  and NrnThread which will be used to find out if src in same thread as NetCon */
+    std::map<PreSyn*, NrnThread*> presynmap;
 
-void BBS_netpar_solve(double tstop) {
+    for (int ith = 0; ith < nrn_nthread; ++ith) {
+        NrnThread& nt = nrn_threads[ith];
+        for (int i = 0; i < nt.n_presyn; ++i) {
+            presynmap[nt.presyns + i] = nrn_threads + ith;
+        }
+    }
 
-        double time = nrnmpi_wtime();
+    for (int ith = 0; ith < nrn_nthread; ++ith) {
+        NrnThread& nt = nrn_threads[ith];
+        for (int i = 0; i < nt.n_netcon; ++i) {
+            NetCon* nc = nt.netcons + i;
+            int chk = 0;  // ignore nc.delay_
+            int gid = netcon_srcgid[ith][i];
+            PreSyn* ps;
+            InputPreSyn* psi;
+            netpar_tid_gid2ps(ith, gid, &ps, &psi);
+            if (psi) {
+                chk = 1;
+            } else if (all) {
+                chk = 1;
+                // but ignore if src in same thread as NetCon
+                if (ps && presynmap[ps] == &nt) {
+                    chk = 0;
+                }
+            }
+            if (chk && nc->delay_ < mindelay) {
+                mindelay = nc->delay_;
+            }
+        }
+    }
 
 #if NRNMPI
-	double mt, md;
-	tstopunset;
-	mt = dt ; md = mindelay_ - 1e-10;
-	if (md < mt) {
-		if (nrnmpi_myid == 0) {
-			hoc_execerror("mindelay is 0", "(or less than dt for fixed step method)");
-		}else{
-			return;
-		}
-	}
-
-	nrn_timeout(timeout_);
-	ncs2nrn_integrate(tstop*(1.+1e-11));
-	nrn_spike_exchange(nrn_threads);
-	nrn_timeout(0);
-	if (npe_) {
-		npe_[0].wx_ = npe_[0].ws_ = 0.;
-	};
-//printf("%d netpar_solve exit t=%g tstop=%g mindelay_=%g\n",nrnmpi_myid, t, tstop, mindelay_);
-#else // not NRNMPI
-	ncs2nrn_integrate(tstop);
-#endif
-	tstopunset;
-
-    nrnmpi_barrier();
-	if ( nrnmpi_myid == 0 ) {
-      printf( " Solver Time : %g\n", nrnmpi_wtime() - time );
+    if (nrnmpi_use) {
+        active_ = 1;
+    }
+    if (use_compress_) {
+        if (mindelay / dt > 255) {
+            mindelay = 255 * dt;
+        }
     }
-}
-
-static double set_mindelay(double maxdelay) {
-	double mindelay = maxdelay;
-	last_maxstep_arg_ = maxdelay;
-
-   // if all==1 then minimum delay of all NetCon no matter the source.
-   // except if src in same thread as NetCon
-   int all = (nrn_nthread > 1);
-   // minumum delay of all NetCon having an InputPreSyn source
-   for (int ith = 0; ith < nrn_nthread; ++ith) {
-       NrnThread* nt = nrn_threads + ith;
-       for (int i = 0; i < nt->n_netcon; ++i) {
-           NetCon& nc = nt->netcons[i];
-           int chk = 0; // ignore nc.delay_
-           if (nc.src_ && nc.src_->type() == InputPreSynType) {
-               chk = 1;
-           }else if (all) {
-               chk = 1;
-               // but ignore if src in same thread as NetCon
-               if (nc.src_ && nc.src_->type() == PreSynType
-                   && ((PreSyn*)nc.src_)->nt_ == nt) {
-                   chk = 0;
-               }
-           }
-           if (chk && nc.delay_ < mindelay) {
-                mindelay = nc.delay_;
-           } 
-		}
-	}
 
-#if NRNMPI
-	if (nrnmpi_use) {active_ = 1;}
-	if (use_compress_) {
-		if (mindelay/dt > 255) {
-			mindelay = 255*dt;
-		}
-	}
-
-//printf("%d netpar_mindelay local %g now calling nrnmpi_mindelay\n", nrnmpi_myid, mindelay);
-//	double st = time();
-	mindelay_ = nrnmpi_mindelay(mindelay);
-	min_interprocessor_delay_ = mindelay_;
-//	add_wait_time(st);
-//printf("%d local min=%g  global min=%g\n", nrnmpi_myid, mindelay, mindelay_);
-	errno = 0;
-	return mindelay;
+    // printf("%d netpar_mindelay local %g now calling nrnmpi_mindelay\n", nrnmpi_myid, mindelay);
+    //	double st = time();
+    mindelay_ = nrnmpi_mindelay(mindelay);
+    //	add_wait_time(st);
+    // printf("%d local min=%g  global min=%g\n", nrnmpi_myid, mindelay, mindelay_);
+    errno = 0;
+    return mindelay;
 #else
-	mindelay_ = mindelay;
-	min_interprocessor_delay_ = mindelay_;
-	return mindelay;
-#endif //NRNMPI
-}
-
-double BBS_netpar_mindelay(double maxdelay) {
-	double tt = set_mindelay(maxdelay);
-	return tt;
+    mindelay_ = mindelay;
+    return mindelay;
+#endif  // NRNMPI
 }
 
 void BBS_netpar_spanning_statistics(int* nsend, int* nsendmax, int* nrecv, int* nrecv_useful) {
 #if NRNMPI
-	*nsend = nsend_;
-	*nsendmax = nsendmax_;
-	*nrecv = nrecv_;
-	*nrecv_useful = nrecv_useful_;
+    *nsend = nsend_;
+    *nsendmax = nsendmax_;
+    *nrecv = nrecv_;
+    *nrecv_useful = nrecv_useful_;
 #endif
 }
 
@@ -918,62 +843,54 @@ two phase multisend distributes the injection.
 
 int nrnmpi_spike_compress(int nspike, bool gid_compress, int xchng_meth) {
 #if NRNMPI
-	if (nrnmpi_numprocs < 2) { return 0; }
-	assert(xchng_meth == 0);
-	if (nspike >= 0) {
-		ag_send_nspike_ = 0;
-		if (spfixout_) { free(spfixout_); spfixout_ = 0; }
-		if (spfixin_) { free(spfixin_); spfixin_ = 0; }
-		if (spfixin_ovfl_) { free(spfixin_ovfl_); spfixin_ovfl_ = 0; }
+    if (nrnmpi_numprocs < 2) {
+        return 0;
+    }
+    nrn_assert(xchng_meth == 0);
+    if (nspike >= 0) {
+        ag_send_nspike_ = 0;
+        if (spfixout_) {
+            free(spfixout_);
+            spfixout_ = 0;
+        }
+        if (spfixin_) {
+            free(spfixin_);
+            spfixin_ = 0;
+        }
+        if (spfixin_ovfl_) {
+            free(spfixin_ovfl_);
+            spfixin_ovfl_ = 0;
+        }
         localmaps.clear();
-	}
-	if (nspike == 0) { // turn off
-		use_compress_ = false;
-		nrn_use_localgid_ = false;
-	}else if (nspike > 0) { // turn on
-		use_compress_ = true;
-		ag_send_nspike_ = nspike;
-		nrn_use_localgid_ = false;
-		if (gid_compress) {
-			// we can only do this after everything is set up
-			mk_localgid_rep();
-			if (!nrn_use_localgid_ && nrnmpi_myid == 0) {
-printf("Notice: gid compression did not succeed. Probably more than 255 cells on one cpu.\n");
-			}
-		}
-		if (!nrn_use_localgid_) {
-			localgid_size_ = sizeof(unsigned int);
-		}
-		ag_send_size_ = 2 + ag_send_nspike_*(1 + localgid_size_);
-		spfixout_capacity_ = ag_send_size_ + 50*(1 + localgid_size_);
-		spfixout_ = (unsigned char*)emalloc(spfixout_capacity_);
-		spfixin_ = (unsigned char*)emalloc(nrnmpi_numprocs*ag_send_size_);
-		ovfl_capacity_ = 100;
-		spfixin_ovfl_ = (unsigned char*)emalloc(ovfl_capacity_*(1 + localgid_size_));
-	}
-	return ag_send_nspike_;
+    }
+    if (nspike == 0) {  // turn off
+        use_compress_ = false;
+        nrn_use_localgid_ = false;
+    } else if (nspike > 0) {  // turn on
+        use_compress_ = true;
+        ag_send_nspike_ = nspike;
+        nrn_use_localgid_ = false;
+        if (gid_compress) {
+            // we can only do this after everything is set up
+            mk_localgid_rep();
+            if (!nrn_use_localgid_ && nrnmpi_myid == 0) {
+                printf(
+                    "Notice: gid compression did not succeed. Probably more than 255 cells on one "
+                    "cpu.\n");
+            }
+        }
+        if (!nrn_use_localgid_) {
+            localgid_size_ = sizeof(unsigned int);
+        }
+        ag_send_size_ = 2 + ag_send_nspike_ * (1 + localgid_size_);
+        spfixout_capacity_ = ag_send_size_ + 50 * (1 + localgid_size_);
+        spfixout_ = (unsigned char*)emalloc(spfixout_capacity_);
+        spfixin_ = (unsigned char*)emalloc(nrnmpi_numprocs * ag_send_size_);
+        ovfl_capacity_ = 100;
+        spfixin_ovfl_ = (unsigned char*)emalloc(ovfl_capacity_ * (1 + localgid_size_));
+    }
+    return ag_send_nspike_;
 #else
-	return 0;
-#endif
-}
-
-
-/// Approximate count of number of bytes for the gid2out map
-size_t output_presyn_size(void) {
-  if (gid2out.empty()) { return 0; }
-  size_t nbyte = sizeof(gid2out) + sizeof(int)*gid2out.size() + sizeof(PreSyn*)*gid2out.size();
-#ifdef DEBUG
-  printf(" gid2out table bytes=~%ld size=%d\n", nbyte, gid2out.size());
+    return 0;
 #endif
-  return nbyte;
 }
-
-size_t input_presyn_size(void) {
-  if (gid2in.empty()) { return 0; }
-  size_t nbyte = sizeof(gid2in) + sizeof(int)*gid2in.size() + sizeof(InputPreSyn*)*gid2in.size();
-#ifdef DEBUG
-  printf(" gid2in table bytes=~%ld size=%d\n", nbyte, gid2in->size());
-#endif
-  return nbyte;
-}
-
diff --git a/coreneuron/nrniv/node_permute.cpp b/coreneuron/nrniv/node_permute.cpp
new file mode 100644
index 000000000..fed731017
--- /dev/null
+++ b/coreneuron/nrniv/node_permute.cpp
@@ -0,0 +1,310 @@
+/*
+Permute nodes.
+
+To make gaussian elimination on gpu more efficient.
+
+Permutation vector p[i] applied to a data vector, moves the data_original[i]
+to data[p[i]].
+That suffices for node properties such as area[i], a[i], b[i]. e.g.
+  area[p[i]] <- area_original[i]
+
+Notice that p on the left side is a forward permutation. On the right side
+it serves as the inverse permutation.
+area_original[i] <- area_permuted[p[i]]
+
+but things
+get a bit more complicated when the data is an integer index into the
+original data.
+
+For example:
+
+parent[i] needs to be transformed so that
+parent[p[i]] <- p[parent_original[i]] except that if parent_original[j] = -1
+  then parent[p[j]] = -1
+
+membrane mechanism nodelist ( a subset of nodes) needs to be at least
+minimally transformed so that
+nodelist_new[k] <- p[nodelist_original[k]]
+This does not affect the order of the membrane mechanism property data.
+
+However, computation is more efficient to permute (sort) nodelist_new so that
+it follows as much as possible the permuted node ordering, ie in increasing
+node order.  Consider this further mechanism specific nodelist permutation,
+which is to be applied to the above nodelist_new, to be p_m, which has the same
+size as nodelist. ie.
+nodelist[p_m[k]] <- nodelist_new[k].
+
+Notice the similarity to the parent case...
+nodelist[p_m[k]] = p[nodelist_original[k]]
+
+and now the membrane mechanism node data, does need to be permuted to have an
+order consistent with the new nodelist. Since there are nm instances of the
+mechanism each with sz data values (consider AoS layout).
+The data permutation is
+for k=[0:nm] for isz=[0:sz]
+  data_m[p_m[k]*sz + isz] = data_m_original[k*sz + isz]
+
+For an SoA layout the indexing is k + isz*nm (where nm may include padding).
+
+A more complicated case is a mechanisms dparam array (nm instances each with
+dsz values) Some of those values are indices into another mechanism (eg
+pointers to ion properties) or voltage or area depending on the semantics of
+the value. We can use the above data_m permutation but then need to update
+the values according to the permutation of the object the value indexes into.
+Consider the permutation of the target object to be p_t . Then a value
+iold = pdata_m(k, isz) - data_t in AoS format
+refers to k_t = iold % sz_t and isz_t = iold - k_t*sz_t
+and for a target in SoA format isz_t = iold % nm_t and k_t = iold - isz_t*nm_t
+ie k_t_new = p_m_t[k_t] so, for AoS, inew = k_t_new*sz_t + isz_t
+or , for SoA, inew = k_t_new + isz_t*nm_t
+so pdata_m(k, isz) = inew + data_t
+
+
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "coreneuron/nrnoc/multicore.h"
+#include "coreneuron/nrniv/node_permute.h"
+#include "coreneuron/nrnoc/nrnoc_decl.h"
+#include "coreneuron/nrniv/nrniv_decl.h"
+#include "coreneuron/nrniv/nrn_assert.h"
+#include <vector>
+#include <utility>
+#include <algorithm>
+
+template <typename T>
+void permute(T* data, int cnt, int sz, int layout, int* p) {
+    // data(p[icnt], isz) <- data(icnt, isz)
+    // this does not change data, merely permutes it.
+    // assert len(p) == cnt
+    if (!p) {
+        return;
+    }
+    int n = cnt * sz;
+    if (n < 1) {
+        return;
+    }
+
+    if (layout == 0) {  // for SoA, n might be larger due to cnt padding
+        n = nrn_soa_padded_size(cnt, layout) * sz;
+    }
+
+    T* data_orig = new T[n];
+    for (int i = 0; i < n; ++i) {
+        data_orig[i] = data[i];
+    }
+
+    for (int icnt = 0; icnt < cnt; ++icnt) {
+        for (int isz = 0; isz < sz; ++isz) {
+            // note that when layout==0, nrn_i_layout takes into account SoA padding.
+            int i = nrn_i_layout(icnt, cnt, isz, sz, layout);
+            int ip = nrn_i_layout(p[icnt], cnt, isz, sz, layout);
+            data[ip] = data_orig[i];
+        }
+    }
+
+    delete[] data_orig;
+}
+
+static void invert_permute(int* p, int n) {
+    int* newp = new int[n];
+    for (int i = 0; i < n; ++i) {
+        newp[p[i]] = i;
+    }
+    for (int i = 0; i < n; ++i) {
+        p[i] = newp[i];
+    }
+    delete[] newp;
+}
+
+void update_pdata_values(Memb_list* ml, int type, NrnThread& nt) {
+    // assumes AoS to SoA transformation already made since we are using
+    // nrn_i_layout to determine indices into both ml->pdata and into target data
+    int psz = nrn_prop_dparam_size_[type];
+    if (psz == 0) {
+        return;
+    }
+    if (nrn_is_artificial_[type]) {
+        return;
+    }
+    int* semantics = memb_func[type].dparam_semantics;
+    if (!semantics) {
+        return;
+    }
+    int* pdata = ml->pdata;
+    int layout = nrn_mech_data_layout_[type];
+    int cnt = ml->nodecount;
+    // ml padding does not matter (but target padding does matter)
+
+    // interesting semantics are -1 (area), -5 (pointer), or 0-999 (ion variables)
+    for (int i = 0; i < psz; ++i) {
+        int s = semantics[i];
+        if (s == -1) {                               // area
+            int area0 = nt._actual_area - nt._data;  // includes padding if relevant
+            int* p_target = nt._permute;
+            for (int iml = 0; iml < cnt; ++iml) {
+                int* pd = pdata + nrn_i_layout(iml, cnt, i, psz, layout);
+                // *pd is the original integer into nt._data . Needs to be replaced
+                // by the permuted value
+
+                // This is ok whether or not area changed by padding?
+                // since old *pd updated appropriately by earlier AoS to SoA
+                // transformation
+                int ix = *pd - area0;  // original integer into area array.
+                nrn_assert((ix >= 0) && (ix < nt.end));
+                int ixnew = p_target[ix];
+                *pd = ixnew + area0;
+            }
+        } else if (s == -5) {  // assume pointer to membrane voltage
+            int v0 = nt._actual_v - nt._data;
+            // same as for area semantics
+            int* p_target = nt._permute;
+            for (int iml = 0; iml < cnt; ++iml) {
+                int* pd = pdata + nrn_i_layout(iml, cnt, i, psz, layout);
+                int ix = *pd - v0;  // original integer into area array.
+                nrn_assert((ix >= 0) && (ix < nt.end));
+                int ixnew = p_target[ix];
+                *pd = ixnew + v0;
+            }
+        } else if (s >= 0 && s < 1000) {  // ion
+            int etype = s;
+            int elayout = nrn_mech_data_layout_[etype];
+            Memb_list* eml = nt._ml_list[etype];
+            int edata0 = eml->data - nt._data;
+            int ecnt = eml->nodecount;
+            int esz = nrn_prop_param_size_[etype];
+            int* p_target = eml->_permute;
+            for (int iml = 0; iml < cnt; ++iml) {
+                int* pd = pdata + nrn_i_layout(iml, cnt, i, psz, layout);
+                int ix = *pd - edata0;
+                // from ix determine i_ecnt and i_esz (need to permute i_ecnt)
+                int i_ecnt, i_esz, padded_ecnt;
+                if (elayout == 1) {  // AoS
+                    padded_ecnt = ecnt;
+                    i_ecnt = ix / esz;
+                    i_esz = ix % esz;
+                } else {  // SoA
+                    assert(elayout == 0);
+                    padded_ecnt = nrn_soa_padded_size(ecnt, elayout);
+                    i_ecnt = ix % padded_ecnt;
+                    i_esz = ix / padded_ecnt;
+                }
+                int i_ecnt_new = p_target[i_ecnt];
+                int ix_new = nrn_i_layout(i_ecnt_new, ecnt, i_esz, esz, elayout);
+                *pd = ix_new + edata0;
+            }
+        }
+    }
+}
+
+void node_permute(int* vec, int n, int* permute) {
+    for (int i = 0; i < n; ++i) {
+        if (vec[i] >= 0) {
+            vec[i] = permute[vec[i]];
+        }
+    }
+}
+
+void permute_ptr(int* vec, int n, int* p) {
+    permute(vec, n, 1, 1, p);
+}
+
+void permute_data(double* vec, int n, int* p) {
+    permute(vec, n, 1, 1, p);
+}
+
+void permute_ml(Memb_list* ml, int type, NrnThread& nt) {
+    int sz = nrn_prop_param_size_[type];
+    int psz = nrn_prop_dparam_size_[type];
+    int layout = nrn_mech_data_layout_[type];
+    permute(ml->data, ml->nodecount, sz, layout, ml->_permute);
+    permute(ml->pdata, ml->nodecount, psz, layout, ml->_permute);
+
+    update_pdata_values(ml, type, nt);
+}
+
+int nrn_index_permute(int ix, int type, Memb_list* ml) {
+    int* p = ml->_permute;
+    if (!p) {
+        return ix;
+    }
+    int layout = nrn_mech_data_layout_[type];
+    if (layout == 1) {
+        int sz = nrn_prop_param_size_[type];
+        int i_cnt = ix / sz;
+        int i_sz = ix % sz;
+        return p[i_cnt] * sz + i_sz;
+    } else {
+        assert(layout == 0);
+        int padded_cnt = nrn_soa_padded_size(ml->nodecount, layout);
+        int i_cnt = ix % padded_cnt;
+        int i_sz = ix / padded_cnt;
+        return i_sz * padded_cnt + p[i_cnt];
+    }
+}
+
+#if 0
+static void pr(const char* s, int* x, int n) {
+  printf("%s:", s);
+  for (int i=0; i < n; ++i) {
+    printf("  %d %d", i, x[i]);
+  }
+  printf("\n");
+}
+
+static void pr(const char* s, double* x, int n) {
+  printf("%s:", s);
+  for (int i=0; i < n; ++i) {
+    printf("  %d %g", i, x[i]);
+  }
+  printf("\n");
+}
+#endif
+
+// note that sort_indices has the sense of an inverse permutation in that
+// the value of sort_indices[0] is the index with the smallest value in the
+// indices array
+
+static bool nrn_index_sort_cmp(const std::pair<int, int>& a, const std::pair<int, int>& b) {
+    bool result = false;
+    if (a.first < b.first) {
+        result = true;
+    } else if (a.first == b.first) {
+        if (a.second < b.second) {
+            result = true;
+        }
+    }
+    return result;
+}
+
+int* nrn_index_sort(int* values, int n) {
+    std::vector<std::pair<int, int> > vi(n);
+    for (int i = 0; i < n; ++i) {
+        vi[i].first = values[i];
+        vi[i].second = i;
+    }
+    std::sort(vi.begin(), vi.end(), nrn_index_sort_cmp);
+    int* sort_indices = new int[n];
+    for (int i = 0; i < n; ++i) {
+        sort_indices[i] = vi[i].second;
+    }
+    return sort_indices;
+}
+
+void permute_nodeindices(Memb_list* ml, int* p) {
+    // nodeindices values are permuted according to p (that per se does
+    //  not affect vec).
+
+    node_permute(ml->nodeindices, ml->nodecount, p);
+
+    // Then the new node indices are sorted by
+    // increasing index. Instances using the same node stay in same
+    // original relative order so that their contributions to rhs, d (if any)
+    // remain in same order (except for gpu parallelism).
+    // That becomes ml->_permute
+
+    ml->_permute = nrn_index_sort(ml->nodeindices, ml->nodecount);
+    invert_permute(ml->_permute, ml->nodecount);
+    permute_ptr(ml->nodeindices, ml->nodecount, ml->_permute);
+}
diff --git a/coreneuron/nrniv/node_permute.h b/coreneuron/nrniv/node_permute.h
new file mode 100644
index 000000000..01db957b3
--- /dev/null
+++ b/coreneuron/nrniv/node_permute.h
@@ -0,0 +1,17 @@
+#ifndef node_permute_h
+#define node_permute_h
+
+// determine ml->_permute and permute the ml->nodeindices accordingly
+void permute_nodeindices(Memb_list* ml, int* permute);
+
+// vec values >= 0 updated according to permutation
+void node_permute(int* vec, int n, int* permute);
+
+// moves values to new location but does not change those values
+void permute_ptr(int* vec, int n, int* permute);
+
+void permute_data(double* vec, int n, int* permute);
+void permute_ml(Memb_list* ml, int type, NrnThread& nt);
+int nrn_index_permute(int, int type, Memb_list* ml);
+
+#endif
diff --git a/coreneuron/nrniv/nrn_acc_manager.cpp b/coreneuron/nrniv/nrn_acc_manager.cpp
new file mode 100644
index 000000000..f061268e9
--- /dev/null
+++ b/coreneuron/nrniv/nrn_acc_manager.cpp
@@ -0,0 +1,982 @@
+#include <queue>
+#include <utility>
+
+#include "coreneuron/nrnoc/multicore.h"
+#include "coreneuron/nrniv/netcon.h"
+#include "coreneuron/nrniv/nrn_acc_manager.h"
+#include "coreneuron/nrniv/nrniv_decl.h"
+#include "coreneuron/nrniv/vrecitem.h"
+#include "coreneuron/nrniv/profiler_interface.h"
+#include "coreneuron/nrniv/cellorder.h"
+#include "coreneuron/nrniv/cuda_profile.h"
+#include "coreneuron/scopmath_core/newton_struct.h"
+
+#ifdef _OPENACC
+#include <openacc.h>
+#endif
+
+#ifdef CRAYPAT
+#include <pat_api.h>
+#endif
+
+extern InterleaveInfo* interleave_info;
+void copy_ivoc_vect_to_device(IvocVect*& iv, IvocVect*& div);
+
+/* note: threads here are corresponding to global nrn_threads array */
+void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
+#ifdef _OPENACC
+
+    if (nthreads <= 0) {
+        printf("\n Warning: No threads to copy on GPU! ");
+        return;
+    }
+
+    /** @todo: currently only checking nvidia gpu */
+    int num_gpus = acc_get_num_devices(acc_device_nvidia);
+    if (num_gpus == 0) {
+        printf("\n WARNING: Enabled GPU execution but couldn't find NVIDIA GPU! \n");
+    }
+
+    int i;
+    NrnThread* d_threads;
+
+    /* @todo: why dt is not setup at this moment? */
+    for (i = 0; i < nthreads; i++) {
+        (threads + i)->_dt = dt;
+        /* this thread will be computed on GPU */
+        (threads + i)->compute_gpu = 1;
+    }
+
+    /* -- copy NrnThread to device. this needs to be contigious vector because offset is used to
+     * find
+     * corresponding NrnThread using Point_process in NET_RECEIVE block
+     */
+    d_threads = (NrnThread*)acc_copyin(threads, sizeof(NrnThread) * nthreads);
+
+    printf("\n --- Copying to Device! --- ");
+
+    if (interleave_info == NULL) {
+        printf("\n Warning: No permutation data? Required for linear algebra!");
+    }
+
+    /* pointers for data struct on device, starting with d_ */
+
+    for (i = 0; i < nthreads; i++) {
+        NrnThread* nt = threads + i;      // NrnThread on host
+        NrnThread* d_nt = d_threads + i;  // NrnThread on device
+
+        if (nt->end <= 0) {
+            // this is an empty thread
+            continue;
+        }
+
+        double* d__data;  // nrn_threads->_data on device
+
+        printf("\n -----------COPYING %d'th NrnThread TO DEVICE --------------- \n", i);
+#if defined(CUDA_PROFILING)
+        print_gpu_memory_usage();
+#endif
+
+        /* -- copy _data to device -- */
+
+        /*copy all double data for thread */
+        d__data = (double*)acc_copyin(nt->_data, nt->_ndata * sizeof(double));
+
+        /* Here is the example of using OpenACC data enter/exit
+         * Remember that we are not allowed to use nt->_data but we have to use:
+         *      double *dtmp = nt->_data;  // now use dtmp!
+                #pragma acc enter data copyin(dtmp[0:nt->_ndata]) async(nt->stream_id)
+                #pragma acc wait(nt->stream_id)
+         */
+
+        /*update d_nt._data to point to device copy */
+        acc_memcpy_to_device(&(d_nt->_data), &d__data, sizeof(double*));
+
+        /* -- setup rhs, d, a, b, v, node_aread to point to device copy -- */
+        double* dptr;
+
+        /* for padding, we have to recompute ne */
+        int ne = nrn_soa_padded_size(nt->end, 0);
+
+        dptr = d__data + 0 * ne;
+        acc_memcpy_to_device(&(d_nt->_actual_rhs), &(dptr), sizeof(double*));
+
+        dptr = d__data + 1 * ne;
+        acc_memcpy_to_device(&(d_nt->_actual_d), &(dptr), sizeof(double*));
+
+        dptr = d__data + 2 * ne;
+        acc_memcpy_to_device(&(d_nt->_actual_a), &(dptr), sizeof(double*));
+
+        dptr = d__data + 3 * ne;
+        acc_memcpy_to_device(&(d_nt->_actual_b), &(dptr), sizeof(double*));
+
+        dptr = d__data + 4 * ne;
+        acc_memcpy_to_device(&(d_nt->_actual_v), &(dptr), sizeof(double*));
+
+        dptr = d__data + 5 * ne;
+        acc_memcpy_to_device(&(d_nt->_actual_area), &(dptr), sizeof(double*));
+
+        int* d_v_parent_index = (int*)acc_copyin(nt->_v_parent_index, nt->end * sizeof(int));
+        acc_memcpy_to_device(&(d_nt->_v_parent_index), &(d_v_parent_index), sizeof(int*));
+
+        /* nt._ml_list is used in NET_RECEIVE block and should have valid membrane list id*/
+        Memb_list** d_ml_list =
+            (Memb_list**)acc_copyin(nt->_ml_list, n_memb_func * sizeof(Memb_list*));
+        acc_memcpy_to_device(&(d_nt->_ml_list), &(d_ml_list), sizeof(Memb_list**));
+
+        /* -- copy NrnThreadMembList list ml to device -- */
+
+        NrnThreadMembList* tml;
+        NrnThreadMembList* d_tml;
+        NrnThreadMembList* d_last_tml;
+
+        Memb_list* d_ml;
+        int first_tml = 1;
+        size_t offset = 6 * ne;
+
+        for (tml = nt->tml; tml; tml = tml->next) {
+            /*copy tml to device*/
+            /*QUESTIONS: does tml will point to NULL as in host ? : I assume so!*/
+            d_tml = (NrnThreadMembList*)acc_copyin(tml, sizeof(NrnThreadMembList));
+
+            /*first tml is pointed by nt */
+            if (first_tml) {
+                acc_memcpy_to_device(&(d_nt->tml), &d_tml, sizeof(NrnThreadMembList*));
+                first_tml = 0;
+            } else {
+                /*rest of tml forms linked list */
+                acc_memcpy_to_device(&(d_last_tml->next), &d_tml, sizeof(NrnThreadMembList*));
+            }
+
+            // book keeping for linked-list
+            d_last_tml = d_tml;
+
+            /* now for every tml, there is a ml. copy that and setup pointer */
+            d_ml = (Memb_list*)acc_copyin(tml->ml, sizeof(Memb_list));
+            acc_memcpy_to_device(&(d_tml->ml), &d_ml, sizeof(Memb_list*));
+
+            /* setup nt._ml_list */
+            acc_memcpy_to_device(&(d_ml_list[tml->index]), &d_ml, sizeof(Memb_list*));
+
+            int type = tml->index;
+            int n = tml->ml->nodecount;
+            int szp = nrn_prop_param_size_[type];
+            int szdp = nrn_prop_dparam_size_[type];
+            int is_art = nrn_is_artificial_[type];
+            int layout = nrn_mech_data_layout_[type];
+
+            offset = nrn_soa_padded_size(offset, layout);
+
+            dptr = d__data + offset;
+
+            acc_memcpy_to_device(&(d_ml->data), &(dptr), sizeof(double*));
+
+            offset += nrn_soa_padded_size(n, layout) * szp;
+
+            if (!is_art) {
+                int* d_nodeindices = (int*)acc_copyin(tml->ml->nodeindices, sizeof(int) * n);
+                acc_memcpy_to_device(&(d_ml->nodeindices), &d_nodeindices, sizeof(int*));
+            }
+
+            if (szdp) {
+                int pcnt = nrn_soa_padded_size(n, layout) * szdp;
+                int* d_pdata = (int*)acc_copyin(tml->ml->pdata, sizeof(int) * pcnt);
+                acc_memcpy_to_device(&(d_ml->pdata), &d_pdata, sizeof(int*));
+            }
+
+            int ts = memb_func[type].thread_size_;
+            if (ts) {
+                ThreadDatum* td =
+                    (ThreadDatum*)acc_copyin(tml->ml->_thread, ts * sizeof(ThreadDatum));
+                acc_memcpy_to_device(&(d_ml->_thread), &td, sizeof(ThreadDatum*));
+            }
+
+            NetReceiveBuffer_t *nrb, *d_nrb;
+            int *d_weight_index, *d_pnt_index, *d_displ, *d_nrb_index;
+            double *d_nrb_t, *d_nrb_flag;
+
+            // net_receive buffer associated with mechanism
+            nrb = tml->ml->_net_receive_buffer;
+
+            // if net receive buffer exist for mechanism
+            if (nrb) {
+                d_nrb = (NetReceiveBuffer_t*)acc_copyin(nrb, sizeof(NetReceiveBuffer_t));
+                acc_memcpy_to_device(&(d_ml->_net_receive_buffer), &d_nrb,
+                                     sizeof(NetReceiveBuffer_t*));
+
+                d_pnt_index = (int*)acc_copyin(nrb->_pnt_index, sizeof(int) * nrb->_size);
+                acc_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index, sizeof(int*));
+
+                d_weight_index = (int*)acc_copyin(nrb->_weight_index, sizeof(int) * nrb->_size);
+                acc_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index, sizeof(int*));
+
+                d_nrb_t = (double*)acc_copyin(nrb->_nrb_t, sizeof(double) * nrb->_size);
+                acc_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t, sizeof(double*));
+
+                d_nrb_flag = (double*)acc_copyin(nrb->_nrb_flag, sizeof(double) * nrb->_size);
+                acc_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag, sizeof(double*));
+
+                d_displ = (int*)acc_copyin(nrb->_displ, sizeof(int) * (nrb->_size + 1));
+                acc_memcpy_to_device(&(d_nrb->_displ), &d_displ, sizeof(int*));
+
+                d_nrb_index = (int*)acc_copyin(nrb->_nrb_index, sizeof(int) * (nrb->_size + 1));
+                acc_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index, sizeof(int*));
+            }
+
+            /* copy NetSendBuffer_t on to GPU */
+            NetSendBuffer_t* nsb;
+            nsb = tml->ml->_net_send_buffer;
+
+            if (nsb) {
+                NetSendBuffer_t* d_nsb;
+                int* d_iptr;
+                double* d_dptr;
+
+                d_nsb = (NetSendBuffer_t*)acc_copyin(nsb, sizeof(NetSendBuffer_t));
+                acc_memcpy_to_device(&(d_ml->_net_send_buffer), &d_nsb, sizeof(NetSendBuffer_t*));
+
+                d_iptr = (int*)acc_copyin(nsb->_sendtype, sizeof(int) * nsb->_size);
+                acc_memcpy_to_device(&(d_nsb->_sendtype), &d_iptr, sizeof(int*));
+
+                d_iptr = (int*)acc_copyin(nsb->_vdata_index, sizeof(int) * nsb->_size);
+                acc_memcpy_to_device(&(d_nsb->_vdata_index), &d_iptr, sizeof(int*));
+
+                d_iptr = (int*)acc_copyin(nsb->_pnt_index, sizeof(int) * nsb->_size);
+                acc_memcpy_to_device(&(d_nsb->_pnt_index), &d_iptr, sizeof(int*));
+
+                d_iptr = (int*)acc_copyin(nsb->_weight_index, sizeof(int) * nsb->_size);
+                acc_memcpy_to_device(&(d_nsb->_weight_index), &d_iptr, sizeof(int*));
+
+                d_dptr = (double*)acc_copyin(nsb->_nsb_t, sizeof(double) * nsb->_size);
+                acc_memcpy_to_device(&(d_nsb->_nsb_t), &d_dptr, sizeof(double*));
+
+                d_dptr = (double*)acc_copyin(nsb->_nsb_flag, sizeof(double) * nsb->_size);
+                acc_memcpy_to_device(&(d_nsb->_nsb_flag), &d_dptr, sizeof(double*));
+            }
+        }
+
+        if (nt->shadow_rhs_cnt) {
+            double* d_shadow_ptr;
+
+            int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
+
+            /* copy shadow_rhs to device and fix-up the pointer */
+            d_shadow_ptr = (double*)acc_copyin(nt->_shadow_rhs, pcnt * sizeof(double));
+            acc_memcpy_to_device(&(d_nt->_shadow_rhs), &d_shadow_ptr, sizeof(double*));
+
+            /* copy shadow_d to device and fix-up the pointer */
+            d_shadow_ptr = (double*)acc_copyin(nt->_shadow_d, pcnt * sizeof(double));
+            acc_memcpy_to_device(&(d_nt->_shadow_d), &d_shadow_ptr, sizeof(double*));
+        }
+
+        if (nt->n_pntproc) {
+            /* copy Point_processes array and fix the pointer to execute net_receive blocks on GPU
+             */
+            Point_process* pntptr =
+                (Point_process*)acc_copyin(nt->pntprocs, nt->n_pntproc * sizeof(Point_process));
+            acc_memcpy_to_device(&(d_nt->pntprocs), &pntptr, sizeof(Point_process*));
+        }
+
+        if (nt->n_weight) {
+            /* copy weight vector used in NET_RECEIVE which is pointed by netcon.weight */
+            double* d_weights = (double*)acc_copyin(nt->weights, sizeof(double) * nt->n_weight);
+            acc_memcpy_to_device(&(d_nt->weights), &d_weights, sizeof(double*));
+        }
+
+        if (nt->_nvdata) {
+            /* copy vdata which is setup in bbcore_read. This contains cuda allocated
+             * nrnran123_State * */
+            void** d_vdata = (void**)acc_copyin(nt->_vdata, sizeof(void*) * nt->_nvdata);
+            acc_memcpy_to_device(&(d_nt->_vdata), &d_vdata, sizeof(void**));
+        }
+
+        if (nt->n_presyn) {
+            /* copy presyn vector used for spike exchange, note we have added new PreSynHelper due
+             * to issue
+             * while updating PreSyn objects which has virtual base class. May be this is issue due
+             * to
+             * VTable and alignment */
+            PreSynHelper* d_presyns_helper =
+                (PreSynHelper*)acc_copyin(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn);
+            acc_memcpy_to_device(&(d_nt->presyns_helper), &d_presyns_helper, sizeof(PreSynHelper*));
+            PreSyn* d_presyns = (PreSyn*)acc_copyin(nt->presyns, sizeof(PreSyn) * nt->n_presyn);
+            acc_memcpy_to_device(&(d_nt->presyns), &d_presyns, sizeof(PreSyn*));
+        }
+
+        if (nt->_net_send_buffer_size) {
+            /* copy send_receive buffer */
+            int* d_net_send_buffer =
+                (int*)acc_copyin(nt->_net_send_buffer, sizeof(int) * nt->_net_send_buffer_size);
+            acc_memcpy_to_device(&(d_nt->_net_send_buffer), &d_net_send_buffer, sizeof(int*));
+        }
+
+        if (nt->n_vecplay) {
+            /* copy VecPlayContinuous instances */
+
+            printf("\n Warning: VectorPlay used but NOT implemented on GPU! ");
+
+            /** just empty containers */
+            void** d_vecplay = (void**)acc_copyin(nt->_vecplay, sizeof(void*) * nt->n_vecplay);
+            acc_memcpy_to_device(&(d_nt->_vecplay), &d_vecplay, sizeof(void**));
+
+            for (int i = 0; i < nt->n_vecplay; i++) {
+                VecPlayContinuous* vecplay_instance = (VecPlayContinuous*)nt->_vecplay[i];
+
+                /** just VecPlayContinuous object */
+                void* d_p = (void*)acc_copyin(vecplay_instance, sizeof(VecPlayContinuous));
+                acc_memcpy_to_device(&(d_vecplay[i]), &d_p, sizeof(void*));
+
+                VecPlayContinuous* d_vecplay_instance = (VecPlayContinuous*)d_p;
+
+                /** copy y_, t_ and discon_indices_ */
+                copy_ivoc_vect_to_device(vecplay_instance->y_, d_vecplay_instance->y_);
+                copy_ivoc_vect_to_device(vecplay_instance->t_, d_vecplay_instance->t_);
+                copy_ivoc_vect_to_device(vecplay_instance->discon_indices_,
+                                         d_vecplay_instance->discon_indices_);
+
+                /** copy PlayRecordEvent : todo: verify this */
+                PlayRecordEvent* d_e_ =
+                    (PlayRecordEvent*)acc_copyin(vecplay_instance->e_, sizeof(PlayRecordEvent));
+                acc_memcpy_to_device(&(d_e_->plr_), &d_vecplay_instance,
+                                     sizeof(VecPlayContinuous*));
+                acc_memcpy_to_device(&(d_vecplay_instance->e_), &d_e_, sizeof(PlayRecordEvent*));
+
+                /** copy pd_ : note that it's pointer inside ml->data and hence data itself is
+                 * already on GPU */
+                double* d_pd_ = (double*)acc_deviceptr(vecplay_instance->pd_);
+                acc_memcpy_to_device(&(d_vecplay_instance->pd_), &d_pd_, sizeof(double*));
+            }
+        }
+
+        if (nt->_permute) {
+            if (use_interleave_permute == 1) {
+                /* todo: not necessary to setup pointers, just copy it */
+                InterleaveInfo* info = interleave_info + i;
+                InterleaveInfo* d_info = (InterleaveInfo*)acc_copyin(info, sizeof(InterleaveInfo));
+                int* d_ptr = NULL;
+
+                d_ptr = (int*)acc_copyin(info->stride, sizeof(int) * (info->nstride + 1));
+                acc_memcpy_to_device(&(d_info->stride), &d_ptr, sizeof(int*));
+
+                d_ptr = (int*)acc_copyin(info->firstnode, sizeof(int) * nt->ncell);
+                acc_memcpy_to_device(&(d_info->firstnode), &d_ptr, sizeof(int*));
+
+                d_ptr = (int*)acc_copyin(info->lastnode, sizeof(int) * nt->ncell);
+                acc_memcpy_to_device(&(d_info->lastnode), &d_ptr, sizeof(int*));
+
+                d_ptr = (int*)acc_copyin(info->cellsize, sizeof(int) * nt->ncell);
+                acc_memcpy_to_device(&(d_info->cellsize), &d_ptr, sizeof(int*));
+
+            } else if (use_interleave_permute == 2) {
+                /* todo: not necessary to setup pointers, just copy it */
+                InterleaveInfo* info = interleave_info + i;
+                InterleaveInfo* d_info = (InterleaveInfo*)acc_copyin(info, sizeof(InterleaveInfo));
+                int* d_ptr = NULL;
+
+                d_ptr = (int*)acc_copyin(info->stride, sizeof(int) * info->nstride);
+                acc_memcpy_to_device(&(d_info->stride), &d_ptr, sizeof(int*));
+
+                d_ptr = (int*)acc_copyin(info->firstnode, sizeof(int) * (info->nwarp + 1));
+                acc_memcpy_to_device(&(d_info->firstnode), &d_ptr, sizeof(int*));
+
+                d_ptr = (int*)acc_copyin(info->lastnode, sizeof(int) * (info->nwarp + 1));
+                acc_memcpy_to_device(&(d_info->lastnode), &d_ptr, sizeof(int*));
+
+                d_ptr = (int*)acc_copyin(info->stridedispl, sizeof(int) * (info->nwarp + 1));
+                acc_memcpy_to_device(&(d_info->stridedispl), &d_ptr, sizeof(int*));
+
+                d_ptr = (int*)acc_copyin(info->cellsize, sizeof(int) * info->nwarp);
+                acc_memcpy_to_device(&(d_info->cellsize), &d_ptr, sizeof(int*));
+            } else {
+                printf("\n ERROR: only --cell_permute = [12] implemented");
+                abort();
+            }
+        } else {
+            printf("\n WARNING: NrnThread %d not permuted, error for linear algebra?", i);
+        }
+
+        printf("\n Compute thread on GPU? : %s, Stream : %d\n", (nt->compute_gpu) ? "Yes" : "No",
+               nt->stream_id);
+    }
+
+    if (nrn_ion_global_map_size) {
+        double** d_data =
+            (double**)acc_copyin(nrn_ion_global_map, sizeof(double*) * nrn_ion_global_map_size);
+        for (int j = 0; j < nrn_ion_global_map_size; j++) {
+            if (nrn_ion_global_map[j]) {
+                /* @todo: fix this constant size 3 :( */
+                double* d_mechmap = (double*)acc_copyin(nrn_ion_global_map[j], 3 * sizeof(double));
+                acc_memcpy_to_device(&(d_data[j]), &d_mechmap, sizeof(double*));
+            }
+        }
+    }
+#else
+    (void)threads;
+    (void)nthreads;
+#endif
+}
+
+void copy_ivoc_vect_to_device(IvocVect*& iv, IvocVect*& div) {
+#ifdef _OPENACC
+    if (iv) {
+        IvocVect* d_iv = (IvocVect*)acc_copyin(iv, sizeof(IvocVect));
+        acc_memcpy_to_device(&div, &d_iv, sizeof(IvocVect*));
+
+        size_t n = iv->size();
+        if (n) {
+            double* d_data = (double*)acc_copyin(iv->data(), sizeof(double) * n);
+            acc_memcpy_to_device(&(d_iv->data_), &d_data, sizeof(double*));
+        }
+    }
+#else
+    (void)iv;
+    (void)div;
+#endif
+}
+
+void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml) {
+    NetReceiveBuffer_t* nrb = ml->_net_receive_buffer;
+    if (!nrb) {
+        return;
+    }
+
+#ifdef _OPENACC
+    if (nt->compute_gpu) {
+        // free existing vectors in buffers on gpu
+        acc_delete(nrb->_pnt_index, nrb->_size * sizeof(int));
+        acc_delete(nrb->_weight_index, nrb->_size * sizeof(int));
+        acc_delete(nrb->_nrb_t, nrb->_size * sizeof(double));
+        acc_delete(nrb->_nrb_flag, nrb->_size * sizeof(double));
+        acc_delete(nrb->_displ, (nrb->_size + 1) * sizeof(int));
+        acc_delete(nrb->_nrb_index, nrb->_size * sizeof(int));
+    }
+#endif
+
+    // Reallocate host
+    nrb->_size *= 2;
+    nrb->_pnt_index = (int*)erealloc(nrb->_pnt_index, nrb->_size * sizeof(int));
+    nrb->_weight_index = (int*)erealloc(nrb->_weight_index, nrb->_size * sizeof(int));
+    nrb->_nrb_t = (double*)erealloc(nrb->_nrb_t, nrb->_size * sizeof(double));
+    nrb->_nrb_flag = (double*)erealloc(nrb->_nrb_flag, nrb->_size * sizeof(double));
+    nrb->_displ = (int*)erealloc(nrb->_displ, (nrb->_size + 1) * sizeof(int));
+    nrb->_nrb_index = (int*)erealloc(nrb->_nrb_index, nrb->_size * sizeof(int));
+
+#ifdef _OPENACC
+    if (nt->compute_gpu) {
+        int *d_weight_index, *d_pnt_index, *d_displ, *d_nrb_index;
+        double *d_nrb_t, *d_nrb_flag;
+
+        // update device copy
+        acc_update_device(nrb, sizeof(NetReceiveBuffer_t));
+
+        NetReceiveBuffer_t* d_nrb = (NetReceiveBuffer_t*)acc_deviceptr(nrb);
+
+        // recopy the vectors in the buffer
+        d_pnt_index = (int*)acc_copyin(nrb->_pnt_index, sizeof(int) * nrb->_size);
+        acc_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index, sizeof(int*));
+
+        d_weight_index = (int*)acc_copyin(nrb->_weight_index, sizeof(int) * nrb->_size);
+        acc_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index, sizeof(int*));
+
+        d_nrb_t = (double*)acc_copyin(nrb->_nrb_t, sizeof(double) * nrb->_size);
+        acc_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t, sizeof(double*));
+
+        d_nrb_flag = (double*)acc_copyin(nrb->_nrb_flag, sizeof(double) * nrb->_size);
+        acc_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag, sizeof(double*));
+
+        d_displ = (int*)acc_copyin(nrb->_displ, sizeof(int) * (nrb->_size + 1));
+        acc_memcpy_to_device(&(d_nrb->_displ), &d_displ, sizeof(int*));
+
+        d_nrb_index = (int*)acc_copyin(nrb->_nrb_index, sizeof(int) * nrb->_size);
+        acc_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index, sizeof(int*));
+    }
+#endif
+}
+
+typedef std::pair<int, int> NRB_P;
+
+struct comp {
+    bool operator()(const NRB_P& a, const NRB_P& b) {
+        if (a.first == b.first) {
+            return a.second > b.second;  // same instances in original net_receive order
+        }
+        return a.first > b.first;
+    }
+};
+
+static void net_receive_buffer_order(NetReceiveBuffer_t* nrb) {
+    if (nrb->_cnt == 0) {
+        nrb->_displ_cnt = 0;
+        return;
+    }
+
+    std::priority_queue<NRB_P, std::vector<NRB_P>, comp> nrbq;
+
+    for (int i = 0; i < nrb->_cnt; ++i) {
+        nrbq.push(NRB_P(nrb->_pnt_index[i], i));
+    }
+
+    int displ_cnt = 0;
+    int index_cnt = 0;
+    int last_instance_index = -1;
+    nrb->_displ[0] = 0;
+
+    while (!nrbq.empty()) {
+        const NRB_P& p = nrbq.top();
+        nrb->_nrb_index[index_cnt++] = p.second;
+        if (p.first != last_instance_index) {
+            ++displ_cnt;
+        }
+        nrb->_displ[displ_cnt] = index_cnt;
+        last_instance_index = p.first;
+        nrbq.pop();
+    }
+    nrb->_displ_cnt = displ_cnt;
+}
+
+/* when we execute NET_RECEIVE block on GPU, we provide the index of synapse instances
+ * which we need to execute during the current timestep. In order to do this, we have
+ * update NetReceiveBuffer_t object to GPU. When size of cpu buffer changes, we set
+ * reallocated to true and hence need to reallocate buffer on GPU and then need to copy
+ * entire buffer. If reallocated is 0, that means buffer size is not changed and hence
+ * only need to copy _size elements to GPU.
+ * Note: this is very preliminary implementation, optimisations will be done after first
+ * functional version.
+ */
+void update_net_receive_buffer(NrnThread* nt) {
+    NrnThreadMembList* tml;
+
+    for (tml = nt->tml; tml; tml = tml->next) {
+        // net_receive buffer to copy
+        NetReceiveBuffer_t* nrb = tml->ml->_net_receive_buffer;
+
+        // if net receive buffer exist for mechanism
+        if (nrb && nrb->_cnt) {
+            // instance order to avoid race. setup _displ and _nrb_index
+            net_receive_buffer_order(nrb);
+
+#ifdef _OPENACC
+            if (nt->compute_gpu) {
+                // note that dont update nrb otherwise we loose pointers
+
+                /* update scalar elements */
+                acc_update_device(&nrb->_cnt, sizeof(int));
+                acc_update_device(&nrb->_displ_cnt, sizeof(int));
+
+                acc_update_device(nrb->_pnt_index, sizeof(int) * nrb->_cnt);
+                acc_update_device(nrb->_weight_index, sizeof(int) * nrb->_cnt);
+                acc_update_device(nrb->_nrb_t, sizeof(double) * nrb->_cnt);
+                acc_update_device(nrb->_nrb_flag, sizeof(double) * nrb->_cnt);
+                acc_update_device(nrb->_displ, sizeof(int) * (nrb->_displ_cnt + 1));
+                acc_update_device(nrb->_nrb_index, sizeof(int) * nrb->_cnt);
+            }
+#endif
+        }
+    }
+}
+
+void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) {
+#ifdef _OPENACC
+    if (!nt->compute_gpu)
+        return;
+
+    if (nsb->_cnt) {
+        acc_update_self(nsb->_sendtype, sizeof(int) * nsb->_cnt);
+        acc_update_self(nsb->_vdata_index, sizeof(int) * nsb->_cnt);
+        acc_update_self(nsb->_pnt_index, sizeof(int) * nsb->_cnt);
+        acc_update_self(nsb->_weight_index, sizeof(int) * nsb->_cnt);
+        acc_update_self(nsb->_nsb_t, sizeof(double) * nsb->_cnt);
+        acc_update_self(nsb->_nsb_flag, sizeof(double) * nsb->_cnt);
+    }
+#else
+    (void)nt;
+    (void)nsb;
+#endif
+}
+
+void update_nrnthreads_on_host(NrnThread* threads, int nthreads) {
+#ifdef _OPENACC
+
+    printf("\n --- Copying to Host! --- \n");
+
+    int i;
+    NetReceiveBuffer_t* nrb;
+
+    for (i = 0; i < nthreads; i++) {
+        NrnThread* nt = threads + i;
+
+        if (nt->compute_gpu && (nt->end > 0)) {
+            /* -- copy data to host -- */
+
+            int ne = nrn_soa_padded_size(nt->end, 0);
+
+            acc_update_self(nt->_actual_rhs, ne * sizeof(double));
+            acc_update_self(nt->_actual_d, ne * sizeof(double));
+            acc_update_self(nt->_actual_a, ne * sizeof(double));
+            acc_update_self(nt->_actual_b, ne * sizeof(double));
+            acc_update_self(nt->_actual_v, ne * sizeof(double));
+            acc_update_self(nt->_actual_area, ne * sizeof(double));
+
+            /* @todo: nt._ml_list[tml->index] = tml->ml; */
+
+            /* -- copy NrnThreadMembList list ml to host -- */
+            NrnThreadMembList* tml;
+            for (tml = nt->tml; tml; tml = tml->next) {
+                Memb_list* ml = tml->ml;
+
+                acc_update_self(&tml->index, sizeof(int));
+                acc_update_self(&ml->nodecount, sizeof(int));
+
+                int type = tml->index;
+                int n = ml->nodecount;
+                int szp = nrn_prop_param_size_[type];
+                int szdp = nrn_prop_dparam_size_[type];
+                int is_art = nrn_is_artificial_[type];
+                int layout = nrn_mech_data_layout_[type];
+
+                int pcnt = nrn_soa_padded_size(n, layout) * szp;
+
+                acc_update_self(ml->data, pcnt * sizeof(double));
+
+                if (!is_art) {
+                    acc_update_self(ml->nodeindices, n * sizeof(int));
+                }
+
+                if (szdp) {
+                    int pcnt = nrn_soa_padded_size(n, layout) * szdp;
+                    acc_update_self(ml->pdata, pcnt * sizeof(int));
+                }
+
+                nrb = tml->ml->_net_receive_buffer;
+
+                if (nrb) {
+                    acc_update_self(&nrb->_cnt, sizeof(int));
+                    acc_update_self(&nrb->_size, sizeof(int));
+                    acc_update_self(&nrb->_pnt_offset, sizeof(int));
+                    acc_update_self(&nrb->_displ_cnt, sizeof(int));
+
+                    acc_update_self(nrb->_pnt_index, sizeof(int) * nrb->_size);
+                    acc_update_self(nrb->_weight_index, sizeof(int) * nrb->_size);
+                    acc_update_self(nrb->_displ, sizeof(int) * (nrb->_size + 1));
+                    acc_update_self(nrb->_nrb_index, sizeof(int) * nrb->_size);
+                }
+            }
+
+            if (nt->shadow_rhs_cnt) {
+                int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
+                /* copy shadow_rhs to host */
+                acc_update_self(nt->_shadow_rhs, pcnt * sizeof(double));
+                /* copy shadow_d to host */
+                acc_update_self(nt->_shadow_d, pcnt * sizeof(double));
+            }
+
+            if (nt->n_pntproc) {
+                acc_update_self(nt->pntprocs, nt->n_pntproc * sizeof(Point_process));
+            }
+
+            if (nt->n_weight) {
+                acc_update_self(nt->weights, sizeof(double) * nt->n_weight);
+            }
+
+            /* dont update vdata, its pointer array
+               if(nt->_nvdata) {
+               acc_update_self(nt->_vdata, sizeof(double)*nt->_nvdata);
+               }
+             */
+        }
+    }
+#else
+    (void)threads;
+    (void)nthreads;
+#endif
+}
+
+void update_nrnthreads_on_device(NrnThread* threads, int nthreads) {
+#ifdef _OPENACC
+
+    printf("\n --- Copying to Device! --- \n");
+
+    int i;
+    NetReceiveBuffer_t* nrb;
+
+    for (i = 0; i < nthreads; i++) {
+        NrnThread* nt = threads + i;
+
+        if (nt->compute_gpu && (nt->end > 0)) {
+            /* -- copy data to device -- */
+
+            int ne = nrn_soa_padded_size(nt->end, 0);
+
+            acc_update_device(nt->_actual_rhs, ne * sizeof(double));
+            acc_update_device(nt->_actual_d, ne * sizeof(double));
+            acc_update_device(nt->_actual_a, ne * sizeof(double));
+            acc_update_device(nt->_actual_b, ne * sizeof(double));
+            acc_update_device(nt->_actual_v, ne * sizeof(double));
+            acc_update_device(nt->_actual_area, ne * sizeof(double));
+
+            /* @todo: nt._ml_list[tml->index] = tml->ml; */
+
+            /* -- copy NrnThreadMembList list ml to host -- */
+            NrnThreadMembList* tml;
+            for (tml = nt->tml; tml; tml = tml->next) {
+                Memb_list* ml = tml->ml;
+                int type = tml->index;
+                int n = ml->nodecount;
+                int szp = nrn_prop_param_size_[type];
+                int szdp = nrn_prop_dparam_size_[type];
+                int is_art = nrn_is_artificial_[type];
+                int layout = nrn_mech_data_layout_[type];
+
+                int pcnt = nrn_soa_padded_size(n, layout) * szp;
+
+                acc_update_device(ml->data, pcnt * sizeof(double));
+
+                if (!is_art) {
+                    acc_update_device(ml->nodeindices, n * sizeof(int));
+                }
+
+                if (szdp) {
+                    int pcnt = nrn_soa_padded_size(n, layout) * szdp;
+                    acc_update_device(ml->pdata, pcnt * sizeof(int));
+                }
+
+                nrb = tml->ml->_net_receive_buffer;
+
+                if (nrb) {
+                    acc_update_device(&nrb->_cnt, sizeof(int));
+                    acc_update_device(&nrb->_size, sizeof(int));
+                    acc_update_device(&nrb->_pnt_offset, sizeof(int));
+                    acc_update_device(&nrb->_displ_cnt, sizeof(int));
+
+                    acc_update_device(nrb->_pnt_index, sizeof(int) * nrb->_size);
+                    acc_update_device(nrb->_weight_index, sizeof(int) * nrb->_size);
+                    acc_update_device(nrb->_displ, sizeof(int) * (nrb->_size + 1));
+                    acc_update_device(nrb->_nrb_index, sizeof(int) * nrb->_size);
+                }
+            }
+
+            if (nt->shadow_rhs_cnt) {
+                int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
+                /* copy shadow_rhs to host */
+                acc_update_device(nt->_shadow_rhs, pcnt * sizeof(double));
+                /* copy shadow_d to host */
+                acc_update_device(nt->_shadow_d, pcnt * sizeof(double));
+            }
+
+            if (nt->n_pntproc) {
+                acc_update_device(nt->pntprocs, nt->n_pntproc * sizeof(Point_process));
+            }
+
+            if (nt->n_weight) {
+                acc_update_device(nt->weights, sizeof(double) * nt->n_weight);
+            }
+
+            /* don't and don't update vdata, its pointer array
+               if(nt->_nvdata) {
+               acc_update_device(nt->_vdata, sizeof(double)*nt->_nvdata);
+               }
+             */
+        }
+    }
+#else
+    (void)threads;
+    (void)nthreads;
+#endif
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void update_matrix_from_gpu(NrnThread* _nt) {
+#ifdef _OPENACC
+    if (_nt->compute_gpu && (_nt->end > 0)) {
+        /* before copying, make sure all computations in the stream are completed */
+        #pragma acc wait(_nt->stream_id)
+
+        /* openacc routine doesn't allow asyn, use pragma */
+        // acc_update_self(_nt->_actual_rhs, 2*_nt->end*sizeof(double));
+
+        /* RHS and D are contigious, copy them in one go!
+         * NOTE: in pragma you have to give actual pointer like below and not nt->rhs...
+         */
+        double* rhs = _nt->_actual_rhs;
+        int ne = nrn_soa_padded_size(_nt->end, 0);
+
+        #pragma acc update host(rhs[0 : 2 * ne]) async(_nt->stream_id)
+        #pragma acc wait(_nt->stream_id)
+    }
+#else
+    (void)_nt;
+#endif
+}
+
+void update_matrix_to_gpu(NrnThread* _nt) {
+#ifdef _OPENACC
+    if (_nt->compute_gpu && (_nt->end > 0)) {
+        /* before copying, make sure all computations in the stream are completed */
+        #pragma acc wait(_nt->stream_id)
+
+        /* while discussion with Michael we found that RHS is also needed on
+         * gpu because nrn_cap_jacob uses rhs which is being updated on GPU
+         */
+        // printf("\n Copying voltage to GPU ... ");
+        double* v = _nt->_actual_v;
+        double* rhs = _nt->_actual_rhs;
+        int ne = nrn_soa_padded_size(_nt->end, 0);
+
+        #pragma acc update device(v[0 : ne]) async(_nt->stream_id)
+        #pragma acc update device(rhs[0 : ne]) async(_nt->stream_id)
+        #pragma acc wait(_nt->stream_id)
+    }
+#else
+    (void)_nt;
+#endif
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+void finalize_data_on_device() {
+    /*@todo: when we have used random123 on gpu and we do this finalize,
+    I am seeing cuCtxDestroy returned CUDA_ERROR_INVALID_CONTEXT error.
+    THis might be due to the fact that the cuda apis (e.g. free is not
+    called yet for Ramdom123 data / streams etc. So handle this better!
+    */
+    return;
+
+#ifdef _OPENACC
+    acc_shutdown(acc_device_default);
+#endif
+}
+
+void nrn_newtonspace_copyto_device(NewtonSpace* ns) {
+#ifdef _OPENACC
+    if (nrn_threads[0].compute_gpu == 0) {
+        return;
+    }
+
+    int n = ns->n * ns->n_instance;
+    // actually, the values of double do not matter, only the  pointers.
+    NewtonSpace* d_ns = (NewtonSpace*)acc_copyin(ns, sizeof(NewtonSpace));
+
+    double* pd;
+    int* pint;
+    double** ppd;
+
+    pd = (double*)acc_copyin(ns->delta_x, n * sizeof(double));
+    acc_memcpy_to_device(&(d_ns->delta_x), &pd, sizeof(double*));
+
+    pd = (double*)acc_copyin(ns->high_value, n * sizeof(double));
+    acc_memcpy_to_device(&(d_ns->high_value), &pd, sizeof(double*));
+
+    pd = (double*)acc_copyin(ns->low_value, n * sizeof(double));
+    acc_memcpy_to_device(&(d_ns->low_value), &pd, sizeof(double*));
+
+    pd = (double*)acc_copyin(ns->rowmax, n * sizeof(double));
+    acc_memcpy_to_device(&(d_ns->rowmax), &pd, sizeof(double*));
+
+    pint = (int*)acc_copyin(ns->perm, n * sizeof(int));
+    acc_memcpy_to_device(&(d_ns->perm), &pint, sizeof(int*));
+
+    ppd = (double**)acc_copyin(ns->jacobian, ns->n * sizeof(double*));
+    acc_memcpy_to_device(&(d_ns->jacobian), &ppd, sizeof(double**));
+
+    // the actual jacobian doubles were allocated as a single array
+    double* d_jacdat = (double*)acc_copyin(ns->jacobian[0], ns->n * n * sizeof(double));
+
+    for (int i = 0; i < ns->n; ++i) {
+        pd = d_jacdat + i * n;
+        acc_memcpy_to_device(&(ppd[i]), &pd, sizeof(double*));
+    }
+#endif
+}
+
+void nrn_sparseobj_copyto_device(SparseObj* so) {
+#ifdef _OPENACC
+    if (nrn_threads[0].compute_gpu == 0) {
+        return;
+    }
+
+    unsigned n1 = so->neqn + 1;
+    SparseObj* d_so = (SparseObj*)acc_copyin(so, sizeof(SparseObj));
+    // only pointer fields in SparseObj that need setting up are
+    //   rowst, diag, rhs, ngetcall, coef_list
+    // only pointer fields in Elm that need setting up are
+    //   r_down, c_right, value
+    // do not care about the Elm* ptr value, just the space.
+
+    Elm** ppelm;
+    Elm* pelm;
+    unsigned* pu;
+    double* pd;
+    double** ppd;
+
+    Elm** d_rowst = (Elm**)acc_copyin(so->rowst, n1 * sizeof(Elm*));
+    acc_memcpy_to_device(&(d_so->rowst), &d_rowst, sizeof(Elm**));
+
+    Elm** d_diag = (Elm**)acc_copyin(so->diag, n1 * sizeof(Elm*));
+    acc_memcpy_to_device(&(d_so->diag), &d_diag, sizeof(Elm**));
+
+    pu = (unsigned*)acc_copyin(so->ngetcall, so->_cntml_padded * sizeof(unsigned));
+    acc_memcpy_to_device(&(d_so->ngetcall), &pu, sizeof(Elm**));
+
+    pd = (double*)acc_copyin(so->rhs, n1 * so->_cntml_padded * sizeof(double));
+    acc_memcpy_to_device(&(d_so->rhs), &pd, sizeof(double*));
+
+    double** d_coef_list =
+        (double**)acc_copyin(so->coef_list, so->coef_list_size * sizeof(double*));
+    acc_memcpy_to_device(&(d_so->coef_list), &d_coef_list, sizeof(double**));
+
+    // Fill in relevant Elm pointer values
+
+    for (unsigned irow = 1; irow < n1; ++irow) {
+        for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) {
+            Elm* pelm = (Elm*)acc_copyin(elm, sizeof(Elm));
+
+            if (elm == so->rowst[irow]) {
+                acc_memcpy_to_device(&(d_rowst[irow]), &pelm, sizeof(Elm*));
+            } else {
+                Elm* d_e = (Elm*)acc_deviceptr(elm->c_left);
+                acc_memcpy_to_device(&(pelm->c_left), &d_e, sizeof(Elm*));
+            }
+
+            if (elm->col == elm->row) {
+                acc_memcpy_to_device(&(d_diag[irow]), &pelm, sizeof(Elm*));
+            }
+
+            if (irow > 1) {
+                if (elm->r_up) {
+                    Elm* d_e = (Elm*)acc_deviceptr(elm->r_up);
+                    acc_memcpy_to_device(&(pelm->r_up), &d_e, sizeof(Elm*));
+                }
+            }
+
+            pd = (double*)acc_copyin(elm->value, so->_cntml_padded * sizeof(double));
+            acc_memcpy_to_device(&(pelm->value), &pd, sizeof(double*));
+        }
+    }
+
+    // visit all the Elm again and fill in pelm->r_down and pelm->c_left
+    for (unsigned irow = 1; irow < n1; ++irow) {
+        for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) {
+            pelm = (Elm*)acc_deviceptr(elm);
+            if (elm->r_down) {
+                Elm* d_e = (Elm*)acc_deviceptr(elm->r_down);
+                acc_memcpy_to_device(&(pelm->r_down), &d_e, sizeof(Elm*));
+            }
+            if (elm->c_right) {
+                Elm* d_e = (Elm*)acc_deviceptr(elm->c_right);
+                acc_memcpy_to_device(&(pelm->c_right), &d_e, sizeof(Elm*));
+            }
+        }
+    }
+
+    // Fill in the d_so->coef_list
+    for (unsigned i = 0; i < so->coef_list_size; ++i) {
+        pd = (double*)acc_deviceptr(so->coef_list[i]);
+        acc_memcpy_to_device(&(d_coef_list[i]), &pd, sizeof(double*));
+    }
+#endif
+}
diff --git a/coreneuron/nrniv/nrn_acc_manager.h b/coreneuron/nrniv/nrn_acc_manager.h
new file mode 100644
index 000000000..d0cc70ae8
--- /dev/null
+++ b/coreneuron/nrniv/nrn_acc_manager.h
@@ -0,0 +1,31 @@
+#ifndef _nrn_device_manager_
+#define _nrn_device_manager_
+
+#if defined(_OPENACC)
+#include <openacc.h>
+#endif
+
+#include "coreneuron/nrnoc/multicore.h"
+
+void setup_nrnthreads_on_device(NrnThread* threads, int nthreads);
+void update_nrnthreads_on_host(NrnThread* threads, int nthreads);
+void update_nrnthreads_on_device(NrnThread* threads, int nthreads);
+void modify_data_on_device(NrnThread* threads, int nthreads);
+void dump_nt_to_file(char* filename, NrnThread* threads, int nthreads);
+void finalize_data_on_device();
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void update_matrix_from_gpu(NrnThread* _nt);
+void update_matrix_to_gpu(NrnThread* _nt);
+void update_net_receive_buffer(NrnThread* _nt);
+void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml);
+void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // _nrn_device_manager_
diff --git a/coreneuron/nrniv/nrn_assert.h b/coreneuron/nrniv/nrn_assert.h
index f92e73497..b8d57c1a1 100644
--- a/coreneuron/nrniv/nrn_assert.h
+++ b/coreneuron/nrniv/nrn_assert.h
@@ -42,15 +42,16 @@ THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 /** Emit formatted message to stderr, then abort(). */
-static void abortf(const char *fmt,...) {
+static void abortf(const char* fmt, ...) {
     va_list va;
-    va_start(va,fmt);
-    vfprintf(stderr,fmt,va);
+    va_start(va, fmt);
+    vfprintf(stderr, fmt, va);
     va_end(va);
     abort();
 }
 
 /** assert()-like macro, independent of NDEBUG status */
-#define nrn_assert(x) ((x) || (abortf("%s:%d: Assertion '%s' failed.\n",__FILE__,__LINE__,#x),0))
+#define nrn_assert(x) \
+    ((x) || (abortf("%s:%d: Assertion '%s' failed.\n", __FILE__, __LINE__, #x), 0))
 
 #endif
diff --git a/coreneuron/nrniv/nrn_datareader.cpp b/coreneuron/nrniv/nrn_datareader.cpp
index 2a70eb30e..51b88612c 100644
--- a/coreneuron/nrniv/nrn_datareader.cpp
+++ b/coreneuron/nrniv/nrn_datareader.cpp
@@ -26,51 +26,74 @@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 THE POSSIBILITY OF SUCH DAMAGE.
 */
 
+#include <iostream>
 #include "coreneuron/nrniv/nrn_datareader.h"
 
-data_reader::data_reader(const char *filename, bool reorder) {
-    this->open(filename,reorder);
+data_reader::data_reader(const char* filename, bool reorder) {
+    this->open(filename, reorder);
     checkpoint(0);
 }
 
-void data_reader::open(const char *filename, bool reorder) {
-    reorder_on_read=reorder;
+void data_reader::open(const char* filename, bool reorder) {
+    reorder_on_read = reorder;
 
     close();
     F.open(filename);
-    nrn_assert(!F.fail());
 }
 
-static const int max_line_length=100;
+static const int max_line_length = 100;
 
 int data_reader::read_int() {
     char line_buf[max_line_length];
 
-    F.getline(line_buf,sizeof(line_buf));
+    F.getline(line_buf, sizeof(line_buf));
     nrn_assert(!F.fail());
 
     int i;
-    int n_scan=sscanf(line_buf,"%d",&i);
-    nrn_assert(n_scan==1);
-    
+    int n_scan = sscanf(line_buf, "%d", &i);
+    nrn_assert(n_scan == 1);
+
     return i;
 }
 
+void data_reader::read_mapping_count(int* gid,
+                                     int* seg,
+                                     int* soma,
+                                     int* axon,
+                                     int* dend,
+                                     int* apical,
+                                     int* compartment) {
+    char line_buf[max_line_length];
+
+    F.getline(line_buf, sizeof(line_buf));
+    nrn_assert(!F.fail());
+
+    /** mapping file has extra strings, ignore those */
+    int n_scan = sscanf(line_buf, "%d %d %*s %d %*s %d %*s %d %*s %d %*s %d", gid, seg, soma, axon,
+                        dend, apical, compartment);
+
+    nrn_assert(n_scan == 7);
+}
+
 void data_reader::read_checkpoint_assert() {
     char line_buf[max_line_length];
 
-    F.getline(line_buf,sizeof(line_buf));
+    F.getline(line_buf, sizeof(line_buf));
     nrn_assert(!F.fail());
 
     int i;
-    int n_scan=sscanf(line_buf,"chkpnt %d\n",&i);
-    nrn_assert(n_scan==1);
-    nrn_assert(i==chkpnt);
+    int n_scan = sscanf(line_buf, "chkpnt %d\n", &i);
+    if (n_scan != 1) {
+        fprintf(stderr, "no chkpnt line for %d\n", chkpnt);
+    }
+    nrn_assert(n_scan == 1);
+    if (i != chkpnt) {
+        fprintf(stderr, "file chkpnt %d != expected %d\n", i, chkpnt);
+    }
+    nrn_assert(i == chkpnt);
     ++chkpnt;
 }
 
-
 void data_reader::close() {
     F.close();
 }
-
diff --git a/coreneuron/nrniv/nrn_datareader.h b/coreneuron/nrniv/nrn_datareader.h
index 78d96a8dc..6c19dfd46 100644
--- a/coreneuron/nrniv/nrn_datareader.h
+++ b/coreneuron/nrniv/nrn_datareader.h
@@ -48,11 +48,10 @@ THE POSSIBILITY OF SUCH DAMAGE.
  * and read_dbl_array() methods use new [].
  */
 
-
 class data_reader {
     std::ifstream F;       //!< File stream associated with reader.
     bool reorder_on_read;  //!< True if we need to reorder for native endiannes.
-    int chkpnt;             //!< Current checkpoint number state.
+    int chkpnt;            //!< Current checkpoint number state.
 
     /** Read a checkpoint line, bump our chkpnt counter, and assert equality.
      *
@@ -63,22 +62,32 @@ class data_reader {
     void read_checkpoint_assert();
 
     // private copy constructor, assignment: data_reader is not copyable.
-    data_reader(const data_reader &);
-    data_reader &operator=(const data_reader &);
+    data_reader(const data_reader&);
+    data_reader& operator=(const data_reader&);
+
+  public:
+    data_reader() : reorder_on_read(false), chkpnt(0) {
+    }
 
-public:
-    data_reader(): reorder_on_read(false),chkpnt(0) {}
+    explicit data_reader(const char* filename, bool reorder = false);
 
-    explicit data_reader(const char *filename,bool reorder=false);
-    
     /** Preserving chkpnt state, move to a new file. */
-    void open(const char *filename, bool reorder);
+    void open(const char* filename, bool reorder);
+
+    /** Is the file not open */
+    bool fail() const {
+        return F.fail();
+    }
 
     /** Query chkpnt state. */
-    int checkpoint() const { return chkpnt; }
+    int checkpoint() const {
+        return chkpnt;
+    }
 
     /** Explicitly override chkpnt state. */
-    void checkpoint(int c) { chkpnt=c; }
+    void checkpoint(int c) {
+        chkpnt = c;
+    }
 
     /** Parse a single integer entry.
      *
@@ -89,9 +98,41 @@ class data_reader {
      */
     int read_int();
 
+    /** Parse a neuron mapping count entries
+     *
+     * Reads neuron mapping info which is represented by
+     * gid, #segments, #somas, #axons, #dendrites, #apicals, #total compartments
+     */
+    void read_mapping_count(int* gid,
+                            int* seg,
+                            int* soma,
+                            int* axon,
+                            int* dend,
+                            int* apical,
+                            int* compartment);
+
+    /** Parse a neuron section segment mapping
+     *
+     * Read count no of mappings for section to segment
+     */
+    template <typename T>
+    void read_mapping_info(T* mapinfo, int count) {
+        const int max_line_length = 1000;
+        char line_buf[max_line_length];
+
+        for (int i = 0; i < count; i++) {
+            F.getline(line_buf, sizeof(line_buf));
+            nrn_assert(!F.fail());
+            int sec, seg, n_scan;
+            n_scan = sscanf(line_buf, "%d %d", &sec, &seg);
+            nrn_assert(n_scan == 2);
+            mapinfo->add_segment(sec, seg);
+        }
+    }
+
     /** Defined flag values for parse_array() */
-    typedef enum parse_action { read,seek } parse_action;
-    
+    typedef enum parse_action { read, seek } parse_action;
+
     /** Generic parse function for an array of fixed length.
       *
       * \tparam T the array element type: may be \c int or \c double.
@@ -109,20 +150,22 @@ class data_reader {
       * representation of the writing process.
       */
     template <typename T>
-    inline T* parse_array(T* p,size_t count,parse_action flag){
-        if (count>0 && flag!=seek) nrn_assert(p!=0);
-  
+    inline T* parse_array(T* p, size_t count, parse_action flag) {
+        if (count > 0 && flag != seek)
+            nrn_assert(p != 0);
+
         read_checkpoint_assert();
         switch (flag) {
-        case seek:
-            F.seekg(count*sizeof(T),std::ios_base::cur);
-            break;
-        case read:
-            F.read((char *)p,count*sizeof(T));
-            if (reorder_on_read) endian::swap_endian_range(p,p+count);
-            break;
+            case seek:
+                F.seekg(count * sizeof(T), std::ios_base::cur);
+                break;
+            case read:
+                F.read((char*)p, count * sizeof(T));
+                if (reorder_on_read)
+                    endian::swap_endian_range(p, p + count);
+                break;
         }
-  
+
         nrn_assert(!F.fail());
         return p;
     }
@@ -131,15 +174,18 @@ class data_reader {
 
     /** Read and optionally allocate an integer array of fixed length. */
     template <typename T>
-    inline T* read_array(T* p,size_t count) { return parse_array(p,count,read); }
+    inline T* read_array(T* p, size_t count) {
+        return parse_array(p, count, read);
+    }
 
     /** Allocate and read an integer array of fixed length. */
     template <typename T>
-    inline T* read_array(size_t count) { return parse_array(new T[count],count,read); }
-      
+    inline T* read_array(size_t count) {
+        return parse_array(new T[count], count, read);
+    }
+
     /** Close currently open file. */
     void close();
 };
 
-
-#endif // ifndef nrn_datareader_h
+#endif  // ifndef nrn_datareader_h
diff --git a/coreneuron/nrniv/nrn_setup.cpp b/coreneuron/nrniv/nrn_setup.cpp
index 864ba7401..f537a983c 100644
--- a/coreneuron/nrniv/nrn_setup.cpp
+++ b/coreneuron/nrniv/nrn_setup.cpp
@@ -25,21 +25,26 @@ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 THE POSSIBILITY OF SUCH DAMAGE.
 */
+#include "coreneuron/utils/randoms/nrnran123.h"
+#include <algorithm>
+#include <iostream>
+#include <vector>
+#include <map>
 #include "coreneuron/nrnconf.h"
 #include "coreneuron/nrnoc/multicore.h"
 #include "coreneuron/nrniv/nrniv_decl.h"
 #include "coreneuron/nrnoc/nrnoc_decl.h"
 #include "coreneuron/nrniv/vrecitem.h"
-#include "coreneuron/utils/randoms/nrnran123.h"
 #include "coreneuron/utils/sdprintf.h"
-#include "coreneuron/nrniv/nrn_datareader.h"
 #include "coreneuron/nrniv/nrn_assert.h"
 #include "coreneuron/nrniv/nrnmutdec.h"
 #include "coreneuron/nrniv/memory.h"
 #include "coreneuron/nrniv/nrn_setup.h"
-#include <algorithm>
-#include <iostream>
-#include <vector>
+#include "coreneuron/nrniv/partrans.h"
+#include "coreneuron/nrniv/nrnoptarg.h"
+#include "coreneuron/nrniv/node_permute.h"
+#include "coreneuron/nrniv/cellorder.h"
+#include "coreneuron/utils/reports/nrnreport.h"
 
 // file format defined in cooperation with nrncore/src/nrniv/nrnbbcore_write.cpp
 // single integers are ascii one per line. arrays are binary int or double
@@ -59,7 +64,7 @@ THE POSSIBILITY OF SUCH DAMAGE.
 // when <firstgid>_1.dat is read and then destroy it after <firstgid>_2.dat
 // is finished using it.  An earlier implementation which attempted to
 // encode the thread number into the negative gid
-// (i.e -ith - nth*(type +1000*index)) failed due to not large enough 
+// (i.e -ith - nth*(type +1000*index)) failed due to not large enough
 // integer domain size.
 //
 // <firstgid>_2.dat
@@ -108,6 +113,36 @@ THE POSSIBILITY OF SUCH DAMAGE.
 // files with the first containing output_gids and netcon_srcgid which are
 // stored in the nt.presyns array and nt.netcons array respectively
 
+int nrn_setup_multiple = 1; /* default */
+int nrn_setup_extracon = 0; /* default */
+static int maxgid;
+// no gid in any file can be greater than maxgid. maxgid will be set so that
+// maxgid * nrn_setup_multiple < 0x7fffffff
+
+// nrn_setup_extracon extra connections per NrnThread.
+// i.e. nrn_setup_extracon * nrn_setup_multiple * nrn_nthread
+// extra connections on this process.
+// The targets of the connections on a NrnThread are randomly selected
+// (with replacement) from the set of ProbAMPANMDA_EMS on the thread.
+// (This synapse type is not strictly appropriate to be used as
+// a generalized synapse with multiple input streams since some of its
+// range variables store quantities that should be stream specific
+// and therefore should be stored in the NetCon weight vector. But it
+// might be good enough for our purposes. In any case, we'd like to avoid
+// creating new POINT_PROCESS instances with all the extra complexities
+// involved in adjusting the data arrays.)
+// The nrn_setup_extracon value is used to allocate the appropriae
+// amount of extra space for NrnThread.netcons and NrnThread.weights
+//
+// The most difficult problem is to augment the rank wide inputpresyn_ list.
+// We wish to randomly choose source gids for the extracon NetCons from the
+// set of gids not in "multiple" instance of the model the NrnThread is a
+// member of. We need to take into account the possibilty of multiple
+// NrnThread in multiple "multiple" instances having extra NetCon with the
+// same source gid. That some of the source gids may be already be
+// associated with already existing PreSyn on this rank is a minor wrinkle.
+// This is done between phase1 and phase2 during the call to
+// determine_inputpresyn().
 
 // If MATRIX_LAYOUT is 1 then a,b,d,rhs,v,area is not padded using NRN_SOA_PAD
 // When MATRIX_LAYOUT is 0 then mechanism pdata index values into _actual_v
@@ -125,340 +160,558 @@ THE POSSIBILITY OF SUCH DAMAGE.
 #define NRN_SOA_PAD 4
 #endif
 #if !defined(NRN_SOA_BYTE_ALIGN)
-// for layout 0, every range variable array must be aligned by at least 16 bytes (the size of the simd memory bus)
-#define NRN_SOA_BYTE_ALIGN (2*sizeof(double))
+// for layout 0, every range variable array must be aligned by at least 16 bytes (the size of the
+// simd memory bus)
+#define NRN_SOA_BYTE_ALIGN (2 * sizeof(double))
 #endif
 
-static MUTDEC
-static void determine_inputpresyn(void);
-static size_t model_size(void);
+static MUTDEC static size_t model_size(void);
+
+/// Vector of maps for negative presyns
+std::vector<std::map<int, PreSyn*> > neg_gid2out;
+/// Maps for ouput and input presyns
+std::map<int, PreSyn*> gid2out;
+std::map<int, InputPreSyn*> gid2in;
 
-static int n_inputpresyn_;
-static InputPreSyn* inputpresyn_; // the global array of instances.
+/// InputPreSyn.nc_index_ to + InputPreSyn.nc_cnt_ give the NetCon*
+std::vector<NetCon*> netcon_in_presyn_order_;
 
-// InputPreSyn.nc_index_ to + InputPreSyn.nc_cnt_ give the NetCon*
-NetCon** netcon_in_presyn_order_;
+/// Only for setup vector of netcon source gids
+std::vector<int*> netcon_srcgid;
 
 // Wrap read_phase1 and read_phase2 calls to allow using  nrn_multithread_job.
 // Args marshaled by store_phase_args are used by phase1_wrapper
 // and phase2_wrapper.
-static void store_phase_args(int ngroup, int* gidgroups, data_reader* file_reader,
-  const char* path, int byte_swap) {
-  ngroup_w = ngroup;
-  gidgroups_w = gidgroups;
-  file_reader_w = file_reader;
-  path_w = path;
-  byte_swap_w = (bool) byte_swap;
+static void store_phase_args(int ngroup,
+                             int* gidgroups,
+                             int* imult,
+                             data_reader* file_reader,
+                             const char* path,
+                             int byte_swap) {
+    ngroup_w = ngroup;
+    gidgroups_w = gidgroups;
+    imult_w = imult;
+    file_reader_w = file_reader;
+    path_w = path;
+    byte_swap_w = (bool)byte_swap;
 }
 
 /* read files.dat file and distribute cellgroups to all mpi ranks */
-void nrn_read_filesdat(int &ngrp, int * &grp, const char *filesdat)
-{
-    FILE *fp = fopen( filesdat, "r" );
-  
-    if ( !fp ) {
-      nrnmpi_fatal_error( "No input file with nrnthreads, exiting..." );
+void nrn_read_filesdat(int& ngrp, int*& grp, int multiple, int*& imult, const char* filesdat) {
+    FILE* fp = fopen(filesdat, "r");
+
+    if (!fp) {
+        nrn_fatal_error("No input file with nrnthreads, exiting...");
     }
 
-    int iNumFiles;
-    nrn_assert( fscanf( fp, "%d\n", &iNumFiles ) == 1 );
+    int iNumFiles = 0;
+    nrn_assert(fscanf(fp, "%d\n", &iNumFiles) == 1);
 
-    if ( nrnmpi_numprocs > iNumFiles ) {
-        nrnmpi_fatal_error( "The number of CPUs cannot exceed the number of input files" );
+    // temporary strategem to figure out if model uses gap junctions while
+    // being backward compatible
+    if (iNumFiles == -1) {
+        nrn_assert(fscanf(fp, "%d\n", &iNumFiles) == 1);
+        nrn_have_gaps = 1;
+        if (nrnmpi_myid == 0) {
+            printf("Model uses gap junctions\n");
+        }
+    }
+
+    if (nrnmpi_numprocs > iNumFiles) {
+        nrn_fatal_error("The number of CPUs cannot exceed the number of input files");
     }
 
     ngrp = 0;
-    grp = new int[iNumFiles / nrnmpi_numprocs + 1];
+    grp = new int[iNumFiles * multiple / nrnmpi_numprocs + 1];
+    imult = new int[iNumFiles * multiple / nrnmpi_numprocs + 1];
 
     // irerate over gids in files.dat
-    for ( int iNum = 0; iNum < iNumFiles; ++iNum ) {
+    for (int iNum = 0; iNum < iNumFiles * multiple; ++iNum) {
         int iFile;
 
-        nrn_assert( fscanf( fp, "%d\n", &iFile ) == 1 );
-        if ( ( iNum % nrnmpi_numprocs ) == nrnmpi_myid ) {
+        nrn_assert(fscanf(fp, "%d\n", &iFile) == 1);
+        if ((iNum % nrnmpi_numprocs) == nrnmpi_myid) {
             grp[ngrp] = iFile;
+            imult[ngrp] = iNum / iNumFiles;
             ngrp++;
         }
+        if ((iNum + 1) % iNumFiles == 0) {
+            rewind(fp);
+            fscanf(fp, "%*d\n");
+        }
     }
 
-    fclose( fp );
+    fclose(fp);
 }
 
-void nrn_setup(const char *path, const char *filesdat, int byte_swap, int threading) {
+void read_phase1(data_reader& F, int imult, NrnThread& nt) {
+    assert(!F.fail());
+    int zz = imult * maxgid;     // offset for each gid
+    nt.n_presyn = F.read_int();  /// Number of PreSyn-s in NrnThread nt
+    nt.n_netcon = F.read_int();  /// Number of NetCon-s in NrnThread nt
+    nt.presyns = new PreSyn[nt.n_presyn];
+    nt.netcons = new NetCon[nt.n_netcon + nrn_setup_extracon];
+    nt.presyns_helper = (PreSynHelper*)ecalloc(nt.n_presyn, sizeof(PreSynHelper));
+
+    /// Checkpoint in bluron is defined for both phase 1 and phase 2 since they are written together
+    /// output_gid has all of output PreSyns, netcon_srcgid is created for NetCons which might be
+    /// 10k times more than output_gid.
+    int* output_gid = F.read_array<int>(nt.n_presyn);
+    // the extra netcon_srcgid will be filled in later
+    netcon_srcgid[nt.id] = new int[nt.n_netcon + nrn_setup_extracon];
+    F.read_array<int>(netcon_srcgid[nt.id], nt.n_netcon);
+    F.close();
 
-  /// Number of local cell groups
-  int ngroup = 0;
+#if 0
+  // for checking whether negative gids fit into the gid space
+  // not used for now since negative gids no longer encode the thread id.
+  double dmaxint = 1073741824.; //2^30
+  for (;;) {
+    if (dmaxint*2. == double(int(dmaxint*2.))) {
+      dmaxint *= 2.;
+    }else{
+      if (dmaxint*2. - 1. == double(int(dmaxint*2. - 1.))) {
+        dmaxint = 2.*dmaxint - 1.;
+        break;
+      }
+    }
+  }
+#endif
 
-  /// Array of cell group numbers (indices)
-  int *gidgroups = NULL;
+    // offset the (non-negative) gids according to multiple
+    // make sure everything fits into gid space.
+    for (int i = 0; i < nt.n_presyn; ++i) {
+        if (output_gid[i] >= 0) {
+            nrn_assert(output_gid[i] < maxgid);
+            output_gid[i] += zz;
+        }
+    }
+    int* nc_srcgid = netcon_srcgid[nt.id];
+    for (int i = 0; i < nt.n_netcon; ++i) {
+        if (nc_srcgid[i] >= 0) {
+            nrn_assert(nc_srcgid[i] < maxgid);
+            nc_srcgid[i] += zz;
+        }
+    }
 
-  double time = nrnmpi_wtime(); 
+    for (int i = 0; i < nt.n_presyn; ++i) {
+        int gid = output_gid[i];
+        if (gid == -1)
+            continue;
 
-  nrn_read_filesdat(ngroup, gidgroups, filesdat);
+        // Note that the negative (type, index)
+        // coded information goes into the neg_gid2out[tid] hash table.
+        // See netpar.cpp for the netpar_tid_... function implementations.
+        // Both that table and the process wide gid2out table can be deleted
+        // before the end of setup
 
-  assert(ngroup > 0);
-  MUTCONSTRUCT(1)
-  // temporary bug work around. If any process has multiple threads, no
-  // process can have a single thread. So, for now, if one thread, make two.
-  // Fortunately, empty threads work fine.
-  /// Allocate NrnThread* nrn_threads of size ngroup (minimum 2)
-  nrn_threads_create(ngroup == 1?2:ngroup, threading); // serial/parallel threads
+        MUTLOCK
+        /// Put gid into the gid2out hash table with correspondent output PreSyn
+        /// Or to the negative PreSyn map
+        PreSyn* ps = nt.presyns + i;
+        if (gid >= 0) {
+            char m[200];
+            if (gid2in.find(gid) != gid2in.end()) {
+                sprintf(m, "gid=%d already exists as an input port", gid);
+                hoc_execerror(
+                    m,
+                    "Setup all the output ports on this process before using them as input ports.");
+            }
+            if (gid2out.find(gid) != gid2out.end()) {
+                sprintf(m, "gid=%d already exists on this process as an output port", gid);
+                hoc_execerror(m, 0);
+            }
+            gid2out[gid] = ps;
+            ps->gid_ = gid;
+            ps->output_index_ = gid;
+        } else {
+            nrn_assert(neg_gid2out[nt.id].find(gid) == neg_gid2out[nt.id].end());
+            neg_gid2out[nt.id][gid] = ps;
+        }
+        MUTUNLOCK
 
-  /// Reserve vector of maps of size ngroup for negative gid-s
-  /// std::vector< std::map<int, PreSyn*> > neg_gid2out;
-  netpar_tid_gid2ps_alloc(ngroup);
+        if (gid < 0) {
+            nt.presyns[i].output_index_ = -1;
+        }
+    }
+    delete[] output_gid;
+
+    if (nrn_setup_extracon > 0) {
+        // very simplistic
+        // Use this threads positive source gids - zz in nt.netcon order as the
+        // source gids for extracon.
+        // The edge cases are:
+        // The 0th duplicate uses uses source gids for the last duplicate.
+        // If there are fewer positive source gids than extracon, then keep
+        // rotating through the nt.netcon .
+        // If there are no positive source gids, use a source gid of -1.
+        // Would not be difficult to modify so that random positive source was
+        // used, and/or random connect to another duplicate.
+        // Note that we increment the nt.n_netcon at the end of this function.
+        int sidoffset;  // how much to increment the corresponding positive gid
+        // like ring connectivity
+        if (imult > 0) {
+            sidoffset = -maxgid;
+        } else if (nrn_setup_multiple > 1) {
+            sidoffset = (nrn_setup_multiple - 1) * maxgid;
+        } else {
+            sidoffset = 0;
+        }
+        // set up the extracon srcgid_
+        int* nc_srcgid = netcon_srcgid[nt.id];
+        int j = 0;  // rotate through the n_netcon netcon_srcgid
+        for (int i = 0; i < nrn_setup_extracon; ++i) {
+            int sid = -1;
+            for (int k = 0; k < nt.n_netcon; ++k) {
+                // potentially rotate j through the entire n_netcon but no further
+                sid = nc_srcgid[j];
+                j = (j + 1) % nt.n_netcon;
+                if (sid >= 0) {
+                    break;
+                }
+            }
+            if (sid < 0) {  // only connect to real cells.
+                sid = -1;
+            } else {
+                sid += sidoffset;
+            }
+            nc_srcgid[i + nt.n_netcon] = sid;
+        }
+        // finally increment the n_netcon
+        nt.n_netcon += nrn_setup_extracon;
+    }
+}
 
-  // bug fix. gid2out is cumulative over all threads and so do not
-  // know how many there are til after phase1
-  // A process's complete set of output gids and allocation of each thread's
-  // nt.presyns and nt.netcons arrays.
-  // Generates the gid2out map which is needed
-  // to later count the required number of InputPreSyn
-  /// gid2out - map of output presyn-s
-  /// std::map<int, PreSyn*> gid2out;
-  nrn_reset_gid2out();
+void netpar_tid_gid2ps(int tid, int gid, PreSyn** ps, InputPreSyn** psi) {
+    /// for gid < 0 returns the PreSyn* in the thread (tid) specific map.
+    *ps = NULL;
+    *psi = NULL;
+    std::map<int, PreSyn*>::iterator gid2out_it;
+    if (gid >= 0) {
+        gid2out_it = gid2out.find(gid);
+        if (gid2out_it != gid2out.end()) {
+            *ps = gid2out_it->second;
+        } else {
+            std::map<int, InputPreSyn*>::iterator gid2in_it;
+            gid2in_it = gid2in.find(gid);
+            if (gid2in_it != gid2in.end()) {
+                *psi = gid2in_it->second;
+            }
+        }
+    } else {
+        gid2out_it = neg_gid2out[tid].find(gid);
+        if (gid2out_it != neg_gid2out[tid].end()) {
+            *ps = gid2out_it->second;
+        }
+    }
+}
 
-  data_reader *file_reader=new data_reader[ngroup];
+void determine_inputpresyn() {
+    // allocate the process wide InputPreSyn array
+    // all the output_gid have been registered and associated with PreSyn.
+    // now count the needed InputPreSyn by filling the netpar::gid2in map
+    gid2in.clear();
 
-  /* nrn_multithread_job supports serial, pthread, and openmp. */
-  store_phase_args(ngroup, gidgroups, file_reader, path, byte_swap);
-  coreneuron::phase_wrapper<(coreneuron::phase)1>(); /// If not the xlc compiler, it should be coreneuron::phase::one
+    // now have to fill the new table
+    // do not need to worry about negative gid overlap since only use
+    // it to search for PreSyn in this thread.
 
-  // from the netpar::gid2out map and the netcon_srcgid array,
-  // fill the netpar::gid2in, and from the number of entries,
-  // allocate the process wide InputPreSyn array
-  determine_inputpresyn();
+    std::vector<InputPreSyn*> inputpresyn_;
+    std::map<int, PreSyn*>::iterator gid2out_it;
+    std::map<int, InputPreSyn*>::iterator gid2in_it;
+
+    for (int ith = 0; ith < nrn_nthread; ++ith) {
+        NrnThread& nt = nrn_threads[ith];
+        // associate gid with InputPreSyn and increase PreSyn and InputPreSyn count
+        nt.n_input_presyn = 0;
+        for (int i = 0; i < nt.n_netcon; ++i) {
+            int gid = netcon_srcgid[ith][i];
+            if (gid >= 0) {
+                /// If PreSyn or InputPreSyn is already in the map
+                gid2out_it = gid2out.find(gid);
+                if (gid2out_it != gid2out.end()) {
+                    /// Increase PreSyn count
+                    ++gid2out_it->second->nc_cnt_;
+                    continue;
+                }
+                gid2in_it = gid2in.find(gid);
+                if (gid2in_it != gid2in.end()) {
+                    /// Increase InputPreSyn count
+                    ++gid2in_it->second->nc_cnt_;
+                    continue;
+                }
+
+                /// Create InputPreSyn and increase its count
+                InputPreSyn* psi = new InputPreSyn;
+                ++psi->nc_cnt_;
+                gid2in[gid] = psi;
+                inputpresyn_.push_back(psi);
+                ++nt.n_input_presyn;
+            } else {
+                gid2out_it = neg_gid2out[nt.id].find(gid);
+                if (gid2out_it != neg_gid2out[nt.id].end()) {
+                    /// Increase negative PreSyn count
+                    ++gid2out_it->second->nc_cnt_;
+                }
+            }
+        }
+    }
 
+    // now, we can opportunistically create the NetCon* pointer array
+    // to save some memory overhead for
+    // "large number of small array allocation" by
+    // counting the number of NetCons each PreSyn and InputPreSyn point to.
+    // Conceivably the nt.netcons could become a process global array
+    // in which case the NetCon* pointer array could become an integer index
+    // array. More speculatively, the index array could be eliminated itself
+    // if the process global NetCon array were ordered properly but that
+    // would interleave NetCon from different threads. Not a problem for
+    // serial threads but the reordering would propagate to nt.pntprocs
+    // if the NetCon data pointers are also replaced by integer indices.
+
+    // First, allocate the pointer array.
+    int n_nc = 0;
+    for (int ith = 0; ith < nrn_nthread; ++ith) {
+        n_nc += nrn_threads[ith].n_netcon;
+    }
+    netcon_in_presyn_order_.resize(n_nc);
+    n_nc = 0;
+
+    // fill the indices with the offset values and reset the nc_cnt_
+    // such that we use the nc_cnt_ in the following loop to assign the NetCon
+    // to the right place
+    // for PreSyn
+    int offset = 0;
+    for (int ith = 0; ith < nrn_nthread; ++ith) {
+        NrnThread& nt = nrn_threads[ith];
+        for (int i = 0; i < nt.n_presyn; ++i) {
+            PreSyn& ps = nt.presyns[i];
+            ps.nc_index_ = offset;
+            offset += ps.nc_cnt_;
+            ps.nc_cnt_ = 0;
+        }
+    }
+    // for InputPreSyn
+    for (size_t i = 0; i < inputpresyn_.size(); ++i) {
+        InputPreSyn* psi = inputpresyn_[i];
+        psi->nc_index_ = offset;
+        offset += psi->nc_cnt_;
+        psi->nc_cnt_ = 0;
+    }
+    inputpresyn_.clear();
+
+    // fill the netcon_in_presyn_order and recompute nc_cnt_
+    // note that not all netcon_in_presyn will be filled if there are netcon
+    // with no presyn (ie. netcon_srcgid[nt.id][i] = -1) but that is ok since they are
+    // only used via ps.nc_index_ and ps.nc_cnt_;
+    for (int ith = 0; ith < nrn_nthread; ++ith) {
+        NrnThread& nt = nrn_threads[ith];
+        for (int i = 0; i < nt.n_netcon; ++i) {
+            NetCon* nc = nt.netcons + i;
+            int gid = netcon_srcgid[ith][i];
+            PreSyn* ps;
+            InputPreSyn* psi;
+            netpar_tid_gid2ps(ith, gid, &ps, &psi);
+            if (ps) {
+                netcon_in_presyn_order_[ps->nc_index_ + ps->nc_cnt_] = nc;
+                ++ps->nc_cnt_;
+                ++n_nc;
+            } else if (psi) {
+                netcon_in_presyn_order_[psi->nc_index_ + psi->nc_cnt_] = nc;
+                ++psi->nc_cnt_;
+                ++n_nc;
+            }
+        }
+    }
 
-  // read the rest of the gidgroup's data and complete the setup for each
-  // thread.
-  /* nrn_multithread_job supports serial, pthread, and openmp. */
-  coreneuron::phase_wrapper<(coreneuron::phase)2>();
+    /// Resize the vector to its actual size of the netcons put in it
+    netcon_in_presyn_order_.resize(n_nc);
+}
 
-  /// Generally, tables depend on a few parameters. And if those parameters change,
-  /// then the table needs to be recomputed. This is obviously important in NEURON
-  /// since the user can change those parameters at any time. However, there is no
-  /// c example for CoreNEURON so can't see what it looks like in that context.
-  /// Boils down to setting up a function pointer of the function _check_table_thread(),
-  /// which is only executed by StochKV.c.
-  nrn_mk_table_check(); // was done in nrn_thread_memblist_setup in multicore.c
+/// Clean up
+void setup_cleanup() {
+    for (int ith = 0; ith < nrn_nthread; ++ith) {
+        if (netcon_srcgid[ith])
+            delete[] netcon_srcgid[ith];
+    }
+    netcon_srcgid.clear();
+    neg_gid2out.clear();
+}
 
-  delete [] file_reader;
+void nrn_setup(cn_input_params& input_params, const char* filesdat, int byte_swap) {
+    /// Number of local cell groups
+    int ngroup = 0;
 
-  netpar_tid_gid2ps_free();
+    /// Array of cell group numbers (indices)
+    int* gidgroups = NULL;
 
-  if (nrn_nthread > 1) {
-    // NetCvode construction assumed one thread. Need nrn_nthread instances
-    // of NetCvodeThreadData
-    nrn_p_construct();
-  }
+    /// Array of duplicate indices. Normally, with nrn_setup_multiple=1,
+    //   they are ngroup values of 0.
+    int* imult = NULL;
 
-  model_size();
-  delete []gidgroups;
+    double time = nrn_wtime();
 
-  if ( nrnmpi_myid == 0 ) {
-	  printf( " Nrn Setup Done (time: %g)\n", nrnmpi_wtime() - time );
-  }
+    maxgid = 0x7fffffff / nrn_setup_multiple;
+    nrn_read_filesdat(ngroup, gidgroups, nrn_setup_multiple, imult, filesdat);
 
-}
+    MUTCONSTRUCT(1)
+    // temporary bug work around. If any process has multiple threads, no
+    // process can have a single thread. So, for now, if one thread, make two.
+    // Fortunately, empty threads work fine.
+    // Allocate NrnThread* nrn_threads of size ngroup (minimum 2)
+    // Note that rank with 0 dataset/cellgroup works fine
+    nrn_threads_create(ngroup <= 1 ? 2 : ngroup,
+                       input_params.threading);  // serial/parallel threads
 
-void setup_ThreadData(NrnThread& nt) {
-  for (NrnThreadMembList* tml = nt.tml; tml; tml = tml->next) {
-    Memb_func& mf = memb_func[tml->index];
-    Memb_list* ml = tml->ml;
-    if (mf.thread_size_) {
-      ml->_thread = (ThreadDatum*)ecalloc(mf.thread_size_, sizeof(ThreadDatum));
-      if (mf.thread_mem_init_) {
-        MUTLOCK
-        (*mf.thread_mem_init_)(ml->_thread);
-        MUTUNLOCK
-      }
+    if (use_solve_interleave) {
+        create_interleave_info();
     }
-    else ml->_thread = NULL;
-  }
-}
 
+    /// Reserve vector of maps of size ngroup for negative gid-s
+    /// std::vector< std::map<int, PreSyn*> > neg_gid2out;
+    neg_gid2out.resize(ngroup);
+
+    // bug fix. gid2out is cumulative over all threads and so do not
+    // know how many there are til after phase1
+    // A process's complete set of output gids and allocation of each thread's
+    // nt.presyns and nt.netcons arrays.
+    // Generates the gid2out map which is needed
+    // to later count the required number of InputPreSyn
+    /// gid2out - map of output presyn-s
+    /// std::map<int, PreSyn*> gid2out;
+    gid2out.clear();
+
+    netcon_srcgid.resize(nrn_nthread);
+    for (int i = 0; i < nrn_nthread; ++i)
+        netcon_srcgid[i] = NULL;
+
+    data_reader* file_reader = new data_reader[ngroup];
+
+    /* nrn_multithread_job supports serial, pthread, and openmp. */
+    store_phase_args(ngroup, gidgroups, imult, file_reader, input_params.datpath, byte_swap);
+
+    // gap junctions
+    if (nrn_have_gaps) {
+        assert(nrn_setup_multiple == 1);
+        nrn_partrans::transfer_thread_data_ = new nrn_partrans::TransferThreadData[nrn_nthread];
+        nrn_partrans::setup_info_ = new nrn_partrans::SetupInfo[ngroup];
+        coreneuron::phase_wrapper<coreneuron::gap>();
+        nrn_partrans::gap_mpi_setup(ngroup);
+    }
 
-void read_phase1(data_reader &F, NrnThread& nt) {
-  nt.n_presyn = F.read_int(); /// Number of PreSyn-s in NrnThread nt
-  nt.n_netcon = F.read_int(); /// Number of NetCon-s in NrnThread nt
-  nt.presyns = new PreSyn[nt.n_presyn];
-  nt.netcons = new NetCon[nt.n_netcon];
+    coreneuron::phase_wrapper<(
+        coreneuron::phase)1>();  /// If not the xlc compiler, it should be coreneuron::phase::one
 
-  /// Checkpoint in bluron is defined for both phase 1 and phase 2 since they are written together
-  /// output_gid has all of output PreSyns, netcon_srcgid is created for NetCons, which might be
-  /// 10k times more than output_gid.
-  int* output_gid = F.read_array<int>(nt.n_presyn);
-  int* netcon_srcgid = F.read_array<int>(nt.n_netcon);
-  F.close();
+    // from the gid2out map and the netcon_srcgid array,
+    // fill the gid2in, and from the number of entries,
+    // allocate the process wide InputPreSyn array
+    determine_inputpresyn();
 
-#if 0
-  // for checking whether negative gids fit into the gid space
-  // not used for now since negative gids no longer encode the thread id.
-  double dmaxint = 1073741824.; //2^30
-  for (;;) {
-    if (dmaxint*2. == double(int(dmaxint*2.))) {
-      dmaxint *= 2.;
-    }else{
-      if (dmaxint*2. - 1. == double(int(dmaxint*2. - 1.))) {
-        dmaxint = 2.*dmaxint - 1.;
-        break;
-      }
-    }
-  }
+    // read the rest of the gidgroup's data and complete the setup for each
+    // thread.
+    /* nrn_multithread_job supports serial, pthread, and openmp. */
+    coreneuron::phase_wrapper<(coreneuron::phase)2>();
+
+    if (input_params.report)
+        coreneuron::phase_wrapper<(coreneuron::phase)3>();
+
+    double mindelay = set_mindelay(input_params.maxdelay);
+    input_params.set_mindelay(mindelay);
+    setup_cleanup();
+
+#if INTERLEAVE_DEBUG
+    mk_cell_indices();
 #endif
 
-  for (int i=0; i < nt.n_presyn; ++i) {
-    int gid = output_gid[i];
-    // Note that the negative (type, index)
-    // coded information goes into the neg_gid2out[tid] hash table.
-    // See netpar.cpp for the netpar_tid_... function implementations.
-    // Both that table and the process wide gid2out table can be deleted
-    // before the end of setup
-
-    MUTLOCK
-    /// Put gid into the gid2out hash table with correspondent output PreSyn
-    netpar_tid_set_gid2node(nt.id, gid, nrnmpi_myid, nt.presyns + i);
-    MUTUNLOCK
-
-    if (gid < 0) {
-      nt.presyns[i].output_index_ = -1;
+    /// Generally, tables depend on a few parameters. And if those parameters change,
+    /// then the table needs to be recomputed. This is obviously important in NEURON
+    /// since the user can change those parameters at any time. However, there is no
+    /// c example for CoreNEURON so can't see what it looks like in that context.
+    /// Boils down to setting up a function pointer of the function _check_table_thread(),
+    /// which is only executed by StochKV.c.
+    nrn_mk_table_check();  // was done in nrn_thread_memblist_setup in multicore.c
+
+    delete[] file_reader;
+
+    if (nrn_nthread > 1) {
+        // NetCvode construction assumed one thread. Need nrn_nthread instances
+        // of NetCvodeThreadData
+        nrn_p_construct();
+    }
+
+    model_size();
+    delete[] gidgroups;
+
+    if (nrnmpi_myid == 0) {
+        printf(" Nrn Setup Done (time: %g)\n", nrn_wtime() - time);
     }
-    nt.presyns[i].nt_ = &nt;
-  }
-  delete [] output_gid;
-  // encode netcon_srcgid_ values in nt.netcons
-  // which allows immediate freeing of that array.
-  for (int i=0; i < nt.n_netcon; ++i) {
-    nt.netcons[i].u.srcgid_ = netcon_srcgid[i];
-    // do not need to worry about negative gid overlap since only use
-    // it to search for PreSyn in this thread.
-  }
-  delete [] netcon_srcgid;
 }
 
-void determine_inputpresyn() {
-  /// THIS WHOLE FUNCTION NEEDS SERIOUS OPTIMIZATION!
-  // all the output_gid have been registered and associated with PreSyn.
-  // now count the needed InputPreSyn by filling the netpar::gid2in map
-  nrn_reset_gid2in();
-
-  // now have to fill the new table
-  int n_psi = 0;
-  for (int ith = 0; ith < nrn_nthread; ++ith) {
-    NrnThread& nt = nrn_threads[ith];
-    for (int i = 0; i < nt.n_netcon; ++i) {
-      int gid = nt.netcons[i].u.srcgid_;
-      if (gid >= 0) {
-        n_psi += input_gid_register(gid);
-      }
+void setup_ThreadData(NrnThread& nt) {
+    for (NrnThreadMembList* tml = nt.tml; tml; tml = tml->next) {
+        Memb_func& mf = memb_func[tml->index];
+        Memb_list* ml = tml->ml;
+        if (mf.thread_size_) {
+            ml->_thread = (ThreadDatum*)ecalloc(mf.thread_size_, sizeof(ThreadDatum));
+            if (mf.thread_mem_init_) {
+                MUTLOCK (*mf.thread_mem_init_)(ml->_thread);
+                MUTUNLOCK
+            }
+        } else
+            ml->_thread = NULL;
     }
-  }
+}
 
-  n_inputpresyn_ = n_psi;
-  inputpresyn_ = new InputPreSyn[n_psi];
+void read_phasegap(data_reader& F, int imult, NrnThread& nt) {
+    nrn_assert(imult == 0);
+    nrn_partrans::SetupInfo& si = nrn_partrans::setup_info_[nt.id];
+    si.ntar = 0;
+    si.nsrc = 0;
 
-  // associate gid with InputPreSyn
-  n_psi = 0;
-  for (int ith = 0; ith < nrn_nthread; ++ith) {
-    NrnThread& nt = nrn_threads[ith];
-    for (int i = 0; i < nt.n_netcon; ++i) {
-      int gid = nt.netcons[i].u.srcgid_;
-      if (gid >= 0) {
-        n_psi += input_gid_associate(gid, inputpresyn_ + n_psi);
-      }
+    if (F.fail()) {
+        return;
     }
-  }
 
-  // now, we can opportunistically create the NetCon* pointer array
-  // to save some memory overhead for
-  // "large number of small array allocation" by
-  // counting the number of NetCons each PreSyn and InputPreSyn point to,
-  // and instead of a NetCon** InputPreSyn.dil_, merely use a
-  // int InputPreSyn.nc_index_ into the pointer array.
-  // Conceivably the nt.netcons could become a process global array
-  // in which case the NetCon* pointer array could become an integer index
-  // array. More speculatively, the index array could be eliminated itself
-  // if the process global NetCon array were ordered properly but that
-  // would interleave NetCon from different threads. Not a problem for
-  // serial threads but the reordering would propagate to nt.pntprocs
-  // if the NetCon data pointers are also replaced by integer indices.
-
-  // First, allocate the pointer array.
-  int n_nc = 0;
-  for (int ith = 0; ith < nrn_nthread; ++ith) {
-    n_nc += nrn_threads[ith].n_netcon;
-  }
-  netcon_in_presyn_order_ = new NetCon*[n_nc];
+    int chkpntsave = F.checkpoint();
+    F.checkpoint(0);
 
-  // count the NetCon each PreSyn and InputPresyn points to.
-  for (int ith = 0; ith < nrn_nthread; ++ith) {
-    NrnThread& nt = nrn_threads[ith];
-    for (int i = 0; i < nt.n_netcon; ++i) {
-      int gid = nt.netcons[i].u.srcgid_;
-      PreSyn* ps; InputPreSyn* psi;
-      netpar_tid_gid2ps(ith, gid, &ps, &psi);
-      if (ps) {
-        ++ps->nc_cnt_;
-      }else if (psi) {
-        ++psi->nc_cnt_;
-      }
-    }
-  }
-  // fill the indices with the offset values and reset the nc_cnt_
-  int offset = 0;
-  for (int ith = 0; ith < nrn_nthread; ++ith) {
-    NrnThread& nt = nrn_threads[ith];
-    for (int i = 0; i < nt.n_presyn; ++i) {
-      PreSyn& ps = nt.presyns[i];
-      ps.nc_index_ = offset;
-      offset += ps.nc_cnt_;
-      ps.nc_cnt_ = 0;
-    }
+    si.ntar = F.read_int();
+    si.nsrc = F.read_int();
+    si.type = F.read_int();
+    si.ix_vpre = F.read_int();
+    si.sid_target = F.read_array<int>(si.ntar);
+    si.sid_src = F.read_array<int>(si.nsrc);
+    si.v_indices = F.read_array<int>(si.nsrc);
+
+    F.checkpoint(chkpntsave);
+
+#if 0
+  printf("%d read_phasegap tid=%d type=%d %s ix_vpre=%d nsrc=%d ntar=%d\n",
+    nrnmpi_myid, nt.id, si.type, memb_func[si.type].sym, si.ix_vpre,
+    si.nsrc, si.ntar);
+  for (int i=0; i < si.nsrc; ++i) {
+    printf("sid_src %d %d\n", si.sid_src[i], si.v_indices[i]);
   }
-  for (int i=0; i < n_inputpresyn_; ++i) {
-    InputPreSyn& psi = inputpresyn_[i];
-    psi.nc_index_ = offset;
-    offset += psi.nc_cnt_;
-    psi.nc_cnt_ = 0;
-  }
-  // fill the netcon_in_presyn_order and recompute nc_cnt_
-  // note that not all netcon_in_presyn will be filled if there are netcon
-  // with no presyn (ie. nc->u.srcgid_ = -1) but that is ok since they are
-  // only used via ps.nc_index_ and ps.nc_cnt_;
-  for (int ith = 0; ith < nrn_nthread; ++ith) {
-    NrnThread& nt = nrn_threads[ith];
-    for (int i = 0; i < nt.n_netcon; ++i) {
-      NetCon* nc = nt.netcons + i;
-      int gid = nc->u.srcgid_;
-      PreSyn* ps; InputPreSyn* psi;
-      netpar_tid_gid2ps(ith, gid, &ps, &psi);
-      if (ps) {
-        netcon_in_presyn_order_[ps->nc_index_ + ps->nc_cnt_] = nc;
-        nc->src_ = ps; // maybe nc->src_ is not needed
-        ++ps->nc_cnt_;
-      }else if (psi) {
-        netcon_in_presyn_order_[psi->nc_index_ + psi->nc_cnt_] = nc;
-        nc->src_ = psi; // maybe nc->src_ is not needed
-        ++psi->nc_cnt_;
-      }
-    }
+  for (int i=0; i <si. ntar; ++i) {
+    printf("sid_tar %d %d\n", si.sid_target[i], i);
   }
+#endif
 }
+
 int nrn_soa_padded_size(int cnt, int layout) {
-  return coreneuron::soa_padded_size<NRN_SOA_PAD>(cnt, layout);
+    return coreneuron::soa_padded_size<NRN_SOA_PAD>(cnt, layout);
 }
 
 static size_t nrn_soa_byte_align(size_t i) {
-  if (LAYOUT == 0) {
-    size_t dbl_align = NRN_SOA_BYTE_ALIGN/sizeof(double);
-    size_t rem = i%dbl_align;
-    if (rem) {
-      i += dbl_align - rem;
+    if (LAYOUT == 0) {
+        size_t dbl_align = NRN_SOA_BYTE_ALIGN / sizeof(double);
+        size_t rem = i % dbl_align;
+        if (rem) {
+            i += dbl_align - rem;
+        }
+        assert((i * sizeof(double)) % NRN_SOA_BYTE_ALIGN == 0);
     }
-    assert((i*sizeof(double))%NRN_SOA_BYTE_ALIGN == 0);
-  }
-  return i;
+    return i;
 }
 
 // file data is AoS. ie.
@@ -468,41 +721,43 @@ static size_t nrn_soa_byte_align(size_t i) {
 // alignment requirements. Ie. i_instance + i_item*align_cnt.
 
 int nrn_param_layout(int i, int mtype, Memb_list* ml) {
-  int layout = nrn_mech_data_layout_[mtype];
-  if (layout == 1) { return i; }
-  assert(layout == 0);
-  int sz = nrn_prop_param_size_[mtype];
-  int cnt = ml->nodecount;
-  int i_cnt = i / sz;
-  int i_sz = i % sz;
-  return nrn_i_layout(i_cnt, cnt, i_sz, sz, layout);
+    int layout = nrn_mech_data_layout_[mtype];
+    if (layout == 1) {
+        return i;
+    }
+    assert(layout == 0);
+    int sz = nrn_prop_param_size_[mtype];
+    int cnt = ml->nodecount;
+    int i_cnt = i / sz;
+    int i_sz = i % sz;
+    return nrn_i_layout(i_cnt, cnt, i_sz, sz, layout);
 }
 
 int nrn_i_layout(int icnt, int cnt, int isz, int sz, int layout) {
-  if (layout == 1) {
-    return icnt*sz + isz;
-  }else if (layout == 0) {
-    int padded_cnt = nrn_soa_padded_size(cnt, layout); // may want to factor out to save time
-    return icnt + isz*padded_cnt;
-  }
-  assert(0);
-  return 0;
+    if (layout == 1) {
+        return icnt * sz + isz;
+    } else if (layout == 0) {
+        int padded_cnt = nrn_soa_padded_size(cnt, layout);  // may want to factor out to save time
+        return icnt + isz * padded_cnt;
+    }
+    assert(0);
+    return 0;
 }
 
-template<class T>
-inline void mech_layout(data_reader &F, T* data, int cnt, int sz, int layout){
-  if (layout == 1) { /* AoS */
-    F.read_array<T>(data, cnt*sz);
-  }else if (layout == 0) { /* SoA */
-    int align_cnt = nrn_soa_padded_size(cnt, layout);
-    T* d = F.read_array<T>(cnt*sz);
-    for (int i=0; i < cnt; ++i) {
-      for (int j=0; j < sz; ++j) {
-        data[i + j*align_cnt] = d[i*sz + j];
-      }
+template <class T>
+inline void mech_layout(data_reader& F, T* data, int cnt, int sz, int layout) {
+    if (layout == 1) { /* AoS */
+        F.read_array<T>(data, cnt * sz);
+    } else if (layout == 0) { /* SoA */
+        int align_cnt = nrn_soa_padded_size(cnt, layout);
+        T* d = F.read_array<T>(cnt * sz);
+        for (int i = 0; i < cnt; ++i) {
+            for (int j = 0; j < sz; ++j) {
+                data[i + j * align_cnt] = d[i * sz + j];
+            }
+        }
+        delete[] d;
     }
-    delete [] d;
-  }
 }
 
 /* nrn_threads_free() presumes all NrnThread and NrnThreadMembList data is
@@ -510,632 +765,1001 @@ inline void mech_layout(data_reader &F, T* data, int cnt, int sz, int layout){
  * things up first. */
 
 void nrn_cleanup() {
+    gid2in.clear();
+    gid2out.clear();
+
+    for (int it = 0; it < nrn_nthread; ++it) {
+        NrnThread* nt = nrn_threads + it;
+        NrnThreadMembList* next_tml = NULL;
+        for (NrnThreadMembList* tml = nt->tml; tml; tml = next_tml) {
+            Memb_list* ml = tml->ml;
+
+            ml->data = NULL;  // this was pointing into memory owned by nt
+            free(ml->pdata);
+            ml->pdata = NULL;
+            free(ml->nodeindices);
+            ml->nodeindices = NULL;
+            if (ml->_permute) {
+                delete[] ml->_permute;
+                ml->_permute = NULL;
+            }
 
-  nrnmpi_gid_clear();
+            NetReceiveBuffer_t* nrb = ml->_net_receive_buffer;
+            if (nrb) {
+                if (nrb->_size) {
+                    free(nrb->_pnt_index);
+                    free(nrb->_weight_index);
+                    free(nrb->_nrb_t);
+                    free(nrb->_nrb_flag);
+                    free(nrb->_displ);
+                    free(nrb->_nrb_index);
+                }
+                free(nrb);
+            }
 
-  for (int it = 0; it < nrn_nthread; ++it) {
-    NrnThread* nt = nrn_threads + it;
-    NrnThreadMembList * next_tml = NULL;
-    for (NrnThreadMembList *tml = nt->tml; tml; tml = next_tml) {
-      Memb_list* ml = tml->ml;
+            NetSendBuffer_t* nsb = ml->_net_send_buffer;
+            if (nsb) {
+                if (nsb->_size) {
+                    free(nsb->_sendtype);
+                    free(nsb->_vdata_index);
+                    free(nsb->_pnt_index);
+                    free(nsb->_weight_index);
+                    free(nsb->_nsb_t);
+                    free(nsb->_nsb_flag);
+                }
+                free(nsb);
+            }
 
-      ml->data = NULL; // this was pointing into memory owned by nt
-      delete[] ml->pdata;
-      ml->pdata = NULL;
-      delete[] ml->nodeindices;
-      ml->nodeindices = NULL;
+            if (tml->dependencies)
+                free(tml->dependencies);
 
-      if(tml->dependencies)
-          free(tml->dependencies);
+            next_tml = tml->next;
+            free(tml->ml);
+            free(tml);
+        }
 
-      next_tml = tml->next;
-      free(tml->ml);
-      free(tml);
-    }
+        nt->_actual_rhs = NULL;
+        nt->_actual_d = NULL;
+        nt->_actual_a = NULL;
+        nt->_actual_b = NULL;
 
-    nt->_actual_rhs = NULL;
-    nt->_actual_d = NULL;
-    nt->_actual_a = NULL;
-    nt->_actual_b = NULL;
+        free(nt->_v_parent_index);
+        nt->_v_parent_index = NULL;
 
+        free(nt->_data);
+        nt->_data = NULL;
 
-    delete[] nt->_v_parent_index;
-    nt->_v_parent_index = NULL;
+        free(nt->_idata);
+        nt->_idata = NULL;
 
-    free(nt->_data);
-    nt->_data = NULL;
+        free(nt->_vdata);
+        nt->_vdata = NULL;
 
-    free(nt->_idata);
-    nt->_idata = NULL;
+        if (nt->_permute) {
+            delete[] nt->_permute;
+            nt->_permute = NULL;
+        }
 
-    free(nt->_vdata);
-    nt->_vdata = NULL;
+        if (nt->presyns_helper) {
+            free(nt->presyns_helper);
+            nt->_permute = NULL;
+        }
 
-    if (nt->pntprocs) {
-        delete[] nt->pntprocs;
-        nt->pntprocs = NULL;
-    }
+        if (nt->pntprocs) {
+            delete[] nt->pntprocs;
+            nt->pntprocs = NULL;
+        }
 
-    if (nt->presyns) {
-        delete [] nt->presyns;
-        nt->presyns = NULL;
-    }
+        if (nt->presyns) {
+            delete[] nt->presyns;
+            nt->presyns = NULL;
+        }
 
-    if (nt->netcons) {
-        delete [] nt->netcons;
-        nt->netcons = NULL;
-    }
+        if (nt->pnt2presyn_ix) {
+            for (int i = 0; i < nrn_has_net_event_cnt_; ++i) {
+                if (nt->pnt2presyn_ix[i]) {
+                    free(nt->pnt2presyn_ix[i]);
+                }
+            }
+            free(nt->pnt2presyn_ix);
+        }
 
-    if (nt->weights) {
-        delete [] nt->weights;
-        nt->weights = NULL;
-    }
+        if (nt->netcons) {
+            delete[] nt->netcons;
+            nt->netcons = NULL;
+        }
 
-    if (nt->_shadow_rhs) {
-        free(nt->_shadow_rhs);
-        nt->_shadow_rhs = NULL;
-    }
+        if (nt->weights) {
+            delete[] nt->weights;
+            nt->weights = NULL;
+        }
 
-    if (nt->_shadow_d) {
-        free(nt->_shadow_d);
-        nt->_shadow_d = NULL;
-    }
+        if (nt->_shadow_rhs) {
+            free(nt->_shadow_rhs);
+            nt->_shadow_rhs = NULL;
+        }
 
-    free(nt->_ml_list);
-  }
+        if (nt->_shadow_d) {
+            free(nt->_shadow_d);
+            nt->_shadow_d = NULL;
+        }
 
-  delete [] inputpresyn_;
-  delete [] netcon_in_presyn_order_;
+        if (nt->_net_send_buffer_size) {
+            free(nt->_net_send_buffer);
+            nt->_net_send_buffer = NULL;
+            nt->_net_send_buffer_size = 0;
+        }
 
-  nrn_threads_free();
-}
+        // mapping information is available only for non-empty NrnThread
+        if (nt->mapping && nt->ncell) {
+            delete ((NeuronGroupMappingInfo*)nt->mapping);
+        }
 
-void read_phase2(data_reader &F, NrnThread& nt) {
-  NrnThreadMembList* tml;
-  int n_outputgid = F.read_int();
-  nrn_assert(n_outputgid > 0); // avoid n_outputgid unused warning
-  nt.ncell = F.read_int();
-  nt.end = F.read_int();
-  int nmech = F.read_int();
-
-  /// Checkpoint in bluron is defined for both phase 1 and phase 2 since they are written together
-  //printf("ncell=%d end=%d nmech=%d\n", nt.ncell, nt.end, nmech);
-  //printf("nart=%d\n", nart);
-  NrnThreadMembList* tml_last = NULL;
-  nt._ml_list = (Memb_list**)ecalloc(n_memb_func, sizeof(Memb_list*));
-
-  // local unpadded copy needed for updating pdata value indices into nt._data
-  // only field used is the data (as though unpadded aos)
-  // can be freed after that update.
-  Memb_list* unpadded_ml_list = (Memb_list*)ecalloc(n_memb_func, sizeof(Memb_list));
-
-  int shadow_rhs_cnt = 0;
-  for (int i=0; i < nmech; ++i) {
-    tml = (NrnThreadMembList*)emalloc(sizeof(NrnThreadMembList));
-    tml->ml = (Memb_list*)emalloc(sizeof(Memb_list));
-    tml->next = NULL;
-    tml->index = F.read_int();
-    tml->ml->nodecount = F.read_int();
-    tml->ml->_nodecount_padded = nrn_soa_padded_size(tml->ml->nodecount, nrn_mech_data_layout_[tml->index]);
-    if (memb_func[tml->index].is_point && nrn_is_artificial_[tml->index] == 0){
-      // Avoid race for multiple PointProcess instances in same compartment.
-      if (tml->ml->nodecount > shadow_rhs_cnt) {
-        shadow_rhs_cnt = tml->ml->nodecount;
-      }
-    }
-    nt._ml_list[tml->index] = tml->ml;
-    //printf("index=%d nodecount=%d membfunc=%s\n", tml->index, tml->ml->nodecount, memb_func[tml->index].sym?memb_func[tml->index].sym:"None");
-    if (nt.tml) {
-      tml_last->next = tml;
-    }else{
-      nt.tml = tml;
+        free(nt->_ml_list);
     }
-    tml_last = tml;
 
-  }
+    netcon_in_presyn_order_.clear();
 
-  if (shadow_rhs_cnt) {
-    nt._shadow_rhs = (double*)coreneuron::ecalloc_align(nrn_soa_padded_size(shadow_rhs_cnt,0),
-      NRN_SOA_BYTE_ALIGN, sizeof(double));
-    nt._shadow_d = (double*)coreneuron::ecalloc_align(nrn_soa_padded_size(shadow_rhs_cnt,0),
-      NRN_SOA_BYTE_ALIGN, sizeof(double));
-  }
+    nrn_threads_free();
+}
+
+void read_phase2(data_reader& F, int imult, NrnThread& nt) {
+    assert(!F.fail());
+    nrn_assert(imult >= 0);  // avoid imult unused warning
+    NrnThreadMembList* tml;
+    int n_outputgid = F.read_int();
+    nrn_assert(n_outputgid > 0);  // avoid n_outputgid unused warning
+    nt.ncell = F.read_int();
+    nt.end = F.read_int();
+    int nmech = F.read_int();
+
+    /// Checkpoint in bluron is defined for both phase 1 and phase 2 since they are written together
+    // printf("ncell=%d end=%d nmech=%d\n", nt.ncell, nt.end, nmech);
+    // printf("nart=%d\n", nart);
+    NrnThreadMembList* tml_last = NULL;
+    nt._ml_list = (Memb_list**)ecalloc(n_memb_func, sizeof(Memb_list*));
+
+    // local unpadded copy needed for updating pdata value indices into nt._data
+    // only field used is the data (as though unpadded aos)
+    // can be freed after that update.
+    Memb_list* unpadded_ml_list = (Memb_list*)ecalloc(n_memb_func, sizeof(Memb_list));
+
+    int shadow_rhs_cnt = 0;
+    nt.shadow_rhs_cnt = 0;
+
+    nt.stream_id = 0;
+    nt.compute_gpu = 0;
+
+/* read_phase2 is being called from openmp region
+ * and hence we can set the stream equal to current thread id.
+ * In fact we could set gid as stream_id when we will have nrn threads
+ * greater than number of omp threads.
+ */
+#if defined(_OPENMP)
+    nt.stream_id = omp_get_thread_num();
+#endif
 
-  nt._ndata = F.read_int();
-  nt._nidata = F.read_int();
-  nt._nvdata = F.read_int();
-  nt.n_weight = F.read_int();
-
-  nt._data = NULL; // allocated below after padding
-
-  if (nt._nidata) nt._idata = (int*)ecalloc(nt._nidata, sizeof(int));
-  else nt._idata = NULL;
-  // see patternstim.cpp
-  int zzz = (&nt == nrn_threads) ? nrn_extra_thread0_vdata : 0;
-  if (nt._nvdata+zzz) 
-    nt._vdata = (void**)ecalloc(nt._nvdata + zzz, sizeof(void*));
-  else
-    nt._vdata = NULL;
-  //printf("_ndata=%d _nidata=%d _nvdata=%d\n", nt._ndata, nt._nidata, nt._nvdata);
-
-  // The data format begins with the matrix data
-  int ne = nrn_soa_padded_size(nt.end, MATRIX_LAYOUT);
-  size_t offset = 6*ne;
-  size_t unpadded_offset = 6*nt.end;
-
-  // Memb_list.data points into the nt.data array.
-  // Also count the number of Point_process
-  int npnt = 0;
-  for (tml = nt.tml; tml; tml = tml->next) {
-    Memb_list* ml = tml->ml;
-    int type = tml->index;
-    int layout = nrn_mech_data_layout_[type];
-    int n = ml->nodecount;
-    int sz = nrn_prop_param_size_[type];
-    offset = nrn_soa_byte_align(offset);
-    ml->data = (double*)0 + offset; // adjust below since nt._data not allocated
-    unpadded_ml_list[type].data = (double*)0 + unpadded_offset;
-    offset += nrn_soa_padded_size(n, layout)*sz;
-    unpadded_offset += n*sz;
-    if (pnt_map[type] > 0) {
-      npnt += n;
+    for (int i = 0; i < nmech; ++i) {
+        tml = (NrnThreadMembList*)emalloc(sizeof(NrnThreadMembList));
+        tml->ml = (Memb_list*)ecalloc(1, sizeof(Memb_list));
+        tml->ml->_net_receive_buffer = NULL;
+        tml->ml->_net_send_buffer = NULL;
+        tml->ml->_permute = NULL;
+        tml->next = NULL;
+        tml->index = F.read_int();
+        if (memb_func[tml->index].alloc == NULL) {
+            hoc_execerror(memb_func[tml->index].sym, "mechanism does not exist");
+        }
+        tml->ml->nodecount = F.read_int();
+        if (!memb_func[tml->index].sym) {
+            printf("%s (type %d) is not available\n", nrn_get_mechname(tml->index), tml->index);
+            exit(1);
+        }
+        tml->ml->_nodecount_padded =
+            nrn_soa_padded_size(tml->ml->nodecount, nrn_mech_data_layout_[tml->index]);
+        if (memb_func[tml->index].is_point && nrn_is_artificial_[tml->index] == 0) {
+            // Avoid race for multiple PointProcess instances in same compartment.
+            if (tml->ml->nodecount > shadow_rhs_cnt) {
+                shadow_rhs_cnt = tml->ml->nodecount;
+            }
+        }
+        nt._ml_list[tml->index] = tml->ml;
+        // printf("index=%d nodecount=%d membfunc=%s\n", tml->index, tml->ml->nodecount,
+        // memb_func[tml->index].sym?memb_func[tml->index].sym:"None");
+        if (nt.tml) {
+            tml_last->next = tml;
+        } else {
+            nt.tml = tml;
+        }
+        tml_last = tml;
     }
-  }
-  nt.pntprocs = new Point_process[npnt]; // includes acell with and without gid
-  nt.n_pntproc = npnt;
-  //printf("offset=%ld ndata=%ld\n", offset, nt._ndata);
-  // assert(offset == nt._ndata); // not with alignment
-  nt._ndata = offset;
-
-  // now that we know the effect of padding, we can allocate data space,
-  // fill matrix, and adjust Memb_list data pointers
-  nt._data = (double*)coreneuron::ecalloc_align(nt._ndata, NRN_SOA_BYTE_ALIGN, sizeof(double));
-  nt._actual_rhs = nt._data + 0*ne;
-  nt._actual_d = nt._data + 1*ne;
-  nt._actual_a = nt._data + 2*ne;
-  nt._actual_b = nt._data + 3*ne;
-  nt._actual_v = nt._data + 4*ne;
-  nt._actual_area = nt._data + 5*ne;
-  for (tml= nt.tml; tml; tml = tml->next) {
-    Memb_list* ml = tml->ml;
-    ml->data = nt._data + (ml->data - (double*)0);
-  }
 
-  // matrix info
-  nt._v_parent_index = F.read_array<int>(nt.end);
-  F.read_array<double>(nt._actual_a, nt.end);
-  F.read_array<double>(nt._actual_b, nt.end);
-  F.read_array<double>(nt._actual_area, nt.end);
-  F.read_array<double>(nt._actual_v, nt.end);
-
-  Memb_list** mlmap = new Memb_list*[n_memb_func];
-  int synoffset = 0;
-  int* pnt_offset = new int[n_memb_func];
-
-  // All the mechanism data and pdata. 
-  // Also fill in mlmap and pnt_offset
-  // Complete spec of Point_process except for the acell presyn_ field.
-  for (tml = nt.tml; tml; tml = tml->next) {
-    int type = tml->index;
-    Memb_list* ml = tml->ml;
-    mlmap[type] = ml;
-    int is_art = nrn_is_artificial_[type];
-    int n = ml->nodecount;
-    int szp = nrn_prop_param_size_[type];
-    int szdp = nrn_prop_dparam_size_[type];
-
-    if (!is_art) { 
-        ml->nodeindices = F.read_array<int>(ml->nodecount);
-    } else { 
-        ml->nodeindices = NULL;
+    if (shadow_rhs_cnt) {
+        nt._shadow_rhs = (double*)coreneuron::ecalloc_align(nrn_soa_padded_size(shadow_rhs_cnt, 0),
+                                                            NRN_SOA_BYTE_ALIGN, sizeof(double));
+        nt._shadow_d = (double*)coreneuron::ecalloc_align(nrn_soa_padded_size(shadow_rhs_cnt, 0),
+                                                          NRN_SOA_BYTE_ALIGN, sizeof(double));
+        nt.shadow_rhs_cnt = shadow_rhs_cnt;
     }
-    
-    int layout = nrn_mech_data_layout_[type];
-    mech_layout<double>(F, ml->data, n, szp, layout);
-    
-    if (szdp) {
-      ml->pdata = new int[nrn_soa_padded_size(n, layout)*szdp];
-      mech_layout<int>(F, ml->pdata, n, szdp, layout);
-    }else{
-      ml->pdata = NULL;
+
+    nt._ndata = F.read_int();
+    nt._nidata = F.read_int();
+    nt._nvdata = F.read_int();
+    nt.n_weight = F.read_int();
+
+    nt._data = NULL;    // allocated below after padding
+    nt.mapping = NULL;  // section segment mapping
+
+    if (nt._nidata)
+        nt._idata = (int*)ecalloc(nt._nidata, sizeof(int));
+    else
+        nt._idata = NULL;
+    // see patternstim.cpp
+    int extra_nv = (&nt == nrn_threads) ? nrn_extra_thread0_vdata : 0;
+    if (nt._nvdata + extra_nv)
+        nt._vdata = (void**)ecalloc(nt._nvdata + extra_nv, sizeof(void*));
+    else
+        nt._vdata = NULL;
+    // printf("_ndata=%d _nidata=%d _nvdata=%d\n", nt._ndata, nt._nidata, nt._nvdata);
+
+    // The data format begins with the matrix data
+    int ne = nrn_soa_padded_size(nt.end, MATRIX_LAYOUT);
+    size_t offset = 6 * ne;
+    size_t unpadded_offset = 6 * nt.end;
+
+    // Memb_list.data points into the nt.data array.
+    // Also count the number of Point_process
+    int npnt = 0;
+    for (tml = nt.tml; tml; tml = tml->next) {
+        Memb_list* ml = tml->ml;
+        int type = tml->index;
+        int layout = nrn_mech_data_layout_[type];
+        int n = ml->nodecount;
+        int sz = nrn_prop_param_size_[type];
+        offset = nrn_soa_byte_align(offset);
+        ml->data = (double*)0 + offset;  // adjust below since nt._data not allocated
+        unpadded_ml_list[type].data = (double*)0 + unpadded_offset;
+        offset += nrn_soa_padded_size(n, layout) * sz;
+        unpadded_offset += n * sz;
+        if (pnt_map[type] > 0) {
+            npnt += n;
+        }
     }
-    if (pnt_map[type] > 0) { // POINT_PROCESS mechanism including acell
-      int cnt = ml->nodecount;
-      Point_process* pnt = NULL;
-      pnt = nt.pntprocs + synoffset;
-      pnt_offset[type] = synoffset;
-      synoffset += cnt;
-      for (int i=0; i < cnt; ++i) {
-        Point_process* pp = pnt + i;
-        pp->_type = type;
-        pp->_i_instance = i;
-        nt._vdata[ml->pdata[nrn_i_layout(i, cnt, 1, szdp, layout)]] = pp;
-        pp->_presyn = NULL;
-        pp->_tid = nt.id;
-      }
+    nt.pntprocs = new Point_process[npnt];  // includes acell with and without gid
+    nt.n_pntproc = npnt;
+    // printf("offset=%ld ndata=%ld\n", offset, nt._ndata);
+    // assert(offset == nt._ndata); // not with alignment
+    nt._ndata = offset;
+
+    // now that we know the effect of padding, we can allocate data space,
+    // fill matrix, and adjust Memb_list data pointers
+    nt._data = (double*)coreneuron::ecalloc_align(nt._ndata, NRN_SOA_BYTE_ALIGN, sizeof(double));
+    nt._actual_rhs = nt._data + 0 * ne;
+    nt._actual_d = nt._data + 1 * ne;
+    nt._actual_a = nt._data + 2 * ne;
+    nt._actual_b = nt._data + 3 * ne;
+    nt._actual_v = nt._data + 4 * ne;
+    nt._actual_area = nt._data + 5 * ne;
+    for (tml = nt.tml; tml; tml = tml->next) {
+        Memb_list* ml = tml->ml;
+        ml->data = nt._data + (ml->data - (double*)0);
     }
-  }
 
-  // Some pdata may index into data which has been reordered from AoS to
-  // SoA. The three possibilities are if semantics is -1 (area), -5 (pointer),
-  // or 0-999 (ion variables). Note that pdata has a layout and the
-  // type block in nt.data into which it indexes, has a layout.
-  for (tml = nt.tml; tml; tml = tml->next) {
-    int type = tml->index;
-    int layout = nrn_mech_data_layout_[type];
-    int* pdata = tml->ml->pdata;
-    int cnt = tml->ml->nodecount;
-    int szdp = nrn_prop_dparam_size_[type];
-    int* semantics = memb_func[type].dparam_semantics;
-
-    // ignore ARTIFICIAL_CELL (has useless area pointer with semantics=-1)
-    if (nrn_is_artificial_[type]) { continue; }
-
-    if( szdp ) {
-        if(!semantics) continue; // temporary for HDFReport, Binreport which will be skipped in bbcore_write of HBPNeuron
-        nrn_assert(semantics);
-    }
+    // matrix info
+    nt._v_parent_index = (int*)coreneuron::ecalloc_align(nt.end, NRN_SOA_BYTE_ALIGN, sizeof(int));
+    ;
+    F.read_array<int>(nt._v_parent_index, nt.end);
+
+    F.read_array<double>(nt._actual_a, nt.end);
+    F.read_array<double>(nt._actual_b, nt.end);
+    F.read_array<double>(nt._actual_area, nt.end);
+    F.read_array<double>(nt._actual_v, nt.end);
 
-    for (int i=0; i < szdp; ++i) {
-      int s = semantics[i];	
-      if (s == -1) { // area
-        int area0 = nt._actual_area - nt._data;
-        for (int iml=0; iml < cnt; ++iml) {
-          int* pd = pdata + nrn_i_layout(iml, cnt, i, szdp, layout);
-          int ix = *pd - (5*nt.end); // unpadded area is 6th vector from beginning
-          nrn_assert((ix >= 0) && (ix < nt.end));
-          *pd = area0 + ix;
+    Memb_list** mlmap = new Memb_list*[n_memb_func];
+    int synoffset = 0;
+    int* pnt_offset = new int[n_memb_func];
+
+    // All the mechanism data and pdata.
+    // Also fill in mlmap and pnt_offset
+    // Complete spec of Point_process except for the acell presyn_ field.
+    for (tml = nt.tml; tml; tml = tml->next) {
+        int type = tml->index;
+        Memb_list* ml = tml->ml;
+        mlmap[type] = ml;
+        int is_art = nrn_is_artificial_[type];
+        int n = ml->nodecount;
+        int szp = nrn_prop_param_size_[type];
+        int szdp = nrn_prop_dparam_size_[type];
+
+        if (!is_art) {
+            ml->nodeindices =
+                (int*)coreneuron::ecalloc_align(ml->nodecount, NRN_SOA_BYTE_ALIGN, sizeof(int));
+            F.read_array<int>(ml->nodeindices, ml->nodecount);
+        } else {
+            ml->nodeindices = NULL;
         }
-      }else if (s == -5) { //pointer assumes a pointer to membrane voltage
-        int v0 = nt._actual_v - nt._data;
-        for (int iml=0; iml < cnt; ++iml) {
-          int* pd = pdata + nrn_i_layout(iml, cnt, i, szdp, layout);
-          int ix = *pd - (4*nt.end); // unpadded voltage is 5th vector from beginning
-          nrn_assert((ix >= 0) && (ix < nt.end));
-          *pd = v0 + ix;
+
+        int layout = nrn_mech_data_layout_[type];
+        mech_layout<double>(F, ml->data, n, szp, layout);
+
+        if (szdp) {
+            ml->pdata = (int*)coreneuron::ecalloc_align(nrn_soa_padded_size(n, layout) * szdp,
+                                                        NRN_SOA_BYTE_ALIGN, sizeof(int));
+            mech_layout<int>(F, ml->pdata, n, szdp, layout);
+        } else {
+            ml->pdata = NULL;
         }
-      }else if (s >=0 && s < 1000) { //ion
-        int etype = s;
-        int elayout = nrn_mech_data_layout_[etype];
-        if (elayout == 1) { continue; } /* ion is AoS so nothing to do */
-        assert(elayout == 0);
-        /* ion is SoA so must recalculate pdata values */
-        Memb_list* eml = mlmap[etype];
-        int edata0 = eml->data - nt._data;
-        int unpadded_edata0 = unpadded_ml_list[etype].data - (double*)0;
-        int ecnt = eml->nodecount;
-        int esz = nrn_prop_param_size_[etype];
-        for (int iml=0; iml < cnt; ++iml) {
-          int* pd = pdata + nrn_i_layout(iml, cnt, i, szdp, layout);
-          int ix = *pd - unpadded_edata0;
-          nrn_assert((ix >= 0) && (ix < ecnt*esz));
-          /* Original pd order assumed ecnt groups of esz */
-          *pd = edata0 + nrn_param_layout(ix, etype, eml);
+        if (pnt_map[type] > 0) {  // POINT_PROCESS mechanism including acell
+            int cnt = ml->nodecount;
+            Point_process* pnt = NULL;
+            pnt = nt.pntprocs + synoffset;
+            pnt_offset[type] = synoffset;
+            synoffset += cnt;
+            for (int i = 0; i < cnt; ++i) {
+                Point_process* pp = pnt + i;
+                pp->_type = type;
+                pp->_i_instance = i;
+                nt._vdata[ml->pdata[nrn_i_layout(i, cnt, 1, szdp, layout)]] = pp;
+                pp->_tid = nt.id;
+            }
         }
-      }
     }
-  }
-  // unpadded_ml_list no longer needed
-  free(unpadded_ml_list);
 
-  /* here we setup the mechanism dependencies. if there is a mechanism dependency
-   * then we allocate an array for tml->dependencies otherwise set it to NULL.
-   * In order to find out the "real" dependencies i.e. dependent mechanism
-   * exist at the same compartment, we compare the nodeindices of mechanisms
-   * returned by nrn_mech_depend.
-   */
+    if (nrn_have_gaps == 1) {
+        nrn_partrans::gap_thread_setup(nt);
+    }
 
-  /* temporary array for dependencies */
-  int* mech_deps = (int*)ecalloc(n_memb_func, sizeof(int));
+    // Some pdata may index into data which has been reordered from AoS to
+    // SoA. The three possibilities are if semantics is -1 (area), -5 (pointer),
+    // or 0-999 (ion variables). Note that pdata has a layout and the
+    // type block in nt.data into which it indexes, has a layout.
+    for (tml = nt.tml; tml; tml = tml->next) {
+        int type = tml->index;
+        int layout = nrn_mech_data_layout_[type];
+        int* pdata = tml->ml->pdata;
+        int cnt = tml->ml->nodecount;
+        int szdp = nrn_prop_dparam_size_[type];
+        int* semantics = memb_func[type].dparam_semantics;
+
+        // ignore ARTIFICIAL_CELL (has useless area pointer with semantics=-1)
+        if (nrn_is_artificial_[type]) {
+            continue;
+        }
 
-  for (tml = nt.tml; tml; tml = tml->next) {
+        if (szdp) {
+            if (!semantics)
+                continue;  // temporary for HDFReport, Binreport which will be skipped in
+                           // bbcore_write of HBPNeuron
+            nrn_assert(semantics);
+        }
 
-    /* initialize to null */
-    tml->dependencies = NULL;
-    tml->ndependencies = 0;
+        for (int i = 0; i < szdp; ++i) {
+            int s = semantics[i];
+            if (s == -1) {  // area
+                int area0 = nt._actual_area - nt._data;
+                for (int iml = 0; iml < cnt; ++iml) {
+                    int* pd = pdata + nrn_i_layout(iml, cnt, i, szdp, layout);
+                    int ix = *pd - (5 * nt.end);  // unpadded area is 6th vector from beginning
+                    nrn_assert((ix >= 0) && (ix < nt.end));
+                    *pd = area0 + ix;
+                }
+            } else if (s == -5) {  // pointer assumes a pointer to membrane voltage
+                int v0 = nt._actual_v - nt._data;
+                for (int iml = 0; iml < cnt; ++iml) {
+                    int* pd = pdata + nrn_i_layout(iml, cnt, i, szdp, layout);
+                    int ix = *pd - (4 * nt.end);  // unpadded voltage is 5th vector from beginning
+                    nrn_assert((ix >= 0) && (ix < nt.end));
+                    *pd = v0 + ix;
+                }
+            } else if (s >= 0 && s < 1000) {  // ion
+                int etype = s;
+                int elayout = nrn_mech_data_layout_[etype];
+                if (elayout == 1) {
+                    continue;
+                } /* ion is AoS so nothing to do */
+                assert(elayout == 0);
+                /* ion is SoA so must recalculate pdata values */
+                Memb_list* eml = mlmap[etype];
+                int edata0 = eml->data - nt._data;
+                int unpadded_edata0 = unpadded_ml_list[etype].data - (double*)0;
+                int ecnt = eml->nodecount;
+                int esz = nrn_prop_param_size_[etype];
+                for (int iml = 0; iml < cnt; ++iml) {
+                    int* pd = pdata + nrn_i_layout(iml, cnt, i, szdp, layout);
+                    int ix = *pd - unpadded_edata0;
+                    nrn_assert((ix >= 0) && (ix < ecnt * esz));
+                    /* Original pd order assumed ecnt groups of esz */
+                    *pd = edata0 + nrn_param_layout(ix, etype, eml);
+                }
+            }
+        }
+    }
+    // unpadded_ml_list no longer needed
+    free(unpadded_ml_list);
+
+    /* if desired, apply the node permutation. This involves permuting
+       at least the node parameter arrays for a, b, and area and all
+       integer vector values that index into nodes. This could have been done
+       when originally filling the arrays with AoS ordered data, but can also
+       be done now, after the SoA transformation. The latter has the advantage
+       that the present order is consistent with all the layout values. Note
+       that after this portion of the permutation, a number of other node index
+       vectors will be read and will need to be permuted as well in subsequent
+       sections of this function.
+    */
+    if (use_interleave_permute) {
+        nt._permute = interleave_order(nt.id, nt.ncell, nt.end, nt._v_parent_index);
+    }
+    if (nt._permute) {
+        int* p = nt._permute;
+        permute_data(nt._actual_a, nt.end, p);
+        permute_data(nt._actual_b, nt.end, p);
+        permute_data(nt._actual_area, nt.end, p);
 
-    /* get dependencies from the models */
-    int deps_cnt = nrn_mech_depend(tml->index, mech_deps);
+        // index values change as well as ordering
+        permute_ptr(nt._v_parent_index, nt.end, p);
+        node_permute(nt._v_parent_index, nt.end, p);
 
-    /* if dependencies, setup dependency array */
-    if(deps_cnt) {
+#if 0
+for (int i=0; i < nt.end; ++i) {
+  printf("parent[%d] = %d\n", i, nt._v_parent_index[i]);
+}
+#endif
 
-        /* store "real" dependencies in the vector */
-        std::vector<int> actual_mech_deps;
+        // specify the ml->_permute and sort the nodeindices
+        for (tml = nt.tml; tml; tml = tml->next) {
+            if (tml->ml->nodeindices) {  // not artificial
+                permute_nodeindices(tml->ml, p);
+            }
+        }
 
-        Memb_list *ml = tml->ml;
-        int* nodeindices = ml->nodeindices;
+        // permute mechanism data, pdata (and values)
+        for (tml = nt.tml; tml; tml = tml->next) {
+            if (tml->ml->nodeindices) {  // not artificial
+                permute_ml(tml->ml, tml->index, nt);
+            }
+        }
 
-        /* iterate over dependencies */
-        for(int j=0; j<deps_cnt; j++) {
+        // permute the Point_process._i_instance
+        for (int i = 0; i < nt.n_pntproc; ++i) {
+            Point_process& pp = nt.pntprocs[i];
+            Memb_list* ml = nt._ml_list[pp._type];
+            if (ml->_permute) {
+                pp._i_instance = ml->_permute[pp._i_instance];
+            }
+        }
+    }
 
-            /* memb_list of dependency mechanism */
-            Memb_list *dml = nt._ml_list[mech_deps[j]];
+    if (nrn_have_gaps == 1 && use_interleave_permute) {
+        nrn_partrans::gap_indices_permute(nt);
+    }
 
-            /* dependency mechanism may not exist in the model */
-            if(!dml)
-                continue;
+    /* here we setup the mechanism dependencies. if there is a mechanism dependency
+     * then we allocate an array for tml->dependencies otherwise set it to NULL.
+     * In order to find out the "real" dependencies i.e. dependent mechanism
+     * exist at the same compartment, we compare the nodeindices of mechanisms
+     * returned by nrn_mech_depend.
+     */
 
-            /* take nodeindices for comparison */
-            int* dnodeindices = dml->nodeindices;
+    /* temporary array for dependencies */
+    int* mech_deps = (int*)ecalloc(n_memb_func, sizeof(int));
 
-            /* set_intersection function needs temp vector to push the common values */
-            std::vector<int> node_intersection;
+    for (tml = nt.tml; tml; tml = tml->next) {
+        /* initialize to null */
+        tml->dependencies = NULL;
+        tml->ndependencies = 0;
+
+        /* get dependencies from the models */
+        int deps_cnt = nrn_mech_depend(tml->index, mech_deps);
+
+        /* if dependencies, setup dependency array */
+        if (deps_cnt) {
+            /* store "real" dependencies in the vector */
+            std::vector<int> actual_mech_deps;
+
+            Memb_list* ml = tml->ml;
+            int* nodeindices = ml->nodeindices;
+
+            /* iterate over dependencies */
+            for (int j = 0; j < deps_cnt; j++) {
+                /* memb_list of dependency mechanism */
+                Memb_list* dml = nt._ml_list[mech_deps[j]];
+
+                /* dependency mechanism may not exist in the model */
+                if (!dml)
+                    continue;
+
+                /* take nodeindices for comparison */
+                int* dnodeindices = dml->nodeindices;
+
+                /* set_intersection function needs temp vector to push the common values */
+                std::vector<int> node_intersection;
+
+                /* make sure they have non-zero nodes and find their intersection */
+                if ((ml->nodecount > 0) && (dml->nodecount > 0)) {
+                    std::set_intersection(nodeindices, nodeindices + ml->nodecount, dnodeindices,
+                                          dnodeindices + dml->nodecount,
+                                          std::back_inserter(node_intersection));
+                }
+
+                /* if they intersect in the nodeindices, it's real dependency */
+                if (!node_intersection.empty()) {
+                    actual_mech_deps.push_back(mech_deps[j]);
+                }
+            }
 
-            /* make sure they have non-zero nodes and find their intersection */
-            if( (ml->nodecount > 0) && (dml->nodecount > 0)) {
-                std::set_intersection(nodeindices,  nodeindices  + ml->nodecount,
-                                      dnodeindices, dnodeindices + dml->nodecount,
-                                      std::back_inserter(node_intersection));
+            /* copy actual_mech_deps to dependencies */
+            if (!actual_mech_deps.empty()) {
+                tml->ndependencies = actual_mech_deps.size();
+                tml->dependencies = (int*)ecalloc(actual_mech_deps.size(), sizeof(int));
+                memcpy(tml->dependencies, &actual_mech_deps[0],
+                       sizeof(int) * actual_mech_deps.size());
             }
+        }
+    }
+
+    /* free temp dependency array */
+    free(mech_deps);
 
-            /* if they intersect in the nodeindices, it's real dependency */
-            if(!node_intersection.empty()) {
-                actual_mech_deps.push_back(mech_deps[j]);
+    /// Fill the BA lists
+    BAMech** bamap = new BAMech*[n_memb_func];
+    for (int i = 0; i < BEFORE_AFTER_SIZE; ++i) {
+        BAMech* bam;
+        NrnThreadBAList *tbl, **ptbl;
+        for (int ii = 0; ii < n_memb_func; ++ii) {
+            bamap[ii] = (BAMech*)0;
+        }
+        for (bam = bamech_[i]; bam; bam = bam->next) {
+            bamap[bam->type] = bam;
+        }
+        /* unnecessary but keep in order anyway */
+        ptbl = nt.tbl + i;
+        for (tml = nt.tml; tml; tml = tml->next) {
+            if (bamap[tml->index]) {
+                Memb_list* ml = tml->ml;
+                tbl = (NrnThreadBAList*)emalloc(sizeof(NrnThreadBAList));
+                tbl->next = (NrnThreadBAList*)0;
+                tbl->bam = bamap[tml->index];
+                tbl->ml = ml;
+                *ptbl = tbl;
+                ptbl = &(tbl->next);
             }
         }
+    }
+    delete[] bamap;
 
-        /* copy actual_mech_deps to dependencies */
-        if(!actual_mech_deps.empty()) {
-            tml->ndependencies = actual_mech_deps.size();
-            tml->dependencies = (int*)ecalloc(actual_mech_deps.size(), sizeof(int));
-            memcpy(tml->dependencies, &actual_mech_deps[0], sizeof(int)*actual_mech_deps.size());
+    // from nrn_has_net_event create pnttype2presyn.
+    pnttype2presyn = (int*)ecalloc(n_memb_func, sizeof(int));
+    for (int i = 0; i < n_memb_func; ++i) {
+        pnttype2presyn[i] = -1;
+    }
+    for (int i = 0; i < nrn_has_net_event_cnt_; ++i) {
+        pnttype2presyn[nrn_has_net_event_[i]] = i;
+    }
+    // create the nt.pnt2presyn_ix array of arrays.
+    nt.pnt2presyn_ix = (int**)ecalloc(nrn_has_net_event_cnt_, sizeof(int*));
+    for (int i = 0; i < nrn_has_net_event_cnt_; ++i) {
+        Memb_list* ml = nt._ml_list[nrn_has_net_event_[i]];
+        if (ml && ml->nodecount > 0) {
+            nt.pnt2presyn_ix[i] = (int*)ecalloc(ml->nodecount, sizeof(int));
         }
     }
-  }
 
-  /* free temp dependency array */
-  free(mech_deps);
+    // Real cells are at the beginning of the nt.presyns followed by
+    // acells (with and without gids mixed together)
+    // Here we associate the real cells with voltage pointers and
+    // acell PreSyn with the Point_process.
+    // nt.presyns order same as output_vindex order
+    int* output_vindex = F.read_array<int>(nt.n_presyn);
+    if (nt._permute) {
+        // only indices >= 0 (i.e. _actual_v indices) will be changed.
+        node_permute(output_vindex, nt.n_presyn, nt._permute);
+    }
+    double* output_threshold = F.read_array<double>(nt.ncell);
+    for (int i = 0; i < nt.n_presyn; ++i) {  // real cells
+        PreSyn* ps = nt.presyns + i;
+
+        int ix = output_vindex[i];
+        if (ix < 0) {
+            ix = -ix;
+            int index = ix / 1000;
+            int type = ix - index * 1000;
+            Point_process* pnt = nt.pntprocs + (pnt_offset[type] + index);
+            ps->pntsrc_ = pnt;
+            // pnt->_presyn = ps;
+            int ip2ps = pnttype2presyn[pnt->_type];
+            if (ip2ps >= 0) {
+                nt.pnt2presyn_ix[ip2ps][pnt->_i_instance] = i;
+            }
+            if (ps->gid_ < 0) {
+                ps->gid_ = -1;
+            }
+        } else {
+            assert(ps->gid_ > -1);
+            ps->thvar_index_ = ix;  // index into _actual_v
+            assert(ix < nt.end);
+            ps->threshold_ = output_threshold[i];
+        }
+    }
+    delete[] output_vindex;
+    delete[] output_threshold;
+
+    // initial net_send_buffer size about 1% of number of presyns
+    // nt._net_send_buffer_size = nt.ncell/100 + 1;
+    // but, to avoid reallocation complexity on GPU ...
+    nt._net_send_buffer_size = nt.ncell;
+    nt._net_send_buffer = (int*)ecalloc(nt._net_send_buffer_size, sizeof(int));
+
+    // do extracon later as the target and weight info
+    // is not directly in the file
+    int nnetcon = nt.n_netcon - nrn_setup_extracon;
+
+    int nweight = nt.n_weight;
+    // printf("nnetcon=%d nweight=%d\n", nnetcon, nweight);
+    // it may happen that Point_process structures will be made unnecessary
+    // by factoring into NetCon.
+
+    // Make NetCon.target_ point to proper Point_process. Only the NetCon
+    // with pnttype[i] > 0 have a target.
+    int* pnttype = F.read_array<int>(nnetcon);
+    int* pntindex = F.read_array<int>(nnetcon);
+    for (int i = 0; i < nnetcon; ++i) {
+        int type = pnttype[i];
+        if (type > 0) {
+            int index = pnt_offset[type] + pntindex[i];  /// Potentially uninitialized pnt_offset[],
+                                                         /// check for previous assignments
+            Point_process* pnt = nt.pntprocs + index;
+            NetCon& nc = nt.netcons[i];
+            nc.target_ = pnt;
+            nc.active_ = true;
+        }
+    }
 
-  /// Fill the BA lists
-  BAMech** bamap = new BAMech*[n_memb_func]; 
-  for (int i=0; i < BEFORE_AFTER_SIZE; ++i) {
-    BAMech* bam;
-    NrnThreadBAList* tbl, **ptbl;
-    for (int ii=0; ii < n_memb_func; ++ii) {
-      bamap[ii] = (BAMech*)0;
+    int extracon_target_type = -1;
+    int extracon_target_nweight = 0;
+    if (nrn_setup_extracon > 0) {
+        // Fill in the extracon target_ and active_.
+        // Simplistic.
+        // Rotate through the pntindex and use only pnttype for ProbAMPANMDA_EMS
+        // (which happens to have a weight vector length of 5.)
+        // Edge case: if there is no such synapse, let the target_ be NULL
+        //   and the netcon be inactive.
+        // Same pattern as algorithm for extracon netcon_srcgid above in phase1.
+        extracon_target_type = nrn_get_mechtype("ProbAMPANMDA_EMS");
+        assert(extracon_target_type > 0);
+        extracon_target_nweight = pnt_receive_size[extracon_target_type];
+        int j = 0;
+        for (int i = 0; i < nrn_setup_extracon; ++i) {
+            int active = 0;
+            for (int k = 0; k < nnetcon; ++k) {
+                if (pnttype[j] == extracon_target_type) {
+                    active = 1;
+                    break;
+                }
+                j = (j + 1) % nnetcon;
+            }
+            NetCon& nc = nt.netcons[i + nnetcon];
+            nc.active_ = active;
+            if (active) {
+                nc.target_ = nt.pntprocs + (pnt_offset[extracon_target_type] + pntindex[j]);
+            } else {
+                nc.target_ = NULL;
+            }
+        }
     }
-    for (bam = bamech_[i]; bam; bam = bam->next) {
-      bamap[bam->type] = bam;
+
+    delete[] pntindex;
+
+    // weights in netcons order in groups defined by Point_process target type.
+    nt.n_weight += nrn_setup_extracon * extracon_target_nweight;
+    nt.weights = new double[nt.n_weight];
+    F.read_array<double>(nt.weights, nweight);
+
+    int iw = 0;
+    for (int i = 0; i < nnetcon; ++i) {
+        NetCon& nc = nt.netcons[i];
+        nc.u.weight_index_ = iw;
+        iw += pnt_receive_size[pnttype[i]];
     }
-    /* unnecessary but keep in order anyway */
-    ptbl = nt.tbl + i;
-    for (tml = nt.tml; tml; tml = tml->next) {
-      if (bamap[tml->index]) {
-        Memb_list* ml = tml->ml;
-        tbl = (NrnThreadBAList*)emalloc(sizeof(NrnThreadBAList));
-        tbl->next = (NrnThreadBAList*)0;
-        tbl->bam = bamap[tml->index];
-        tbl->ml = ml;
-        *ptbl = tbl;
-        ptbl = &(tbl->next);
-      }
+    assert(iw == nweight);
+    delete[] pnttype;
+
+    // delays in netcons order
+    double* delay = F.read_array<double>(nnetcon);
+    for (int i = 0; i < nnetcon; ++i) {
+        NetCon& nc = nt.netcons[i];
+        nc.delay_ = delay[i];
     }
-  }
-  delete [] bamap;
-
-  // Real cells are at the beginning of the nt.presyns followed by
-  // acells (with and without gids mixed together)
-  // Here we associate the real cells with voltage pointers and
-  // acell PreSyn with the Point_process.
-  //nt.presyns order same as output_vindex order
-  int* output_vindex = F.read_array<int>(nt.n_presyn);
-  double* output_threshold = F.read_array<double>(nt.ncell);
-  for (int i=0; i < nt.n_presyn; ++i) { // real cells
-    PreSyn* ps = nt.presyns + i;
-    int ix = output_vindex[i];
-    if (ix < 0) {
-      ix = -ix;
-      int index = ix/1000;
-      int type = ix - index*1000;
-      Point_process* pnt = nt.pntprocs + (pnt_offset[type] + index);
-      ps->pntsrc_ = pnt;
-      pnt->_presyn = ps;
-      if (ps->gid_ < 0) {
-        ps->gid_ = -1;
-      }
-    }else{
-      assert(ps->gid_ > -1);
-      ps->thvar_ = nt._actual_v + ix;
-      assert (ix < nt.end);
-      ps->threshold_ = output_threshold[i];
+    delete[] delay;
+
+    if (nrn_setup_extracon > 0) {
+        // simplistic. delay is 1 and weight is 0.001
+        for (int i = 0; i < nrn_setup_extracon; ++i) {
+            NetCon& nc = nt.netcons[nnetcon + i];
+            nc.delay_ = 1.0;
+            nc.u.weight_index_ = nweight + i * extracon_target_nweight;
+            nt.weights[nc.u.weight_index_] = 2.0;  // this value 2.0 is extracted from .dat files
+        }
     }
-  }
-  delete [] output_vindex;
-  delete [] output_threshold;
-
-  int nnetcon = nt.n_netcon;
-  int nweight = nt.n_weight;
-//printf("nnetcon=%d nweight=%d\n", nnetcon, nweight);
-  // it may happen that Point_process structures will be made unnecessary
-  // by factoring into NetCon.
-
-  // Make NetCon.target_ point to proper Point_process. Only the NetCon
-  // with pnttype[i] > 0 have a target.
-  int* pnttype = F.read_array<int>(nnetcon);
-  int* pntindex = F.read_array<int>(nnetcon);
-  for (int i=0; i < nnetcon; ++i) {
-    int type = pnttype[i];
-    if (type > 0) {
-      int index = pnt_offset[type] + pntindex[i]; /// Potentially uninitialized pnt_offset[], check for previous assignments
-      Point_process* pnt = nt.pntprocs + index;
-      NetCon& nc = nt.netcons[i];
-      nc.target_ = pnt;
-      nc.active_ = true;
+
+    // BBCOREPOINTER information
+    npnt = F.read_int();
+    for (int i = 0; i < npnt; ++i) {
+        int type = F.read_int();
+        assert(nrn_bbcore_read_[type]);
+        int icnt = F.read_int();
+        int dcnt = F.read_int();
+        int* iArray = NULL;
+        double* dArray = NULL;
+        if (icnt) {
+            iArray = F.read_array<int>(icnt);
+        }
+        if (dcnt) {
+            dArray = F.read_array<double>(dcnt);
+        }
+        int ik = 0;
+        int dk = 0;
+        Memb_list* ml = mlmap[type];
+        int dsz = nrn_prop_param_size_[type];
+        int pdsz = nrn_prop_dparam_size_[type];
+        int cntml = ml->nodecount;
+        int layout = nrn_mech_data_layout_[type];
+        for (int j = 0; j < cntml; ++j) {
+            int jp = j;
+            if (ml->_permute) {
+                jp = ml->_permute[j];
+            }
+            double* d = ml->data;
+            Datum* pd = ml->pdata;
+            d += nrn_i_layout(jp, cntml, 0, dsz, layout);
+            pd += nrn_i_layout(jp, cntml, 0, pdsz, layout);
+            int aln_cntml = nrn_soa_padded_size(cntml, layout);
+            (*nrn_bbcore_read_[type])(dArray, iArray, &dk, &ik, 0, aln_cntml, d, pd, ml->_thread,
+                                      &nt, 0.0);
+        }
+        assert(dk == dcnt);
+        assert(ik == icnt);
+        if (ik) {
+            delete[] iArray;
+        }
+        if (dk) {
+            delete[] dArray;
+        }
     }
-  }
-  delete [] pntindex;
-  delete [] pnt_offset;
-
-  // weights in netcons order in groups defined by Point_process target type.
-  nt.weights = F.read_array<double>(nweight);
-  int iw = 0;
-  for (int i=0; i < nnetcon; ++i) {
-    NetCon& nc = nt.netcons[i];
-    nc.u.weight_ = nt.weights + iw;
-    iw += pnt_receive_size[pnttype[i]];
-  }
-  assert(iw == nweight);
-  delete [] pnttype;
-
-  // delays in netcons order
-  double* delay = F.read_array<double>(nnetcon);
-  for (int i=0; i < nnetcon; ++i) {
-    NetCon& nc = nt.netcons[i];
-    nc.delay_ = delay[i];
-  }
-  delete [] delay;
-
-  // BBCOREPOINTER information
-  npnt = F.read_int();
-  for (int i=0; i < npnt; ++i) {
-    int type = F.read_int();
-    assert(nrn_bbcore_read_[type]);
-    int icnt = F.read_int();
-    int dcnt = F.read_int();
-    int* iArray = NULL;
-    double* dArray = NULL;
-    if (icnt) 
-    {
-      iArray = F.read_array<int>(icnt);
+    delete[] mlmap;
+
+    // VecPlayContinuous instances
+    // No attempt at memory efficiency
+    int n = F.read_int();
+    nt.n_vecplay = n;
+    if (n) {
+        nt._vecplay = new void*[n];
+    } else {
+        nt._vecplay = NULL;
     }
-    if (dcnt) 
-    {
-      dArray = F.read_array<double>(dcnt);
+    for (int i = 0; i < n; ++i) {
+        int vtype = F.read_int();
+        nrn_assert(vtype == VecPlayContinuousType);
+        int mtype = F.read_int();
+        Memb_list* ml = nt._ml_list[mtype];
+        int ix = F.read_int();
+        int sz = F.read_int();
+        IvocVect* yvec = vector_new1(sz);
+        F.read_array<double>(vector_vec(yvec), sz);
+        IvocVect* tvec = vector_new1(sz);
+        F.read_array<double>(vector_vec(tvec), sz);
+        ix = nrn_param_layout(ix, mtype, ml);
+        if (ml->_permute) {
+            ix = nrn_index_permute(ix, mtype, ml);
+        }
+        nt._vecplay[i] = new VecPlayContinuous(ml->data + ix, yvec, tvec, NULL, nt.id);
     }
-    int ik = 0;
-    int dk = 0;
-    Memb_list* ml = mlmap[type];
-    int dsz = nrn_prop_param_size_[type];
-    int pdsz = nrn_prop_dparam_size_[type];
-    int cntml = ml->nodecount;
-    int layout = nrn_mech_data_layout_[type];
-    for (int j=0; j < cntml; ++j) {
-      double* d = ml->data;
-      Datum* pd = ml->pdata;
-      d += nrn_i_layout(j, cntml, 0, dsz, layout);
-      pd += nrn_i_layout(j, cntml, 0, pdsz, layout);
-      int aln_cntml = nrn_soa_padded_size(cntml, layout);
-      (*nrn_bbcore_read_[type])(dArray, iArray, &dk, &ik, 0, aln_cntml, d, pd, ml->_thread, &nt, 0.0);
+
+    // NetReceiveBuffering
+    for (int i = 0; i < net_buf_receive_cnt_; ++i) {
+        int type = net_buf_receive_type_[i];
+        // Does this thread have this type.
+        Memb_list* ml = nt._ml_list[type];
+        if (ml) {  // needs a NetReceiveBuffer
+            NetReceiveBuffer_t* nrb = (NetReceiveBuffer_t*)ecalloc(1, sizeof(NetReceiveBuffer_t));
+            ml->_net_receive_buffer = nrb;
+            nrb->_pnt_offset = pnt_offset[type];
+
+            // begin with a size of 5% of the number of instances
+            nrb->_size = ml->nodecount;
+            // or at least 8
+            if (nrb->_size < 8) {
+                nrb->_size = 8;
+            }
+            // but not more than nodecount
+            if (nrb->_size > ml->nodecount) {
+                nrb->_size = ml->nodecount;
+            }
+
+            nrb->_pnt_index = (int*)ecalloc(nrb->_size, sizeof(int));
+            nrb->_displ = (int*)ecalloc(nrb->_size + 1, sizeof(int));
+            nrb->_nrb_index = (int*)ecalloc(nrb->_size, sizeof(int));
+            nrb->_weight_index = (int*)ecalloc(nrb->_size, sizeof(int));
+            nrb->_nrb_t = (double*)ecalloc(nrb->_size, sizeof(double));
+            nrb->_nrb_flag = (double*)ecalloc(nrb->_size, sizeof(double));
+        }
     }
-    assert(dk == dcnt);
-    assert(ik == icnt);
-    if (ik) 
-    {
-      delete [] iArray;
+
+    // NetSendBuffering
+    for (int i = 0; i < net_buf_send_cnt_; ++i) {
+        int type = net_buf_send_type_[i];
+        // Does this thread have this type.
+        Memb_list* ml = nt._ml_list[type];
+        if (ml) {  // needs a NetSendBuffer
+            NetSendBuffer_t* nsb = (NetSendBuffer_t*)ecalloc(1, sizeof(NetSendBuffer_t));
+            ml->_net_send_buffer = nsb;
+
+            // begin with a size equal to twice number of instances
+            // at present there is no provision for dynamically increasing this.
+            nsb->_size = ml->nodecount * 2;
+            nsb->_cnt = 0;
+
+            nsb->_sendtype = (int*)ecalloc(nsb->_size, sizeof(int));
+            nsb->_vdata_index = (int*)ecalloc(nsb->_size, sizeof(int));
+            nsb->_pnt_index = (int*)ecalloc(nsb->_size, sizeof(int));
+            nsb->_weight_index = (int*)ecalloc(nsb->_size, sizeof(int));
+            // when == 1, NetReceiveBuffer_t is newly allocated (i.e. we need to free previous copy
+            // and recopy new data
+            nsb->reallocated = 1;
+            nsb->_nsb_t = (double*)ecalloc(nsb->_size, sizeof(double));
+            nsb->_nsb_flag = (double*)ecalloc(nsb->_size, sizeof(double));
+        }
     }
-    if (dk)
-    {
-      delete [] dArray;
+
+    delete[] pnt_offset;
+}
+
+/** read mapping information for neurons */
+void read_phase3(data_reader& F, int imult, NrnThread& nt) {
+    (void)imult;
+
+    /** mapping information for all neurons in single NrnThread */
+    NeuronGroupMappingInfo* nrngroup_map = new NeuronGroupMappingInfo();
+
+    /** total compartments in this NrnThread */
+    int total_compartment = 0;
+
+    /** for every neuron */
+    for (int i = 0; i < nt.ncell; i++) {
+        int gid, seg, soma, axon, dend, apical, compartment;
+
+        // read counts
+        F.read_mapping_count(&gid, &seg, &soma, &axon, &dend, &apical, &compartment);
+
+        NeuronMappingInfo nmap(gid, seg, soma, axon, dend, apical, compartment);
+
+        // read all section-segment mapping
+        F.read_mapping_info(&nmap, seg);
+
+        // add mapping info for current gid
+        nrngroup_map->add_neuron_mapping_info(nmap);
+        total_compartment += compartment;
     }
-  }
-  delete [] mlmap;
-
-  // VecPlayContinuous instances
-  // No attempt at memory efficiency
-  int n = F.read_int();
-  nt.n_vecplay = n;
-  if (n) {
-    nt._vecplay = new void*[n];
-  }else{
-    nt._vecplay = NULL;
-  }
-  for (int i=0; i < n; ++i) {
-    int vtype = F.read_int();
-    nrn_assert(vtype == VecPlayContinuousType);
-    int mtype = F.read_int();
-    Memb_list* ml = nt._ml_list[mtype];
-    int ix = F.read_int();
-    int sz = F.read_int();
-    IvocVect* yvec = vector_new(sz);
-    F.read_array<double>(vector_vec(yvec), sz);
-    IvocVect* tvec = vector_new(sz);
-    F.read_array<double>(vector_vec(tvec), sz);
-    ix = nrn_param_layout(ix, mtype, ml);
-    nt._vecplay[i] = new VecPlayContinuous(ml->data + ix, yvec, tvec, NULL, nt.id);
-  }
+
+    // sum of compartments for all neurons in mapping file should be
+    // equal to no of compartments from dataset
+    nrn_assert(total_compartment == nt.end);
+
+    // no of cells should match
+    nrn_assert(nrngroup_map->count() == nt.ncell);
+
+    // set point in NrnThread
+    nt.mapping = (void*)nrngroup_map;
 }
 
 static size_t memb_list_size(NrnThreadMembList* tml) {
-  size_t sz_ntml = sizeof(NrnThreadMembList);
-  size_t sz_ml = sizeof(Memb_list);
-  size_t szi = sizeof(int);
-  size_t nbyte = sz_ntml + sz_ml;
-  nbyte += tml->ml->nodecount*szi;
-  nbyte += nrn_prop_dparam_size_[tml->index]*tml->ml->nodecount*sizeof(Datum);
+    size_t sz_ntml = sizeof(NrnThreadMembList);
+    size_t sz_ml = sizeof(Memb_list);
+    size_t szi = sizeof(int);
+    size_t nbyte = sz_ntml + sz_ml;
+    nbyte += tml->ml->nodecount * szi;
+    nbyte += nrn_prop_dparam_size_[tml->index] * tml->ml->nodecount * sizeof(Datum);
 #ifdef DEBUG
-  int i = tml->index;
-  printf("%s %d psize=%d ppsize=%d cnt=%d nbyte=%ld\n", memb_func[i].sym, i, nrn_prop_param_size_[i], nrn_prop_dparam_size_[i], tml->ml->nodecount, nbyte);
+    int i = tml->index;
+    printf("%s %d psize=%d ppsize=%d cnt=%d nbyte=%ld\n", memb_func[i].sym, i,
+           nrn_prop_param_size_[i], nrn_prop_dparam_size_[i], tml->ml->nodecount, nbyte);
 #endif
-  return nbyte;
+    return nbyte;
 }
 
-size_t model_size(void) {
-  size_t nbyte = 0;
-  size_t szd = sizeof(double);
-  size_t szi = sizeof(int);
-  size_t szv = sizeof(void*);
-  size_t sz_th = sizeof(NrnThread);
-  size_t sz_ps = sizeof(PreSyn);
-  size_t sz_pp = sizeof(Point_process);
-  size_t sz_nc = sizeof(NetCon);
-  size_t sz_psi = sizeof(InputPreSyn);
-  NrnThreadMembList* tml;
-  size_t nccnt = 0;
-
-  for (int i=0; i < nrn_nthread; ++i) {
-    NrnThread& nt = nrn_threads[i];
-    size_t nb_nt = 0; // per thread
-    nccnt += nt.n_netcon;
-
-    // Memb_list size
-    int nmech = 0;
-    for (tml=nt.tml; tml; tml = tml->next) {
-      nb_nt += memb_list_size(tml);
-      ++nmech;
+/// Approximate count of number of bytes for the gid2out map
+size_t output_presyn_size(void) {
+    if (gid2out.empty()) {
+        return 0;
+    }
+    size_t nbyte =
+        sizeof(gid2out) + sizeof(int) * gid2out.size() + sizeof(PreSyn*) * gid2out.size();
+#ifdef DEBUG
+    printf(" gid2out table bytes=~%ld size=%d\n", nbyte, gid2out.size());
+#endif
+    return nbyte;
+}
+
+size_t input_presyn_size(void) {
+    if (gid2in.empty()) {
+        return 0;
     }
+    size_t nbyte =
+        sizeof(gid2in) + sizeof(int) * gid2in.size() + sizeof(InputPreSyn*) * gid2in.size();
+#ifdef DEBUG
+    printf(" gid2in table bytes=~%ld size=%d\n", nbyte, gid2in->size());
+#endif
+    return nbyte;
+}
 
-    // basic thread size includes mechanism data and G*V=I matrix
-    nb_nt += sz_th;
-    nb_nt += nt._ndata*szd + nt._nidata*szi + nt._nvdata*szv;
-    nb_nt += nt.end*szi; // _v_parent_index
+size_t model_size(void) {
+    size_t nbyte = 0;
+    size_t szd = sizeof(double);
+    size_t szi = sizeof(int);
+    size_t szv = sizeof(void*);
+    size_t sz_th = sizeof(NrnThread);
+    size_t sz_ps = sizeof(PreSyn);
+    size_t sz_psi = sizeof(InputPreSyn);
+    size_t sz_nc = sizeof(NetCon);
+    size_t sz_pp = sizeof(Point_process);
+    NrnThreadMembList* tml;
+    size_t nccnt = 0;
+
+    for (int i = 0; i < nrn_nthread; ++i) {
+        NrnThread& nt = nrn_threads[i];
+        size_t nb_nt = 0;  // per thread
+        nccnt += nt.n_netcon;
+
+        // Memb_list size
+        int nmech = 0;
+        for (tml = nt.tml; tml; tml = tml->next) {
+            nb_nt += memb_list_size(tml);
+            ++nmech;
+        }
+
+        // basic thread size includes mechanism data and G*V=I matrix
+        nb_nt += sz_th;
+        nb_nt += nt._ndata * szd + nt._nidata * szi + nt._nvdata * szv;
+        nb_nt += nt.end * szi;  // _v_parent_index
 
 #ifdef DEBUG
-    printf("ncell=%d end=%d nmech=%d\n", nt.ncell, nt.end, nmech);
-    printf("ndata=%ld nidata=%ld nvdata=%ld\n", nt._ndata, nt._nidata, nt._nvdata);
-    printf("nbyte so far %ld\n", nb_nt);
-    printf("n_presyn = %d sz=%ld nbyte=%ld\n", nt.n_presyn, sz_ps, nt.n_presyn*sz_ps);
-    printf("n_pntproc=%d sz=%ld nbyte=%ld\n", nt.n_pntproc, sz_pp, nt.n_pntproc*sz_pp);
-    printf("n_netcon=%d sz=%ld nbyte=%ld\n", nt.n_netcon, sz_nc, nt.n_netcon*sz_nc);
-    printf("n_weight = %d\n", nt.n_weight);
+        printf("ncell=%d end=%d nmech=%d\n", nt.ncell, nt.end, nmech);
+        printf("ndata=%ld nidata=%ld nvdata=%ld\n", nt._ndata, nt._nidata, nt._nvdata);
+        printf("nbyte so far %ld\n", nb_nt);
+        printf("n_presyn = %d sz=%ld nbyte=%ld\n", nt.n_presyn, sz_ps, nt.n_presyn * sz_ps);
+        printf("n_input_presyn = %d sz=%ld nbyte=%ld\n", nt.n_input_presyn, sz_psi,
+               nt.n_input_presyn * sz_psi);
+        printf("n_pntproc=%d sz=%ld nbyte=%ld\n", nt.n_pntproc, sz_pp, nt.n_pntproc * sz_pp);
+        printf("n_netcon=%d sz=%ld nbyte=%ld\n", nt.n_netcon, sz_nc, nt.n_netcon * sz_nc);
+        printf("n_weight = %d\n", nt.n_weight);
 #endif
 
-    // spike handling
-    nb_nt += nt.n_pntproc*sz_pp + nt.n_netcon*sz_nc + nt.n_presyn*sz_ps
-             + nt.n_weight*szd;
-    nbyte += nb_nt;
+        // spike handling
+        nb_nt += nt.n_pntproc * sz_pp + nt.n_netcon * sz_nc + nt.n_presyn * sz_ps +
+                 nt.n_input_presyn * sz_psi + nt.n_weight * szd;
+        nbyte += nb_nt;
 #ifdef DEBUG
-    printf("%d thread %d total bytes %ld\n", nrnmpi_myid, i, nb_nt);
+        printf("%d thread %d total bytes %ld\n", nrnmpi_myid, i, nb_nt);
 #endif
-  }
+    }
 
 #ifdef DEBUG
-  printf("%d n_inputpresyn=%d sz=%ld nbyte=%ld\n", nrnmpi_myid, n_inputpresyn_, sz_psi, n_inputpresyn_*sz_psi);
-  printf("%d netcon pointers %ld  nbyte=%ld\n", nrnmpi_myid, nccnt, nccnt*sizeof(NetCon*));
+    printf("%d netcon pointers %ld  nbyte=%ld\n", nrnmpi_myid, nccnt, nccnt * sizeof(NetCon*));
 #endif
-  nbyte += n_inputpresyn_*sz_psi + nccnt*sizeof(NetCon*);
-  nbyte += output_presyn_size();
-  nbyte += input_presyn_size();
+    nbyte += nccnt * sizeof(NetCon*);
+    nbyte += output_presyn_size();
+    nbyte += input_presyn_size();
 
 #ifdef DEBUG
-  printf("nrnran123 size=%ld cnt=%ld nbyte=%ld\n", nrnran123_state_size(), nrnran123_instance_count(), nrnran123_instance_count()*nrnran123_state_size());
+    printf("nrnran123 size=%ld cnt=%ld nbyte=%ld\n", nrnran123_state_size(),
+           nrnran123_instance_count(), nrnran123_instance_count() * nrnran123_state_size());
 #endif
 
-  nbyte += nrnran123_instance_count() * nrnran123_state_size();
+    nbyte += nrnran123_instance_count() * nrnran123_state_size();
 
 #ifdef DEBUG
-  printf("%d total bytes %ld\n", nrnmpi_myid, nbyte);
+    printf("%d total bytes %ld\n", nrnmpi_myid, nbyte);
 #endif
 
-  return nbyte;
+    return nbyte;
 }
diff --git a/coreneuron/nrniv/nrn_setup.h b/coreneuron/nrniv/nrn_setup.h
index 7db61ac92..c41a1ba59 100644
--- a/coreneuron/nrniv/nrn_setup.h
+++ b/coreneuron/nrniv/nrn_setup.h
@@ -31,77 +31,102 @@ THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <string>
 #include "coreneuron/nrnoc/multicore.h"
+#include "coreneuron/nrniv/nrn_datareader.h"
 
 static int ngroup_w;
 static int* gidgroups_w;
+static int* imult_w;
 static const char* path_w;
 static data_reader* file_reader_w;
 static bool byte_swap_w;
 
-static void read_phase1(data_reader &F,NrnThread& nt);
-static void read_phase2(data_reader &F, NrnThread& nt);
+static void read_phase1(data_reader& F, int imult, NrnThread& nt);
+static void read_phase2(data_reader& F, int imult, NrnThread& nt);
+static void read_phase3(data_reader& F, int imult, NrnThread& nt);
+static void read_phasegap(data_reader& F, int imult, NrnThread& nt);
 static void setup_ThreadData(NrnThread& nt);
 
+// Functions to load and clean data;
+extern void nrn_init_and_load_data(int argc, char** argv, cn_input_params& input_params);
+extern void nrn_cleanup();
+
 namespace coreneuron {
 
     /// Reading phase number.
-    enum phase {one=1, two};
+    enum phase { one = 1, two, three, gap };
 
     /// Get the phase number in form of the string.
-    template<phase P>
+    template <phase P>
     inline std::string getPhaseName();
 
-    template<>
-    inline std::string getPhaseName<one>(){
+    template <>
+    inline std::string getPhaseName<one>() {
         return "1";
     }
 
-    template<>
-    inline std::string getPhaseName<two>(){
+    template <>
+    inline std::string getPhaseName<two>() {
         return "2";
     }
 
+    template <>
+    inline std::string getPhaseName<three>() {
+        return "3";
+    }
+
+    template <>
+    inline std::string getPhaseName<gap>() {
+        return "gap";
+    }
 
     /// Reading phase selector.
-    template<phase P>
-    inline void read_phase_aux(data_reader &F, NrnThread& nt);
+    template <phase P>
+    inline void read_phase_aux(data_reader& F, int imult, NrnThread& nt);
+
+    template <>
+    inline void read_phase_aux<one>(data_reader& F, int imult, NrnThread& nt) {
+        read_phase1(F, imult, nt);
+    }
 
-    template<>
-    inline void read_phase_aux<one>(data_reader &F, NrnThread& nt){
-        read_phase1(F, nt);
+    template <>
+    inline void read_phase_aux<two>(data_reader& F, int imult, NrnThread& nt) {
+        read_phase2(F, imult, nt);
     }
 
-    template<>
-    inline void read_phase_aux<two>(data_reader &F, NrnThread& nt){
-        read_phase2(F, nt);
+    template <>
+    inline void read_phase_aux<three>(data_reader& F, int imult, NrnThread& nt) {
+        read_phase3(F, imult, nt);
     }
 
+    template <>
+    inline void read_phase_aux<gap>(data_reader& F, int imult, NrnThread& nt) {
+        read_phasegap(F, imult, nt);
+    }
 
     /// Reading phase wrapper for each neuron group.
-    template<phase P>
-    inline void* phase_wrapper_w(NrnThread* nt){
+    template <phase P>
+    inline void* phase_wrapper_w(NrnThread* nt) {
         int i = nt->id;
         char fnamebuf[1000];
         if (i < ngroup_w) {
-          sd_ptr fname = sdprintf(fnamebuf, sizeof(fnamebuf), std::string("%s/%d_"+getPhaseName<P>()+".dat").c_str(), path_w, gidgroups_w[i]);
-          file_reader_w[i].open(fname, byte_swap_w);
-          read_phase_aux<P>(file_reader_w[i], *nt);
-          file_reader_w[i].close();
-          if (P == 2) {
-            setup_ThreadData(*nt);
-          }
+            sd_ptr fname = sdprintf(fnamebuf, sizeof(fnamebuf),
+                                    std::string("%s/%d_" + getPhaseName<P>() + ".dat").c_str(),
+                                    path_w, gidgroups_w[i]);
+            file_reader_w[i].open(fname, byte_swap_w);
+            read_phase_aux<P>(file_reader_w[i], imult_w[i], *nt);
+            file_reader_w[i].close();
+            if (P == 2) {
+                setup_ThreadData(*nt);
+            }
         }
         return NULL;
     }
 
-
     /// Specific phase reading executed by threads.
-    template<phase P>
-    inline static void phase_wrapper(){
+    template <phase P>
+    inline static void phase_wrapper() {
         nrn_multithread_job(phase_wrapper_w<P>);
     }
-
-
 }
 
 #endif
diff --git a/coreneuron/nrniv/nrn_stats.cpp b/coreneuron/nrniv/nrn_stats.cpp
index bbd1c5371..526340113 100644
--- a/coreneuron/nrniv/nrn_stats.cpp
+++ b/coreneuron/nrniv/nrn_stats.cpp
@@ -34,34 +34,181 @@ THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <stdio.h>
+#include <climits>
 #include "nrn_stats.h"
 #include "coreneuron/nrnmpi/nrnmpi.h"
 #include "coreneuron/nrnoc/multicore.h"
+#include "coreneuron/nrniv/netcvode.h"
+#include "coreneuron/nrniv/partrans.h"
 
 extern int spikevec_size;
+extern int* spikevec_gid;
+extern NetCvode* net_cvode_instance;
 
-const int NUM_STATS = 3;
-
-void report_cell_stats( void )
-{
-  long stat_array[NUM_STATS] = {0,0,0}, gstat_array[NUM_STATS];
-
-  for (int ith=0; ith < nrn_nthread; ++ith)
-  {
-    stat_array[0] += (long)nrn_threads[ith].ncell;     // number of cells
-    stat_array[1] += (long)nrn_threads[ith].n_netcon;  // number of netcons, synapses
-  }
-  stat_array[2] = (long)spikevec_size;
-
-  nrnmpi_long_allreduce_vec( stat_array, gstat_array, NUM_STATS, 1 );
-
-  if ( nrnmpi_myid == 0 )
-  {
-    printf("\n");
-    printf(" Number of cells in the simulation: %ld\n", gstat_array[0]);
-    printf(" Number of synapses in the simulation: %ld\n", gstat_array[1]);
-    printf(" Number of spikes: %ld\n", gstat_array[2]);
-    printf("\n");
-  }
-}
+const int NUM_STATS = 12;
+#if COLLECT_TQueue_STATISTICS
+const int NUM_EVENT_TYPES = 3;
+#endif
+enum event_type { enq = 0, spike, ite };
+
+void report_cell_stats(void) {
+    long stat_array[NUM_STATS] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, gstat_array[NUM_STATS];
+
+    for (int ith = 0; ith < nrn_nthread; ++ith) {
+        stat_array[0] += (long)nrn_threads[ith].ncell;           // number of cells
+        stat_array[10] += (long)nrn_threads[ith].end;            // number of compartments
+        stat_array[1] += (long)nrn_threads[ith].n_presyn;        // number of presyns
+        stat_array[2] += (long)nrn_threads[ith].n_input_presyn;  // number of input presyns
+        stat_array[3] += (long)nrn_threads[ith].n_netcon;        // number of netcons, synapses
+        stat_array[4] += (long)nrn_threads[ith].n_pntproc;       // number of point processes
+        if (nrn_partrans::transfer_thread_data_) {
+            int ntar = nrn_partrans::transfer_thread_data_[ith].ntar;
+            stat_array[11] += (long)ntar;  // number of transfer (gap) targets
+        }
+    }
+    stat_array[5] = (long)spikevec_size;  // number of spikes
+
+    int spikevec_positive_gid_size = 0;
+    for (int i = 0; i < spikevec_size; ++i)
+        if (spikevec_gid[i] > -1)
+            spikevec_positive_gid_size++;
+
+    stat_array[6] = (long)spikevec_positive_gid_size;  // number of non-negative gid spikes
+
+/// Event queuing statistics
+#if COLLECT_TQueue_STATISTICS
+    //    long que_stat[3] = {0, 0, 0}, gmax_que_stat[3];
+    /// Number of events for each thread, enqueued and spike enqueued
+    std::vector<long>
+        thread_vec_events[NUM_EVENT_TYPES];  // Number of events throughout the simulation
+    std::vector<std::pair<double, long> >
+        thread_vec_max_num_events[NUM_EVENT_TYPES];  // Time and the maximum number of events
+    std::vector<long> thread_vec_event_times[NUM_EVENT_TYPES];  // Number of time intervals for
+                                                                // events in the simulation
+    for (int type = 0; type < NUM_EVENT_TYPES; ++type) {
+        thread_vec_events[type].resize(nrn_nthread);
+        thread_vec_max_num_events[type].resize(nrn_nthread);
+        thread_vec_event_times[type].resize(nrn_nthread);
+    }
+
+    std::map<double, long>::const_iterator mapit;
 
+    /// Get the total number of enqueued events and enqueued with spike events
+    /// time_map_events - maps from TQueue class in sptbinq.h, - a collector of events statistics
+    for (int ith = 0; ith < nrn_nthread; ++ith) {
+        for (int type = 0; type < NUM_EVENT_TYPES; ++type) {
+            thread_vec_event_times[type][ith] +=
+                (long)net_cvode_instance->p[ith].tqe_->time_map_events[type].size();
+            thread_vec_max_num_events[type][ith].second = 0;
+            mapit = net_cvode_instance->p[ith].tqe_->time_map_events[type].begin();
+            for (; mapit != net_cvode_instance->p[ith].tqe_->time_map_events[type].end(); ++mapit) {
+                thread_vec_events[type][ith] += mapit->second;
+                if (mapit->second > thread_vec_max_num_events[type][ith].second) {
+                    thread_vec_max_num_events[type][ith].second = mapit->second;
+                    thread_vec_max_num_events[type][ith].first = mapit->first;
+                }
+            }
+            stat_array[7 + type] +=
+                thread_vec_events[type][ith];  // number of enqueued events and number of spike
+                                               // triggered events (enqueued after spike exchange)
+        }
+    }
+
+    /// Maximum number of events and correspondent time
+    long max_num_events[NUM_EVENT_TYPES] = {0, 0, 0}, gmax_num_events[NUM_EVENT_TYPES];
+    /// Get the maximum number of events one between threads first
+    for (int type = 0; type < NUM_EVENT_TYPES; ++type) {
+        for (int ith = 0; ith < nrn_nthread; ++ith) {
+            if (thread_vec_max_num_events[type][ith].second > max_num_events[type]) {
+                max_num_events[type] = thread_vec_max_num_events[type][ith].second;
+            }
+        }
+    }
+    nrnmpi_long_allreduce_vec(max_num_events, gmax_num_events, NUM_EVENT_TYPES, 2);
+
+    long qmin[NUM_EVENT_TYPES] = {LONG_MAX, LONG_MAX, LONG_MAX}, qmax[NUM_EVENT_TYPES] = {0, 0, 0},
+         qsum[NUM_EVENT_TYPES] = {0, 0, 0}, qdiff[NUM_EVENT_TYPES];
+    long gqmax[NUM_EVENT_TYPES], gqmin[NUM_EVENT_TYPES], gqsum[NUM_EVENT_TYPES],
+        gqdiff_max[NUM_EVENT_TYPES], gqdiff_min[NUM_EVENT_TYPES];
+    /// Max and min number of time intervals for the events and difference between threads
+    for (int type = 0; type < NUM_EVENT_TYPES; ++type) {
+        for (int ith = 0; ith < nrn_nthread; ++ith) {
+            qsum[type] += thread_vec_event_times[type][ith];
+            if (thread_vec_event_times[type][ith] > qmax[type])
+                qmax[type] = thread_vec_event_times[type][ith];
+            if (thread_vec_event_times[type][ith] < qmin[type])
+                qmin[type] = thread_vec_event_times[type][ith];
+        }
+        qdiff[type] = qmax[type] - qmin[type];
+    }
+    nrnmpi_long_allreduce_vec(qsum, gqsum, NUM_EVENT_TYPES, 1);
+    nrnmpi_long_allreduce_vec(qmax, gqmax, NUM_EVENT_TYPES, 2);
+    nrnmpi_long_allreduce_vec(qmin, gqmin, NUM_EVENT_TYPES, 0);
+    nrnmpi_long_allreduce_vec(qdiff, gqdiff_max, NUM_EVENT_TYPES, 2);
+    nrnmpi_long_allreduce_vec(qdiff, gqdiff_min, NUM_EVENT_TYPES, 0);
+#endif
+
+#if NRNMPI
+    nrnmpi_long_allreduce_vec(stat_array, gstat_array, NUM_STATS, 1);
+#else
+    assert(sizeof(stat_array) == sizeof(gstat_array));
+    memcpy(gstat_array, stat_array, sizeof(stat_array));
+#endif
+
+    if (nrnmpi_myid == 0) {
+        printf("\n\n Simulation Statistics\n");
+        printf(" Number of cells: %ld\n", gstat_array[0]);
+        printf(" Number of compartments: %ld\n", gstat_array[10]);
+        printf(" Number of presyns: %ld\n", gstat_array[1]);
+        printf(" Number of input presyns: %ld\n", gstat_array[2]);
+        printf(" Number of synapses: %ld\n", gstat_array[3]);
+        printf(" Number of point processes: %ld\n", gstat_array[4]);
+        printf(" Number of transfer (gap) targets: %ld\n", gstat_array[11]);
+        printf(" Number of spikes: %ld\n", gstat_array[5]);
+        printf(" Number of spikes with non negative gid-s: %ld\n", gstat_array[6]);
+#if COLLECT_TQueue_STATISTICS
+        printf(" Number of enqueued events: %ld\n", gstat_array[7]);
+        printf(" Maximum number of time intervals for the events: %ld\n", gqmax[enq]);
+        printf(" Number of after-spike enqueued events: %ld\n", gstat_array[8]);
+        printf(" Number of inter-thread enqueued events: %ld\n", gstat_array[9]);
+        //        printf(" Maximum difference of time interval enqueued events between threads on a
+        //        single MPI: %ld\n", gqdiff_max[enq]);
+        //        printf(" Maximum difference of time interval spike enqueued events between threads
+        //        on a single MPI: %ld\n", gqdiff_max[spike]);
+        //        printf(" Minimum difference of time interval enqueued events between threads on a
+        //        single MPI: %ld\n", gqdiff_min[enq]);
+        //        printf(" Minimum difference of time interval spike enqueued events between threads
+        //        on a single MPI: %ld\n", gqdiff_min[spike]);
+        printf(" Maximum number of enqueued events during specific time by one thread: %ld\n",
+               gmax_num_events[enq]);
+        printf(" Maximum number of spike enqueued events during specific time by one thread: %ld\n",
+               gmax_num_events[spike]);
+#endif
+    }
+
+#if COLLECT_TQueue_STATISTICS
+    int q_detailed_stats = 0;
+    if (q_detailed_stats) {
+        nrnmpi_barrier();
+        if (nrnmpi_myid == 0)
+            printf("\n Times for maximum number of enqueued events: ");
+        nrnmpi_barrier();
+        for (int ith = 0; ith < nrn_nthread; ++ith) {
+            if (thread_vec_max_num_events[enq][ith].second == gmax_num_events[enq])
+                printf("%lf\n", thread_vec_max_num_events[enq][ith].first);
+        }
+        nrnmpi_barrier();
+
+        if (nrnmpi_myid == 0)
+            printf("\n\n Times for maximum number of spike enqueued events: ");
+        nrnmpi_barrier();
+        for (int ith = 0; ith < nrn_nthread; ++ith) {
+            if (thread_vec_max_num_events[spike][ith].second == gmax_num_events[spike])
+                printf("%lf\n", thread_vec_max_num_events[spike][ith].first);
+        }
+        nrnmpi_barrier();
+    }
+#endif
+    if (nrnmpi_myid == 0)
+        printf("\n\n");
+}
diff --git a/coreneuron/nrniv/nrn_stats.h b/coreneuron/nrniv/nrn_stats.h
index 4481c3257..1466638d2 100644
--- a/coreneuron/nrniv/nrn_stats.h
+++ b/coreneuron/nrniv/nrn_stats.h
@@ -42,7 +42,6 @@ THE POSSIBILITY OF SUCH DAMAGE.
  *  @param void
  *  @return void
  */
-void report_cell_stats( void );
-
+void report_cell_stats(void);
 
 #endif /* ifndef _H_NRN_STATS_ */
diff --git a/coreneuron/nrniv/nrniv_decl.h b/coreneuron/nrniv/nrniv_decl.h
index 35c23ce40..75bc24405 100644
--- a/coreneuron/nrniv/nrniv_decl.h
+++ b/coreneuron/nrniv/nrniv_decl.h
@@ -29,20 +29,31 @@ THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef nrniv_dec_h
 #define nrniv_dec_h
 
+#include <vector>
+#include <map>
 #include "coreneuron/nrniv/netcon.h"
 #include "coreneuron/utils/endianness.h"
+#include "coreneuron/nrniv/nrnoptarg.h"
 
+/// Vector of maps for negative presyns
+extern std::vector<std::map<int, PreSyn*> > neg_gid2out;
+/// Maps for ouput and input presyns
+extern std::map<int, PreSyn*> gid2out;
+extern std::map<int, InputPreSyn*> gid2in;
 
-#if defined(__cplusplus)
-extern "C" {
-#endif
+/// InputPreSyn.nc_index_ to + InputPreSyn.nc_cnt_ give the NetCon*
+extern std::vector<NetCon*> netcon_in_presyn_order_;
+/// Only for setup vector of netcon source gids
+extern std::vector<int*> netcon_srcgid;
 
-extern void mk_mech(const char* fname);
+extern void mk_mech(const char* path);
+extern void set_globals(const char* path);
 extern void mk_netcvode(void);
 extern void nrn_p_construct(void);
-extern void nrn_setup(const char *path, const char *filesdat, int byte_swap, int threading);
+extern void nrn_setup(cn_input_params& input_params, const char* filesdat, int byte_swap);
+extern int nrn_setup_multiple;
+extern int nrn_setup_extracon;
 extern void nrn_cleanup();
-extern double BBS_netpar_mindelay(double maxdelay);
 extern void BBS_netpar_solve(double);
 extern void nrn_mkPatternStim(const char* filename);
 extern int nrn_extra_thread0_vdata;
@@ -50,35 +61,19 @@ extern void nrn_set_extra_thread0_vdata(void);
 extern Point_process* nrn_artcell_instantiate(const char* mechname);
 extern int nrn_need_byteswap;
 
-extern void nrn_reset_gid2out(void);
-extern void nrn_reset_gid2in(void);
-extern int input_gid_register(int gid);
-extern int input_gid_associate(int gid, InputPreSyn* psi);
-
-// only used in nrn_setup.cpp but implemented in netpar.cpp since that
-// is where int<->PreSyn* and int<->InputPreSyn* maps are defined.
-extern void netpar_tid_gid2ps_alloc(int nth);
-extern void netpar_tid_gid2ps_free();
-extern void netpar_tid_gid2ps(int tid, int gid, PreSyn** ps, InputPreSyn** psi);
-extern void netpar_tid_set_gid2node(int tid, int gid, int nid, PreSyn* ps);
-
-extern void nrn_cleanup_presyn(DiscreteEvent*);
 extern void nrn_outputevent(unsigned char, double);
 extern void ncs2nrn_integrate(double tstop);
-extern size_t output_presyn_size(void);
-extern size_t input_presyn_size(void);
 
 extern void handle_forward_skip(double forwardskip, int prcellgid);
 
-extern NetCon** netcon_in_presyn_order_;
-
 extern int nrn_set_timeout(int);
-extern void nrnmpi_gid_clear(void);
+
+extern void netpar_tid_gid2ps(int tid, int gid, PreSyn** ps, InputPreSyn** psi);
+extern double set_mindelay(double maxdelay);
 
 extern int nrn_soa_padded_size(int cnt, int layout);
 
-#if defined(__cplusplus)
-}
-#endif
+extern int use_interleave_permute;
+extern int cellorder_nwarp;
 
 #endif
diff --git a/coreneuron/nrniv/nrniv_mf.h b/coreneuron/nrniv/nrniv_mf.h
index 2242132a6..c28825465 100644
--- a/coreneuron/nrniv/nrniv_mf.h
+++ b/coreneuron/nrniv/nrniv_mf.h
@@ -33,20 +33,29 @@ THE POSSIBILITY OF SUCH DAMAGE.
 #include "membfunc.h"
 struct NrnThread;
 
-typedef void(*Pvmi)(NrnThread*, Memb_list*, int);
+typedef void (*Pvmi)(NrnThread*, Memb_list*, int);
 
 #if defined(__cplusplus)
 extern "C" {
 #endif
 
-extern void register_mech(char**, void(*)(Prop*), Pvmi, Pvmi, Pvmi, Pvmi, int, int);
-extern int point_register_mech(char**, void(*)(Prop*), Pvmi, Pvmi, Pvmi, Pvmi, int,
-	void*(*)(Object*), void(*)(void*), Member_func*, int);
-extern void hoc_register_cvode(int, int(*)(int),
-	int(*)(int, double**, double**, double*, Datum*, double*, int),
-	int(*)(NrnThread*, Memb_list*, int),
-	int(*)(NrnThread*, Memb_list*, int)
-);
+extern void register_mech(char**, void (*)(Prop*), Pvmi, Pvmi, Pvmi, Pvmi, int, int);
+extern int point_register_mech(char**,
+                               void (*)(Prop*),
+                               Pvmi,
+                               Pvmi,
+                               Pvmi,
+                               Pvmi,
+                               int,
+                               void* (*)(Object*),
+                               void (*)(void*),
+                               Member_func*,
+                               int);
+extern void hoc_register_cvode(int,
+                               int (*)(int),
+                               int (*)(int, double**, double**, double*, Datum*, double*, int),
+                               int (*)(NrnThread*, Memb_list*, int),
+                               int (*)(NrnThread*, Memb_list*, int));
 
 extern int nrn_get_mechtype(const char*);
 extern int v_structure_change;
diff --git a/coreneuron/nrniv/nrnmutdec.h b/coreneuron/nrniv/nrnmutdec.h
index e25b8e2c1..a32bb9b7f 100644
--- a/coreneuron/nrniv/nrnmutdec.h
+++ b/coreneuron/nrniv/nrnmutdec.h
@@ -37,14 +37,54 @@ THE POSSIBILITY OF SUCH DAMAGE.
 #define MUTDEC omp_lock_t* mut_;
 #define MUTCONSTRUCTED (mut_ != (omp_lock_t*)0)
 #if defined(__cplusplus)
-#define MUTCONSTRUCT(mkmut) {if (mkmut) {mut_= new omp_lock_t; omp_init_lock(mut_);}else{mut_ = 0;}}
-#define MUTDESTRUCT {if (mut_){omp_destroy_lock(mut_); delete mut_; mut_ = (omp_lock_t*)0;}}
+#define MUTCONSTRUCT(mkmut)        \
+    {                              \
+        if (mkmut) {               \
+            mut_ = new omp_lock_t; \
+            omp_init_lock(mut_);   \
+        } else {                   \
+            mut_ = 0;              \
+        }                          \
+    }
+#define MUTDESTRUCT                 \
+    {                               \
+        if (mut_) {                 \
+            omp_destroy_lock(mut_); \
+            delete mut_;            \
+            mut_ = (omp_lock_t*)0;  \
+        }                           \
+    }
 #else
-#define MUTCONSTRUCT(mkmut) {if (mkmut) {mut_=(omp_lock_t*)malloc(sizeof(omp_lock_t)); omp_init_lock(mut_);}else{mut_ = 0;}}
-#define MUTDESTRUCT {if (mut_){omp_destroy_lock(mut_); free((char*)mut_); mut_ = (omp_lock_t*)0;}}
+#define MUTCONSTRUCT(mkmut)                                 \
+    {                                                       \
+        if (mkmut) {                                        \
+            mut_ = (omp_lock_t*)malloc(sizeof(omp_lock_t)); \
+            omp_init_lock(mut_);                            \
+        } else {                                            \
+            mut_ = 0;                                       \
+        }                                                   \
+    }
+#define MUTDESTRUCT                 \
+    {                               \
+        if (mut_) {                 \
+            omp_destroy_lock(mut_); \
+            free((char*)mut_);      \
+            mut_ = (omp_lock_t*)0;  \
+        }                           \
+    }
 #endif
-#define MUTLOCK {if (mut_)  {omp_set_lock(mut_);}}
-#define MUTUNLOCK {if (mut_) {omp_unset_lock(mut_);}}
+#define MUTLOCK                 \
+    {                           \
+        if (mut_) {             \
+            omp_set_lock(mut_); \
+        }                       \
+    }
+#define MUTUNLOCK                 \
+    {                             \
+        if (mut_) {               \
+            omp_unset_lock(mut_); \
+        }                         \
+    }
 
 #else /* _OPENMP */
 
@@ -61,25 +101,65 @@ THE POSSIBILITY OF SUCH DAMAGE.
 #define MUTDEC pthread_mutex_t* mut_;
 #define MUTCONSTRUCTED (mut_ != (pthread_mutex_t*)0)
 #if defined(__cplusplus)
-#define MUTCONSTRUCT(mkmut) {if (mkmut) {mut_ = new pthread_mutex_t; pthread_mutex_init(mut_, 0);}else{mut_ = 0;}}
-#define MUTDESTRUCT {if (mut_){pthread_mutex_destroy(mut_); delete mut_; mut_ = (pthread_mutex_t*)0;}}
+#define MUTCONSTRUCT(mkmut)              \
+    {                                    \
+        if (mkmut) {                     \
+            mut_ = new pthread_mutex_t;  \
+            pthread_mutex_init(mut_, 0); \
+        } else {                         \
+            mut_ = 0;                    \
+        }                                \
+    }
+#define MUTDESTRUCT                      \
+    {                                    \
+        if (mut_) {                      \
+            pthread_mutex_destroy(mut_); \
+            delete mut_;                 \
+            mut_ = (pthread_mutex_t*)0;  \
+        }                                \
+    }
 #else
-#define MUTCONSTRUCT(mkmut) {if (mkmut) {mut_ = (pthread_mutex_t*)malloc(sizeof(pthread_mutex_t)); pthread_mutex_init(mut_, 0);}else{mut_ = 0;}}
-#define MUTDESTRUCT {if (mut_){pthread_mutex_destroy(mut_); free((char*)mut_); mut_ = (pthread_mutex_t*)0;}}
+#define MUTCONSTRUCT(mkmut)                                           \
+    {                                                                 \
+        if (mkmut) {                                                  \
+            mut_ = (pthread_mutex_t*)malloc(sizeof(pthread_mutex_t)); \
+            pthread_mutex_init(mut_, 0);                              \
+        } else {                                                      \
+            mut_ = 0;                                                 \
+        }                                                             \
+    }
+#define MUTDESTRUCT                      \
+    {                                    \
+        if (mut_) {                      \
+            pthread_mutex_destroy(mut_); \
+            free((char*)mut_);           \
+            mut_ = (pthread_mutex_t*)0;  \
+        }                                \
+    }
 #endif
-#define MUTLOCK {if (mut_) {pthread_mutex_lock(mut_);}}
-#define MUTUNLOCK {if (mut_) {pthread_mutex_unlock(mut_);}}
+#define MUTLOCK                       \
+    {                                 \
+        if (mut_) {                   \
+            pthread_mutex_lock(mut_); \
+        }                             \
+    }
+#define MUTUNLOCK                       \
+    {                                   \
+        if (mut_) {                     \
+            pthread_mutex_unlock(mut_); \
+        }                               \
+    }
 /*#define MUTLOCK {if (mut_) {printf("lock %lx\n", mut_); pthread_mutex_lock(mut_);}}*/
 /*#define MUTUNLOCK {if (mut_) {printf("unlock %lx\n", mut_); pthread_mutex_unlock(mut_);}}*/
 #else
 #define MUTDEC /**/
 #define MUTCONSTRUCTED (0)
 #define MUTCONSTRUCT(mkmut) /**/
-#define MUTDESTRUCT /**/
-#define MUTLOCK /**/
-#define MUTUNLOCK /**/
+#define MUTDESTRUCT         /**/
+#define MUTLOCK             /**/
+#define MUTUNLOCK           /**/
 #endif
 
 #endif /* USE_PTHREAD */
 
-#endif 
+#endif
diff --git a/coreneuron/nrniv/nrnoptarg.cpp b/coreneuron/nrniv/nrnoptarg.cpp
index 72a1db53c..2a5e882dc 100644
--- a/coreneuron/nrniv/nrnoptarg.cpp
+++ b/coreneuron/nrniv/nrnoptarg.cpp
@@ -32,18 +32,18 @@ THE POSSIBILITY OF SUCH DAMAGE.
 #include "coreneuron/nrniv/nrnoptarg.h"
 #include "coreneuron/utils/sdprintf.h"
 
-extern "C" void nrn_exit( int );
+extern "C" void nrn_exit(int);
 extern int nrnmpi_myid;
 
-cb_parameters::cb_parameters()
-{
+cn_parameters::cn_parameters() {
     tstart = 0.0;
     tstop = 100.0;
-    dt = 0.025;
+    dt = -1000.;
 
     dt_io = 0.1;
+    dt_report = 0.1;
 
-    celsius = 34.0;
+    celsius = -1000.0;  // precedence: set by user, globals.dat, 34.0
     voltage = -65.0;
     maxdelay = 10.0;
 
@@ -53,45 +53,54 @@ cb_parameters::cb_parameters()
     prcellgid = -1;
 
     threading = 0;
+    compute_gpu = 0;
+    cell_interleave_permute = 0;
+    nwarp = 0; /* 0 means not specified */
+    report = 0;
 
     patternstim = NULL;
 
     datpath = ".";
     outpath = ".";
     filesdat = "files.dat";
+
+    multiple = 1;
+    extracon = 0;
 }
 
-sd_ptr cb_parameters::get_filesdat_path( char *path_buf, size_t bufsz )
-{
+sd_ptr cn_parameters::get_filesdat_path(char* path_buf, size_t bufsz) {
     // shouldn't we check if filesdat is absolute or relative? -- sgy 20150119
-    return sdprintf( path_buf, bufsz, "%s/%s", datpath, filesdat );
+    return sdprintf(path_buf, bufsz, "%s/%s", datpath, filesdat);
 }
 
-void cb_parameters::show_cb_opts()
-{
-    if ( nrnmpi_myid == 0 ) {
-        printf( "\n Configuration Parameters" );
+void cn_parameters::show_cb_opts() {
+    if (nrnmpi_myid == 0) {
+        printf("\n Configuration Parameters");
+
+        printf("\n tstart: %g, tstop: %g, dt: %g, dt_io: %g", tstart, tstop, dt, dt_io);
+        printf(" celsius: %g, voltage: %g, maxdelay: %g", celsius, voltage, maxdelay);
+
+        printf("\n forwardskip: %g, spikebuf: %d, prcellgid: %d, multiple: %d, extracon: %d",
+               forwardskip, spikebuf, prcellgid, multiple, extracon);
+        printf("\n threading : %d, mindelay : %g, cell_permute: %d, nwarp: %d", threading, mindelay,
+               cell_interleave_permute, nwarp);
 
-        printf( "\n tstart: %g, tstop: %g, dt: %g, dt_io: %g", tstart, tstop, dt, dt_io );
-        printf( " celsius: %g, voltage: %g, maxdelay: %g", celsius, voltage, maxdelay );
+        printf("\n patternstim: %s, datpath: %s, filesdat: %s, outpath: %s", patternstim, datpath,
+               filesdat, outpath);
 
-        printf( "\n forwardskip: %g, spikebuf: %d, prcellgid: %d, threading : %d, mindelay : %g", \
-                forwardskip, spikebuf, prcellgid, threading, mindelay);
+        printf("\n report: %d, report dt: %lf ", report, dt_report);
 
-        printf( "\n patternstim: %s, datpath: %s, filesdat: %s, outpath: %s", \
-                patternstim, datpath, filesdat, outpath );
-        if ( prcellgid >= 0 ) {
-            printf( "\n prcellstate will be called for gid %d", prcellgid );
+        if (prcellgid >= 0) {
+            printf("\n prcellstate will be called for gid %d", prcellgid);
         }
 
-        printf( "\n\n" );
+        printf("\n\n");
     }
 }
 
-
-void cb_parameters::show_cb_opts_help()
-{
-    printf( "\nWelcome to CoreNeuron!\n\nOPTIONS\n\
+void cn_parameters::show_cb_opts_help() {
+    printf(
+        "\nWelcome to CoreNeuron!\n\nOPTIONS\n\
        -h, -?, --help Print a usage message briefly summarizing these command-line options \
 		and the bug-reporting address, then exit.\n\n\
        -s TIME, --tstart=TIME\n\
@@ -99,11 +108,13 @@ void cb_parameters::show_cb_opts_help()
        -e TIME, --tstop=TIME\n\
               Set the stop time to TIME (double). The default value is '100.'\n\n\
        -t TIME, --dt=TIME\n\
-              Set the dt time to TIME (double). The default value is '0.025'.\n\n\
+              Set the dt time to TIME (double). The default value is set by defaults.dat, otherwise '0.025'.\n\n\
        -i TIME, --dt_io=TIME\n\
               Set the dt of I/O to TIME (double). The default value is '0.1'.\n\n\
+       -v FLOAT, --voltage=v_init\n\
+              Value used for nrn_finitialize(1, v_init). If 1000, then nrn_finitialize(0,...)\n\
        -l NUMBER, --celsius=NUMBER\n\
-              Set the celsius temperature to NUMBER (double). The default value is '34.'.\n\n\
+              Set the celsius temperature to NUMBER (double). The default value set by defaults.dat, othewise '34.0'.\n\n\
        -p FILE, --pattern=FILE\n\
               Apply patternstim with the spike file FILE (char*). The default value is 'NULL'.\n\n\
        -b SIZE, --spikebuf=SIZE\n\
@@ -111,7 +122,13 @@ void cb_parameters::show_cb_opts_help()
        -g NUMBER, --prcellgid=NUMBER\n\
               Output prcellstate information for the gid NUMBER (int). The default value is '-1'.\n\n\
        -c, --threading\n\
-              Optiong to enable threading. The default implies no threading.\n\n\
+              Option to enable threading. The default implies no threading.\n\n\
+       -a, --gpu\n\
+              Option to enable use of GPUs. The default implies cpu only run.\n\n\
+       -R NUMBER, --cell_permute=NUMBER\n\
+              Cell permutation and interleaving for efficiency\n\n\
+       -W NUMBER, --nwarp=NUMBER\n\
+              number of warps to balance\n\n\
        -d PATH, --datpath=PATH\n\
               Set the path with required CoreNeuron data to PATH (char*). The default value is '.'.\n\n\
        -f FILE, --filesdat=FILE\n\
@@ -120,100 +137,134 @@ void cb_parameters::show_cb_opts_help()
               Set the path for the output data to PATH (char*). The default value is '.'.\n\
        -k TIME, --forwardskip=TIME\n\
               Set forwardskip to TIME (double). The default value is '0.'.\n\
+       -r TYPE --report=TYPE\n\
+              Enable voltage report with specificied type (0 for disable, 1 for soma, 2 for full compartment).\n\
+       -w, --dt_report=TIME\n\
+              Set the dt for soma reports (using ReportingLib) to TIME (double). The default value is '0.1'.\n\n\
+       -z MULTIPLE, --multiple=MULTIPLE\n\
+              Model duplication factor. Model size is normal size * MULTIPLE (int). The default value is '1'.\n\
+       -x EXTRACON, --extracon=EXTRACON\n\
+              Number of extra random connections in each thread to other duplicate models (int). The default value is '0'.\n\
        -mpi\n\
-              Enable MPI. In order to initialize MPI environment this argument must be specified.\n" );
+              Enable MPI. In order to initialize MPI environment this argument must be specified.\n");
 }
 
-void cb_parameters::read_cb_opts( int argc, char **argv )
-{
+void cn_parameters::read_cb_opts(int argc, char** argv) {
     optind = 1;
     int c;
 
-    while ( 1 ) {
+    while (1) {
         static struct option long_options[] = {
             /* These options don't set a flag.
              *  we distinguish them by their indices. */
-            {"tstart",    required_argument, 0, 's'},
-            {"tstop",     required_argument, 0, 'e'},
-            {"dt",        required_argument, 0, 't'},
-            {"dt_io",     required_argument, 0, 'i'},
-            {"celsius",   required_argument, 0, 'l'},
-            {"pattern",   required_argument, 0, 'p'},
-            {"spikebuf",  required_argument, 0, 'b'},
+            {"tstart", required_argument, 0, 's'},
+            {"tstop", required_argument, 0, 'e'},
+            {"dt", required_argument, 0, 't'},
+            {"dt_io", required_argument, 0, 'i'},
+            {"celsius", required_argument, 0, 'l'},
+            {"voltage", required_argument, 0, 'v'},
+            {"pattern", required_argument, 0, 'p'},
+            {"spikebuf", required_argument, 0, 'b'},
             {"prcellgid", required_argument, 0, 'g'},
-            {"threading", no_argument,       0, 'c'},
-            {"datpath",   required_argument, 0, 'd'},
-            {"filesdat",  required_argument, 0, 'f'},
-            {"outpath",   required_argument, 0, 'o'},
+            {"threading", no_argument, 0, 'c'},
+            {"gpu", no_argument, 0, 'a'},
+            {"cell_permute", optional_argument, 0, 'R'},
+            {"nwarp", required_argument, 0, 'W'},
+            {"datpath", required_argument, 0, 'd'},
+            {"filesdat", required_argument, 0, 'f'},
+            {"outpath", required_argument, 0, 'o'},
             {"forwardskip", required_argument, 0, 'k'},
-            {"mpi",       optional_argument, 0, 'm'},
-            {"help",      no_argument,       0, 'h'},
-            {0, 0, 0, 0}
-        };
+            {"multiple", required_argument, 0, 'z'},
+            {"extracon", required_argument, 0, 'x'},
+            {"mpi", optional_argument, 0, 'm'},
+            {"report", required_argument, 0, 'r'},
+            {"dt_report", required_argument, 0, 'w'},
+            {"help", no_argument, 0, 'h'},
+            {0, 0, 0, 0}};
         /* getopt_long stores the option index here. */
         int option_index = 0;
 
-        c = getopt_long( argc, argv, "s:e:t:i:l:p:b:g:c:d:f:o:k:m:h",
-                         long_options, &option_index );
+        c = getopt_long(argc, argv, "s:e:t:i:l:p:b:g:c:d:f:o:k:z:x:m:h:r:w:a:v:R:W", long_options,
+                        &option_index);
 
         /* Detect the end of the options. */
-        if ( c == -1 ) {
+        if (c == -1) {
             break;
         }
 
-        switch ( c ) {
+        switch (c) {
             case 0:
 
                 /* If this option set a flag, do nothing else now. */
-                if ( long_options[option_index].flag != 0 ) {
+                if (long_options[option_index].flag != 0) {
                     break;
                 }
 
-                printf( "option %s", long_options[option_index].name );
+                printf("option %s", long_options[option_index].name);
 
-                if ( optarg ) {
-                    printf( " with arg %s", optarg );
+                if (optarg) {
+                    printf(" with arg %s", optarg);
                 }
 
-                printf( "\n" );
+                printf("\n");
                 break;
 
             case 's':
-                tstart = atof( optarg );
+                tstart = atof(optarg);
                 break;
 
             case 'e':
-                tstop = atof( optarg );
+                tstop = atof(optarg);
                 break;
 
             case 't':
-                dt = atof( optarg );
+                dt = atof(optarg);
                 break;
 
             case 'i':
-                dt_io = atof(optarg );
+                dt_io = atof(optarg);
                 break;
 
             case 'l':
                 celsius = atof(optarg);
                 break;
 
+            case 'v':
+                voltage = atof(optarg);
+                break;
+
             case 'p':
                 patternstim = optarg;
                 break;
 
             case 'b':
-                spikebuf = atoi( optarg );
+                spikebuf = atoi(optarg);
                 break;
 
             case 'g':
-                prcellgid = atoi( optarg );
+                prcellgid = atoi(optarg);
                 break;
 
             case 'c':
                 threading = 1;
                 break;
 
+            case 'a':
+                compute_gpu = 1;
+                break;
+
+            case 'R':
+                if (optarg == NULL) {
+                    cell_interleave_permute = 1;
+                } else {
+                    cell_interleave_permute = atoi(optarg);
+                }
+                break;
+
+            case 'W':
+                nwarp = atoi(optarg);
+                break;
+
             case 'd':
                 datpath = optarg;
                 break;
@@ -227,41 +278,57 @@ void cb_parameters::read_cb_opts( int argc, char **argv )
                 break;
 
             case 'k':
-                forwardskip = atof( optarg );
+                forwardskip = atof(optarg);
+                break;
+
+            case 'z':
+                multiple = atoi(optarg);
+                break;
+
+            case 'x':
+                extracon = atoi(optarg);
                 break;
 
             case 'm':
                 /// Reserved for "--mpi", which by this time should be taken care of
                 break;
 
+            case 'r':
+                report = atoi(optarg);
+                break;
+
+            case 'w':
+                dt_report = atof(optarg);
+                break;
+
             case 'h':
             case '?':
-                if ( nrnmpi_myid == 0 ) {
+                if (nrnmpi_myid == 0) {
                     show_cb_opts_help();
                 }
 
-                nrn_exit( 0 );
+                nrn_exit(0);
 
             default:
-                printf( "Option %s", long_options[option_index].name );
+                printf("Option %s", long_options[option_index].name);
 
-                if ( optarg ) {
-                    printf( " with arg %s", optarg );
+                if (optarg) {
+                    printf(" with arg %s", optarg);
                 }
 
-                printf( "is not recognized. Ignoring...\n" );
+                printf("is not recognized. Ignoring...\n");
                 break;
         }
     }
 
     /* Print any remaining command line arguments (not options). */
-    if ( optind < argc ) {
-        printf( "non-option ARGV-elements: " );
+    if (optind < argc) {
+        printf("non-option ARGV-elements: ");
 
-        while ( optind < argc ) {
-            printf( "%s ", argv[optind++] );
+        while (optind < argc) {
+            printf("%s ", argv[optind++]);
         }
 
-        putchar( '\n' );
+        putchar('\n');
     }
 }
diff --git a/coreneuron/nrniv/nrnoptarg.h b/coreneuron/nrniv/nrnoptarg.h
index e90430ab9..5a3ba2d4a 100644
--- a/coreneuron/nrniv/nrnoptarg.h
+++ b/coreneuron/nrniv/nrnoptarg.h
@@ -39,12 +39,12 @@ THE POSSIBILITY OF SUCH DAMAGE.
 #include <getopt.h>
 #include "coreneuron/utils/sdprintf.h"
 
-typedef struct cb_parameters {
-
-    double tstart; 		/**< start time of simulation in msec*/
-    double tstop;		/**< stop time of simulation in msec*/
-    double dt;			/**< timestep to use in msec*/
-    double dt_io;               /**< i/o timestep to use in msec*/
+typedef struct cn_parameters {
+    double tstart;    /**< start time of simulation in msec*/
+    double tstop;     /**< stop time of simulation in msec*/
+    double dt;        /**< timestep to use in msec*/
+    double dt_io;     /**< i/o timestep to use in msec*/
+    double dt_report; /**< i/o timestep to use in msec for reports*/
 
     double celsius;
     double voltage;
@@ -52,34 +52,43 @@ typedef struct cb_parameters {
 
     double forwardskip;
 
-    int spikebuf;		/**< internal buffer used on evry rank for spikes */
-    int prcellgid; 		/**< gid of cell for prcellstate */
+    int spikebuf;  /**< internal buffer used on evry rank for spikes */
+    int prcellgid; /**< gid of cell for prcellstate */
+
+    int threading; /**< enable pthread/openmp  */
+    int report;    /**< enable soma reports  */
+
+    int compute_gpu; /**< run computations on gpu  */
 
-    int threading;		/**< enable pthread/openmp  */
+    int cell_interleave_permute; /**< cell interleaving permutation  */
+    int nwarp;                   /* number of warps to balance for cell_interleave_permute == 2 */
+
+    const char* patternstim;
+    const char* datpath;  /**< directory path where .dat files */
+    const char* outpath;  /**< directory where spikes will be written */
+    const char* filesdat; /**< name of file containing list of gids dat files read in */
 
-    const char *patternstim;
-    const char *datpath;		/**< directory path where .dat files */
-    const char *outpath; 		/**< directory where spikes will be written */
-    const char *filesdat; 		/**< name of file containing list of gids dat files read in */
-   
     double mindelay;
 
-    /** default constructor */ 
-    cb_parameters();
+    int multiple;
+    int extracon;
+
+    /** default constructor */
+    cn_parameters();
 
-    /** show help message for command line args */ 
+    /** show help message for command line args */
     void show_cb_opts_help();
 
-    /** show all parameter values */ 
+    /** show all parameter values */
     void show_cb_opts();
 
-    /** read options from command line */ 
-    void read_cb_opts( int argc, char **argv );
+    /** read options from command line */
+    void read_cb_opts(int argc, char** argv);
 
-    /** return full path of files.dat file */ 
-    sd_ptr get_filesdat_path( char *path_buf, size_t bufsz );
+    /** return full path of files.dat file */
+    sd_ptr get_filesdat_path(char* path_buf, size_t bufsz);
 
-    /** store/set computed mindelay argument */ 
+    /** store/set computed mindelay argument */
     void set_mindelay(double mdelay) {
         mindelay = mdelay;
     }
@@ -87,4 +96,3 @@ typedef struct cb_parameters {
 } cn_input_params;
 
 #endif
-
diff --git a/coreneuron/nrniv/output_spikes.cpp b/coreneuron/nrniv/output_spikes.cpp
index ff1e56c92..154530c67 100644
--- a/coreneuron/nrniv/output_spikes.cpp
+++ b/coreneuron/nrniv/output_spikes.cpp
@@ -41,38 +41,40 @@ int* spikevec_gid;
 
 static MUTDEC
 
-void mk_spikevec_buffer(int sz) {
-  spikevec_buffer_size = sz;
-  spikevec_size = 0;
-  spikevec_time = new double[sz];
-  spikevec_gid = new int[sz];
-  MUTCONSTRUCT(1);
+    void
+    mk_spikevec_buffer(int sz) {
+    spikevec_buffer_size = sz;
+    spikevec_size = 0;
+    spikevec_time = new double[sz];
+    spikevec_gid = new int[sz];
+    MUTCONSTRUCT(1);
 }
 
-void spikevec_lock() { MUTLOCK }
-void spikevec_unlock() { MUTUNLOCK }
+void spikevec_lock() {
+    MUTLOCK
+}
+void spikevec_unlock() {
+    MUTUNLOCK
+}
 
-void output_spikes(const char *outpath) {
-  char fnamebuf[100];
-  sd_ptr fname=sdprintf(fnamebuf, sizeof(fnamebuf), "%s/out%d.dat", outpath, nrnmpi_myid);
-  FILE* f = fopen(fname, "w");
-  if (!f && nrnmpi_myid == 0){
-      std::cout << "WARNING: Could not open file for writing spikes." << std::endl;
-      return;
-  }
+void output_spikes(const char* outpath) {
+    char fnamebuf[100];
+    sd_ptr fname = sdprintf(fnamebuf, sizeof(fnamebuf), "%s/out%d.dat", outpath, nrnmpi_myid);
+    FILE* f = fopen(fname, "w");
+    if (!f && nrnmpi_myid == 0) {
+        std::cout << "WARNING: Could not open file for writing spikes." << std::endl;
+        return;
+    }
 
-  for (int i=0; i < spikevec_size; ++i)
-  {
-    if (spikevec_gid[i] > -1)
-      fprintf(f, "%.8g\t%d\n", spikevec_time[i], spikevec_gid[i]);
-  }
-  fclose(f);
-}
+    for (int i = 0; i < spikevec_size; ++i)
+        if (spikevec_gid[i] > -1)
+            fprintf(f, "%.8g\t%d\n", spikevec_time[i], spikevec_gid[i]);
 
+    fclose(f);
+}
 
-void validation(std::vector<std::pair<double,int> >& res)
-{
-   for (int i=0; i < spikevec_size; ++i)
-    if (spikevec_gid[i] > -1)
-       res.push_back(std::make_pair(spikevec_time[i], spikevec_gid[i]));
+void validation(std::vector<std::pair<double, int> >& res) {
+    for (int i = 0; i < spikevec_size; ++i)
+        if (spikevec_gid[i] > -1)
+            res.push_back(std::make_pair(spikevec_time[i], spikevec_gid[i]));
 }
diff --git a/coreneuron/nrniv/output_spikes.h b/coreneuron/nrniv/output_spikes.h
index 34588dd31..fd07bb45a 100644
--- a/coreneuron/nrniv/output_spikes.h
+++ b/coreneuron/nrniv/output_spikes.h
@@ -30,17 +30,17 @@ THE POSSIBILITY OF SUCH DAMAGE.
 #define output_spikes_h
 
 #include <vector>
-#include <utility> 
+#include <utility>
 
-void output_spikes(const char *outpath);
+void output_spikes(const char* outpath);
 void mk_spikevec_buffer(int);
 
 extern int spikevec_buffer_size;
 extern int spikevec_size;
-extern double* spikevec_time;	
+extern double* spikevec_time;
 extern int* spikevec_gid;
 
-void validation(std::vector<std::pair<double,int> >& res);
+void validation(std::vector<std::pair<double, int> >& res);
 
 void spikevec_lock();
 void spikevec_unlock();
diff --git a/coreneuron/nrniv/partrans.cpp b/coreneuron/nrniv/partrans.cpp
new file mode 100644
index 000000000..817231785
--- /dev/null
+++ b/coreneuron/nrniv/partrans.cpp
@@ -0,0 +1,159 @@
+#include "coreneuron/nrnconf.h"
+#include "coreneuron/nrnoc/multicore.h"
+#include "coreneuron/nrnoc/nrnoc_decl.h"
+#include "coreneuron/nrnmpi/nrnmpi.h"
+#include "coreneuron/nrniv/partrans.h"
+
+// This is the computational code for gap junction simulation.
+// The setup code is in partrans_setup.cpp
+// assert that all HalfGaps are of the same type
+// assert that every HalfGap instance in the thread have been a
+// ParallelContext.target(&HalfGap.vpre, sid)
+
+int nrn_have_gaps;
+
+using namespace nrn_partrans;
+
+HalfGap_Info* nrn_partrans::halfgap_info;
+TransferThreadData* nrn_partrans::transfer_thread_data_;
+
+// MPI_Alltoallv buffer info
+double* nrn_partrans::insrc_buf_;   // Receive buffer for gap voltages
+double* nrn_partrans::outsrc_buf_;  // Send buffer for gap voltages
+int* nrn_partrans::insrccnt_;
+int* nrn_partrans::insrcdspl_;
+int* nrn_partrans::outsrccnt_;
+int* nrn_partrans::outsrcdspl_;
+
+void nrnmpi_v_transfer() {
+    // copy HalfGap source voltages to outsrc_buf_
+    // note that same voltage may get copied to several locations in outsrc_buf
+
+    // gather the source values. can be done in parallel
+    for (int tid = 0; tid < nrn_nthread; ++tid) {
+        TransferThreadData& ttd = transfer_thread_data_[tid];
+        NrnThread& nt = nrn_threads[tid];
+        int n = ttd.nsrc;
+        if (n == 0) {
+            continue;
+        }
+        double* vdata = nt._actual_v;
+        int* v_indices = ttd.v_indices;
+
+#undef METHOD
+#define METHOD 2
+
+#if METHOD == 1
+
+// copy voltages to cpu and cpu gathers/scatters to outsrc_buf
+        #pragma acc update host(vdata[0 : nt.end]) if (nt.compute_gpu)
+        int* outbuf_indices = ttd.outbuf_indices;
+        for (int i = 0; i < n; ++i) {
+            outsrc_buf_[outbuf_indices[i]] = vdata[v_indices[i]];
+        }
+    }
+
+#elif METHOD == 2
+
+        // gather voltages on gpu and copy to cpu, cpu scatters to outsrc_buf
+        double* vg = ttd.v_gather;
+        #pragma acc parallel loop present( \
+            v_indices[0 : n],              \
+              vdata[0 : nt.end],   \
+                    vg[0 : n]) /*copyout(vg[0:n])*/ if (nt.compute_gpu) async(nt.stream_id)
+        for (int i = 0; i < n; ++i) {
+            vg[i] = vdata[v_indices[i]];
+        }
+        // do not know why the copyout above did not work and the following update is needed
+        #pragma acc update host(vg[0 : n]) if (nrn_threads[0].compute_gpu) async(nt.stream_id)
+    }
+
+    // copy source values to outsrc_buf_
+    for (int tid = 0; tid < nrn_nthread; ++tid) {
+        #pragma acc wait(nrn_threads[tid].stream_id)
+        TransferThreadData& ttd = transfer_thread_data_[tid];
+        int n = ttd.nsrc;
+        if (n == 0) {
+            continue;
+        }
+        int* outbuf_indices = ttd.outbuf_indices;
+        double* vg = ttd.v_gather;
+        for (int i = 0; i < n; ++i) {
+            outsrc_buf_[outbuf_indices[i]] = vg[i];
+        }
+    }
+
+#endif /* METHOD == 2 */
+
+// transfer
+#if NRNMPI
+    if (nrnmpi_numprocs > 1) {  // otherwise insrc_buf_ == outsrc_buf_
+        nrnmpi_barrier();
+        nrnmpi_dbl_alltoallv(outsrc_buf_, outsrccnt_, outsrcdspl_, insrc_buf_, insrccnt_,
+                             insrcdspl_);
+    } else
+#endif
+    {  // actually use the multiprocess code even for one process to aid debugging
+        for (int i = 0; i < outsrcdspl_[1]; ++i) {
+            insrc_buf_[i] = outsrc_buf_[i];
+        }
+    }
+
+    // insrc_buf_ will get copied to targets via nrnthread_v_transfer
+    #pragma acc update device( \
+        insrc_buf_[0 : insrcdspl_[nrnmpi_numprocs]]) if (nrn_threads[0].compute_gpu)
+}
+
+void nrnthread_v_transfer(NrnThread* _nt) {
+    TransferThreadData& ttd = transfer_thread_data_[_nt->id];
+    if (!ttd.halfgap_ml) {
+        return;
+    }
+    int _cntml_actual = ttd.halfgap_ml->nodecount;
+    double* vpre = ttd.halfgap_ml->data;
+    int* insrc_indices = ttd.insrc_indices;
+
+    if (halfgap_info->layout == 1) { /* AoS */
+        int ix_vpre = halfgap_info->ix_vpre;
+        int sz = halfgap_info->sz;
+        vpre += ix_vpre;
+        for (int _iml = 0; _iml < _cntml_actual; ++_iml) {
+            vpre[_iml * sz] = insrc_buf_[insrc_indices[_iml]];
+        }
+    } else { /* SoA */
+        int _cntml_padded = ttd.halfgap_ml->_nodecount_padded;
+        int ix_vpre = halfgap_info->ix_vpre * _cntml_padded;
+        vpre += ix_vpre;
+        #pragma acc parallel loop present(                                                        \
+            insrc_indices[0 : _cntml_actual],                                                     \
+                  vpre[0 : _cntml_actual],                                                \
+                       insrc_buf_[0 : insrcdspl_[nrnmpi_numprocs]]) if (_nt->compute_gpu) \
+                                                                            async(_nt->stream_id)
+        for (int _iml = 0; _iml < _cntml_actual; ++_iml) {
+            vpre[_iml] = insrc_buf_[insrc_indices[_iml]];
+        }
+    }
+}
+
+void nrn_partrans::gap_update_indices() {
+    printf("gap_update_indices\n");
+    if (insrcdspl_) {
+        #pragma acc enter data create( \
+            insrc_buf_[0 : insrcdspl_[nrnmpi_numprocs]]) if (nrn_threads[0].compute_gpu)
+    }
+    for (int tid = 0; tid < nrn_nthread; ++tid) {
+        TransferThreadData& ttd = transfer_thread_data_[tid];
+
+#if METHOD == 2
+        int n = ttd.nsrc;
+        if (n) {
+            #pragma acc enter data copyin(ttd.v_indices[0 : n]) if (nrn_threads[0].compute_gpu)
+            #pragma acc enter data create(ttd.v_gather[0 : n]) if (nrn_threads[0].compute_gpu)
+        }
+#endif /* METHOD == 2 */
+
+        if (ttd.halfgap_ml) {
+            #pragma acc enter data copyin(ttd.insrc_indices[0 : ttd.ntar]) if (nrn_threads[0].compute_gpu)
+        }
+    }
+}
diff --git a/coreneuron/nrniv/partrans.h b/coreneuron/nrniv/partrans.h
new file mode 100644
index 000000000..b1c894360
--- /dev/null
+++ b/coreneuron/nrniv/partrans.h
@@ -0,0 +1,61 @@
+#ifndef partrans_h
+#define partrans_h
+
+struct Memb_list;
+
+namespace nrn_partrans {
+
+#ifndef NRNLONGSGID
+#define NRNLONGSGID 0
+#endif
+
+#if NRNLONGSGID
+    typedef int64_t sgid_t;
+#else
+    typedef int sgid_t;
+#endif
+
+    struct HalfGap_Info {
+        int layout;
+        int type;
+        int ix_vpre; /* AoS index for vpre from beginning of a HalfGap instance */
+        int sz;      /* size of a HalfGap instance */
+    };
+    extern HalfGap_Info* halfgap_info;
+
+    class TransferThreadData {
+      public:
+        TransferThreadData();
+        ~TransferThreadData();
+        Memb_list* halfgap_ml;
+        int nsrc;             // number of places in outsrc_buf_ voltages get copied to.
+        int ntar;             // insrc_indices size (halfgap_ml->nodecount);
+        int* insrc_indices;   // halfgap_ml->nodecount indices into insrc_buf_
+        int* v_indices;       // indices into NrnThread._actual_v (may have duplications).
+        int* outbuf_indices;  // indices into outsrc_buf_
+        double* v_gather;     // _actual_v[v_indices]
+    };
+    extern TransferThreadData* transfer_thread_data_; /* array for threads */
+
+    struct SetupInfo {
+        int nsrc;  // number of sources in this thread
+        int ntar;  // equal to memb_list nodecount
+        int type;
+        int ix_vpre;
+        sgid_t* sid_src;
+        int* v_indices;      // increasing order
+        sgid_t* sid_target;  // aleady in memb_list order
+    };
+    extern SetupInfo* setup_info_; /* array for threads exists only during setup*/
+
+    extern void gap_mpi_setup(int ngroup);
+    extern void gap_thread_setup(NrnThread& nt);
+    extern void gap_indices_permute(NrnThread& nt);
+    extern void gap_update_indices();
+
+    extern double* insrc_buf_;   // Receive buffer for gap voltages
+    extern double* outsrc_buf_;  // Send buffer for gap voltages
+    extern int *insrccnt_, *insrcdspl_, *outsrccnt_, *outsrcdspl_;
+}
+
+#endif /*partrans_h*/
diff --git a/coreneuron/nrniv/partrans_setup.cpp b/coreneuron/nrniv/partrans_setup.cpp
new file mode 100644
index 000000000..1f2c0e99c
--- /dev/null
+++ b/coreneuron/nrniv/partrans_setup.cpp
@@ -0,0 +1,289 @@
+#include "coreneuron/nrnconf.h"
+#include "coreneuron/nrnoc/multicore.h"
+#include "coreneuron/nrnoc/nrnoc_decl.h"
+#include "coreneuron/nrnmpi/nrnmpi.h"
+#include "coreneuron/nrniv/partrans.h"
+#include <map>
+#include <vector>
+
+using namespace ::nrn_partrans;
+
+nrn_partrans::SetupInfo* nrn_partrans::setup_info_;
+
+class SidData {
+  public:
+    std::vector<int> tids_;
+    std::vector<int> indices_;
+};
+
+#if NRNLOGSGID
+#define sgid_alltoallv nrnmpi_long_alltoallv
+#else
+#define sgid_alltoallv nrnmpi_int_alltoallv
+#endif
+
+#define HAVEWANT_t sgid_t
+#define HAVEWANT_alltoallv sgid_alltoallv
+#define HAVEWANT2Int std::map<sgid_t, int>
+#include "coreneuron/nrniv/have2want.h"
+
+nrn_partrans::TransferThreadData::TransferThreadData() {
+    halfgap_ml = NULL;
+    nsrc = 0;
+    ntar = 0;
+    insrc_indices = NULL;
+    v_indices = NULL;
+    outbuf_indices = NULL;
+    v_gather = NULL;
+}
+
+nrn_partrans::TransferThreadData::~TransferThreadData() {
+    if (insrc_indices) {
+        delete[] insrc_indices;
+    }
+    if (v_indices) {
+        delete[] v_indices;
+    }
+    if (outbuf_indices) {
+        delete[] outbuf_indices;
+    }
+    if (v_gather) {
+        delete[] v_gather;
+    }
+}
+
+void nrn_partrans::gap_mpi_setup(int ngroup) {
+    // printf("%d gap_mpi_setup ngroup=%d\n", nrnmpi_myid, ngroup);
+
+    // This can happen until bug is fixed. ie. if one process has more than
+    // one thread then all processes must have more than one thread
+    if (ngroup < nrn_nthread) {
+        transfer_thread_data_[ngroup].nsrc = 0;
+        transfer_thread_data_[ngroup].halfgap_ml = NULL;
+    }
+
+    // create and fill halfgap_info using first available...
+    halfgap_info = new HalfGap_Info;
+    HalfGap_Info& hgi = *halfgap_info;
+    for (int tid = 0; tid < ngroup; ++tid) {
+        nrn_partrans::SetupInfo& si = setup_info_[tid];
+        if (si.ntar) {
+            hgi.ix_vpre = si.ix_vpre;
+            hgi.type = si.type;
+            hgi.sz = nrn_prop_param_size_[hgi.type];
+            hgi.layout = nrn_mech_data_layout_[hgi.type];
+        }
+    }
+
+    // count total_nsrc, total_ntar and allocate (total_ntar too large but...)
+    int total_nsrc = 0, total_ntar = 0;
+    for (int tid = 0; tid < ngroup; ++tid) {
+        nrn_partrans::SetupInfo& si = setup_info_[tid];
+        total_nsrc += si.nsrc;
+        total_ntar += si.ntar;
+    }
+
+    // have and want arrays
+    sgid_t* have = new sgid_t[total_nsrc];
+    sgid_t* want = new sgid_t[total_ntar];  // more than needed
+
+    // map from sid_src to (tid, index) into v_indices
+    // and sid_target to lists of (tid, index) for memb_list
+    // also count the map sizes and fill have and want arrays
+    std::map<sgid_t, SidData> src2data;
+    std::map<sgid_t, SidData> tar2data;
+    int src2data_size = 0, tar2data_size = 0;  // number of unique sids
+    for (int tid = 0; tid < ngroup; ++tid) {
+        SetupInfo& si = setup_info_[tid];
+        for (int i = 0; i < si.nsrc; ++i) {
+            sgid_t sid = si.sid_src[i];
+            SidData sd;
+            sd.tids_.push_back(tid);
+            sd.indices_.push_back(i);
+            src2data[sid] = sd;
+            have[src2data_size] = sid;
+            src2data_size++;
+        }
+        for (int i = 0; i < si.ntar; ++i) {
+            sgid_t sid = si.sid_target[i];
+            if (tar2data.find(sid) == tar2data.end()) {
+                SidData sd;
+                tar2data[sid] = sd;
+                want[tar2data_size] = sid;
+                tar2data_size++;
+            }
+            SidData& sd = tar2data[sid];
+            sd.tids_.push_back(tid);
+            sd.indices_.push_back(i);
+        }
+    }
+
+    // 2) Call the have_to_want function.
+    sgid_t* send_to_want;
+    sgid_t* recv_from_have;
+
+    have_to_want(have, src2data_size, want, tar2data_size, send_to_want, outsrccnt_, outsrcdspl_,
+                 recv_from_have, insrccnt_, insrcdspl_, default_rendezvous);
+
+    int nhost = nrnmpi_numprocs;
+
+    // sanity check. all the sgids we are asked to send, we actually have
+    for (int i = 0; i < outsrcdspl_[nhost]; ++i) {
+        sgid_t sgid = send_to_want[i];
+        assert(src2data.find(sgid) != src2data.end());
+    }
+
+    // sanity check. all the sgids we receive, we actually need.
+    for (int i = 0; i < insrcdspl_[nhost]; ++i) {
+        sgid_t sgid = recv_from_have[i];
+        assert(tar2data.find(sgid) != tar2data.end());
+    }
+
+#if 0
+  printf("%d mpi outsrccnt_, outsrcdspl_, insrccnt, insrcdspl_\n", nrnmpi_myid);
+  for (int i = 0; i < nrnmpi_numprocs; ++i) {
+    printf("%d : %d %d %d %d\n", nrnmpi_myid, outsrccnt_[i], outsrcdspl_[i],
+      insrccnt_[i], insrcdspl_[i]);
+  }
+#endif
+
+    // clean up a little
+    delete[] have;
+    delete[] want;
+
+    insrc_buf_ = new double[insrcdspl_[nhost]];
+    outsrc_buf_ = new double[outsrcdspl_[nhost]];
+
+    // count and allocate transfer_thread_data arrays.
+    for (int tid = 0; tid < ngroup; ++tid) {
+        transfer_thread_data_[tid].nsrc = 0;
+    }
+    for (int i = 0; i < outsrcdspl_[nhost]; ++i) {
+        sgid_t sgid = send_to_want[i];
+        SidData& sd = src2data[sgid];
+        // only one item in the lists.
+        int tid = sd.tids_[0];
+        transfer_thread_data_[tid].nsrc += 1;
+    }
+    for (int tid = 0; tid < ngroup; ++tid) {
+        nrn_partrans::SetupInfo& si = setup_info_[tid];
+        nrn_partrans::TransferThreadData& ttd = transfer_thread_data_[tid];
+        ttd.v_indices = new int[ttd.nsrc];
+        ttd.v_gather = new double[ttd.nsrc];
+        ttd.outbuf_indices = new int[ttd.nsrc];
+        ttd.nsrc = 0;  // recount below as filled
+        ttd.ntar = si.ntar;
+        ttd.insrc_indices = new int[si.ntar];
+    }
+
+    // fill thread actual_v to send arrays. (offsets and layout later).
+    for (int i = 0; i < outsrcdspl_[nhost]; ++i) {
+        sgid_t sgid = send_to_want[i];
+        SidData& sd = src2data[sgid];
+        // only one item in the lists.
+        int tid = sd.tids_[0];
+        int index = sd.indices_[0];
+
+        nrn_partrans::SetupInfo& si = setup_info_[tid];
+        nrn_partrans::TransferThreadData& ttd = transfer_thread_data_[tid];
+
+        ttd.v_indices[ttd.nsrc] = si.v_indices[index];
+        ttd.outbuf_indices[ttd.nsrc] = i;
+        ttd.nsrc += 1;
+    }
+
+    // fill thread receive to vpre arrays. (offsets and layout later).
+    for (int i = 0; i < insrcdspl_[nhost]; ++i) {
+        sgid_t sgid = recv_from_have[i];
+        SidData& sd = tar2data[sgid];
+        // there may be several items in the lists.
+        for (unsigned j = 0; j < sd.tids_.size(); ++j) {
+            int tid = sd.tids_[j];
+            int index = sd.indices_[j];
+
+            transfer_thread_data_[tid].insrc_indices[index] = i;
+        }
+    }
+
+#if 0
+  // things look ok so far?
+  for (int tid=0; tid < ngroup; ++tid) {
+    nrn_partrans::SetupInfo& si = setup_info_[tid];
+    nrn_partrans::TransferThreadData& ttd = transfer_thread_data_[tid];
+    for (int i=0; i < si.nsrc; ++i) {
+      printf("%d %d src sid=%d v_index=%d\n", nrnmpi_myid, tid, si.sid_src[i], si.v_indices[i]);
+    }
+    for (int i=0; i < si.ntar; ++i) {
+      printf("%d %d tar sid=%d i=%d\n", nrnmpi_myid, tid, si.sid_target[i], i);
+    }
+    for (int i=0; i < ttd.nsrc; ++i) {
+      printf("%d %d src i=%d v_index=%d\n", nrnmpi_myid, tid, i, ttd.v_indices[i]);
+    }
+    for (int i=0; i < ttd.ntar; ++i) {
+      printf("%d %d tar i=%d insrc_index=%d\n", nrnmpi_myid, tid, i, ttd.insrc_indices[i]);
+    }
+  }
+#endif
+
+    // cleanup
+    for (int tid = 0; tid < ngroup; ++tid) {
+        SetupInfo& si = setup_info_[tid];
+        delete[] si.sid_src;
+        delete[] si.v_indices;
+        delete[] si.sid_target;
+    }
+    delete[] send_to_want;
+    delete[] recv_from_have;
+    delete[] setup_info_;
+}
+
+void nrn_partrans::gap_thread_setup(NrnThread& nt) {
+    // printf("%d gap_thread_setup tid=%d\n", nrnmpi_myid, nt.id);
+    nrn_partrans::TransferThreadData& ttd = transfer_thread_data_[nt.id];
+
+    ttd.halfgap_ml = nt._ml_list[halfgap_info->type];
+#if 0
+  int ntar = ttd.halfgap_ml->nodecount;
+  assert(ntar == ttd.ntar);
+  int sz =halfgap_info->sz;
+
+  for (int i=0; i < ntar; ++i) {
+    ttd.insrc_indices[i] += sz;
+  }
+#endif
+}
+
+void nrn_partrans::gap_indices_permute(NrnThread& nt) {
+    printf("nrn_partrans::gap_indices_permute\n");
+    nrn_partrans::TransferThreadData& ttd = transfer_thread_data_[nt.id];
+    // sources
+    if (ttd.nsrc > 0 && nt._permute) {
+        int n = ttd.nsrc;
+        int* iv = ttd.v_indices;
+        int* ip = nt._permute;
+        // iv starts out as indices into unpermuted node array. That node
+        // was permuted to index ip
+        for (int i = 0; i < n; ++i) {
+            iv[i] = ip[iv[i]];
+        }
+    }
+    // now the outsrc_buf_ is invariant under any node permutation,
+    // and, consequently, so is the insrc_buf_.
+
+    // targets
+    if (ttd.halfgap_ml && ttd.halfgap_ml->_permute) {
+        int n = ttd.halfgap_ml->nodecount;
+        int* ip = ttd.halfgap_ml->_permute;
+        int* isi = ttd.insrc_indices;
+        // halfgap has been permuted according to ip.
+        // so old index value needs to be put into the new location.
+        int* oldisi = new int[n];
+        for (int i = 0; i < n; ++i) {
+            oldisi[i] = isi[i];
+        }
+        for (int i = 0; i < n; ++i) {
+            isi[ip[i]] = oldisi[i];
+        }
+        delete[] oldisi;
+    }
+}
diff --git a/coreneuron/nrniv/patternstim.cpp b/coreneuron/nrniv/patternstim.cpp
index a342b0c2e..5490e7df4 100644
--- a/coreneuron/nrniv/patternstim.cpp
+++ b/coreneuron/nrniv/patternstim.cpp
@@ -44,8 +44,16 @@ THE POSSIBILITY OF SUCH DAMAGE.
 
 extern "C" {
 void _pattern_reg(void);
-extern void pattern_stim_setup_helper(int size, double* tvec, int* gidvec,
-  int icnt, int cnt, double* _p, Datum* _ppvar, ThreadDatum* _thread, NrnThread* _nt, double v);
+extern void pattern_stim_setup_helper(int size,
+                                      double* tvec,
+                                      int* gidvec,
+                                      int icnt,
+                                      int cnt,
+                                      double* _p,
+                                      Datum* _ppvar,
+                                      ThreadDatum* _thread,
+                                      NrnThread* _nt,
+                                      double v);
 }
 
 static int read_raster_file(const char* fname, double** tvec, int** gidvec);
@@ -53,81 +61,81 @@ static int read_raster_file(const char* fname, double** tvec, int** gidvec);
 int nrn_extra_thread0_vdata;
 
 void nrn_set_extra_thread0_vdata() {
-  // limited to PatternStim for now.
-  // if called, must be called before nrn_setup and after mk_mech.
-  int type = nrn_get_mechtype("PatternStim");
-  if (!memb_func[type].sym) {
-    // the NEURON mod file version is not vectorized so the param size
-    // differs by 1 from the coreneuron version.
-     nrn_prop_param_size_[type] += 1;
-    _pattern_reg();
-  }
-  nrn_extra_thread0_vdata = nrn_prop_dparam_size_[type];
+    // limited to PatternStim for now.
+    // if called, must be called before nrn_setup and after mk_mech.
+    int type = nrn_get_mechtype("PatternStim");
+    if (!memb_func[type].sym) {
+        // the NEURON mod file version is not vectorized so the param size
+        // differs by 1 from the coreneuron version.
+        nrn_prop_param_size_[type] += 1;
+        _pattern_reg();
+    }
+    nrn_extra_thread0_vdata = nrn_prop_dparam_size_[type];
 }
 
 // fname is the filename of an output_spikes.h format raster file.
 void nrn_mkPatternStim(const char* fname) {
-  int type = nrn_get_mechtype("PatternStim");
-  if (!memb_func[type].sym) {
-    printf("nrn_set_extra_thread_vdata must be called (after mk_mech, and before nrn_setup\n");
-    assert(0);
-  }
-
-  double* tvec;
-  int* gidvec;
-  int size = read_raster_file(fname, &tvec, &gidvec);
-  printf("raster size = %d\n", size);
+    int type = nrn_get_mechtype("PatternStim");
+    if (!memb_func[type].sym) {
+        printf("nrn_set_extra_thread_vdata must be called (after mk_mech, and before nrn_setup\n");
+        assert(0);
+    }
+
+    double* tvec;
+    int* gidvec;
+    int size = read_raster_file(fname, &tvec, &gidvec);
+    printf("raster size = %d\n", size);
 #if 0
   for (int i=0; i < size; ++i) { printf("%g %d\n", tvec[i], gidvec[i]);}
 #endif
 
-  Point_process* pnt = nrn_artcell_instantiate("PatternStim");
-  NrnThread* nt = nrn_threads + pnt->_tid;
-  Memb_list* ml =  nt->_ml_list[type];
-  int layout = nrn_mech_data_layout_[type];
-  int sz = nrn_prop_param_size_[type];
-  int psz = nrn_prop_dparam_size_[type];
-  int _cntml = ml->nodecount;
-  int _iml = pnt->_i_instance;
-  double* _p = ml->data;
-  Datum* _ppvar = ml->pdata;
-  if (layout == 1) {
-    _p += _iml*sz; _ppvar += _iml*psz;
-  }else if (layout == 0) {
-    ;
-  }else{
-    assert(0);
-  }    
-  pattern_stim_setup_helper(size, tvec, gidvec, _iml, _cntml, _p, _ppvar, NULL, nt, 0.0);
+    Point_process* pnt = nrn_artcell_instantiate("PatternStim");
+    NrnThread* nt = nrn_threads + pnt->_tid;
+    Memb_list* ml = nt->_ml_list[type];
+    int layout = nrn_mech_data_layout_[type];
+    int sz = nrn_prop_param_size_[type];
+    int psz = nrn_prop_dparam_size_[type];
+    int _cntml = ml->nodecount;
+    int _iml = pnt->_i_instance;
+    double* _p = ml->data;
+    Datum* _ppvar = ml->pdata;
+    if (layout == 1) {
+        _p += _iml * sz;
+        _ppvar += _iml * psz;
+    } else if (layout == 0) {
+        ;
+    } else {
+        assert(0);
+    }
+    pattern_stim_setup_helper(size, tvec, gidvec, _iml, _cntml, _p, _ppvar, NULL, nt, 0.0);
 }
 
 int read_raster_file(const char* fname, double** tvec, int** gidvec) {
-  FILE* f = fopen(fname, "r");
-  assert(f);
-  int size = 0;
-  int bufsize = 10000;
-  *tvec = (double*)emalloc(bufsize*sizeof(double));
-  *gidvec = (int*)emalloc(bufsize*sizeof(int));
-
-  double st;
-  int gid;
-  char dummy[100];
-  nrn_assert(fgets(dummy, 100, f));
-  while (fscanf(f, "%lf %d\n", &st, &gid) == 2) {
-    if (size >= bufsize) {
-	bufsize *= 2;
-	*tvec = (double*)erealloc(*tvec, bufsize*sizeof(double));
-	*gidvec = (int*)erealloc(*gidvec, bufsize*sizeof(int));
+    FILE* f = fopen(fname, "r");
+    assert(f);
+    int size = 0;
+    int bufsize = 10000;
+    *tvec = (double*)emalloc(bufsize * sizeof(double));
+    *gidvec = (int*)emalloc(bufsize * sizeof(int));
+
+    double st;
+    int gid;
+    char dummy[100];
+    nrn_assert(fgets(dummy, 100, f));
+    while (fscanf(f, "%lf %d\n", &st, &gid) == 2) {
+        if (size >= bufsize) {
+            bufsize *= 2;
+            *tvec = (double*)erealloc(*tvec, bufsize * sizeof(double));
+            *gidvec = (int*)erealloc(*gidvec, bufsize * sizeof(int));
+        }
+        (*tvec)[size] = st;
+        (*gidvec)[size] = gid;
+        ++size;
     }
-    (*tvec)[size] = st;
-    (*gidvec)[size] = gid;
-    ++size;
-  }
-  fclose(f);
-  return size;
+    fclose(f);
+    return size;
 }
 
-
 // Opportunistically implemented to create a single PatternStim.
 // So only does enough to get that functionally incorporated into the model
 // and other types may require additional work. In particular, we
@@ -136,82 +144,85 @@ int read_raster_file(const char* fname, double** tvec, int** gidvec) {
 // we do not modify any of the other thread 0 data arrays or counts.
 
 Point_process* nrn_artcell_instantiate(const char* mechname) {
-  int type = nrn_get_mechtype(mechname);
-  printf("nrn_artcell_instantiate %s type=%d\n", mechname, type);
-  NrnThread* nt = nrn_threads + 0;
-
-  // see nrn_setup.cpp:read_phase2 for how it creates NrnThreadMembList instances.
-  // create and append to nt.tml
-  assert(nt->_ml_list[type] == NULL); //FIXME
-  NrnThreadMembList* tml = (NrnThreadMembList*)emalloc(sizeof(NrnThreadMembList));
-  tml->ml = (Memb_list*)emalloc(sizeof(Memb_list));
-  tml->dependencies = NULL;
-  tml->ndependencies = 0;
-  nt->_ml_list[type] = tml->ml;
-  tml->index = type;
-  tml->next = NULL;
-  if (!nt->tml) {
-    nt->tml = tml;
-  }else{
-    for (NrnThreadMembList* i = nt->tml; i; i = i->next) {
-      if (!i->next) {
-        i->next = tml;
-        break;
-      }
+    int type = nrn_get_mechtype(mechname);
+    printf("nrn_artcell_instantiate %s type=%d\n", mechname, type);
+    NrnThread* nt = nrn_threads + 0;
+
+    // see nrn_setup.cpp:read_phase2 for how it creates NrnThreadMembList instances.
+    // create and append to nt.tml
+    assert(nt->_ml_list[type] == NULL);  // FIXME
+    NrnThreadMembList* tml = (NrnThreadMembList*)emalloc(sizeof(NrnThreadMembList));
+    tml->ml = (Memb_list*)emalloc(sizeof(Memb_list));
+    tml->dependencies = NULL;
+    tml->ndependencies = 0;
+    nt->_ml_list[type] = tml->ml;
+    tml->index = type;
+    tml->next = NULL;
+    if (!nt->tml) {
+        nt->tml = tml;
+    } else {
+        for (NrnThreadMembList* i = nt->tml; i; i = i->next) {
+            if (!i->next) {
+                i->next = tml;
+                break;
+            }
+        }
     }
-  }
-
-  // fill in tml->ml info. The data is not in the cache efficient
-  // NrnThread arrays but there should not be many of these instances.
-  int psize = nrn_prop_param_size_[type];
-  int dsize = nrn_prop_dparam_size_[type];
-  // int layout = nrn_mech_data_layout_[type]; // not needed because singleton
-  Memb_list* ml = tml->ml;
-  ml->nodecount = 1;
-  ml->_nodecount_padded = ml->nodecount;
-  ml->nodeindices = NULL;
-  ml->data = (double*)ecalloc(ml->nodecount*psize, sizeof(double));
-  ml->pdata = (Datum*)ecalloc(ml->nodecount*dsize, sizeof(Datum));
-  ml->_thread = NULL;
-
-  // Here we have a problem with no easy general solution. ml->pdata are
-  // integer indexes into the nt->_data nt->_idata and nt->_vdata array
-  // depending on context, 
-  // but nrn_setup.cpp allocated these to exactly have the size needed by
-  // the file defined model (at least for _vdata) and so there are no slots
-  // for pdata to index into for this new instance.
-  // So nrn_setup.cpp:phase2 needs to
-  // be notified that some extra space will be required. For now, defer
-  // the general situation of several instances for several types and
-  // demand that this method is never called more than once. We introduce
-  // a int nrn_extra_thread0_vdata (only that is needed by PatternStim)
-  //  which will be used by
-  // nrn_setup.cpp:phase2 to allocate the appropriately larger
-  // _vdata arrays for thread 0 (without changing _nvdata so
-  // that we can fill in the indices here)
-  static int cnt = 0;
-  if (++cnt > 1) {
-    printf("nrn_artcell_instantiate cannot be called more than once\n");
-    assert(0);
-  }
-  // note that PatternStim internal usage for the 4 ppvar values  is:
-  // #define _nd_area  _nt->_data[_ppvar[0]]  (not used since ARTIFICIAL_CELL)
-  // #define _p_ptr  _nt->_vdata[_ppvar[2]] (the BBCORE_POINTER)
-  // #define _tqitem &(_nt->_vdata[_ppvar[3]]) (for net_send)
-  // and general external usage is:
-  // _nt->_vdata[_ppvar[1]] = Point_process*
-  // 
-
-  Point_process* pnt = new Point_process;
-  pnt->_type = type;
-  pnt->_tid = nt->id;
-  pnt->_i_instance = 0;
-  // as though all dparam index into _vdata
-  assert(dsize <= nrn_extra_thread0_vdata);
-  for (int i=0; i < dsize; ++i) {
-    ml->pdata[i] = nt->_nvdata + i;
-  }
-  nt->_vdata[nt->_nvdata + 1] = (void*)pnt;
-
-  return pnt;
+
+    // fill in tml->ml info. The data is not in the cache efficient
+    // NrnThread arrays but there should not be many of these instances.
+    int psize = nrn_prop_param_size_[type];
+    int dsize = nrn_prop_dparam_size_[type];
+    // int layout = nrn_mech_data_layout_[type]; // not needed because singleton
+    Memb_list* ml = tml->ml;
+    ml->nodecount = 1;
+    ml->_nodecount_padded = ml->nodecount;
+    ml->nodeindices = NULL;
+    ml->data = (double*)ecalloc(ml->nodecount * psize, sizeof(double));
+    ml->pdata = (Datum*)ecalloc(ml->nodecount * dsize, sizeof(Datum));
+    ml->_thread = NULL;
+    ml->_net_receive_buffer = NULL;
+    ml->_net_send_buffer = NULL;
+    ml->_permute = NULL;
+
+    // Here we have a problem with no easy general solution. ml->pdata are
+    // integer indexes into the nt->_data nt->_idata and nt->_vdata array
+    // depending on context,
+    // but nrn_setup.cpp allocated these to exactly have the size needed by
+    // the file defined model (at least for _vdata) and so there are no slots
+    // for pdata to index into for this new instance.
+    // So nrn_setup.cpp:phase2 needs to
+    // be notified that some extra space will be required. For now, defer
+    // the general situation of several instances for several types and
+    // demand that this method is never called more than once. We introduce
+    // a int nrn_extra_thread0_vdata (only that is needed by PatternStim)
+    //  which will be used by
+    // nrn_setup.cpp:phase2 to allocate the appropriately larger
+    // _vdata arrays for thread 0 (without changing _nvdata so
+    // that we can fill in the indices here)
+    static int cnt = 0;
+    if (++cnt > 1) {
+        printf("nrn_artcell_instantiate cannot be called more than once\n");
+        assert(0);
+    }
+    // note that PatternStim internal usage for the 4 ppvar values  is:
+    // #define _nd_area  _nt->_data[_ppvar[0]]  (not used since ARTIFICIAL_CELL)
+    // #define _p_ptr  _nt->_vdata[_ppvar[2]] (the BBCORE_POINTER)
+    // #define _tqitem &(_nt->_vdata[_ppvar[3]]) (for net_send)
+    // and general external usage is:
+    // _nt->_vdata[_ppvar[1]] = Point_process*
+    //
+
+    Point_process* pnt = new Point_process;
+    pnt->_type = type;
+    pnt->_tid = nt->id;
+    pnt->_i_instance = 0;
+    // as though all dparam index into _vdata
+    assert(dsize <= nrn_extra_thread0_vdata);
+    for (int i = 0; i < dsize; ++i) {
+        ml->pdata[i] = nt->_nvdata + i;
+    }
+    nt->_vdata[nt->_nvdata + 1] = (void*)pnt;
+
+    return pnt;
 }
diff --git a/coreneuron/nrniv/prcellstate.cpp b/coreneuron/nrniv/prcellstate.cpp
index 47ca363cc..b4b53f5b0 100644
--- a/coreneuron/nrniv/prcellstate.cpp
+++ b/coreneuron/nrniv/prcellstate.cpp
@@ -34,175 +34,264 @@ THE POSSIBILITY OF SUCH DAMAGE.
 #include "coreneuron/nrniv/netcon.h"
 #include "coreneuron/nrnoc/nrnoc_decl.h"
 #include "coreneuron/utils/sdprintf.h"
+#include "coreneuron/nrniv/nrniv_decl.h"
+#include "coreneuron/nrniv/nrn_assert.h"
 
-std::map<Point_process*, int> pnt2index; // for deciding if NetCon is to be printed
-static int pntindex; // running count of printed point processes.
+#define precision 15
+
+static std::map<Point_process*, int> pnt2index;  // for deciding if NetCon is to be printed
+static int pntindex;                             // running count of printed point processes.
+static std::map<NetCon*, DiscreteEvent*> map_nc2src;
+static std::vector<int>* inv_permute_;
+
+static int permute(int i, NrnThread& nt) {
+    return nt._permute ? nt._permute[i] : i;
+}
+
+static int inv_permute(int i, NrnThread& nt) {
+    nrn_assert(i >= 0 && i < nt.end);
+    if (!nt._permute) {
+        return i;
+    }
+    if (!inv_permute_) {
+        inv_permute_ = new std::vector<int>(nt.end);
+        for (int i = 0; i < nt.end; ++i) {
+            (*inv_permute_)[nt._permute[i]] = i;
+        }
+    }
+    return (*inv_permute_)[i];
+}
+
+static int ml_permute(int i, Memb_list* ml) {
+    return ml->_permute ? ml->_permute[i] : i;
+}
+
+// Note: cellnodes array is in unpermuted order.
 
 static void pr_memb(int type, Memb_list* ml, int* cellnodes, NrnThread& nt, FILE* f) {
-  int is_art = nrn_is_artificial_[type];
-  if (is_art)
-    return;
-
-  int header_printed = 0;
-  int size = nrn_prop_param_size_[type];
-  int psize = nrn_prop_dparam_size_[type];
-  int receives_events = pnt_receive[type] ? 1 : 0;
-  int layout = nrn_mech_data_layout_[type];
-  int cnt = ml->nodecount;
-  for (int i = 0; i < ml->nodecount; ++i) {
-    int inode = ml->nodeindices[i];
-    if (cellnodes[inode] >= 0) {
-      if (!header_printed) {
-        header_printed = 1;
-        fprintf(f, "type=%d %s size=%d\n", type, memb_func[type].sym, size);
-      }
-      if (receives_events) {
-        fprintf(f, "%d nri %d\n", cellnodes[inode], pntindex);
-        int k = nrn_i_layout(i, cnt, 1, psize, layout);
-        Point_process* pp = (Point_process*)nt._vdata[ml->pdata[k]];
-        pnt2index[pp] = pntindex;
-        ++pntindex;
-      }
-      for (int j=0; j < size; ++j) {
-        int k = nrn_i_layout(i, cnt, j, size, layout);
-        fprintf(f, " %d %d %.15g\n", cellnodes[inode], j, ml->data[k]);
-      }
+    int is_art = nrn_is_artificial_[type];
+    if (is_art)
+        return;
+
+    int header_printed = 0;
+    int size = nrn_prop_param_size_[type];
+    int psize = nrn_prop_dparam_size_[type];
+    int receives_events = pnt_receive[type] ? 1 : 0;
+    int layout = nrn_mech_data_layout_[type];
+    int cnt = ml->nodecount;
+    for (int iorig = 0; iorig < ml->nodecount; ++iorig) {  // original index
+        int i = ml_permute(iorig, ml);                     // present index
+        int inode = ml->nodeindices[i];                    // inode is the permuted node
+        int cix = cellnodes[inv_permute(inode, nt)];       // original index relative to this cell
+        if (cix >= 0) {
+            if (!header_printed) {
+                header_printed = 1;
+                fprintf(f, "type=%d %s size=%d\n", type, memb_func[type].sym, size);
+            }
+            if (receives_events) {
+                fprintf(f, "%d nri %d\n", cix, pntindex);
+                int k = nrn_i_layout(i, cnt, 1, psize, layout);
+                Point_process* pp = (Point_process*)nt._vdata[ml->pdata[k]];
+                pnt2index[pp] = pntindex;
+                ++pntindex;
+            }
+            for (int j = 0; j < size; ++j) {
+                int k = nrn_i_layout(i, cnt, j, size, layout);
+                fprintf(f, " %d %d %.*g\n", cix, j, precision, ml->data[k]);
+            }
+        }
     }
-  }
 }
 
 static void pr_netcon(NrnThread& nt, FILE* f) {
-  if (pntindex == 0) { return; }
-  // pnt2index table has been filled
-
-  // List of NetCon for each of the NET_RECEIVE point process instances
-  std::vector< std::vector<NetCon*> > nclist;
-  nclist.resize(pntindex);
-  int nc_cnt = 0;
-  for (int i=0; i < nt.n_netcon; ++i) {
-    NetCon* nc = nt.netcons + i;
-    Point_process* pp = nc->target_;
-    std::map<Point_process*, int>::iterator it = pnt2index.find(pp);
-    if (it != pnt2index.end()) {
-      nclist[it->second].push_back(nc);
-      ++nc_cnt;
+    if (pntindex == 0) {
+        return;
+    }
+    // pnt2index table has been filled
+
+    // List of NetCon for each of the NET_RECEIVE point process instances
+    // Also create the initial map of NetCon <-> DiscreteEvent (PreSyn)
+    std::vector<std::vector<NetCon*> > nclist;
+    nclist.resize(pntindex);
+    map_nc2src.clear();
+    int nc_cnt = 0;
+    for (int i = 0; i < nt.n_netcon; ++i) {
+        NetCon* nc = nt.netcons + i;
+        Point_process* pp = nc->target_;
+        std::map<Point_process*, int>::iterator it = pnt2index.find(pp);
+        if (it != pnt2index.end()) {
+            nclist[it->second].push_back(nc);
+            map_nc2src[nc] = NULL;
+            ++nc_cnt;
+        }
     }
-  }
-  fprintf(f, "netcons %d\n", nc_cnt);
-  fprintf(f, " pntindex srcgid active delay weights\n");
-  for (int i=0; i < pntindex; ++i) {
-    for (int j=0; j < (int)(nclist[i].size()); ++j) {
-      NetCon* nc = nclist[i][j];
-      int srcgid = -3;
-      if (nc->src_) {
-        if (nc->src_->type() == PreSynType) {
-          PreSyn* ps = (PreSyn*)nc->src_;
-          srcgid = ps->gid_;
-          if (srcgid < 0 && ps->pntsrc_) {
-            int type = ps->pntsrc_->_type;
-            fprintf(f, "%d %s %d %.15g", i, memb_func[type].sym, nc->active_?1:0, nc->delay_);
-          }else if (srcgid < 0 && ps->thvar_) {
-            fprintf(f, "%d %s %d %.15g", i, "v", nc->active_?1:0, nc->delay_);
-          }else{
-            fprintf(f, "%d %d %d %.15g", i, srcgid, nc->active_?1:0, nc->delay_);
-          }
-        }else{
-          srcgid = ((InputPreSyn*)nc->src_)->gid_;
-          fprintf(f, "%d %d %d %.15g", i, srcgid, nc->active_?1:0, nc->delay_);
+    fprintf(f, "netcons %d\n", nc_cnt);
+    fprintf(f, " pntindex srcgid active delay weights\n");
+
+    /// Fill the NetCon <-> DiscreteEvent map with PreSyn-s
+    DiscreteEvent* de;
+    std::map<NetCon*, DiscreteEvent*>::iterator it_nc2src;
+    // presyns can come from any thread
+    for (int ith = 0; ith < nrn_nthread; ++ith) {
+        NrnThread& ntps = nrn_threads[ith];
+        for (int i = 0; i < ntps.n_presyn; ++i) {
+            PreSyn* ps = ntps.presyns + i;
+            for (int j = 0; j < ps->nc_cnt_; ++j) {
+                NetCon* nc = netcon_in_presyn_order_[ps->nc_index_ + j];
+                it_nc2src = map_nc2src.find(nc);
+                if (it_nc2src != map_nc2src.end()) {
+                    it_nc2src->second = ps;
+                }
+            }
+        }
+    }
+
+    /// Fill the NetCon <-> DiscreteEvent map with InputPreSyn-s
+    /// Traverse gid <-> InputPreSyn map and loop over NetCon-s of the
+    /// correspondent InputPreSyn. If NetCon is in the nc2src map,
+    /// remember its ips and the gid
+    std::map<NetCon*, int> map_nc2gid;
+    std::map<int, InputPreSyn*>::iterator it_gid2in = gid2in.begin();
+    for (; it_gid2in != gid2in.end(); ++it_gid2in) {
+        InputPreSyn* ips = it_gid2in->second;  /// input presyn
+        for (int i = 0; i < ips->nc_cnt_; ++i) {
+            NetCon* nc = netcon_in_presyn_order_[ips->nc_index_ + i];
+            it_nc2src = map_nc2src.find(nc);
+            if (it_nc2src != map_nc2src.end()) {
+                it_nc2src->second = ips;
+                map_nc2gid[nc] = it_gid2in->first;  /// src gid of the input presyn
+            }
         }
-      }else{
-        fprintf(f, "%d %d %d %.15g", i, srcgid, nc->active_?1:0, nc->delay_);
-      }
-      int wcnt = pnt_receive_size[nc->target_->_type];
-      for (int k=0; k < wcnt; ++k) {
-        fprintf(f, " %.15g", nc->u.weight_[k]);
-      }
-      fprintf(f, "\n");
     }
-  }
-  // cleanup
-  nclist.clear();
+
+    for (int i = 0; i < pntindex; ++i) {
+        for (int j = 0; j < (int)(nclist[i].size()); ++j) {
+            NetCon* nc = nclist[i][j];
+            int srcgid = -3;
+            it_nc2src = map_nc2src.find(nc);
+            if (it_nc2src !=
+                map_nc2src.end()) {  // seems like there should be no NetCon which is not in the map
+                de = it_nc2src->second;
+                if (de && de->type() == PreSynType) {
+                    PreSyn* ps = (PreSyn*)de;
+                    srcgid = ps->gid_;
+                    Point_process* pnt = ps->pntsrc_;
+                    if (srcgid < 0 && pnt) {
+                        int type = pnt->_type;
+                        fprintf(f, "%d %s %d %.*g", i, memb_func[type].sym, nc->active_ ? 1 : 0,
+                                precision, nc->delay_);
+                    } else if (srcgid < 0 && ps->thvar_index_ > 0) {
+                        fprintf(f, "%d %s %d %.*g", i, "v", nc->active_ ? 1 : 0, precision,
+                                nc->delay_);
+                    } else {
+                        fprintf(f, "%d %d %d %.*g", i, srcgid, nc->active_ ? 1 : 0, precision,
+                                nc->delay_);
+                    }
+                } else {
+                    fprintf(f, "%d %d %d %.*g", i, map_nc2gid[nc], nc->active_ ? 1 : 0, precision,
+                            nc->delay_);
+                }
+            } else {
+                fprintf(f, "%d %d %d %.*g", i, srcgid, nc->active_ ? 1 : 0, precision, nc->delay_);
+            }
+            int wcnt = pnt_receive_size[nc->target_->_type];
+            for (int k = 0; k < wcnt; ++k) {
+                fprintf(f, " %.*g", precision, nt.weights[nc->u.weight_index_ + k]);
+            }
+            fprintf(f, "\n");
+        }
+    }
+    // cleanup
+    nclist.clear();
 }
 
 static void pr_realcell(PreSyn& ps, NrnThread& nt, FILE* f) {
-  //for associating NetCons with Point_process identifiers
-
-  pntindex = 0;
-
-  // threshold variable is a voltage
-printf("thvar=%p actual_v=%p end=%p\n", ps.thvar_, nt._actual_v,
-nt._actual_v + nt.end);
-  if (ps.thvar_ < nt._actual_v || ps.thvar_ >= (nt._actual_v + nt.end)) {
-    hoc_execerror("gid not associated with a voltage", 0);
-  }
-  int inode = ps.thvar_ - nt._actual_v;
-
-  // and the root node is ...
-  int rnode = inode;
-  while (rnode >= nt.ncell) {
-    rnode = nt._v_parent_index[rnode];
-  }
-
-  // count the number of nodes in the cell
-  // do not assume all cell nodes except the root are contiguous
-  int* cellnodes = new int[nt.end];
-  for (int i=0; i < nt.end; ++i) { cellnodes[i] = -1; }
-  int cnt = 0;
-  cellnodes[rnode] = cnt++;
-  for (int i=nt.ncell; i < nt.end; ++i) {
-    if (cellnodes[nt._v_parent_index[i]] >= 0) {
-      cellnodes[i] = cnt++;
+    // for associating NetCons with Point_process identifiers
+
+    pntindex = 0;
+
+    // threshold variable is a voltage
+    printf("thvar_index_=%d end=%d\n", inv_permute(ps.thvar_index_, nt), nt.end);
+    if (ps.thvar_index_ < 0 || ps.thvar_index_ >= nt.end) {
+        hoc_execerror("gid not associated with a voltage", 0);
+    }
+    int inode = ps.thvar_index_;
+
+    // and the root node is ...
+    int rnode = inode;
+    while (rnode >= nt.ncell) {
+        rnode = nt._v_parent_index[rnode];
+    }
+
+    // count the number of nodes in the cell
+    // do not assume all cell nodes except the root are contiguous
+    // cellnodes is an unpermuted vector
+    int* cellnodes = new int[nt.end];
+    for (int i = 0; i < nt.end; ++i) {
+        cellnodes[i] = -1;
+    }
+    int cnt = 0;
+    cellnodes[inv_permute(rnode, nt)] = cnt++;
+    for (int i = nt.ncell; i < nt.end; ++i) {  // think of it as unpermuted order
+        if (cellnodes[inv_permute(nt._v_parent_index[permute(i, nt)], nt)] >= 0) {
+            cellnodes[i] = cnt++;
+        }
+    }
+    fprintf(f, "%d nodes  %d is the threshold node\n", cnt, cellnodes[inv_permute(inode, nt)] - 1);
+    fprintf(f, " threshold %.*g\n", precision, ps.threshold_);
+    fprintf(f, "inode parent area a b\n");
+    for (int iorig = 0; iorig < nt.end; ++iorig)
+        if (cellnodes[iorig] >= 0) {
+            int i = permute(iorig, nt);
+            int ip = nt._v_parent_index[i];
+            fprintf(f, "%d %d %.*g %.*g %.*g\n", cellnodes[iorig],
+                    ip >= 0 ? cellnodes[inv_permute(ip, nt)] : -1, precision, nt._actual_area[i],
+                    precision, nt._actual_a[i], precision, nt._actual_b[i]);
+        }
+    fprintf(f, "inode v\n");
+    for (int i = 0; i < nt.end; ++i)
+        if (cellnodes[i] >= 0) {
+            fprintf(f, "%d %.*g\n", cellnodes[i], precision, nt._actual_v[permute(i, nt)]);
+        }
+
+    // each mechanism
+    for (NrnThreadMembList* tml = nt.tml; tml; tml = tml->next) {
+        pr_memb(tml->index, tml->ml, cellnodes, nt, f);
+    }
+
+    // the NetCon info (uses pnt2index)
+    pr_netcon(nt, f);
+
+    delete[] cellnodes;
+    pnt2index.clear();
+    if (inv_permute_) {
+        delete inv_permute_;
+        inv_permute_ = NULL;
     }
-  }
-  fprintf(f, "%d nodes  %d is the threshold node\n", cnt, cellnodes[inode]-1);
-  fprintf(f, " threshold %.15g\n", ps.threshold_);
-  fprintf(f, "inode parent area a b\n");
-  for (int i=0; i < nt.end; ++i) if (cellnodes[i] >= 0) {
-    fprintf(f, "%d %d %.15g %.15g %.15g\n",
-      cellnodes[i], cellnodes[nt._v_parent_index[i]],
-      nt._actual_area[i], nt._actual_a[i], nt._actual_b[i]);
-  }
-  fprintf(f, "inode v\n");
-  for (int i=0; i < nt.end; ++i) if (cellnodes[i] >= 0) {
-    fprintf(f, "%d %.15g\n",
-     cellnodes[i], nt._actual_v[i]);
-  }
-  
-  // each mechanism
-  for (NrnThreadMembList* tml = nt.tml; tml; tml = tml->next) {
-    pr_memb(tml->index, tml->ml, cellnodes, nt, f);
-  }
-
-  // the NetCon info (uses pnt2index)
-  pr_netcon(nt, f);
-
-  delete [] cellnodes;
-  pnt2index.clear();
 }
 
 int prcellstate(int gid, const char* suffix) {
-  //search the NrnThread.presyns for the gid
-  for (int ith=0; ith < nrn_nthread; ++ith) {
-    NrnThread& nt = nrn_threads[ith];
-    for (int ip = 0; ip < nt.n_presyn; ++ip) {
-      PreSyn& ps = nt.presyns[ip];
-      if (ps.output_index_ == gid) {
-        // found it so create a <gid>_<suffix>.corenrn file
-        char buf[200];
-        sd_ptr filename=sdprintf(buf, sizeof(buf), "%d_%s.corenrn", gid, suffix);
-        FILE* f = fopen(filename, "w");
-        assert(f);
-        fprintf(f, "gid = %d\n", gid);
-        fprintf(f, "t = %.15g\n", nt._t);
-        fprintf(f, "celsius = %.15g\n", celsius);
-        if (ps.thvar_) {
-          pr_realcell(ps, nt, f);
+    // search the NrnThread.presyns for the gid
+    for (int ith = 0; ith < nrn_nthread; ++ith) {
+        NrnThread& nt = nrn_threads[ith];
+        for (int ip = 0; ip < nt.n_presyn; ++ip) {
+            PreSyn& ps = nt.presyns[ip];
+            if (ps.output_index_ == gid) {
+                // found it so create a <gid>_<suffix>.corenrn file
+                char buf[200];
+                sd_ptr filename = sdprintf(buf, sizeof(buf), "%d_%s.corenrn", gid, suffix);
+                FILE* f = fopen(filename, "w");
+                assert(f);
+                fprintf(f, "gid = %d\n", gid);
+                fprintf(f, "t = %.*g\n", precision, nt._t);
+                fprintf(f, "celsius = %.*g\n", precision, celsius);
+                if (ps.thvar_index_ >= 0) {
+                    pr_realcell(ps, nt, f);
+                }
+                fclose(f);
+                return 1;
+            }
         }
-        fclose(f);
-        return 1;
-      }
     }
-  }
-  return 0;
+    return 0;
 }
-
diff --git a/coreneuron/nrniv/profiler_interface.cpp b/coreneuron/nrniv/profiler_interface.cpp
new file mode 100644
index 000000000..411f0ab75
--- /dev/null
+++ b/coreneuron/nrniv/profiler_interface.cpp
@@ -0,0 +1,53 @@
+#include <stdio.h>
+
+#ifdef CUDA_PROFILING
+#include "coreneuron/nrniv/cuda_profile.h"
+#endif
+
+#ifdef CRAYPAT
+#include <pat_api.h>
+#endif
+
+#if defined(_OPENACC)
+#include <openacc.h>
+
+static int cray_acc_debug_orig = 0;
+static int cray_acc_debug_zero = 0;
+#endif
+
+extern int nrnmpi_myid;
+
+void start_profile() {
+    if (nrnmpi_myid == 0)
+        printf("\n ----- GPU PROFILING STARTED -----\n");
+
+#ifdef CRAYPAT
+    PAT_record(PAT_STATE_ON);
+#endif
+
+#if defined(_OPENACC) && defined(_CRAYC)
+    cray_acc_set_debug_global_level(cray_acc_debug_orig);
+#endif
+
+#ifdef CUDA_PROFILING
+    start_cuda_profile();
+#endif
+}
+
+void stop_profile() {
+    if (nrnmpi_myid == 0)
+        printf("\n ----- GPU PROFILING STOPPED -----\n");
+
+#ifdef CRAYPAT
+    PAT_record(PAT_STATE_OFF);
+#endif
+
+#if defined(_OPENACC) && defined(_CRAYC)
+    cray_acc_debug_orig = cray_acc_get_debug_global_level();
+    cray_acc_set_debug_global_level(cray_acc_debug_zero);
+#endif
+
+#ifdef CUDA_PROFILING
+    stop_cuda_profile();
+#endif
+}
diff --git a/coreneuron/nrniv/profiler_interface.h b/coreneuron/nrniv/profiler_interface.h
new file mode 100644
index 000000000..ca851a288
--- /dev/null
+++ b/coreneuron/nrniv/profiler_interface.h
@@ -0,0 +1,7 @@
+#ifndef _profiler_interface_h_
+#define _profiler_interface_h_
+
+void start_profile();
+void stop_profile();
+
+#endif
diff --git a/coreneuron/nrniv/tnode.h b/coreneuron/nrniv/tnode.h
new file mode 100644
index 000000000..b7f1bbe84
--- /dev/null
+++ b/coreneuron/nrniv/tnode.h
@@ -0,0 +1,40 @@
+#ifndef tnode_h
+#define tnode_h
+
+#include <vector>
+
+// experiment with ordering strategies for Tree Nodes
+
+class TNode;
+
+typedef std::vector<TNode*> VecTNode;
+
+class TNode {
+  public:
+    TNode(int ix);
+    virtual ~TNode();
+    TNode* parent;
+    VecTNode children;
+    size_t mkhash();
+    size_t hash;
+    size_t treesize;
+    size_t nodevec_index;
+    size_t treenode_order;
+    size_t level;
+    size_t cellindex;
+    size_t groupindex;
+    int nodeindex;
+};
+
+size_t level_from_leaf(VecTNode&);
+size_t level_from_root(VecTNode&);
+
+void group_order2(VecTNode&, size_t groupsize, size_t ncell);
+size_t dist2child(TNode* nd);
+
+// see balance.cpp
+size_t warp_balance(size_t ncell, VecTNode& nodevec);
+
+#define warpsize 32
+
+#endif
diff --git a/coreneuron/nrniv/tqueue.cpp b/coreneuron/nrniv/tqueue.cpp
index ae8c51edb..d78351b9b 100644
--- a/coreneuron/nrniv/tqueue.cpp
+++ b/coreneuron/nrniv/tqueue.cpp
@@ -50,199 +50,87 @@ Douglas Jones.
 of struct _spblk, we are really using TQItem
 */
 
-
 TQItem::TQItem() {
     left_ = 0;
     right_ = 0;
     parent_ = 0;
 }
 
-TQItem::~TQItem() {
-}
-
-TQueue::TQueue() {
-    MUTCONSTRUCT(0)
-    nshift_ = 0;
-    sptree_ = new SPTREE;
-    spinit(sptree_);
-    binq_ = new BinQ;
-    least_ = 0;
-
-#if COLLECT_TQueue_STATISTICS
-    nmove = ninsert = nrem = nleast = nbal = ncmplxrem = 0;
-    nfastmove = ncompare = nleastsrch = nfind = nfindsrch = 0;
-#endif
-}
-
-TQueue::~TQueue() {
-    SPBLK* q, *q2;
-    while((q = spdeq(&sptree_->root)) != NULL) {
-        delete q;
-    }
-    delete sptree_;
-    for (q = binq_->first(); q; q = q2) {
-        q2 = binq_->next(q);
-        remove(q); /// Potentially dereferences freed pointer this->sptree_
-    }
-    delete binq_;
-    MUTDESTRUCT
-}
-
-void TQueue::move_least_nolock(double tnew) {
-    TQItem* b = least();
-    if (b) {
-        b->t_ = tnew;
-        TQItem* nl = sphead(sptree_);
-        if (nl) {
-            if (tnew > nl->t_) {
-                least_ = spdeq(&sptree_->root);
-                spenq(b, sptree_);
-            }
-        }
-    }
-}
-
-void TQueue::move(TQItem* i, double tnew) {
-    MUTLOCK
-    STAT(nmove)
-    if (i == least_) {
-        move_least_nolock(tnew);
-    }else if (tnew < least_->t_) {
-        spdelete(i, sptree_);
-        i->t_ = tnew;
-        spenq(least_, sptree_);
-        least_ = i;
-    }else{
-        spdelete(i, sptree_);
-        i->t_ = tnew;
-        spenq(i, sptree_);
-    }
-    MUTUNLOCK
-}
-
-void TQueue::statistics() {
-#if COLLECT_TQueue_STATISTICS
-    printf("insertions=%lu  moves=%lu removals=%lu calls to least=%lu\n",
-        ninsert, nmove, nrem, nleast);
-    printf("calls to find=%lu\n",
-        nfind);
-    printf("comparisons=%d\n",
-        sptree_->enqcmps);
-#else
-    printf("Turn on COLLECT_TQueue_STATISTICS_ in tqueue.h\n");
-#endif
-}
-
-TQItem* TQueue::insert(double tt, void* d) {
-    MUTLOCK
-    STAT(ninsert);
-    TQItem* i = new TQItem;
-    i->data_ = d;
-    i->t_ = tt;
-    i->cnt_ = -1;
-    if (tt < least_t_nolock()) {
-        if (least()) {
-            spenq(least(), sptree_);
-        }
-        least_ = i;
-    }else{
-        spenq(i, sptree_);
-    }
-    MUTUNLOCK
-    return i;
-}
-
-void TQueue::remove(TQItem* q) {
-    MUTLOCK
-    STAT(nrem);
-    if (q) {
-        if (q == least_) {
-            if (sptree_->root) {
-                least_ = spdeq(&sptree_->root);
-            }else{
-                least_ = NULL;
-            }
-        }else if (q->cnt_ >= 0) {
-            binq_->remove(q);
-        }else{
-            spdelete(q, sptree_);
-        }
-        delete q;
-    }
-    MUTUNLOCK
-}
-
-TQItem* TQueue::atomic_dq(double tt) {
-    TQItem* q = 0;
-    MUTLOCK
-    if (least_ && least_->t_ <= tt) {
-        q = least_;
-        STAT(nrem);
-        if (sptree_->root) {
-            least_ = spdeq(&sptree_->root);
-        }else{
-            least_ = NULL;
-        }
-    }
-    MUTUNLOCK
-    return q;
-}
-
 BinQ::BinQ() {
     nbin_ = 1000;
     bins_ = new TQItem*[nbin_];
-    for (int i=0; i < nbin_; ++i) { bins_[i] = 0; }
+    for (int i = 0; i < nbin_; ++i) {
+        bins_[i] = 0;
+    }
     qpt_ = 0;
     tt_ = 0.;
 #if COLLECT_TQueue_STATISTICS
-    nfenq = 0;
+    nfenq = nfdeq = 0;
 #endif
 }
 
 BinQ::~BinQ() {
-    for (int i=0; i < nbin_; ++i) {
+    for (int i = 0; i < nbin_; ++i) {
         assert(!bins_[i]);
     }
-    delete [] bins_;
+    delete[] bins_;
+    vec_bins.clear();
 }
 
 void BinQ::resize(int size) {
-    //printf("BinQ::resize from %d to %d\n", nbin_, size);
+    // printf("BinQ::resize from %d to %d\n", nbin_, size);
     int i, j;
     TQItem* q;
     assert(size >= nbin_);
     TQItem** bins = new TQItem*[size];
-    for (i=nbin_; i < size; ++i) { bins[i] = 0; }
-    for (i=0, j=qpt_; i < nbin_; ++i, ++j) {
-        if (j >= nbin_) { j = 0; }
+    for (i = nbin_; i < size; ++i) {
+        bins[i] = 0;
+    }
+    for (i = 0, j = qpt_; i < nbin_; ++i, ++j) {
+        if (j >= nbin_) {
+            j = 0;
+        }
         bins[i] = bins_[j];
         for (q = bins[i]; q; q = q->left_) {
             q->cnt_ = i;
         }
     }
-    delete [] bins_;
+    delete[] bins_;
     bins_ = bins;
     nbin_ = size;
     qpt_ = 0;
 }
 void BinQ::enqueue(double td, TQItem* q) {
-    int idt = (int)((td - tt_)/nrn_threads->_dt + 1.e-10);
+    int idt = (int)((td - tt_) * rev_dt + 1.e-10);
     assert(idt >= 0);
     if (idt >= nbin_) {
-        resize(idt + 100);
+        resize(idt + 1000);
     }
-    //assert (idt < nbin_);
+    // assert (idt < nbin_);
     idt += qpt_;
-    if (idt >= nbin_) { idt -= nbin_; }
-//printf("enqueue idt=%d qpt=%d\n", idt, qpt_);
-    assert (idt < nbin_);
-    q->cnt_ = idt; // only for iteration
+    if (idt >= nbin_) {
+        idt -= nbin_;
+    }
+    // printf("enqueue: idt=%d qpt=%d nbin_=%d\n", idt, qpt_, nbin_);
+    assert(idt < nbin_);
+    q->cnt_ = idt;  // only for iteration
     q->left_ = bins_[idt];
     bins_[idt] = q;
 #if COLLECT_TQueue_STATISTICS
     ++nfenq;
 #endif
 }
+TQItem* BinQ::dequeue() {
+    TQItem* q = NULL;
+    q = bins_[qpt_];
+    if (q) {
+        bins_[qpt_] = q->left_;
+#if COLLECT_TQueue_STATISTICS
+        ++nfdeq;
+#endif
+    }
+    return q;
+}
 
 TQItem* BinQ::first() {
     for (int i = 0; i < nbin_; ++i) {
@@ -253,7 +141,9 @@ TQItem* BinQ::first() {
     return 0;
 }
 TQItem* BinQ::next(TQItem* q) {
-    if (q->left_) { return q->left_; }
+    if (q->left_) {
+        return q->left_;
+    }
     for (int i = q->cnt_ + 1; i < nbin_; ++i) {
         if (bins_[i]) {
             return bins_[i];
@@ -263,7 +153,7 @@ TQItem* BinQ::next(TQItem* q) {
 }
 
 void BinQ::remove(TQItem* q) {
-    TQItem* q1, *q2;
+    TQItem *q1, *q2;
     q1 = bins_[q->cnt_];
     if (q1 == q) {
         bins_[q->cnt_] = q->left_;
@@ -277,7 +167,6 @@ void BinQ::remove(TQItem* q) {
     }
 }
 
-
 //#include "coreneuron/nrniv/sptree.h"
 
 /*
@@ -331,14 +220,11 @@ Hines changed to void spinit(SPTREE**) for use with TQueue.
  * spinit() -- initialize an empty splay tree
  *
  */
-void
-spinit(SPTREE* q)
-{
+void spinit(SPTREE* q) {
     q->enqcmps = 0;
     q->root = NULL;
 }
 
-
 /*----------------
  *
  *  spenq() -- insert item in a tree.
@@ -351,28 +237,25 @@ spinit(SPTREE* q)
  *  performed along the way to shorten the left branch of the right subtree
  *  and the right branch of the left subtree
  */
-SPBLK *
-spenq( SPBLK* n, SPTREE* q )
-{
-    SPBLK * left;	/* the rightmost node in the left tree */
-    SPBLK * right;	/* the leftmost node in the right tree */
-    SPBLK * next;	/* the root of the unsplit part */
-    SPBLK * temp;
+SPBLK* spenq(SPBLK* n, SPTREE* q) {
+    SPBLK* left;  /* the rightmost node in the left tree */
+    SPBLK* right; /* the leftmost node in the right tree */
+    SPBLK* next;  /* the root of the unsplit part */
+    SPBLK* temp;
 
     double key;
 #if STRCMP_DEF
-    int Sct;		/* Strcmp value */
+    int Sct; /* Strcmp value */
 #endif
 
     n->uplink = NULL;
     next = q->root;
     q->root = n;
-    if( next == NULL )	/* trivial enq */
+    if (next == NULL) /* trivial enq */
     {
         n->leftlink = NULL;
         n->rightlink = NULL;
-    }
-    else		/* difficult enq */
+    } else /* difficult enq */
     {
         key = n->key;
         left = n;
@@ -382,107 +265,100 @@ spenq( SPBLK* n, SPTREE* q )
        splayed trees resulting from splitting on n->key;
        note that the children will be reversed! */
 
-    q->enqcmps++;
-        if ( STRCMP( next->key, key ) > 0 )
-        goto two;
+        q->enqcmps++;
+        if (STRCMP(next->key, key) > 0)
+            goto two;
 
-    one:	/* assert next->key <= key */
+    one: /* assert next->key <= key */
 
-    do	/* walk to the right in the left tree */
-    {
-            temp = next->rightlink;
-            if( temp == NULL )
+        do /* walk to the right in the left tree */
         {
+            temp = next->rightlink;
+            if (temp == NULL) {
                 left->rightlink = next;
                 next->uplink = left;
                 right->leftlink = NULL;
-                goto done;	/* job done, entire tree split */
+                goto done; /* job done, entire tree split */
             }
 
-        q->enqcmps++;
-            if( STRCMP( temp->key, key ) > 0 )
-        {
+            q->enqcmps++;
+            if (STRCMP(temp->key, key) > 0) {
                 left->rightlink = next;
                 next->uplink = left;
                 left = next;
                 next = temp;
-                goto two;	/* change sides */
+                goto two; /* change sides */
             }
 
             next->rightlink = temp->leftlink;
-            if( temp->leftlink != NULL )
-            temp->leftlink->uplink = next;
+            if (temp->leftlink != NULL)
+                temp->leftlink->uplink = next;
             left->rightlink = temp;
             temp->uplink = left;
             temp->leftlink = next;
             next->uplink = temp;
             left = temp;
             next = temp->rightlink;
-            if( next == NULL )
-        {
+            if (next == NULL) {
                 right->leftlink = NULL;
-                goto done;	/* job done, entire tree split */
+                goto done; /* job done, entire tree split */
             }
 
-        q->enqcmps++;
+            q->enqcmps++;
 
-    } while( STRCMP( next->key, key ) <= 0 );	/* change sides */
+        } while (STRCMP(next->key, key) <= 0); /* change sides */
 
-    two:	/* assert next->key > key */
+    two: /* assert next->key > key */
 
-    do	/* walk to the left in the right tree */
-    {
-            temp = next->leftlink;
-            if( temp == NULL )
+        do /* walk to the left in the right tree */
         {
+            temp = next->leftlink;
+            if (temp == NULL) {
                 right->leftlink = next;
                 next->uplink = right;
                 left->rightlink = NULL;
-                goto done;	/* job done, entire tree split */
+                goto done; /* job done, entire tree split */
             }
 
-        q->enqcmps++;
-            if( STRCMP( temp->key, key ) <= 0 )
-        {
+            q->enqcmps++;
+            if (STRCMP(temp->key, key) <= 0) {
                 right->leftlink = next;
                 next->uplink = right;
                 right = next;
                 next = temp;
-                goto one;	/* change sides */
+                goto one; /* change sides */
             }
             next->leftlink = temp->rightlink;
-            if( temp->rightlink != NULL )
-            temp->rightlink->uplink = next;
+            if (temp->rightlink != NULL)
+                temp->rightlink->uplink = next;
             right->leftlink = temp;
             temp->uplink = right;
             temp->rightlink = next;
             next->uplink = temp;
             right = temp;
             next = temp->leftlink;
-            if( next == NULL )
-        {
+            if (next == NULL) {
                 left->rightlink = NULL;
-                goto done;	/* job done, entire tree split */
+                goto done; /* job done, entire tree split */
             }
 
-        q->enqcmps++;
+            q->enqcmps++;
 
-    } while( STRCMP( next->key, key ) > 0 );	/* change sides */
+        } while (STRCMP(next->key, key) > 0); /* change sides */
 
         goto one;
 
-    done:	/* split is done, branches of n need reversal */
+    done: /* split is done, branches of n need reversal */
 
         temp = n->leftlink;
         n->leftlink = n->rightlink;
         n->rightlink = temp;
     }
 
-    return( n );
+    return (n);
 
 } /* spenq */
 
-
 /*----------------
  *
  *  spdeq() -- return and remove head node from a subtree.
@@ -492,75 +368,67 @@ spenq( SPBLK* n, SPTREE* q )
  *  subtree (if there is one); on the way to the leftmost node, rotations
  *  are performed to shorten the left branch of the tree
  */
-SPBLK *
-spdeq( SPBLK** np ) /* pointer to a node pointer */
+SPBLK* spdeq(SPBLK** np) /* pointer to a node pointer */
 
 {
-    SPBLK * deq;		/* one to return */
-    SPBLK * next;       	/* the next thing to deal with */
-    SPBLK * left;      	/* the left child of next */
-    SPBLK * farleft;		/* the left child of left */
-    SPBLK * farfarleft;	/* the left child of farleft */
+    SPBLK* deq;        /* one to return */
+    SPBLK* next;       /* the next thing to deal with */
+    SPBLK* left;       /* the left child of next */
+    SPBLK* farleft;    /* the left child of left */
+    SPBLK* farfarleft; /* the left child of farleft */
 
-    if( np == NULL || *np == NULL )
-    {
+    if (np == NULL || *np == NULL) {
         deq = NULL;
-    }
-    else
-    {
+    } else {
         next = *np;
         left = next->leftlink;
-        if( left == NULL )
-    {
+        if (left == NULL) {
             deq = next;
             *np = next->rightlink;
 
-            if( *np != NULL )
-        (*np)->uplink = NULL;
-
-        }
-    else for(;;)	/* left is not null */
-    {
-            /* next is not it, left is not NULL, might be it */
-            farleft = left->leftlink;
-            if( farleft == NULL )
-        {
-                deq = left;
-                next->leftlink = left->rightlink;
-                if( left->rightlink != NULL )
-            left->rightlink->uplink = next;
-        break;
-            }
-
-            /* next, left are not it, farleft is not NULL, might be it */
-            farfarleft = farleft->leftlink;
-            if( farfarleft == NULL )
-        {
-                deq = farleft;
+            if (*np != NULL)
+                (*np)->uplink = NULL;
+
+        } else
+            for (;;) /* left is not null */
+            {
+                /* next is not it, left is not NULL, might be it */
+                farleft = left->leftlink;
+                if (farleft == NULL) {
+                    deq = left;
+                    next->leftlink = left->rightlink;
+                    if (left->rightlink != NULL)
+                        left->rightlink->uplink = next;
+                    break;
+                }
+
+                /* next, left are not it, farleft is not NULL, might be it */
+                farfarleft = farleft->leftlink;
+                if (farfarleft == NULL) {
+                    deq = farleft;
+                    left->leftlink = farleft->rightlink;
+                    if (farleft->rightlink != NULL)
+                        farleft->rightlink->uplink = left;
+                    break;
+                }
+
+                /* next, left, farleft are not it, rotate */
+                next->leftlink = farleft;
+                farleft->uplink = next;
                 left->leftlink = farleft->rightlink;
-                if( farleft->rightlink != NULL )
-            farleft->rightlink->uplink = left;
-        break;
+                if (farleft->rightlink != NULL)
+                    farleft->rightlink->uplink = left;
+                farleft->rightlink = left;
+                left->uplink = farleft;
+                next = farleft;
+                left = farfarleft;
             }
-
-            /* next, left, farleft are not it, rotate */
-            next->leftlink = farleft;
-            farleft->uplink = next;
-            left->leftlink = farleft->rightlink;
-            if( farleft->rightlink != NULL )
-        farleft->rightlink->uplink = left;
-            farleft->rightlink = left;
-            left->uplink = farleft;
-            next = farleft;
-            left = farfarleft;
-    }
     }
 
-    return( deq );
+    return (deq);
 
 } /* spdeq */
 
-
 /*----------------
  *
  *  splay() -- reorganize the tree.
@@ -577,100 +445,94 @@ spdeq( SPBLK** np ) /* pointer to a node pointer */
  *  detect n not in q and complain
  */
 
-void
-splay( SPBLK* n, SPTREE* q )
-{
-    SPBLK * up;	/* points to the node being dealt with */
-    SPBLK * prev;	/* a descendent of up, already dealt with */
-    SPBLK * upup;	/* the parent of up */
-    SPBLK * upupup;	/* the grandparent of up */
-    SPBLK * left;	/* the top of left subtree being built */
-    SPBLK * right;	/* the top of right subtree being built */
+void splay(SPBLK* n, SPTREE* q) {
+    SPBLK* up;     /* points to the node being dealt with */
+    SPBLK* prev;   /* a descendent of up, already dealt with */
+    SPBLK* upup;   /* the parent of up */
+    SPBLK* upupup; /* the grandparent of up */
+    SPBLK* left;   /* the top of left subtree being built */
+    SPBLK* right;  /* the top of right subtree being built */
 
     left = n->leftlink;
     right = n->rightlink;
     prev = n;
     up = prev->uplink;
 
-    while( up != NULL )
-    {
+    while (up != NULL) {
         /* walk up the tree towards the root, splaying all to the left of
        n into the left subtree, all to right into the right subtree */
 
         upup = up->uplink;
-        if( up->leftlink == prev )	/* up is to the right of n */
-    {
-            if( upup != NULL && upup->leftlink == up )  /* rotate */
+        if (up->leftlink == prev) /* up is to the right of n */
         {
+            if (upup != NULL && upup->leftlink == up) /* rotate */
+            {
                 upupup = upup->uplink;
                 upup->leftlink = up->rightlink;
-                if( upup->leftlink != NULL )
-            upup->leftlink->uplink = upup;
+                if (upup->leftlink != NULL)
+                    upup->leftlink->uplink = upup;
                 up->rightlink = upup;
                 upup->uplink = up;
-                if( upupup == NULL )
-            q->root = up;
-        else if( upupup->leftlink == upup )
-            upupup->leftlink = up;
-        else
-            upupup->rightlink = up;
+                if (upupup == NULL)
+                    q->root = up;
+                else if (upupup->leftlink == upup)
+                    upupup->leftlink = up;
+                else
+                    upupup->rightlink = up;
                 up->uplink = upupup;
                 upup = upupup;
             }
             up->leftlink = right;
-            if( right != NULL )
-        right->uplink = up;
+            if (right != NULL)
+                right->uplink = up;
             right = up;
 
-        }
-    else				/* up is to the left of n */
-    {
-            if( upup != NULL && upup->rightlink == up )	/* rotate */
+        } else /* up is to the left of n */
         {
+            if (upup != NULL && upup->rightlink == up) /* rotate */
+            {
                 upupup = upup->uplink;
                 upup->rightlink = up->leftlink;
-                if( upup->rightlink != NULL )
-            upup->rightlink->uplink = upup;
+                if (upup->rightlink != NULL)
+                    upup->rightlink->uplink = upup;
                 up->leftlink = upup;
                 upup->uplink = up;
-                if( upupup == NULL )
-            q->root = up;
-        else if( upupup->rightlink == upup )
-            upupup->rightlink = up;
-        else
-            upupup->leftlink = up;
+                if (upupup == NULL)
+                    q->root = up;
+                else if (upupup->rightlink == upup)
+                    upupup->rightlink = up;
+                else
+                    upupup->leftlink = up;
                 up->uplink = upupup;
                 upup = upupup;
             }
             up->rightlink = left;
-            if( left != NULL )
-        left->uplink = up;
+            if (left != NULL)
+                left->uplink = up;
             left = up;
         }
         prev = up;
         up = upup;
     }
 
-# ifdef DEBUG
-    if( q->root != prev )
-    {
-/*	fprintf(stderr, " *** bug in splay: n not in q *** " ); */
-    abort();
+#ifdef DEBUG
+    if (q->root != prev) {
+        /*	fprintf(stderr, " *** bug in splay: n not in q *** " ); */
+        abort();
     }
-# endif
+#endif
 
     n->leftlink = left;
     n->rightlink = right;
-    if( left != NULL )
-    left->uplink = n;
-    if( right != NULL )
-    right->uplink = n;
+    if (left != NULL)
+        left->uplink = n;
+    if (right != NULL)
+        right->uplink = n;
     q->root = n;
     n->uplink = NULL;
 
 } /* splay */
 
-
 /*----------------
  *
  * sphead() --  return the "lowest" element in the tree.
@@ -684,31 +546,27 @@ splay( SPBLK* n, SPTREE* q )
  *      avoids splaying but just searches for and returns a pointer to
  *      the bottom of the left branch
  */
-SPBLK *
-sphead( SPTREE* q )
-{
-    SPBLK * x;
+SPBLK* sphead(SPTREE* q) {
+    SPBLK* x;
 
     /* splay version, good amortized bound */
-    x = spdeq( &q->root );
-    if( x != NULL )
-    {
+    x = spdeq(&q->root);
+    if (x != NULL) {
         x->rightlink = q->root;
         x->leftlink = NULL;
         x->uplink = NULL;
-        if( q->root != NULL )
-        q->root->uplink = x;
+        if (q->root != NULL)
+            q->root->uplink = x;
     }
     q->root = x;
 
     /* alternative version, bad amortized bound,
        but faster on the average */
 
-    return( x );
+    return (x);
 
 } /* sphead */
 
-
 /*----------------
  *
  * spdelete() -- Delete node from a tree.
@@ -717,27 +575,25 @@ sphead( SPTREE* q )
  *	around its new root, which is the successor of n
  *
  */
-void
-spdelete( SPBLK* n, SPTREE* q )
-{
-    SPBLK * x;
+void spdelete(SPBLK* n, SPTREE* q) {
+    SPBLK* x;
 
-    splay( n, q );
-    x = spdeq( &q->root->rightlink );
-    if( x == NULL )		/* empty right subtree */
+    splay(n, q);
+    x = spdeq(&q->root->rightlink);
+    if (x == NULL) /* empty right subtree */
     {
         q->root = q->root->leftlink;
-        if (q->root) q->root->uplink = NULL;
-    }
-    else			/* non-empty right subtree */
+        if (q->root)
+            q->root->uplink = NULL;
+    } else /* non-empty right subtree */
     {
         x->uplink = NULL;
         x->leftlink = q->root->leftlink;
         x->rightlink = q->root->rightlink;
-        if( x->leftlink != NULL )
-        x->leftlink->uplink = x;
-        if( x->rightlink != NULL )
-        x->rightlink->uplink = x;
+        if (x->leftlink != NULL)
+            x->leftlink->uplink = x;
+        if (x->rightlink != NULL)
+            x->rightlink->uplink = x;
         q->root = x;
     }
 
diff --git a/coreneuron/nrniv/tqueue.h b/coreneuron/nrniv/tqueue.h
index 730cdeceb..006cf2ba8 100644
--- a/coreneuron/nrniv/tqueue.h
+++ b/coreneuron/nrniv/tqueue.h
@@ -26,8 +26,8 @@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-//#ifndef tqueue_h
-//#define tqueue_h
+#ifndef tqueue_h
+#define tqueue_h
 
 /*
 **  SPTREE:  The following type declarations provide the binary tree
@@ -49,10 +49,13 @@ THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <stdio.h>
 #include <assert.h>
+#include <queue>
+#include <vector>
+#include <map>
+#include <utility>
 #include "coreneuron/nrniv/nrnmutdec.h"
 
-#define COLLECT_TQueue_STATISTICS 1
-
+#define COLLECT_TQueue_STATISTICS 0
 #define STRCMP(a, b) (a - b)
 
 class TQItem;
@@ -63,12 +66,11 @@ class TQItem;
 #define cnt cnt_
 #define key t_
 
-typedef struct SPTREE
-{
-    SPBLK	* root;		/* root node */
+typedef struct SPTREE {
+    SPBLK* root; /* root node */
 
     /* Statistics, not strictly necessary, but handy for tuning  */
-    int		enqcmps;	/* compares in spenq */
+    int enqcmps; /* compares in spenq */
 
 } SPTREE;
 
@@ -79,75 +81,134 @@ typedef struct SPTREE
 #define sphead sptq_sphead
 #define spdelete sptq_spdelete
 
-extern void spinit(SPTREE*);		/* init tree */
-extern SPBLK * spenq(SPBLK*, SPTREE*);	/* insert item into the tree */
-extern SPBLK * spdeq(SPBLK**);		/* return and remove lowest item in subtree */
-extern void splay(SPBLK*, SPTREE*);	/* reorganize tree */
-extern SPBLK * sphead(SPTREE*);         /* return first node in tree */
-extern void spdelete(SPBLK*, SPTREE*);	/* delete node from tree */
-
+extern void spinit(SPTREE*);           /* init tree */
+extern SPBLK* spenq(SPBLK*, SPTREE*);  /* insert item into the tree */
+extern SPBLK* spdeq(SPBLK**);          /* return and remove lowest item in subtree */
+extern void splay(SPBLK*, SPTREE*);    /* reorganize tree */
+extern SPBLK* sphead(SPTREE*);         /* return first node in tree */
+extern void spdelete(SPBLK*, SPTREE*); /* delete node from tree */
 
 class TQItem {
-public:
-	TQItem();
-	virtual ~TQItem();
-public:
-	void* data_;
-	double t_;
-	TQItem* left_;
-	TQItem* right_;
-	TQItem* parent_;
-	int cnt_; // reused: -1 means it is in the splay tree, >=0 gives bin
+  public:
+    TQItem();
+
+  public:
+    void* data_;
+    double t_;
+    TQItem* left_;
+    TQItem* right_;
+    TQItem* parent_;
+    int cnt_;  // reused: -1 means it is in the splay tree, >=0 gives bin
+};
+
+typedef std::pair<double, TQItem*> TQPair;
+
+struct less_time {
+    bool operator()(const TQPair& x, const TQPair& y) const {
+        return x.first > y.first;
+    }
 };
 
 // helper class for the TQueue (SplayTBinQueue).
 class BinQ {
-public:
-	BinQ();
-	virtual ~BinQ();
-	void enqueue(double tt, TQItem*);
-	void shift(double tt) { assert(!bins_[qpt_]); tt_ = tt; if (++qpt_ >= nbin_) { qpt_ = 0; }}
-	// for iteration
-	TQItem* first();
-	TQItem* next(TQItem*);
-	void remove(TQItem*);
+  public:
+    BinQ();
+    ~BinQ();
+    void enqueue(double tt, TQItem*);
+    void shift(double tt) {
+        assert(!bins_[qpt_]);
+        tt_ = tt;
+        if (++qpt_ >= nbin_) {
+            qpt_ = 0;
+        }
+    }
+    TQItem* top() {
+        return bins_[qpt_];
+    }
+    TQItem* dequeue();
+    double tbin() {
+        return tt_;
+    }
+    // for iteration
+    TQItem* first();
+    TQItem* next(TQItem*);
+    void remove(TQItem*);
     void resize(int);
 #if COLLECT_TQueue_STATISTICS
-public:
-    int nfenq;
+  public:
+    int nfenq, nfdeq;
 #endif
-private:
-	double tt_; // time at beginning of qpt_ interval
-	int nbin_, qpt_;
-	TQItem** bins_;
+  private:
+    double tt_;  // time at beginning of qpt_ interval
+    int nbin_, qpt_;
+    TQItem** bins_;
+    std::vector<std::vector<TQItem*> > vec_bins;
 };
 
+enum container { spltree, pq_que };
+
+template <container C = spltree>
 class TQueue {
-public:
-  TQueue();
-  virtual ~TQueue();
-
-  TQItem* least() {return least_;}
-  TQItem* atomic_dq(double til);
-  TQItem* insert(double t, void* data);
-  void shift_bin(double _t_) { ++nshift_; binq_->shift(_t_); }
-  void remove(TQItem*);
-  void move(TQItem*, double tnew);
-  void statistics();
-  int nshift_;
-
-private:
-  double least_t_nolock(){if (least_) { return least_->t_;}else{return 1e15;}}
-  void move_least_nolock(double tnew);
-  SPTREE* sptree_;
-  BinQ* binq_;
-  TQItem* least_;
-  MUTDEC
+  public:
+    TQueue();
+    ~TQueue();
+
+    inline TQItem* least() {
+        return least_;
+    }
+    inline TQItem* insert(double t, void* data);
+    inline TQItem* enqueue_bin(double t, void* data);
+    inline TQItem* dequeue_bin() {
+        return binq_->dequeue();
+    }
+    inline void shift_bin(double _t_) {
+        ++nshift_;
+        binq_->shift(_t_);
+    }
+    inline TQItem* top() {
+        return binq_->top();
+    }
+
+    inline TQItem* atomic_dq(double til);
+    inline void remove(TQItem*);
+    inline void move(TQItem*, double tnew);
 #if COLLECT_TQueue_STATISTICS
-  unsigned long ninsert, nrem, nleast, nbal, ncmplxrem;
-  unsigned long ncompare, nleastsrch, nfind, nfindsrch, nmove, nfastmove;
+    inline void statistics();
+    inline void record_stat_event(int type, double time);
 #endif
-};
+    int nshift_;
 
+    /// Priority queue of vectors for queuing the events. enqueuing for move() and
+    /// move_least_nolock() is not implemented
+    std::priority_queue<TQPair, std::vector<TQPair>, less_time> pq_que_;
+    /// Types of queuing statistics
+    enum qtype { enq = 0, spike, ite, deq };
+#if COLLECT_TQueue_STATISTICS
+    /// Map for queuing statistics
+    std::map<double, long> time_map_events[4];
+#endif
 
-//#endif
+  private:
+    double least_t_nolock() {
+        if (least_) {
+            return least_->t_;
+        } else {
+            return 1e15;
+        }
+    }
+    void move_least_nolock(double tnew);
+    SPTREE* sptree_;
+    BinQ* binq_;
+    TQItem* least_;
+    TQPair make_TQPair(TQItem* p) {
+        return TQPair(p->t_, p);
+    }
+    MUTDEC
+#if COLLECT_TQueue_STATISTICS
+    unsigned long ninsert, nrem, nleast, nbal, ncmplxrem;
+    unsigned long ncompare, nleastsrch, nfind, nfindsrch, nmove, nfastmove;
+#endif
+};
+
+#include "coreneuron/nrniv/tqueue.ipp"
+#endif
diff --git a/coreneuron/nrniv/tqueue.ipp b/coreneuron/nrniv/tqueue.ipp
new file mode 100644
index 000000000..9f4bb166f
--- /dev/null
+++ b/coreneuron/nrniv/tqueue.ipp
@@ -0,0 +1,362 @@
+/*
+Copyright (c) 2016, Blue Brain Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef tqueue_ipp_
+#define tqueue_ipp_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include "coreneuron/nrnoc/multicore.h"
+#include "coreneuron/nrniv/tqueue.h"
+
+#if COLLECT_TQueue_STATISTICS
+#define STAT(arg) ++arg;
+#else
+#define STAT(arg) /**/
+#endif
+
+// splay tree + bin queue limited to fixed step method
+// for event-sets or priority queues
+// this starts from the sptqueue.cpp file and adds a bin queue
+
+/* Derived from David Brower's c translation of pascal code by
+Douglas Jones.
+*/
+/* The original c code is included from this file but note that instead
+of struct _spblk, we are really using TQItem
+*/
+
+template <container C>
+TQueue<C>::TQueue() {
+    MUTCONSTRUCT(0)
+    nshift_ = 0;
+    sptree_ = new SPTREE;
+    spinit(sptree_);
+    binq_ = new BinQ;
+    least_ = 0;
+
+#if COLLECT_TQueue_STATISTICS
+    nmove = ninsert = nrem = nleast = nbal = ncmplxrem = 0;
+    nfastmove = ncompare = nleastsrch = nfind = nfindsrch = 0;
+#endif
+}
+
+template <container C>
+TQueue<C>::~TQueue() {
+    SPBLK *q, *q2;
+    /// Clear the binq
+    for (q = binq_->first(); q; q = q2) {
+        q2 = binq_->next(q);
+        remove(q);  /// Potentially dereferences freed pointer this->sptree_
+    }
+    delete binq_;
+
+    /// Clear the splay tree
+    while ((q = spdeq(&sptree_->root)) != NULL) {
+        delete q;
+    }
+    delete sptree_;
+
+    /// Clear the priority queue
+    while (pq_que_.size()) {
+        delete pq_que_.top().second;
+        pq_que_.pop();
+    }
+
+    MUTDESTRUCT
+}
+
+template <container C>
+TQItem* TQueue<C>::enqueue_bin(double td, void* d) {
+    MUTLOCK
+#if COLLECT_TQueue_STATISTICS
+    STAT(ninsert);
+    record_stat_event(enq, td);
+#endif
+    TQItem* i = new TQItem;
+    i->data_ = d;
+    i->t_ = td;
+    binq_->enqueue(td, i);
+    MUTUNLOCK
+    return i;
+}
+
+#if COLLECT_TQueue_STATISTICS
+template <container C>
+void TQueue<C>::record_stat_event(int type, double time) {
+    if (time_map_events[type].find(time) == time_map_events[type].end())
+        time_map_events[type][time] = 1;
+    else
+        ++time_map_events[type][time];
+}
+
+template <container C>
+void TQueue<C>::statistics() {
+    printf("insertions=%lu  moves=%lu removals=%lu calls to least=%lu\n", ninsert, nmove, nrem,
+           nleast);
+    printf("calls to find=%lu\n", nfind);
+    printf("comparisons=%d\n", sptree_->enqcmps);
+}
+#endif
+
+/// Splay tree priority queue implementation
+template <>
+inline void TQueue<spltree>::move_least_nolock(double tnew) {
+    TQItem* b = least();
+    if (b) {
+        b->t_ = tnew;
+        TQItem* nl;
+        nl = sphead(sptree_);
+        if (nl && (tnew > nl->t_)) {
+            least_ = spdeq(&sptree_->root);
+            spenq(b, sptree_);
+        }
+    }
+}
+
+/// STL priority queue implementation
+template <>
+inline void TQueue<pq_que>::move_least_nolock(double tnew) {
+    TQItem* b = least();
+    if (b) {
+        b->t_ = tnew;
+        TQItem* nl;
+        nl = pq_que_.top().second;
+        if (nl && (tnew > nl->t_)) {
+            least_ = nl;
+            pq_que_.pop();
+            pq_que_.push(make_TQPair(b));
+        }
+    }
+}
+
+/// Splay tree priority queue implementation
+template <>
+inline void TQueue<spltree>::move(TQItem* i, double tnew) {
+    MUTLOCK
+    STAT(nmove)
+    if (i == least_) {
+        move_least_nolock(tnew);
+    } else if (tnew < least_->t_) {
+        spdelete(i, sptree_);
+        i->t_ = tnew;
+        spenq(least_, sptree_);
+        least_ = i;
+    } else {
+        spdelete(i, sptree_);
+        i->t_ = tnew;
+        spenq(i, sptree_);
+    }
+    MUTUNLOCK
+}
+
+/// STL priority queue implementation
+template <>
+inline void TQueue<pq_que>::move(TQItem* i, double tnew) {
+    MUTLOCK
+    STAT(nmove)
+    if (i == least_) {
+        move_least_nolock(tnew);
+    } else if (tnew < least_->t_) {
+        TQItem* qmove = new TQItem;
+        qmove->data_ = i->data_;
+        qmove->t_ = tnew;
+        qmove->cnt_ = i->cnt_;
+        i->t_ = -1.;
+        pq_que_.push(make_TQPair(least_));
+        least_ = qmove;
+    } else {
+        TQItem* qmove = new TQItem;
+        qmove->data_ = i->data_;
+        qmove->t_ = tnew;
+        qmove->cnt_ = i->cnt_;
+        i->t_ = -1.;
+        pq_que_.push(make_TQPair(qmove));
+    }
+    MUTUNLOCK
+}
+
+/// Splay tree priority queue implementation
+template <>
+inline TQItem* TQueue<spltree>::insert(double tt, void* d) {
+    MUTLOCK
+#if COLLECT_TQueue_STATISTICS
+    STAT(ninsert);
+    record_stat_event(enq, tt);
+#endif
+    TQItem* i = new TQItem;
+    i->data_ = d;
+    i->t_ = tt;
+    i->cnt_ = -1;
+    if (tt < least_t_nolock()) {
+        if (least_) {
+            /// Probably storing both time and event which has the time is redundant, but the event
+            /// is then returned
+            /// to the upper level call stack function. If we were to eliminate i->t_ and i->cnt_
+            /// fields,
+            /// we need to make sure we are not braking anything.
+            spenq(least_, sptree_);
+        }
+        least_ = i;
+    } else {
+        spenq(i, sptree_);
+    }
+    MUTUNLOCK
+    return i;
+}
+
+/// STL priority queue implementation
+template <>
+inline TQItem* TQueue<pq_que>::insert(double tt, void* d) {
+    MUTLOCK
+#if COLLECT_TQueue_STATISTICS
+    STAT(ninsert);
+    record_stat_event(enq, tt);
+#endif
+    TQItem* i = new TQItem;
+    i->data_ = d;
+    i->t_ = tt;
+    i->cnt_ = -1;
+    if (tt < least_t_nolock()) {
+        if (least_) {
+            /// Probably storing both time and event which has the time is redundant, but the event
+            /// is then returned
+            /// to the upper level call stack function. If we were to eliminate i->t_ and i->cnt_
+            /// fields,
+            /// we need to make sure we are not braking anything.
+            pq_que_.push(make_TQPair(least_));
+        }
+        least_ = i;
+    } else {
+        pq_que_.push(make_TQPair(i));
+    }
+    MUTUNLOCK
+    return i;
+}
+
+/// Splay tree priority queue implementation
+template <>
+inline void TQueue<spltree>::remove(TQItem* q) {
+    MUTLOCK
+#if COLLECT_TQueue_STATISTICS
+    STAT(nrem);
+    record_stat_event(deq, q->t_);
+#endif
+    if (q) {
+        if (q == least_) {
+            if (sptree_->root) {
+                least_ = spdeq(&sptree_->root);
+            } else {
+                least_ = NULL;
+            }
+        } else {
+            spdelete(q, sptree_);
+        }
+        delete q;
+    }
+    MUTUNLOCK
+}
+
+/// STL priority queue implementation
+template <>
+inline void TQueue<pq_que>::remove(TQItem* q) {
+    MUTLOCK
+#if COLLECT_TQueue_STATISTICS
+    STAT(nrem);
+    record_stat_event(deq, q->t_);
+#endif
+    if (q) {
+        if (q == least_) {
+            if (pq_que_.size()) {
+                least_ = pq_que_.top().second;
+                pq_que_.pop();
+            } else {
+                least_ = NULL;
+            }
+        } else {
+            q->t_ = -1.;
+        }
+    }
+    MUTUNLOCK
+}
+
+/// Splay tree priority queue implementation
+template <>
+inline TQItem* TQueue<spltree>::atomic_dq(double tt) {
+    TQItem* q = 0;
+    MUTLOCK
+    if (least_ && least_->t_ <= tt) {
+        q = least_;
+#if COLLECT_TQueue_STATISTICS
+        STAT(nrem);
+        record_stat_event(deq, tt);
+#endif
+        if (sptree_->root) {
+            least_ = spdeq(&sptree_->root);
+        } else {
+            least_ = NULL;
+        }
+    }
+    MUTUNLOCK
+    return q;
+}
+
+/// STL priority queue implementation
+template <>
+inline TQItem* TQueue<pq_que>::atomic_dq(double tt) {
+    TQItem* q = 0;
+    MUTLOCK
+    if (least_ && least_->t_ <= tt) {
+        q = least_;
+#if COLLECT_TQueue_STATISTICS
+        STAT(nrem);
+        record_stat_event(deq, tt);
+#endif
+        //        int qsize = pq_que_.size();
+        //        printf("map size: %d\n", msize);
+        /// This while loop is to delete events whose times have been moved with the ::move
+        /// function,
+        /// but in fact events were left in the queue since the only function available is pop
+        while (pq_que_.size() && pq_que_.top().second->t_ < 0.) {
+            delete pq_que_.top().second;
+            pq_que_.pop();
+        }
+        if (pq_que_.size()) {
+            least_ = pq_que_.top().second;
+            pq_que_.pop();
+        } else {
+            least_ = NULL;
+        }
+    }
+    MUTUNLOCK
+    return q;
+}
+
+#endif
diff --git a/coreneuron/nrniv/vrecitem.h b/coreneuron/nrniv/vrecitem.h
index 48c3aa3ec..257583e9a 100644
--- a/coreneuron/nrniv/vrecitem.h
+++ b/coreneuron/nrniv/vrecitem.h
@@ -38,63 +38,71 @@ class PlayRecord;
 
 // used by PlayRecord subclasses that utilize discrete events
 class PlayRecordEvent : public DiscreteEvent {
-public:
-	PlayRecordEvent();
-	virtual ~PlayRecordEvent();
-	virtual void deliver(double, NetCvode*, NrnThread*);
-	virtual void pr(const char*, double t, NetCvode*);
-	virtual void frecord_init(TQItem* q);
-	virtual NrnThread* thread();
-	PlayRecord* plr_;
-	static unsigned long playrecord_send_;
-	static unsigned long playrecord_deliver_;
+  public:
+    PlayRecordEvent();
+    virtual ~PlayRecordEvent();
+    virtual void deliver(double, NetCvode*, NrnThread*);
+    virtual void pr(const char*, double t, NetCvode*);
+    virtual NrnThread* thread();
+    PlayRecord* plr_;
+    static unsigned long playrecord_send_;
+    static unsigned long playrecord_deliver_;
 };
 
 // common interface for Play and Record for all integration methods.
 class PlayRecord {
-public:
-	PlayRecord(double* pd, int ith);
-	virtual ~PlayRecord();
-	virtual void play_init(){}	// called near beginning of finitialize
-	virtual void continuous(double){} // play - every f(y, t) or res(y', y, t); record - advance_tn and initialize flag
-	virtual void deliver(double, NetCvode*){} // at associated DiscreteEvent
-	virtual PlayRecordEvent* event() { return nil;}
-	virtual void pr(); // print identifying info
-	virtual int type() { return 0; }
-
-	// administration
-	virtual void frecord_init(TQItem*) {}
-
-	double* pd_;
-	int ith_; // The thread index
+  public:
+    PlayRecord(double* pd, int ith);
+    virtual ~PlayRecord();
+    virtual void play_init() {
+    }  // called near beginning of finitialize
+    virtual void continuous(double) {
+    }  // play - every f(y, t) or res(y', y, t); record - advance_tn and initialize flag
+    virtual void deliver(double, NetCvode*) {
+    }  // at associated DiscreteEvent
+    virtual PlayRecordEvent* event() {
+        return nil;
+    }
+    virtual void pr();  // print identifying info
+    virtual int type() {
+        return 0;
+    }
+
+    double* pd_;
+    int ith_;  // The thread index
 };
 
-
 class VecPlayContinuous : public PlayRecord {
-public:
-	VecPlayContinuous(double*, IvocVect* yvec, IvocVect* tvec, IvocVect* discon, int ith);
-	virtual ~VecPlayContinuous();
-	void init(IvocVect* yvec, IvocVect* tvec, IvocVect* tdiscon);
-	virtual void play_init();
-	virtual void deliver(double tt, NetCvode*);
-	virtual PlayRecordEvent* event() { return e_;}
-	virtual void pr();
-
-	void continuous(double tt);
-	double interpolate(double tt);
-	double interp(double th, double x0, double x1){ return x0 + (x1 - x0)*th; }
-	void search(double tt);
-
-	virtual int type() { return VecPlayContinuousType; }
-
-	IvocVect* y_;
-	IvocVect* t_;
-	IvocVect* discon_indices_;
+  public:
+    VecPlayContinuous(double*, IvocVect* yvec, IvocVect* tvec, IvocVect* discon, int ith);
+    virtual ~VecPlayContinuous();
+    void init(IvocVect* yvec, IvocVect* tvec, IvocVect* tdiscon);
+    virtual void play_init();
+    virtual void deliver(double tt, NetCvode*);
+    virtual PlayRecordEvent* event() {
+        return e_;
+    }
+    virtual void pr();
+
+    void continuous(double tt);
+    double interpolate(double tt);
+    double interp(double th, double x0, double x1) {
+        return x0 + (x1 - x0) * th;
+    }
+    void search(double tt);
+
+    virtual int type() {
+        return VecPlayContinuousType;
+    }
+
+    IvocVect* y_;
+    IvocVect* t_;
+    IvocVect* discon_indices_;
     size_t last_index_;
     size_t discon_index_;
     size_t ubound_index_;
 
-	PlayRecordEvent* e_;
+    PlayRecordEvent* e_;
 };
 
 #endif
diff --git a/coreneuron/nrniv/vrecord.cpp b/coreneuron/nrniv/vrecord.cpp
index a4f92bdc3..8f699f05f 100644
--- a/coreneuron/nrniv/vrecord.cpp
+++ b/coreneuron/nrniv/vrecord.cpp
@@ -36,138 +36,144 @@ THE POSSIBILITY OF SUCH DAMAGE.
 
 extern NetCvode* net_cvode_instance;
 
-PlayRecordEvent::PlayRecordEvent() {}
-PlayRecordEvent::~PlayRecordEvent() {}
-
-void PlayRecordEvent::frecord_init(TQItem* q) {
-	plr_->frecord_init(q);
+PlayRecordEvent::PlayRecordEvent() {
+}
+PlayRecordEvent::~PlayRecordEvent() {
 }
 
 void PlayRecordEvent::deliver(double tt, NetCvode* ns, NrnThread*) {
-	plr_->deliver(tt, ns);
+    plr_->deliver(tt, ns);
 }
 
-NrnThread* PlayRecordEvent::thread() { return nrn_threads + plr_->ith_; }
+NrnThread* PlayRecordEvent::thread() {
+    return nrn_threads + plr_->ith_;
+}
 
 void PlayRecordEvent::pr(const char* s, double tt, NetCvode*) {
-	printf("%s PlayRecordEvent %.15g ", s, tt);
-	plr_->pr();
+    printf("%s PlayRecordEvent %.15g ", s, tt);
+    plr_->pr();
 }
 
 PlayRecord::PlayRecord(double* pd, int ith) {
-//printf("PlayRecord::PlayRecord %p\n", this);
-	pd_ = pd;
-	ith_ = ith;
+    // printf("PlayRecord::PlayRecord %p\n", this);
+    pd_ = pd;
+    ith_ = ith;
 }
 
 PlayRecord::~PlayRecord() {
-//printf("PlayRecord::~PlayRecord %p\n", this);
+    // printf("PlayRecord::~PlayRecord %p\n", this);
 }
 
 void PlayRecord::pr() {
-	printf("PlayRecord\n");
+    printf("PlayRecord\n");
 }
 
-
-VecPlayContinuous::VecPlayContinuous(double* pd, IvocVect* yvec, IvocVect* tvec, IvocVect* discon, int ith) : PlayRecord(pd, ith) {
-//printf("VecPlayContinuous\n");
-	init(yvec, tvec, discon);
+VecPlayContinuous::VecPlayContinuous(double* pd,
+                                     IvocVect* yvec,
+                                     IvocVect* tvec,
+                                     IvocVect* discon,
+                                     int ith)
+    : PlayRecord(pd, ith) {
+    // printf("VecPlayContinuous\n");
+    init(yvec, tvec, discon);
 }
 
 void VecPlayContinuous::init(IvocVect* yvec, IvocVect* tvec, IvocVect* discon) {
-	y_ = yvec;
-	t_ = tvec;
-	discon_indices_ = discon;
-	ubound_index_ = 0;
-	last_index_ = 0;
-	e_ = new PlayRecordEvent();
-	e_->plr_ = this;
+    y_ = yvec;
+    t_ = tvec;
+    discon_indices_ = discon;
+    ubound_index_ = 0;
+    last_index_ = 0;
+    e_ = new PlayRecordEvent();
+    e_->plr_ = this;
 }
 
-
 VecPlayContinuous::~VecPlayContinuous() {
-//printf("~VecPlayContinuous\n");
-	delete e_;
+    // printf("~VecPlayContinuous\n");
+    delete e_;
 }
 
 void VecPlayContinuous::play_init() {
-	NrnThread* nt = nrn_threads + ith_;
-	last_index_ = 0;
-	discon_index_ = 0;
-	if (discon_indices_) {
+    NrnThread* nt = nrn_threads + ith_;
+    last_index_ = 0;
+    discon_index_ = 0;
+    if (discon_indices_) {
         if (discon_indices_->size() > 0) {
             ubound_index_ = (int)(*discon_indices_)[discon_index_++];
-//printf("play_init %d %g\n", ubound_index_, t_->elem(ubound_index_));
+            // printf("play_init %d %g\n", ubound_index_, t_->elem(ubound_index_));
             e_->send((*t_)[ubound_index_], net_cvode_instance, nt);
-		}else{
-            ubound_index_ = t_->size()-1;
-		}
-	}else{
-		ubound_index_ = 0;
+        } else {
+            ubound_index_ = t_->size() - 1;
+        }
+    } else {
+        ubound_index_ = 0;
         e_->send((*t_)[ubound_index_], net_cvode_instance, nt);
-	}
+    }
 }
 
 void VecPlayContinuous::deliver(double tt, NetCvode* ns) {
-	NrnThread* nt = nrn_threads + ith_;
-//printf("deliver %g\n", tt);
-	last_index_ = ubound_index_;
-	if (discon_indices_) {
+    NrnThread* nt = nrn_threads + ith_;
+    // printf("deliver %g\n", tt);
+    last_index_ = ubound_index_;
+    if (discon_indices_) {
         if (discon_index_ < discon_indices_->size()) {
             ubound_index_ = (int)(*discon_indices_)[discon_index_++];
-//printf("after deliver:send %d %g\n", ubound_index_, t_->elem(ubound_index_));
+            // printf("after deliver:send %d %g\n", ubound_index_, t_->elem(ubound_index_));
             e_->send((*t_)[ubound_index_], ns, nt);
-		}else{
+        } else {
             ubound_index_ = t_->size() - 1;
-		}
-	}else{
+        }
+    } else {
         if (ubound_index_ < t_->size() - 1) {
-			ubound_index_++;
+            ubound_index_++;
             e_->send((*t_)[ubound_index_], ns, nt);
-		}
-	}
-	continuous(tt);
+        }
+    }
+    continuous(tt);
 }
 
-	
 void VecPlayContinuous::continuous(double tt) {
-	*pd_ = interpolate(tt);
+    *pd_ = interpolate(tt);
 }
 
 double VecPlayContinuous::interpolate(double tt) {
     if (tt >= (*t_)[ubound_index_]) {
-		last_index_ = ubound_index_;
-		if (last_index_ == 0) {
-//printf("return last tt=%g ubound=%g y=%g\n", tt, t_->elem(ubound_index_), y_->elem(last_index_));
+        last_index_ = ubound_index_;
+        if (last_index_ == 0) {
+            // printf("return last tt=%g ubound=%g y=%g\n", tt, t_->elem(ubound_index_),
+            // y_->elem(last_index_));
             return (*y_)[last_index_];
-		}
-    }else if (tt <= (*t_)[0]) {
-		last_index_ = 0;
-//printf("return elem(0) tt=%g t0=%g y=%g\n", tt, t_->elem(0), y_->elem(0));
+        }
+    } else if (tt <= (*t_)[0]) {
+        last_index_ = 0;
+        // printf("return elem(0) tt=%g t0=%g y=%g\n", tt, t_->elem(0), y_->elem(0));
         return (*y_)[0];
-	}else{
-		search(tt);
-	}
-    double x0 = (*y_)[last_index_-1];
+    } else {
+        search(tt);
+    }
+    double x0 = (*y_)[last_index_ - 1];
     double x1 = (*y_)[last_index_];
     double t0 = (*t_)[last_index_ - 1];
     double t1 = (*t_)[last_index_];
-//printf("IvocVectRecorder::continuous tt=%g t0=%g t1=%g theta=%g x0=%g x1=%g\n", tt, t0, t1, (tt - t0)/(t1 - t0), x0, x1);
-	if (t0 == t1) { return (x0 + x1)/2.; }
-	return interp((tt - t0)/(t1 - t0), x0, x1);
+    // printf("IvocVectRecorder::continuous tt=%g t0=%g t1=%g theta=%g x0=%g x1=%g\n", tt, t0, t1,
+    // (tt - t0)/(t1 - t0), x0, x1);
+    if (t0 == t1) {
+        return (x0 + x1) / 2.;
+    }
+    return interp((tt - t0) / (t1 - t0), x0, x1);
 }
 
 void VecPlayContinuous::search(double tt) {
-//	assert (tt > t_->elem(0) && tt < t_->elem(t_->size() - 1))
-    while ( tt < (*t_)[last_index_]) {
-		--last_index_;
-	}
-    while ( tt >= (*t_)[last_index_]) {
-		++last_index_;
-	}
+    //	assert (tt > t_->elem(0) && tt < t_->elem(t_->size() - 1))
+    while (tt < (*t_)[last_index_]) {
+        --last_index_;
+    }
+    while (tt >= (*t_)[last_index_]) {
+        ++last_index_;
+    }
 }
 
 void VecPlayContinuous::pr() {
-	printf("VecPlayContinuous ");
-//	printf("%s.x[%d]\n", hoc_object_name(y_->obj_), last_index_);
+    printf("VecPlayContinuous ");
+    //	printf("%s.x[%d]\n", hoc_object_name(y_->obj_), last_index_);
 }
diff --git a/coreneuron/nrnmpi/mpispike.c b/coreneuron/nrnmpi/mpispike.c
index d2eb2acc4..3ed5c607e 100644
--- a/coreneuron/nrnmpi/mpispike.c
+++ b/coreneuron/nrnmpi/mpispike.c
@@ -46,32 +46,32 @@ static void pgvts_op(double* in, double* inout, int* len, MPI_Datatype* dptr);
 static MPI_Op mpi_pgvts_op;
 
 static void make_spike_type() {
-	NRNMPI_Spike s;
-	int block_lengths[2];
-	MPI_Aint displacements[2];
-	MPI_Aint addresses[3];
-	MPI_Datatype typelist[2];
+    NRNMPI_Spike s;
+    int block_lengths[2];
+    MPI_Aint displacements[2];
+    MPI_Aint addresses[3];
+    MPI_Datatype typelist[2];
 
-	typelist[0] = MPI_INT;
-	typelist[1] = MPI_DOUBLE;
+    typelist[0] = MPI_INT;
+    typelist[1] = MPI_DOUBLE;
 
-	block_lengths[0] = block_lengths[1] = 1;
+    block_lengths[0] = block_lengths[1] = 1;
 
-	MPI_Get_address(&s, &addresses[0]);
-	MPI_Get_address(&(s.gid), &addresses[1]);
-	MPI_Get_address(&(s.spiketime), &addresses[2]);
+    MPI_Get_address(&s, &addresses[0]);
+    MPI_Get_address(&(s.gid), &addresses[1]);
+    MPI_Get_address(&(s.spiketime), &addresses[2]);
 
-	displacements[0] = addresses[1] - addresses[0];
-	displacements[1] = addresses[2] - addresses[0];
+    displacements[0] = addresses[1] - addresses[0];
+    displacements[1] = addresses[2] - addresses[0];
 
-	MPI_Type_create_struct(2, block_lengths, displacements, typelist, &spike_type);
-	MPI_Type_commit(&spike_type);
+    MPI_Type_create_struct(2, block_lengths, displacements, typelist, &spike_type);
+    MPI_Type_commit(&spike_type);
 
-	MPI_Op_create((MPI_User_function*)pgvts_op, 1, &mpi_pgvts_op);
+    MPI_Op_create((MPI_User_function*)pgvts_op, 1, &mpi_pgvts_op);
 }
 
 void nrnmpi_spike_initialize() {
-	make_spike_type();
+    make_spike_type();
 }
 
 #if nrn_spikebuf_size > 0
@@ -79,423 +79,445 @@ void nrnmpi_spike_initialize() {
 static MPI_Datatype spikebuf_type;
 
 static void make_spikebuf_type() {
-	NRNMPI_Spikebuf s;
-	int block_lengths[3];
-	MPI_Aint displacements[3];
-	MPI_Aint addresses[4];
-	MPI_Datatype typelist[3];
+    NRNMPI_Spikebuf s;
+    int block_lengths[3];
+    MPI_Aint displacements[3];
+    MPI_Aint addresses[4];
+    MPI_Datatype typelist[3];
 
-	typelist[0] = MPI_INT;
-	typelist[1] = MPI_INT;
-	typelist[2] = MPI_DOUBLE;
+    typelist[0] = MPI_INT;
+    typelist[1] = MPI_INT;
+    typelist[2] = MPI_DOUBLE;
 
-	block_lengths[0] = 1;
-	block_lengths[1] = nrn_spikebuf_size;
-	block_lengths[2] = nrn_spikebuf_size;
+    block_lengths[0] = 1;
+    block_lengths[1] = nrn_spikebuf_size;
+    block_lengths[2] = nrn_spikebuf_size;
 
-	MPI_Get_address(&s, &addresses[0]);
-	MPI_Get_address(&(s.nspike), &addresses[1]);
-	MPI_Get_address(&(s.gid[0]), &addresses[2]);
-	MPI_Get_address(&(s.spiketime[0]), &addresses[3]);
+    MPI_Get_address(&s, &addresses[0]);
+    MPI_Get_address(&(s.nspike), &addresses[1]);
+    MPI_Get_address(&(s.gid[0]), &addresses[2]);
+    MPI_Get_address(&(s.spiketime[0]), &addresses[3]);
 
-	displacements[0] = addresses[1] - addresses[0];
-	displacements[1] = addresses[2] - addresses[0];
-	displacements[2] = addresses[3] - addresses[0];
+    displacements[0] = addresses[1] - addresses[0];
+    displacements[1] = addresses[2] - addresses[0];
+    displacements[2] = addresses[3] - addresses[0];
 
-	MPI_Type_create_struct(3, block_lengths, displacements, typelist, &spikebuf_type);
-	MPI_Type_commit(&spikebuf_type);
+    MPI_Type_create_struct(3, block_lengths, displacements, typelist, &spikebuf_type);
+    MPI_Type_commit(&spikebuf_type);
 }
 #endif
 
 int nrnmpi_spike_exchange() {
-	int i, n;
+    int i, n;
 #if nrn_spikebuf_size > 0
-	int n1, novfl;
+    int n1, novfl;
 #endif
-	if (!displs) {
-		np = nrnmpi_numprocs;
-		displs = (int*)emalloc(np*sizeof(int));
-		displs[0] = 0;
-#if nrn_spikebuf_size > 0		
-		make_spikebuf_type();
+    if (!displs) {
+        np = nrnmpi_numprocs;
+        displs = (int*)emalloc(np * sizeof(int));
+        displs[0] = 0;
+#if nrn_spikebuf_size > 0
+        make_spikebuf_type();
 #endif
-	}
+    }
 #if nrn_spikebuf_size == 0
-	MPI_Allgather(&nout_, 1, MPI_INT, nin_, 1, MPI_INT, nrnmpi_comm);
-	n = nin_[0];
-	for (i=1; i < np; ++i) {
-		displs[i] = n;
-		n += nin_[i];
-	}
-	if (n) {
-		if (icapacity_ < n) {
-			icapacity_ = n + 10;
-			free(spikein_);
-			spikein_ = (NRNMPI_Spike*)emalloc(icapacity_ * sizeof(NRNMPI_Spike));
-		}
-		MPI_Allgatherv(spikeout_, nout_, spike_type, spikein_, nin_, displs, spike_type, nrnmpi_comm);
-	}
+    MPI_Allgather(&nout_, 1, MPI_INT, nin_, 1, MPI_INT, nrnmpi_comm);
+    n = nin_[0];
+    for (i = 1; i < np; ++i) {
+        displs[i] = n;
+        n += nin_[i];
+    }
+    if (n) {
+        if (icapacity_ < n) {
+            icapacity_ = n + 10;
+            free(spikein_);
+            spikein_ = (NRNMPI_Spike*)emalloc(icapacity_ * sizeof(NRNMPI_Spike));
+        }
+        MPI_Allgatherv(spikeout_, nout_, spike_type, spikein_, nin_, displs, spike_type,
+                       nrnmpi_comm);
+    }
 #else
-	MPI_Allgather(spbufout_, 1, spikebuf_type, spbufin_, 1, spikebuf_type, nrnmpi_comm);
-	novfl = 0;
-	n = spbufin_[0].nspike;
-	if (n > nrn_spikebuf_size) {
-		nin_[0] = n - nrn_spikebuf_size;
-		novfl += nin_[0];
-	}else{
-		nin_[0] = 0;
-	}
-	for (i=1; i < np; ++i) {
-		displs[i] = novfl;
-		n1 = spbufin_[i].nspike;
-		n += n1;
-		if (n1 > nrn_spikebuf_size) {
-			nin_[i] = n1 - nrn_spikebuf_size;
-			novfl += nin_[i];
-		}else{
-			nin_[i] = 0;
-		}
-	}
-	if (novfl) {
-		if (icapacity_ < novfl) {
-			icapacity_ = novfl + 10;
-			free(spikein_);
-			spikein_ = (NRNMPI_Spike*)hoc_Emalloc(icapacity_ * sizeof(NRNMPI_Spike)); hoc_malchk();
-		}
-		n1 = (nout_ > nrn_spikebuf_size) ? nout_ - nrn_spikebuf_size : 0;
-		MPI_Allgatherv(spikeout_, n1, spike_type, spikein_, nin_, displs, spike_type, nrnmpi_comm);
-	}
-	ovfl_ = novfl;
+    MPI_Allgather(spbufout_, 1, spikebuf_type, spbufin_, 1, spikebuf_type, nrnmpi_comm);
+    novfl = 0;
+    n = spbufin_[0].nspike;
+    if (n > nrn_spikebuf_size) {
+        nin_[0] = n - nrn_spikebuf_size;
+        novfl += nin_[0];
+    } else {
+        nin_[0] = 0;
+    }
+    for (i = 1; i < np; ++i) {
+        displs[i] = novfl;
+        n1 = spbufin_[i].nspike;
+        n += n1;
+        if (n1 > nrn_spikebuf_size) {
+            nin_[i] = n1 - nrn_spikebuf_size;
+            novfl += nin_[i];
+        } else {
+            nin_[i] = 0;
+        }
+    }
+    if (novfl) {
+        if (icapacity_ < novfl) {
+            icapacity_ = novfl + 10;
+            free(spikein_);
+            spikein_ = (NRNMPI_Spike*)hoc_Emalloc(icapacity_ * sizeof(NRNMPI_Spike));
+            hoc_malchk();
+        }
+        n1 = (nout_ > nrn_spikebuf_size) ? nout_ - nrn_spikebuf_size : 0;
+        MPI_Allgatherv(spikeout_, n1, spike_type, spikein_, nin_, displs, spike_type, nrnmpi_comm);
+    }
+    ovfl_ = novfl;
 #endif
-	return n;
+    return n;
 }
 
 /*
 The compressed spike format is restricted to the fixed step method and is
-a sequence of unsigned char. 
+a sequence of unsigned char.
 nspike = buf[0]*256 + buf[1]
 a sequence of spiketime, localgid pairs. There are nspike of them.
-	spiketime is relative to the last transfer time in units of dt.
-	note that this requires a mindelay < 256*dt.
-	localgid is an unsigned int, unsigned short,
-	or unsigned char in size depending on the range and thus takes
-	4, 2, or 1 byte respectively. To be machine independent we do our
-	own byte coding. When the localgid range is smaller than the true
-	gid range, the gid->PreSyn are remapped into
-	hostid specific	maps. If there are not many holes, i.e just about every
-	spike from a source machine is delivered to some cell on a
-	target machine, then instead of	a hash map, a vector is used.
+        spiketime is relative to the last transfer time in units of dt.
+        note that this requires a mindelay < 256*dt.
+        localgid is an unsigned int, unsigned short,
+        or unsigned char in size depending on the range and thus takes
+        4, 2, or 1 byte respectively. To be machine independent we do our
+        own byte coding. When the localgid range is smaller than the true
+        gid range, the gid->PreSyn are remapped into
+        hostid specific	maps. If there are not many holes, i.e just about every
+        spike from a source machine is delivered to some cell on a
+        target machine, then instead of	a hash map, a vector is used.
 The allgather sends the first part of the buf and the allgatherv buffer
 sends any overflow.
 */
 int nrnmpi_spike_exchange_compressed() {
-	int i, novfl, n, ntot, idx, bs, bstot; /* n is #spikes, bs is #byte overflow */
-	if (!displs) {
-		np = nrnmpi_numprocs;
-		displs = (int*)emalloc(np*sizeof(int));
-		displs[0] = 0;
-		byteovfl = (int*)emalloc(np*sizeof(int));
-	}
-
-	MPI_Allgather(spfixout_, ag_send_size_, MPI_BYTE, spfixin_, ag_send_size_, MPI_BYTE, nrnmpi_comm);
-	novfl = 0;
-	ntot = 0;
-	bstot = 0;
-	for (i=0; i < np; ++i) {
-		displs[i] = bstot;
-		idx = i*ag_send_size_;
-		n = spfixin_[idx++]*256;
-		n += spfixin_[idx++];
-		ntot += n;
-		nin_[i] = n;
-		if (n > ag_send_nspike_) {
-			bs = 2 + n*(1 + localgid_size_) - ag_send_size_;
-			byteovfl[i] = bs;
-			bstot += bs;
-			novfl += n - ag_send_nspike_;
-		}else{
-			byteovfl[i] = 0;
-		}
-	}
-	if (novfl) {
-		if (ovfl_capacity_ < novfl) {
-			ovfl_capacity_ = novfl + 10;
-			free(spfixin_ovfl_);
-			spfixin_ovfl_ = (unsigned char*)emalloc(ovfl_capacity_ * (1 + localgid_size_)*sizeof(unsigned char));
-		}
-		bs = byteovfl[nrnmpi_myid];
-		/*
-		note that the spfixout_ buffer is one since the overflow
-		is contiguous to the first part. But the spfixin_ovfl_ is
-		completely separate from the spfixin_ since the latter
-		dynamically changes its size during a run.
-		*/
-		MPI_Allgatherv(spfixout_ + ag_send_size_, bs, MPI_BYTE, spfixin_ovfl_, byteovfl, displs, MPI_BYTE, nrnmpi_comm);
-	}
-	ovfl_ = novfl;
-	return ntot;
+    int i, novfl, n, ntot, idx, bs, bstot; /* n is #spikes, bs is #byte overflow */
+    if (!displs) {
+        np = nrnmpi_numprocs;
+        displs = (int*)emalloc(np * sizeof(int));
+        displs[0] = 0;
+        byteovfl = (int*)emalloc(np * sizeof(int));
+    }
+
+    MPI_Allgather(spfixout_, ag_send_size_, MPI_BYTE, spfixin_, ag_send_size_, MPI_BYTE,
+                  nrnmpi_comm);
+    novfl = 0;
+    ntot = 0;
+    bstot = 0;
+    for (i = 0; i < np; ++i) {
+        displs[i] = bstot;
+        idx = i * ag_send_size_;
+        n = spfixin_[idx++] * 256;
+        n += spfixin_[idx++];
+        ntot += n;
+        nin_[i] = n;
+        if (n > ag_send_nspike_) {
+            bs = 2 + n * (1 + localgid_size_) - ag_send_size_;
+            byteovfl[i] = bs;
+            bstot += bs;
+            novfl += n - ag_send_nspike_;
+        } else {
+            byteovfl[i] = 0;
+        }
+    }
+    if (novfl) {
+        if (ovfl_capacity_ < novfl) {
+            ovfl_capacity_ = novfl + 10;
+            free(spfixin_ovfl_);
+            spfixin_ovfl_ = (unsigned char*)emalloc(ovfl_capacity_ * (1 + localgid_size_) *
+                                                    sizeof(unsigned char));
+        }
+        bs = byteovfl[nrnmpi_myid];
+        /*
+        note that the spfixout_ buffer is one since the overflow
+        is contiguous to the first part. But the spfixin_ovfl_ is
+        completely separate from the spfixin_ since the latter
+        dynamically changes its size during a run.
+        */
+        MPI_Allgatherv(spfixout_ + ag_send_size_, bs, MPI_BYTE, spfixin_ovfl_, byteovfl, displs,
+                       MPI_BYTE, nrnmpi_comm);
+    }
+    ovfl_ = novfl;
+    return ntot;
 }
 
 double nrnmpi_mindelay(double m) {
-	double result;
-	if (!nrnmpi_use) { return m; }
-	MPI_Allreduce(&m, &result, 1, MPI_DOUBLE, MPI_MIN, nrnmpi_comm);
-	return result;
+    double result;
+    if (!nrnmpi_use) {
+        return m;
+    }
+    MPI_Allreduce(&m, &result, 1, MPI_DOUBLE, MPI_MIN, nrnmpi_comm);
+    return result;
 }
 
 int nrnmpi_int_allmax(int x) {
-	int result;
-	if (nrnmpi_numprocs < 2) { return x; }
-	MPI_Allreduce(&x, &result, 1, MPI_INT, MPI_MAX, nrnmpi_comm);
-	return result;
+    int result;
+    if (nrnmpi_numprocs < 2) {
+        return x;
+    }
+    MPI_Allreduce(&x, &result, 1, MPI_INT, MPI_MAX, nrnmpi_comm);
+    return result;
 }
 
 extern void nrnmpi_int_gather(int* s, int* r, int cnt, int root) {
-	MPI_Gather(s, cnt, MPI_INT, r, cnt, MPI_INT, root, nrnmpi_comm);
+    MPI_Gather(s, cnt, MPI_INT, r, cnt, MPI_INT, root, nrnmpi_comm);
+}
+
+extern void nrnmpi_int_gatherv(int* s, int scnt, int* r, int* rcnt, int* rdispl, int root) {
+    MPI_Gatherv(s, scnt, MPI_INT, r, rcnt, rdispl, MPI_INT, root, nrnmpi_comm);
 }
 
-extern void nrnmpi_int_gatherv(int* s, int scnt,
-    int* r, int* rcnt, int* rdispl, int root) {
-	MPI_Gatherv(s, scnt, MPI_INT,
-		r, rcnt, rdispl, MPI_INT, root, nrnmpi_comm);
+extern void nrnmpi_int_alltoall(int* s, int* r, int n) {
+    MPI_Alltoall(s, n, MPI_INT, r, n, MPI_INT, nrnmpi_comm);
 }
 
-extern void nrnmpi_int_alltoallv(int* s, int* scnt, int* sdispl,
-    int* r, int* rcnt, int* rdispl) {
-	MPI_Alltoallv(s, scnt, sdispl, MPI_INT,
-		r, rcnt, rdispl, MPI_INT, nrnmpi_comm);
+extern void nrnmpi_int_alltoallv(int* s, int* scnt, int* sdispl, int* r, int* rcnt, int* rdispl) {
+    MPI_Alltoallv(s, scnt, sdispl, MPI_INT, r, rcnt, rdispl, MPI_INT, nrnmpi_comm);
 }
 
-extern void nrnmpi_dbl_alltoallv(double* s, int* scnt, int* sdispl,
-    double* r, int* rcnt, int* rdispl) {
-	MPI_Alltoallv(s, scnt, sdispl, MPI_DOUBLE,
-		r, rcnt, rdispl, MPI_DOUBLE, nrnmpi_comm);
+extern void nrnmpi_dbl_alltoallv(double* s,
+                                 int* scnt,
+                                 int* sdispl,
+                                 double* r,
+                                 int* rcnt,
+                                 int* rdispl) {
+    MPI_Alltoallv(s, scnt, sdispl, MPI_DOUBLE, r, rcnt, rdispl, MPI_DOUBLE, nrnmpi_comm);
 }
 
-extern void nrnmpi_char_alltoallv(char* s, int* scnt, int* sdispl,
-    char* r, int* rcnt, int* rdispl) {
-	MPI_Alltoallv(s, scnt, sdispl, MPI_CHAR,
-		r, rcnt, rdispl, MPI_CHAR, nrnmpi_comm);
+extern void nrnmpi_char_alltoallv(char* s,
+                                  int* scnt,
+                                  int* sdispl,
+                                  char* r,
+                                  int* rcnt,
+                                  int* rdispl) {
+    MPI_Alltoallv(s, scnt, sdispl, MPI_CHAR, r, rcnt, rdispl, MPI_CHAR, nrnmpi_comm);
 }
 
 /* following are for the partrans */
 
 void nrnmpi_int_allgather(int* s, int* r, int n) {
-	MPI_Allgather(s, n,  MPI_INT, r, n, MPI_INT, nrnmpi_comm);
+    MPI_Allgather(s, n, MPI_INT, r, n, MPI_INT, nrnmpi_comm);
 }
 
 void nrnmpi_int_allgatherv(int* s, int* r, int* n, int* dspl) {
-	MPI_Allgatherv(s, n[nrnmpi_myid],  MPI_INT,
-		r, n, dspl, MPI_INT, nrnmpi_comm);
+    MPI_Allgatherv(s, n[nrnmpi_myid], MPI_INT, r, n, dspl, MPI_INT, nrnmpi_comm);
 }
 
 void nrnmpi_dbl_allgatherv(double* s, double* r, int* n, int* dspl) {
-	MPI_Allgatherv(s, n[nrnmpi_myid],  MPI_DOUBLE,
-		r, n, dspl, MPI_DOUBLE, nrnmpi_comm);
+    MPI_Allgatherv(s, n[nrnmpi_myid], MPI_DOUBLE, r, n, dspl, MPI_DOUBLE, nrnmpi_comm);
 }
 
 void nrnmpi_dbl_broadcast(double* buf, int cnt, int root) {
-	MPI_Bcast(buf, cnt,  MPI_DOUBLE, root, nrnmpi_comm);
+    MPI_Bcast(buf, cnt, MPI_DOUBLE, root, nrnmpi_comm);
 }
 
 void nrnmpi_int_broadcast(int* buf, int cnt, int root) {
-	MPI_Bcast(buf, cnt,  MPI_INT, root, nrnmpi_comm);
+    MPI_Bcast(buf, cnt, MPI_INT, root, nrnmpi_comm);
 }
 
 void nrnmpi_char_broadcast(char* buf, int cnt, int root) {
-	MPI_Bcast(buf, cnt,  MPI_CHAR, root, nrnmpi_comm);
+    MPI_Bcast(buf, cnt, MPI_CHAR, root, nrnmpi_comm);
 }
 
 int nrnmpi_int_sum_reduce(int in) {
-	int result;
-	MPI_Allreduce(&in, &result, 1, MPI_INT, MPI_SUM, nrnmpi_comm);
-	return result;
+    int result;
+    MPI_Allreduce(&in, &result, 1, MPI_INT, MPI_SUM, nrnmpi_comm);
+    return result;
 }
 
 void nrnmpi_assert_opstep(int opstep, double tt) {
-	/* all machines in comm should have same opstep and same tt. */
-	double buf[2];
-	if (nrnmpi_numprocs < 2) { return; }
-	buf[0] = (double)opstep;
-	buf[1] = tt;
-	MPI_Bcast(buf, 2, MPI_DOUBLE, 0, nrnmpi_comm);
-	if (opstep != (int)buf[0]  || tt != buf[1]) {
-		printf("%d opstep=%d %d  t=%g t-troot=%g\n", nrnmpi_myid, opstep,
-			(int)buf[0], tt, tt-buf[1]);
-		hoc_execerror("nrnmpi_assert_opstep failed", (char*)0);		
-	}
+    /* all machines in comm should have same opstep and same tt. */
+    double buf[2];
+    if (nrnmpi_numprocs < 2) {
+        return;
+    }
+    buf[0] = (double)opstep;
+    buf[1] = tt;
+    MPI_Bcast(buf, 2, MPI_DOUBLE, 0, nrnmpi_comm);
+    if (opstep != (int)buf[0] || tt != buf[1]) {
+        printf("%d opstep=%d %d  t=%g t-troot=%g\n", nrnmpi_myid, opstep, (int)buf[0], tt,
+               tt - buf[1]);
+        hoc_execerror("nrnmpi_assert_opstep failed", (char*)0);
+    }
 }
 
 double nrnmpi_dbl_allmin(double x) {
-	double result;
-	if (nrnmpi_numprocs < 2) { return x; }
-	MPI_Allreduce(&x, &result, 1, MPI_DOUBLE, MPI_MIN, nrnmpi_comm);
-	return result;
-}
-
-static void pgvts_op(double* in, double* inout, int* len, MPI_Datatype* dptr){
-	int i, r=0;
-	if(*dptr != MPI_DOUBLE)
-          printf("ERROR in mpispike.c! *dptr should be MPI_DOUBLE.");
-	if(*len != 4)
-          printf("ERROR in mpispike.c! *len should be 4.");
-	if (in[0] < inout[0]) {
- 		/* least time has highest priority */
- 		r = 1;
-	}else if (in[0] == inout[0]) {
-		/* when times are equal then */
-		if (in[1] < inout[1]) {
-			/* NetParEvent done last */
-			r = 1;
-		}else if (in[1] == inout[1]) {
-			/* when times and ops are equal then */
-			if (in[2] < inout[2]) {
-				/* init done next to last.*/
-				r = 1;
-			}else if (in[2] == inout[2]) {
-				/* when times, ops, and inits are equal then */
-				if (in[3] < inout[3]) {
-					/* choose lowest rank */
-					r = 1;
-				}
-			}
-		}
-	}
-	if (r) {
-		for (i=0; i < 4; ++i) { inout[i] = in[i]; }	
-	}
+    double result;
+    if (nrnmpi_numprocs < 2) {
+        return x;
+    }
+    MPI_Allreduce(&x, &result, 1, MPI_DOUBLE, MPI_MIN, nrnmpi_comm);
+    return result;
+}
+
+static void pgvts_op(double* in, double* inout, int* len, MPI_Datatype* dptr) {
+    int i, r = 0;
+    if (*dptr != MPI_DOUBLE)
+        printf("ERROR in mpispike.c! *dptr should be MPI_DOUBLE.");
+    if (*len != 4)
+        printf("ERROR in mpispike.c! *len should be 4.");
+    if (in[0] < inout[0]) {
+        /* least time has highest priority */
+        r = 1;
+    } else if (in[0] == inout[0]) {
+        /* when times are equal then */
+        if (in[1] < inout[1]) {
+            /* NetParEvent done last */
+            r = 1;
+        } else if (in[1] == inout[1]) {
+            /* when times and ops are equal then */
+            if (in[2] < inout[2]) {
+                /* init done next to last.*/
+                r = 1;
+            } else if (in[2] == inout[2]) {
+                /* when times, ops, and inits are equal then */
+                if (in[3] < inout[3]) {
+                    /* choose lowest rank */
+                    r = 1;
+                }
+            }
+        }
+    }
+    if (r) {
+        for (i = 0; i < 4; ++i) {
+            inout[i] = in[i];
+        }
+    }
 }
 
 int nrnmpi_pgvts_least(double* tt, int* op, int* init) {
-	int i;
-	double ibuf[4], obuf[4];
-	ibuf[0] = *tt;
-	ibuf[1] = (double)(*op);
-	ibuf[2] = (double)(*init);
-	ibuf[3] = (double)nrnmpi_myid;
-	for (i=0; i < 4; ++i) {
-		obuf[i] = ibuf[i];
-	}
-	MPI_Allreduce(ibuf, obuf, 4, MPI_DOUBLE, mpi_pgvts_op, nrnmpi_comm);
-	assert(obuf[0] <= *tt);
-	if (obuf[0] == *tt) {
-	  assert((int)obuf[1] <= *op);
-	  if ((int)obuf[1] == *op) {
-	    assert((int)obuf[2] <= *init);
-	    if ((int)obuf[2] == *init) {
-	      assert((int)obuf[3] <= nrnmpi_myid);
-	    }
-	  }
-	}
-	*tt = obuf[0];
-	*op = (int)obuf[1];
-	*init = (int)obuf[2];
-	if (nrnmpi_myid == (int)obuf[3]) {
-		return 1;
-	}
-	return 0;
+    int i;
+    double ibuf[4], obuf[4];
+    ibuf[0] = *tt;
+    ibuf[1] = (double)(*op);
+    ibuf[2] = (double)(*init);
+    ibuf[3] = (double)nrnmpi_myid;
+    for (i = 0; i < 4; ++i) {
+        obuf[i] = ibuf[i];
+    }
+    MPI_Allreduce(ibuf, obuf, 4, MPI_DOUBLE, mpi_pgvts_op, nrnmpi_comm);
+    assert(obuf[0] <= *tt);
+    if (obuf[0] == *tt) {
+        assert((int)obuf[1] <= *op);
+        if ((int)obuf[1] == *op) {
+            assert((int)obuf[2] <= *init);
+            if ((int)obuf[2] == *init) {
+                assert((int)obuf[3] <= nrnmpi_myid);
+            }
+        }
+    }
+    *tt = obuf[0];
+    *op = (int)obuf[1];
+    *init = (int)obuf[2];
+    if (nrnmpi_myid == (int)obuf[3]) {
+        return 1;
+    }
+    return 0;
 }
 
 /* following for splitcell.cpp transfer */
 void nrnmpi_send_doubles(double* pd, int cnt, int dest, int tag) {
-	MPI_Send(pd, cnt, MPI_DOUBLE, dest, tag, nrnmpi_comm);
+    MPI_Send(pd, cnt, MPI_DOUBLE, dest, tag, nrnmpi_comm);
 }
 
 void nrnmpi_recv_doubles(double* pd, int cnt, int src, int tag) {
-	MPI_Status status;
-	MPI_Recv(pd, cnt, MPI_DOUBLE, src, tag, nrnmpi_comm, &status);
+    MPI_Status status;
+    MPI_Recv(pd, cnt, MPI_DOUBLE, src, tag, nrnmpi_comm, &status);
 }
 
 void nrnmpi_postrecv_doubles(double* pd, int cnt, int src, int tag, void** request) {
-	MPI_Irecv(pd, cnt, MPI_DOUBLE, src, tag, nrnmpi_comm, (MPI_Request*)request);
+    MPI_Irecv(pd, cnt, MPI_DOUBLE, src, tag, nrnmpi_comm, (MPI_Request*)request);
 }
 
 void nrnmpi_wait(void** request) {
-	MPI_Status status;
-	MPI_Wait((MPI_Request*)request, &status);
+    MPI_Status status;
+    MPI_Wait((MPI_Request*)request, &status);
 }
 
 void nrnmpi_barrier() {
-        if (nrnmpi_numprocs > 1)
-        {
-          MPI_Barrier(nrnmpi_comm);
-        }
+    if (nrnmpi_numprocs > 1) {
+        MPI_Barrier(nrnmpi_comm);
+    }
 }
 
 double nrnmpi_dbl_allreduce(double x, int type) {
-	double result;
-	MPI_Op tt;
-	if (nrnmpi_numprocs < 2) { return x; }
-	if (type == 1) {
-		tt = MPI_SUM;
-	}else if (type == 2) {
-		tt = MPI_MAX;
-	}else{
-		tt = MPI_MIN;
-	}
-	MPI_Allreduce(&x, &result, 1, MPI_DOUBLE, tt, nrnmpi_comm);
-	return result;
+    double result;
+    MPI_Op tt;
+    if (nrnmpi_numprocs < 2) {
+        return x;
+    }
+    if (type == 1) {
+        tt = MPI_SUM;
+    } else if (type == 2) {
+        tt = MPI_MAX;
+    } else {
+        tt = MPI_MIN;
+    }
+    MPI_Allreduce(&x, &result, 1, MPI_DOUBLE, tt, nrnmpi_comm);
+    return result;
 }
 
 long nrnmpi_long_allreduce(long x, int type) {
-	long result;
-	MPI_Op tt;
-	if (nrnmpi_numprocs < 2) { return x; }
-	if (type == 1) {
-		tt = MPI_SUM;
-	}else if (type == 2) {
-		tt = MPI_MAX;
-	}else{
-		tt = MPI_MIN;
-	}
-	MPI_Allreduce(&x, &result, 1, MPI_LONG, tt, nrnmpi_comm);
-	return result;
+    long result;
+    MPI_Op tt;
+    if (nrnmpi_numprocs < 2) {
+        return x;
+    }
+    if (type == 1) {
+        tt = MPI_SUM;
+    } else if (type == 2) {
+        tt = MPI_MAX;
+    } else {
+        tt = MPI_MIN;
+    }
+    MPI_Allreduce(&x, &result, 1, MPI_LONG, tt, nrnmpi_comm);
+    return result;
 }
 
 void nrnmpi_dbl_allreduce_vec(double* src, double* dest, int cnt, int type) {
-	int i;
-	MPI_Op tt;
-	assert(src != dest);
-	if (nrnmpi_numprocs < 2) {
-		for (i = 0; i < cnt; ++i) {
-			dest[i] = src[i];
-		}
-		return;
-	}
-	if (type == 1) {
-		tt = MPI_SUM;
-	}else if (type == 2) {
-		tt = MPI_MAX;
-	}else{
-		tt = MPI_MIN;
-	}
-	MPI_Allreduce(src, dest, cnt, MPI_DOUBLE, tt, nrnmpi_comm);
-	return;
+    int i;
+    MPI_Op tt;
+    assert(src != dest);
+    if (nrnmpi_numprocs < 2) {
+        for (i = 0; i < cnt; ++i) {
+            dest[i] = src[i];
+        }
+        return;
+    }
+    if (type == 1) {
+        tt = MPI_SUM;
+    } else if (type == 2) {
+        tt = MPI_MAX;
+    } else {
+        tt = MPI_MIN;
+    }
+    MPI_Allreduce(src, dest, cnt, MPI_DOUBLE, tt, nrnmpi_comm);
+    return;
 }
 
 void nrnmpi_long_allreduce_vec(long* src, long* dest, int cnt, int type) {
-	int i;
-	MPI_Op tt;
-	assert(src != dest);
-	if (nrnmpi_numprocs < 2) {
-		for (i = 0; i < cnt; ++i) {
-			dest[i] = src[i];
-		}
-		return;
-	}
-	if (type == 1) {
-		tt = MPI_SUM;
-	}else if (type == 2) {
-		tt = MPI_MAX;
-	}else{
-		tt = MPI_MIN;
-	}
-	MPI_Allreduce(src, dest, cnt, MPI_LONG, tt, nrnmpi_comm);
-	return;
+    int i;
+    MPI_Op tt;
+    assert(src != dest);
+    if (nrnmpi_numprocs < 2) {
+        for (i = 0; i < cnt; ++i) {
+            dest[i] = src[i];
+        }
+        return;
+    }
+    if (type == 1) {
+        tt = MPI_SUM;
+    } else if (type == 2) {
+        tt = MPI_MAX;
+    } else {
+        tt = MPI_MIN;
+    }
+    MPI_Allreduce(src, dest, cnt, MPI_LONG, tt, nrnmpi_comm);
+    return;
 }
 
 void nrnmpi_dbl_allgather(double* s, double* r, int n) {
-	MPI_Allgather(s, n,  MPI_DOUBLE, r, n, MPI_DOUBLE, nrnmpi_comm);
+    MPI_Allgather(s, n, MPI_DOUBLE, r, n, MPI_DOUBLE, nrnmpi_comm);
 }
 
 #endif /*NRNMPI*/
diff --git a/coreneuron/nrnmpi/mpispike.h b/coreneuron/nrnmpi/mpispike.h
index e1bf9d1dd..36df001f3 100644
--- a/coreneuron/nrnmpi/mpispike.h
+++ b/coreneuron/nrnmpi/mpispike.h
@@ -29,15 +29,17 @@ THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef nrnmpispike_h
 #define nrnmpispike_h
 
+#if NRNMPI
+
 #ifndef nrn_spikebuf_size
 #define nrn_spikebuf_size 0
 #endif
 
 #if nrn_spikebuf_size > 0
 typedef struct {
-	int nspike;
-	int gid[nrn_spikebuf_size];
-	double spiketime[nrn_spikebuf_size];
+    int nspike;
+    int gid[nrn_spikebuf_size];
+    double spiketime[nrn_spikebuf_size];
 } NRNMPI_Spikebuf;
 #endif
 
@@ -64,11 +66,11 @@ extern NRNMPI_Spike* spikein_;
 #define ag_send_nspike_ nrnmpi_send_nspike_
 #define ovfl_capacity_ nrnmpi_ovfl_capacity_
 #define ovfl_ nrnmpi_ovfl_
-extern int localgid_size_; /* bytes */
-extern int ag_send_size_; /* bytes */
+extern int localgid_size_;  /* bytes */
+extern int ag_send_size_;   /* bytes */
 extern int ag_send_nspike_; /* spikes */
-extern int ovfl_capacity_; /* spikes */
-extern int ovfl_; /* spikes */
+extern int ovfl_capacity_;  /* spikes */
+extern int ovfl_;           /* spikes */
 extern unsigned char* spfixout_;
 extern unsigned char* spfixin_;
 extern unsigned char* spfixin_ovfl_;
@@ -80,6 +82,9 @@ extern NRNMPI_Spikebuf* spbufout_;
 extern NRNMPI_Spikebuf* spbufin_;
 #endif
 
+
+#endif // NRNMPI
+
 #if defined(__cplusplus)
 }
 #endif
diff --git a/coreneuron/nrnmpi/nrnmpi.c b/coreneuron/nrnmpi/nrnmpi.c
index 511fda21d..4e354161d 100644
--- a/coreneuron/nrnmpi/nrnmpi.c
+++ b/coreneuron/nrnmpi/nrnmpi.c
@@ -27,6 +27,7 @@ THE POSSIBILITY OF SUCH DAMAGE.
 */
 
 #include <string.h>
+#include <sys/time.h>
 #include "coreneuron/nrnconf.h"
 #include "coreneuron/nrnmpi/nrnmpi.h"
 #include "coreneuron/nrnmpi/mpispike.h"
@@ -37,6 +38,7 @@ THE POSSIBILITY OF SUCH DAMAGE.
 
 #if NRNMPI
 #include <mpi.h>
+
 #define USE_HPM 0
 #if USE_HPM
 #include <libhpm.h>
@@ -58,159 +60,157 @@ extern void nrnmpi_checkbufleak();
 #endif
 
 static int nrnmpi_under_nrncontrol_;
-#endif
 
 void nrnmpi_init(int nrnmpi_under_nrncontrol, int* pargc, char*** pargv) {
-#if NRNMPI
-	int i, b, flag;
-	static int called = 0;
-
-	if (called) { return; }
-	called = 1;
-	nrnmpi_use = 1;
-	nrnmpi_under_nrncontrol_ = nrnmpi_under_nrncontrol;
-	if( nrnmpi_under_nrncontrol_ ) {
-
+    int i, b, flag;
+    static int called = 0;
+
+    if (called) {
+        return;
+    }
+    called = 1;
+    nrnmpi_use = 1;
+    nrnmpi_under_nrncontrol_ = nrnmpi_under_nrncontrol;
+    if (nrnmpi_under_nrncontrol_) {
 #if !ALWAYS_CALL_MPI_INIT
-	/* this is not good. depends on mpirun adding at least one
-	   arg that starts with -p4 but that probably is dependent
-	   on mpich and the use of the ch_p4 device. We are trying to
-	   work around the problem that MPI_Init may change the working
-	   directory and so when not invoked under mpirun we would like to
-	   NOT call MPI_Init.
-	*/
-		b = 0;
-		for (i=0; i < *pargc; ++i) {
-			if (strncmp("-p4", (*pargv)[i], 3) == 0) {
-				b = 1;
-				break;
-			}
-			if (strcmp("-mpi", (*pargv)[i]) == 0) {
-				b = 1;
-				break;
-			}
-		}
-		if (nrnmusic) { b = 1; }
-		if (!b) {
-			nrnmpi_use = 0;
-			nrnmpi_under_nrncontrol_ = 0;
-			return;
-		}
+        /* this is not good. depends on mpirun adding at least one
+           arg that starts with -p4 but that probably is dependent
+           on mpich and the use of the ch_p4 device. We are trying to
+           work around the problem that MPI_Init may change the working
+           directory and so when not invoked under mpirun we would like to
+           NOT call MPI_Init.
+        */
+        b = 0;
+        for (i = 0; i < *pargc; ++i) {
+            if (strncmp("-p4", (*pargv)[i], 3) == 0) {
+                b = 1;
+                break;
+            }
+            if (strcmp("-mpi", (*pargv)[i]) == 0) {
+                b = 1;
+                break;
+            }
+        }
+        if (nrnmusic) {
+            b = 1;
+        }
+        if (!b) {
+            nrnmpi_use = 0;
+            nrnmpi_under_nrncontrol_ = 0;
+            return;
+        }
 #endif
-		MPI_Initialized(&flag);
+        MPI_Initialized(&flag);
 
-		if (!flag) {
+        if (!flag) {
 #if (USE_PTHREAD || defined(_OPENMP))
-			int required = MPI_THREAD_FUNNELED;
-			int provided;
-                        nrn_assert(MPI_Init_thread(pargc, pargv, required, &provided) == MPI_SUCCESS);
-                        
-			nrn_assert(required <= provided);
+            int required = MPI_THREAD_FUNNELED;
+            int provided;
+            nrn_assert(MPI_Init_thread(pargc, pargv, required, &provided) == MPI_SUCCESS);
+
+            nrn_assert(required <= provided);
 #else
-			nrn_assert(MPI_Init(pargc, pargv) == MPI_SUCCESS);
+            nrn_assert(MPI_Init(pargc, pargv) == MPI_SUCCESS);
 #endif
-		}
-
-		{
-			nrn_assert(MPI_Comm_dup(MPI_COMM_WORLD, &nrnmpi_world_comm) == MPI_SUCCESS);
-		}
-	}
-	grp_bbs = MPI_GROUP_NULL;
-	grp_net = MPI_GROUP_NULL;
-	nrn_assert(MPI_Comm_dup(nrnmpi_world_comm, &nrnmpi_comm) == MPI_SUCCESS);
-	nrn_assert(MPI_Comm_dup(nrnmpi_world_comm, &nrn_bbs_comm) == MPI_SUCCESS);
-	nrn_assert(MPI_Comm_rank(nrnmpi_world_comm, &nrnmpi_myid_world) == MPI_SUCCESS);
-	nrn_assert(MPI_Comm_size(nrnmpi_world_comm, &nrnmpi_numprocs_world) == MPI_SUCCESS);
-	nrnmpi_numprocs = nrnmpi_numprocs_bbs = nrnmpi_numprocs_world;
-	nrnmpi_myid = nrnmpi_myid_bbs = nrnmpi_myid_world;
-	nrnmpi_spike_initialize();
-	/*begin instrumentation*/
+        }
+
+        { nrn_assert(MPI_Comm_dup(MPI_COMM_WORLD, &nrnmpi_world_comm) == MPI_SUCCESS); }
+    }
+    grp_bbs = MPI_GROUP_NULL;
+    grp_net = MPI_GROUP_NULL;
+    nrn_assert(MPI_Comm_dup(nrnmpi_world_comm, &nrnmpi_comm) == MPI_SUCCESS);
+    nrn_assert(MPI_Comm_dup(nrnmpi_world_comm, &nrn_bbs_comm) == MPI_SUCCESS);
+    nrn_assert(MPI_Comm_rank(nrnmpi_world_comm, &nrnmpi_myid_world) == MPI_SUCCESS);
+    nrn_assert(MPI_Comm_size(nrnmpi_world_comm, &nrnmpi_numprocs_world) == MPI_SUCCESS);
+    nrnmpi_numprocs = nrnmpi_numprocs_bbs = nrnmpi_numprocs_world;
+    nrnmpi_myid = nrnmpi_myid_bbs = nrnmpi_myid_world;
+    nrnmpi_spike_initialize();
+/*begin instrumentation*/
 #if USE_HPM
-	hpmInit( nrnmpi_myid_world, "mpineuron" );
+    hpmInit(nrnmpi_myid_world, "mpineuron");
 #endif
 
-	if (nrnmpi_myid == 0) 
-          printf(" num_mpi=%d\n num_omp_thread=%d\n\n", nrnmpi_numprocs_world,nrnomp_get_numthreads());
-
-#endif /* NRNMPI */
-
+    if (nrnmpi_myid == 0)
+        printf(" num_mpi=%d\n num_omp_thread=%d\n\n", nrnmpi_numprocs_world,
+               nrnomp_get_numthreads());
 }
 
-double nrnmpi_wtime() {
-#if NRNMPI
-	if (nrnmpi_use) {
-		return MPI_Wtime();
-	}
-#endif
-	return 0.0;
-}
-
-
-void nrnmpi_finalize(void) 
-{
-  MPI_Finalize();
+void nrnmpi_finalize(void) {
+    if (nrnmpi_under_nrncontrol_) {
+        int flag = 0;
+        MPI_Initialized(&flag);
+        if (flag) {
+            MPI_Finalize();
+        }
+    }
 }
 
-
 void nrnmpi_terminate() {
-#if NRNMPI
-	if (nrnmpi_use) {
+    if (nrnmpi_use) {
 #if USE_HPM
-		hpmTerminate( nrnmpi_myid_world );
+        hpmTerminate(nrnmpi_myid_world);
 #endif
-		if( nrnmpi_under_nrncontrol_ ) {
-			MPI_Finalize();
-		}
-		nrnmpi_use = 0;
+        if (nrnmpi_under_nrncontrol_) {
+            MPI_Finalize();
+        }
+        nrnmpi_use = 0;
 #if nrnmpidebugleak
-		nrnmpi_checkbufleak();
+        nrnmpi_checkbufleak();
 #endif
-	}
-#endif /*NRNMPI*/
+    }
 }
 
-void nrnmpi_abort(int errcode) {
-#if NRNMPI
-	int flag;
-	MPI_Initialized(&flag);
-	if (flag) {
-		MPI_Abort(MPI_COMM_WORLD, errcode);
-	}else{
-		abort();
-	}
-#else
-	abort();
-#endif
+// check if appropriate threading level supported (i.e. MPI_THREAD_FUNNELED)
+void nrnmpi_check_threading_support() {
+    int th = 0;
+    if (nrnmpi_use) {
+        MPI_Query_thread(&th);
+        if (th < MPI_THREAD_FUNNELED) {
+            nrn_fatal_error(
+                "\n Current MPI library doesn't support MPI_THREAD_FUNNELED,\
+                        \n Run without enabling multi-threading!");
+        }
+    }
 }
 
-void nrnmpi_fatal_error(const char *msg) {
-
-  if(nrnmpi_myid == 0) {
-    printf("%s\n", msg);
-  }
-  
-  nrnmpi_abort(-1);
+/* so src/nrnpython/inithoc.cpp does not have to include a c++ mpi.h */
+int nrnmpi_wrap_mpi_init(int* flag) {
+    return MPI_Initialized(flag);
 }
 
-// check if appropriate threading level supported (i.e. MPI_THREAD_FUNNELED)
-void nrnmpi_check_threading_support() {
+#endif
+
+// TODO nrn_wtime(), nrn_abort(int) and nrn_fatal_error() to be moved to tools
+
+double nrn_wtime() {
 #if NRNMPI
-    int th = 0;
-	if (nrnmpi_use) {
-            MPI_Query_thread( &th );
-            if( th < MPI_THREAD_FUNNELED) {
-                nrnmpi_fatal_error("\n Current MPI library doesn't support MPI_THREAD_FUNNELED,\
-                        \n Run without enabling multi-threading!");
-            }
-	}
+    if (nrnmpi_use) {
+        return MPI_Wtime();
+    } else
 #endif
+    {
+        struct timeval time1;
+        gettimeofday(&time1, NULL);
+        return (time1.tv_sec + time1.tv_usec/1.e6);
+    }
 }
 
+void nrn_abort(int errcode) {
 #if NRNMPI
+    int flag;
+    MPI_Initialized(&flag);
+    if (flag) {
+        MPI_Abort(MPI_COMM_WORLD, errcode);
+    } else
+#endif
+    {
+        abort();
+    }
+}
 
-/* so src/nrnpython/inithoc.cpp does not have to include a c++ mpi.h */
-int nrnmpi_wrap_mpi_init(int* flag) {
-	return MPI_Initialized(flag);
+void nrn_fatal_error(const char* msg) {
+    if (nrnmpi_myid == 0) {
+        printf("%s\n", msg);
+    }
+    nrn_abort(-1);
 }
-	
-#endif
diff --git a/coreneuron/nrnmpi/nrnmpi.h b/coreneuron/nrnmpi/nrnmpi.h
index 1c2953c60..e6d5a1904 100644
--- a/coreneuron/nrnmpi/nrnmpi.h
+++ b/coreneuron/nrnmpi/nrnmpi.h
@@ -36,18 +36,30 @@ THE POSSIBILITY OF SUCH DAMAGE.
    nrnmpi_numprocs_world/small subworlds of size nsmall.
 */
 extern int nrnmpi_numprocs_world; /* size of entire world. total size of all subworlds */
-extern int nrnmpi_myid_world; /* rank in entire world */
-extern int nrnmpi_numprocs; /* size of subworld */
-extern int nrnmpi_myid; /* rank in subworld */
-extern int nrnmpi_numprocs_bbs; /* number of subworlds */
-extern int nrnmpi_myid_bbs; /* rank in nrn_bbs_comm of rank 0 of a subworld */
+extern int nrnmpi_myid_world;     /* rank in entire world */
+extern int nrnmpi_numprocs;       /* size of subworld */
+extern int nrnmpi_myid;           /* rank in subworld */
+extern int nrnmpi_numprocs_bbs;   /* number of subworlds */
+extern int nrnmpi_myid_bbs;       /* rank in nrn_bbs_comm of rank 0 of a subworld */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+void nrn_abort(int errcode);
+void nrn_fatal_error(const char* msg);
+double nrn_wtime();
+
+#if defined(__cplusplus)
+}
+#endif /*c++*/
+
+#if NRNMPI
 
 typedef struct {
-	int gid;
-	double spiketime;
+    int gid;
+    double spiketime;
 } NRNMPI_Spike;
-	        
-#if NRNMPI
 
 #if defined(__cplusplus)
 extern "C" {
diff --git a/coreneuron/nrnmpi/nrnmpi_def_cinc.h b/coreneuron/nrnmpi/nrnmpi_def_cinc.h
index d17997410..a6cc97c3b 100644
--- a/coreneuron/nrnmpi/nrnmpi_def_cinc.h
+++ b/coreneuron/nrnmpi/nrnmpi_def_cinc.h
@@ -28,7 +28,7 @@ THE POSSIBILITY OF SUCH DAMAGE.
 
 int nrnmpi_use;
 int nrnmpi_numprocs = 1; /* size */
-int nrnmpi_myid = 0; /* rank */
+int nrnmpi_myid = 0;     /* rank */
 int nrnmpi_numprocs_world = 1;
 int nrnmpi_myid_world = 0;
 int nrnmpi_numprocs_bbs = 1;
@@ -37,8 +37,11 @@ int nrnmpi_myid_bbs = 0;
 int nrnmpi_nout_;
 int* nrnmpi_nin_;
 int nrnmpi_i_capacity_;
+
+#if NRNMPI
 NRNMPI_Spike* nrnmpi_spikeout_;
 NRNMPI_Spike* nrnmpi_spikein_;
+#endif
 
 int nrnmpi_localgid_size_;
 int nrnmpi_ag_send_size_;
@@ -48,4 +51,3 @@ int nrnmpi_ovfl_;
 unsigned char* nrnmpi_spikeout_fixed_;
 unsigned char* nrnmpi_spikein_fixed_;
 unsigned char* nrnmpi_spikein_fixed_ovfl_;
-
diff --git a/coreneuron/nrnmpi/nrnmpi_impl.h b/coreneuron/nrnmpi/nrnmpi_impl.h
index ce6eadb12..bacc22472 100644
--- a/coreneuron/nrnmpi/nrnmpi_impl.h
+++ b/coreneuron/nrnmpi/nrnmpi_impl.h
@@ -29,9 +29,13 @@ THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef nrnmpi_impl_h
 #define nrnmpi_impl_h
 
+#if NRNMPI
+
 #include <mpi.h>
 
 extern MPI_Comm nrnmpi_world_comm;
 extern MPI_Comm nrnmpi_comm;
 
+#endif // NRNMPI
+
 #endif
diff --git a/coreneuron/nrnmpi/nrnmpidec.h b/coreneuron/nrnmpi/nrnmpidec.h
index 48d1dc3b8..0ec646a8c 100644
--- a/coreneuron/nrnmpi/nrnmpidec.h
+++ b/coreneuron/nrnmpi/nrnmpidec.h
@@ -44,12 +44,12 @@ extern "C" {
 
 /* from bbsmpipack.c */
 typedef struct bbsmpibuf {
-	char* buf;
-	int size;
-	int pkposition;
-	int upkpos;
-	int keypos;
-	int refcount;
+    char* buf;
+    int size;
+    int pkposition;
+    int upkpos;
+    int keypos;
+    int refcount;
 } bbsmpibuf;
 
 extern bbsmpibuf* nrnmpi_newbuf(int size);
@@ -82,13 +82,10 @@ extern int nrnmpi_bbssendrecv(int dest, int tag, bbsmpibuf* s, bbsmpibuf* r);
 /* from nrnmpi.c */
 extern void nrnmpi_init(int nrnmpi_under_nrncontrol, int* pargc, char*** pargv);
 extern int nrnmpi_wrap_mpi_init(int* flag);
-extern double nrnmpi_wtime(void);
 extern void nrnmpi_finalize(void);
 extern void nrnmpi_terminate();
-extern void nrnmpi_abort(int errcode);
 extern void nrnmpi_subworld_size(int n);
 extern int nrn_wrap_mpi_init(int* flag);
-extern void nrnmpi_fatal_error(const char *msg);
 extern void nrnmpi_check_threading_support();
 
 /* from mpispike.c */
@@ -101,9 +98,15 @@ extern void nrnmpi_int_gather(int* s, int* r, int cnt, int root);
 extern void nrnmpi_int_gatherv(int* s, int scnt, int* r, int* rcnt, int* rdispl, int root);
 extern void nrnmpi_int_allgather(int* s, int* r, int n);
 extern void nrnmpi_int_allgatherv(int* s, int* r, int* n, int* dspl);
+extern void nrnmpi_int_alltoall(int* s, int* r, int n);
 extern void nrnmpi_int_alltoallv(int* s, int* scnt, int* sdispl, int* r, int* rcnt, int* rdispl);
 extern void nrnmpi_dbl_allgatherv(double* s, double* r, int* n, int* dspl);
-extern void nrnmpi_dbl_alltoallv(double* s, int* scnt, int* sdispl, double* r, int* rcnt, int* rdispl);
+extern void nrnmpi_dbl_alltoallv(double* s,
+                                 int* scnt,
+                                 int* sdispl,
+                                 double* r,
+                                 int* rcnt,
+                                 int* rdispl);
 extern void nrnmpi_char_alltoallv(char* s, int* scnt, int* sdispl, char* r, int* rcnt, int* rdispl);
 extern void nrnmpi_dbl_broadcast(double* buf, int cnt, int root);
 extern void nrnmpi_int_broadcast(int* buf, int cnt, int root);
diff --git a/coreneuron/nrnmpi/nrnmpiuse.h b/coreneuron/nrnmpi/nrnmpiuse.h
index 7fbf85051..787a77c7a 100644
--- a/coreneuron/nrnmpi/nrnmpiuse.h
+++ b/coreneuron/nrnmpi/nrnmpiuse.h
@@ -29,8 +29,11 @@ THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef usenrnmpi_h
 #define usenrnmpi_h
 
-/* define to 1 if you want MPI specific features activated */
-#define NRNMPI 1
+/* define to 1 if you want MPI specific features activated
+   (optionally provided by CMake option NRNMPI) */
+#ifndef NRNMPI
+  #define NRNMPI 1
+#endif
 
 /* define to 1 if you want parallel distributed cells (and gap junctions) */
 #define PARANEURON 1
diff --git a/coreneuron/nrnoc/capac.c b/coreneuron/nrnoc/capac.c
index e0e8475d8..63a38259d 100644
--- a/coreneuron/nrnoc/capac.c
+++ b/coreneuron/nrnoc/capac.c
@@ -26,9 +26,22 @@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#include "coreneuron/nrnconf.h"
-#include "coreneuron/nrnoc/multicore.h"
-#include "coreneuron/nrnoc/membdef.h"
+#include "coreneuron/coreneuron.h"
+
+#if defined(_OPENACC)
+#define _PRAGMA_FOR_INIT_ACC_LOOP_ \
+    _Pragma("acc parallel loop present(vdata[0:_cntml_padded*nparm]) if(_nt->compute_gpu)")
+#define _PRAGMA_FOR_CUR_ACC_LOOP_ \
+    _Pragma(                      \
+        "acc parallel loop present(vdata[0:_cntml_padded*nparm], ni[0:_cntml_actual], _vec_rhs[0:_nt->end]) if(_nt->compute_gpu) async(stream_id)")
+#define _PRAGMA_FOR_JACOB_ACC_LOOP_ \
+    _Pragma(                        \
+        "acc parallel loop present(vdata[0:_cntml_padded*nparm], ni[0:_cntml_actual], _vec_d[0:_nt->end]) if(_nt->compute_gpu) async(stream_id)")
+#else
+#define _PRAGMA_FOR_INIT_ACC_LOOP_ _Pragma("")
+#define _PRAGMA_FOR_CUR_ACC_LOOP_ _Pragma("")
+#define _PRAGMA_FOR_JACOB_ACC_LOOP_ _Pragma("")
+#endif
 
 #if !defined(LAYOUT)
 /* 1 means AoS, >1 means AoSoA, <= 0 means SOA */
@@ -40,23 +53,25 @@ THE POSSIBILITY OF SUCH DAMAGE.
 #define _STRIDE _cntml_padded + _iml
 #endif
 
-static const char *mechanism[] = { "0", "capacitance", "cm",0, "i_cap", 0,0 };
-static void cap_alloc(double*, Datum*, int type);
-static void cap_init(NrnThread*, Memb_list*, int);
+static const char* mechanism[] = {"0", "capacitance", "cm", 0, "i_cap", 0, 0};
+void nrn_alloc_capacitance(double*, Datum*, int);
+void nrn_init_capacitance(NrnThread*, Memb_list*, int);
+void nrn_jacob_capacitance(NrnThread*, Memb_list*, int);
 
 #define nparm 2
 
-void capac_reg_(void) {
-	int mechtype;
-	/* all methods deal with capacitance in special ways */
-	register_mech(mechanism, cap_alloc, (mod_f_t)0, (mod_f_t)0, (mod_f_t)0, (mod_f_t)cap_init, -1, 1);
-	mechtype = nrn_get_mechtype(mechanism[1]);
-	_nrn_layout_reg(mechtype, LAYOUT);
-	hoc_register_prop_size(mechtype, nparm, 0);
+void capacitance_reg(void) {
+    int mechtype;
+    /* all methods deal with capacitance in special ways */
+    register_mech(mechanism, nrn_alloc_capacitance, (mod_f_t)0, (mod_f_t)0, (mod_f_t)0,
+                  (mod_f_t)nrn_init_capacitance, -1, 1);
+    mechtype = nrn_get_mechtype(mechanism[1]);
+    _nrn_layout_reg(mechtype, LAYOUT);
+    hoc_register_prop_size(mechtype, nparm, 0);
 }
 
-#define cm  vdata[0*_STRIDE]
-#define i_cap  vdata[1*_STRIDE]
+#define cm vdata[0 * _STRIDE]
+#define i_cap vdata[1 * _STRIDE]
 
 /*
 cj is analogous to 1/dt for cvode and daspk
@@ -65,69 +80,95 @@ for pure implicit fixed step it is 1/dt
 It used to be static but is now a thread data variable
 */
 
-void nrn_cap_jacob(NrnThread* _nt, Memb_list* ml) {
-	int _cntml_actual = ml->nodecount;
-	int _cntml_padded = ml->_nodecount_padded;
-	int _iml;
-	double *vdata;
-	double cfac = .001 * _nt->cj;
-  (void) _cntml_padded; /* unused when layout=1*/
-	{ /*if (use_cachevec) {*/
-		int* ni = ml->nodeindices;
+void nrn_jacob_capacitance(NrnThread* _nt, Memb_list* ml, int type) {
+    (void)type;
+    int _cntml_actual = ml->nodecount;
+    int _cntml_padded = ml->_nodecount_padded;
+    int _iml;
+    double* vdata;
+    double cfac = .001 * _nt->cj;
+    (void)_cntml_padded; /* unused when layout=1*/
+
+    double* _vec_d = _nt->_actual_d;
+#if defined(_OPENACC)
+    int stream_id = _nt->stream_id;
+#endif
+
+    { /*if (use_cachevec) {*/
+        int* ni = ml->nodeindices;
+
 #if LAYOUT == 1 /*AoS*/
-		for (_iml=0; _iml < _cntml_actual; _iml++) {
-	    vdata = ml->data + _iml*nparm;
+        for (_iml = 0; _iml < _cntml_actual; _iml++) {
+            vdata = ml->data + _iml * nparm;
 #else
-	    vdata = ml->data;
-		for (_iml=0; _iml < _cntml_actual; _iml++) {
+        vdata = ml->data;
+        _PRAGMA_FOR_JACOB_ACC_LOOP_
+        for (_iml = 0; _iml < _cntml_actual; _iml++) {
 #endif
-			VEC_D(ni[_iml]) += cfac*cm;
-		}
-	}
+            _vec_d[ni[_iml]] += cfac * cm;
+        }
+    }
 }
 
-static void cap_init(NrnThread* _nt, Memb_list* ml, int type ) {
-	int _cntml_actual = ml->nodecount;
-	int _cntml_padded = ml->_nodecount_padded;
-	int _iml;
-	double *vdata;
-	(void)_nt; (void)type; (void) _cntml_padded; /* unused */
+void nrn_init_capacitance(NrnThread* _nt, Memb_list* ml, int type) {
+    (void)type;
+    int _cntml_actual = ml->nodecount;
+    int _cntml_padded = ml->_nodecount_padded;
+    int _iml;
+    double* vdata;
+    (void)_nt;
+    (void)type;
+    (void)_cntml_padded; /* unused */
+
 #if LAYOUT == 1 /*AoS*/
-	for (_iml=0; _iml < _cntml_actual; _iml++) {
-	    vdata = ml->data + _iml*nparm;
+    for (_iml = 0; _iml < _cntml_actual; _iml++) {
+        vdata = ml->data + _iml * nparm;
 #else
-	vdata = ml->data;
-	for (_iml=0; _iml < _cntml_actual; _iml++) {
+    vdata = ml->data;
+    _PRAGMA_FOR_INIT_ACC_LOOP_
+    for (_iml = 0; _iml < _cntml_actual; _iml++) {
 #endif
-		i_cap = 0;
-	}
+        i_cap = 0;
+    }
 }
 
-void nrn_capacity_current(NrnThread* _nt, Memb_list* ml) {
-	int _cntml_actual = ml->nodecount;
-	int _cntml_padded = ml->_nodecount_padded;
-	int _iml;
-	double *vdata;
-	double cfac = .001 * _nt->cj;
-  (void) _cntml_padded; /* unused when layout=1*/
-	/* since rhs is dvm for a full or half implicit step */
-	/* (nrn_update_2d() replaces dvi by dvi-dvx) */
-	/* no need to distinguish secondorder */
-		int* ni = ml->nodeindices;
+void nrn_cur_capacitance(NrnThread* _nt, Memb_list* ml, int type) {
+    (void)type;
+    int _cntml_actual = ml->nodecount;
+    int _cntml_padded = ml->_nodecount_padded;
+    int _iml;
+    double* vdata;
+    double cfac = .001 * _nt->cj;
+
+    /*@todo: verify cfac is being copied !! */
+
+    (void)_cntml_padded; /* unused when layout=1*/
+
+    /* since rhs is dvm for a full or half implicit step */
+    /* (nrn_update_2d() replaces dvi by dvi-dvx) */
+    /* no need to distinguish secondorder */
+    int* ni = ml->nodeindices;
+    double* _vec_rhs = _nt->_actual_rhs;
+#if defined(_OPENACC)
+    int stream_id = _nt->stream_id;
+#endif
+
 #if LAYOUT == 1 /*AoS*/
-	for (_iml=0; _iml < _cntml_actual; _iml++) {
-	    vdata = ml->data + _iml*nparm;
+    for (_iml = 0; _iml < _cntml_actual; _iml++) {
+        vdata = ml->data + _iml * nparm;
 #else
-	vdata = ml->data;
-	for (_iml=0; _iml < _cntml_actual; _iml++) {
+    vdata = ml->data;
+    _PRAGMA_FOR_CUR_ACC_LOOP_
+    for (_iml = 0; _iml < _cntml_actual; _iml++) {
 #endif
-		i_cap = cfac*cm*VEC_RHS(ni[_iml]);
-	}
+        i_cap = cfac * cm * _vec_rhs[ni[_iml]];
+    }
 }
 
 /* the rest can be constructed automatically from the above info*/
 
-static void cap_alloc(double* data, Datum* pdata, int type) {
-	(void)pdata; (void)type; /* unused */
-	data[0] = DEF_cm;	/*default capacitance/cm^2*/
+void nrn_alloc_capacitance(double* data, Datum* pdata, int type) {
+    (void)pdata;
+    (void)type;       /* unused */
+    data[0] = DEF_cm; /*default capacitance/cm^2*/
 }
diff --git a/coreneuron/nrnoc/eion.c b/coreneuron/nrnoc/eion.c
index 95d7a4648..9ff42b89e 100644
--- a/coreneuron/nrnoc/eion.c
+++ b/coreneuron/nrnoc/eion.c
@@ -28,10 +28,8 @@ THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <math.h>
 #include <string.h>
-#include "coreneuron/nrnconf.h"
-#include "coreneuron/nrnoc/multicore.h"
-#include "coreneuron/nrnoc/membdef.h"
-#include "coreneuron/nrnoc/nrnoc_decl.h"
+
+#include "coreneuron/coreneuron.h"
 
 #if !defined(LAYOUT)
 /* 1 means AoS, >1 means AoSoA, <= 0 means SOA */
@@ -43,156 +41,213 @@ THE POSSIBILITY OF SUCH DAMAGE.
 #define _STRIDE _cntml_padded + _iml
 #endif
 
+#if defined(_OPENACC)
+#if defined(PG_ACC_BUGS)
+#define _PRAGMA_FOR_INIT_ACC_LOOP_ \
+    _Pragma(                       \
+        "acc parallel loop present(pd[0:_cntml_padded*5], ppd[0:1], nrn_ion_global_map[0:nrn_ion_global_map_size][0:3]) if(nt->compute_gpu)")
+#define _PRAGMA_FOR_CUR_ACC_LOOP_ \
+    _Pragma(                      \
+        "acc parallel loop present(pd[0:_cntml_padded*5], nrn_ion_global_map[0:nrn_ion_global_map_size][0:3]) if(nt->compute_gpu) async(stream_id)")
+#else
+#define _PRAGMA_FOR_INIT_ACC_LOOP_ \
+    _Pragma(                       \
+        "acc parallel loop present(pd[0:_cntml_padded*5], ppd[0:1], nrn_ion_global_map[0:nrn_ion_global_map_size]) if(nt->compute_gpu)")
+#define _PRAGMA_FOR_CUR_ACC_LOOP_ \
+    _Pragma(                      \
+        "acc parallel loop present(pd[0:_cntml_padded*5], nrn_ion_global_map[0:nrn_ion_global_map_size]) if(nt->compute_gpu) async(stream_id)")
+#endif
+#define _PRAGMA_FOR_SEC_ORDER_CUR_ACC_LOOP_ \
+    _Pragma(                                \
+        "acc parallel loop present(pd[0:_cntml_padded*5], ni[0:_cntml_actual], _vec_rhs[0:_nt->end]) if(_nt->compute_gpu) async(stream_id)")
+#else
+#define _PRAGMA_FOR_INIT_ACC_LOOP_ _Pragma("")
+#define _PRAGMA_FOR_CUR_ACC_LOOP_ _Pragma("")
+#define _PRAGMA_FOR_SEC_ORDER_CUR_ACC_LOOP_ _Pragma("")
+#endif
+
 extern void hoc_register_prop_size(int, int, int);
 
-#define	nparm 5
-static char *mechanism[] = { /*just a template*/
-	"0",
-	"na_ion",
-	"ena", "nao", "nai", 0,
-	"ina", "dina_dv_", 0,
-	0
-};
+#define nparm 5
+static char* mechanism[] = {/*just a template*/
+                            "0", "na_ion", "ena", "nao", "nai", 0, "ina", "dina_dv_", 0, 0};
 
-static void ion_alloc();
-static void ion_cur(NrnThread*, Memb_list*, int);
-static void ion_init(NrnThread*, Memb_list*, int);
+void nrn_cur_ion(NrnThread*, Memb_list*, int);
+void nrn_init_ion(NrnThread*, Memb_list*, int);
+void nrn_alloc_ion(double*, Datum*, int);
 
 double nrn_nernst(), nrn_ghk();
 static int na_ion, k_ion, ca_ion; /* will get type for these special ions */
 
 int nrn_is_ion(int type) {
-	return (memb_func[type].alloc == ion_alloc);
+    // Old: commented to remove dependency on memb_func and alloc function
+    // return (memb_func[type].alloc == ion_alloc);
+    return (type < nrn_ion_global_map_size         // type smaller than largest ion's
+            && nrn_ion_global_map[type] != NULL);  // allocated ion charge variables
 }
 
-static int ion_global_map_size;
-static double** ion_global_map;
-#define global_conci(type) ion_global_map[type][0]
-#define global_conco(type) ion_global_map[type][1]
-#define global_charge(type) ion_global_map[type][2]
+int nrn_ion_global_map_size;
+double** nrn_ion_global_map;
+#define global_conci(type) nrn_ion_global_map[type][0]
+#define global_conco(type) nrn_ion_global_map[type][1]
+#define global_charge(type) nrn_ion_global_map[type][2]
 
 double nrn_ion_charge(int type) {
-	return global_charge(type);
+    return global_charge(type);
 }
 
 void ion_reg(const char* name, double valence) {
-	int i, mechtype;
-	char buf[7][50];
-	double val;
+    int i, mechtype;
+    char buf[7][50];
+    double val;
 #define VAL_SENTINAL -10000.
 
-	Sprintf(buf[0], "%s_ion", name);
-	Sprintf(buf[1], "e%s", name);
-	Sprintf(buf[2], "%si", name);
-	Sprintf(buf[3], "%so", name);
-	Sprintf(buf[5], "i%s", name);
-	Sprintf(buf[6], "di%s_dv_", name);
-	for (i=0; i<7; i++) {
-		mechanism[i+1] = buf[i];
-	}
-	mechanism[5] = (char *)0; /* buf[4] not used above */
-	mechtype = nrn_get_mechtype(buf[0]);
-	if (memb_func[mechtype].alloc != ion_alloc) {
-		register_mech((const char**)mechanism, ion_alloc, ion_cur, (mod_f_t)0, (mod_f_t)0, (mod_f_t)ion_init, -1, 1);
-		mechtype = nrn_get_mechtype(mechanism[1]);
-		_nrn_layout_reg(mechtype, LAYOUT);
-		hoc_register_prop_size(mechtype, nparm, 1 );
-		nrn_writes_conc(mechtype, 1);
-		if (ion_global_map_size <= mechtype) {
-			ion_global_map_size = mechtype + 1;
-			ion_global_map = (double**)erealloc(ion_global_map,
-				sizeof(double*)*ion_global_map_size);
-		}
-		ion_global_map[mechtype] = (double*)emalloc(3*sizeof(double));
-		Sprintf(buf[0], "%si0_%s", name, buf[0]);
-		Sprintf(buf[1], "%so0_%s", name, buf[0]);
-		if (strcmp("na", name) == 0) {
-			na_ion = mechtype;
-			global_conci(mechtype) = DEF_nai;
-			global_conco(mechtype) = DEF_nao;
-			global_charge(mechtype) = 1.;
-		}else if (strcmp("k", name) == 0) {
-			k_ion = mechtype;
-			global_conci(mechtype) = DEF_ki;
-			global_conco(mechtype) = DEF_ko;
-			global_charge(mechtype) = 1.;
-		}else if (strcmp("ca", name) == 0) {
-			ca_ion = mechtype;
-			global_conci(mechtype) = DEF_cai;
-			global_conco(mechtype) = DEF_cao;
-			global_charge(mechtype) = 2.;
-		}else{
-			global_conci(mechtype) = DEF_ioni;
-			global_conco(mechtype) = DEF_iono;
-			global_charge(mechtype) = VAL_SENTINAL;
-		}			
-	}
-	val = global_charge(mechtype);
-	if (valence != VAL_SENTINAL && val != VAL_SENTINAL && valence != val) {
-		fprintf(stderr, "%s ion valence defined differently in\n\
+    sprintf(buf[0], "%s_ion", name);
+    sprintf(buf[1], "e%s", name);
+    sprintf(buf[2], "%si", name);
+    sprintf(buf[3], "%so", name);
+    sprintf(buf[5], "i%s", name);
+    sprintf(buf[6], "di%s_dv_", name);
+    for (i = 0; i < 7; i++) {
+        mechanism[i + 1] = buf[i];
+    }
+    mechanism[5] = (char*)0; /* buf[4] not used above */
+    mechtype = nrn_get_mechtype(buf[0]);
+    if (mechtype >= nrn_ion_global_map_size ||
+        nrn_ion_global_map[mechtype] == NULL) {  // if hasn't yet been allocated
+
+        // allocates mem for ion in ion_map and sets null all non-ion types
+        if (nrn_ion_global_map_size <= mechtype) {
+            int size = mechtype + 1;
+            nrn_ion_global_map = (double**)erealloc(nrn_ion_global_map, sizeof(double*) * size);
+
+            for (i = nrn_ion_global_map_size; i < mechtype; i++) {
+                nrn_ion_global_map[i] = NULL;
+            }
+            nrn_ion_global_map_size = mechtype + 1;
+        }
+        nrn_ion_global_map[mechtype] = (double*)emalloc(3 * sizeof(double));
+
+        register_mech((const char**)mechanism, nrn_alloc_ion, nrn_cur_ion, (mod_f_t)0, (mod_f_t)0,
+                      (mod_f_t)nrn_init_ion, -1, 1);
+        mechtype = nrn_get_mechtype(mechanism[1]);
+        _nrn_layout_reg(mechtype, LAYOUT);
+        hoc_register_prop_size(mechtype, nparm, 1);
+        hoc_register_dparam_semantics(mechtype, 0, "iontype");
+        nrn_writes_conc(mechtype, 1);
+
+        sprintf(buf[0], "%si0_%s", name, buf[0]);
+        sprintf(buf[1], "%so0_%s", name, buf[0]);
+        if (strcmp("na", name) == 0) {
+            na_ion = mechtype;
+            global_conci(mechtype) = DEF_nai;
+            global_conco(mechtype) = DEF_nao;
+            global_charge(mechtype) = 1.;
+        } else if (strcmp("k", name) == 0) {
+            k_ion = mechtype;
+            global_conci(mechtype) = DEF_ki;
+            global_conco(mechtype) = DEF_ko;
+            global_charge(mechtype) = 1.;
+        } else if (strcmp("ca", name) == 0) {
+            ca_ion = mechtype;
+            global_conci(mechtype) = DEF_cai;
+            global_conco(mechtype) = DEF_cao;
+            global_charge(mechtype) = 2.;
+        } else {
+            global_conci(mechtype) = DEF_ioni;
+            global_conco(mechtype) = DEF_iono;
+            global_charge(mechtype) = VAL_SENTINAL;
+        }
+    }
+    val = global_charge(mechtype);
+    if (valence != VAL_SENTINAL && val != VAL_SENTINAL && valence != val) {
+        fprintf(stderr,
+                "%s ion valence defined differently in\n\
 two USEION statements (%g and %g)\n",
-			buf[0], valence, global_charge(mechtype));
-		nrn_exit(1);
-	}else if (valence == VAL_SENTINAL && val == VAL_SENTINAL) {
-		fprintf(stderr, "%s ion valence must be defined in\n\
-the USEION statement of any model using this ion\n", buf[0]);
-		nrn_exit(1);
-	}else if (valence != VAL_SENTINAL) {
-		global_charge(mechtype) = valence;
-	}
+                buf[0], valence, global_charge(mechtype));
+        nrn_exit(1);
+    } else if (valence == VAL_SENTINAL && val == VAL_SENTINAL) {
+        fprintf(stderr,
+                "%s ion valence must be defined in\n\
+the USEION statement of any model using this ion\n",
+                buf[0]);
+        nrn_exit(1);
+    } else if (valence != VAL_SENTINAL) {
+        global_charge(mechtype) = valence;
+    }
 }
 
 #define FARADAY 96485.309
-#define ktf (1000.*8.3134*(celsius + 273.15)/FARADAY)
-double nrn_nernst(ci, co, z) double z, ci, co; {
-/*printf("nrn_nernst %g %g %g\n", ci, co, z);*/
-	if (z == 0) {
-		return 0.;
-	}
-	if (ci <= 0.) {
-		return 1e6;
-	}else if (co <= 0.) {
-		return -1e6;
-	}else{
-		return ktf/z*log(co/ci);
-	}
+#define ktf (1000. * 8.3134 * (celsius + 273.15) / FARADAY)
+
+#pragma acc routine seq
+double nrn_nernst(double ci, double co, double z, double celsius) {
+    /*printf("nrn_nernst %g %g %g\n", ci, co, z);*/
+    if (z == 0) {
+        return 0.;
+    }
+    if (ci <= 0.) {
+        return 1e6;
+    } else if (co <= 0.) {
+        return -1e6;
+    } else {
+        return ktf / z * log(co / ci);
+    }
 }
 
-void nrn_wrote_conc(int type, double* p1, int p2, int it, NrnThread* nt) {
- 	if (it & 04) {
+#pragma acc routine seq
+void nrn_wrote_conc(int type,
+                    double* p1,
+                    int p2,
+                    int it,
+                    double** gimap,
+                    double celsius,
+                    int _cntml_padded) {
+#ifndef _OPENACC
+    static int flag = 1;
+    extern int nrnmpi_myid;
+    if (flag && nrnmpi_myid == 0) {
+        /** need to check this as this kernel was failing */
+        printf("\n WARNING: nrn_nrn_wrote_conc support on GPU need to validate!\n");
+        flag = 0;
+    }
+#endif
+    if (it & 04) {
 #if LAYOUT <= 0 /* SoA */
-		int _iml = 0;
-		int _cntml_padded = nt->_ml_list[type]->_nodecount_padded;
-#else /* nt is unused */
-		assert(nt);
+        int _iml = 0;
+/* passing _nt to this function causes cray compiler to segfault during compilation
+ * hence passing _cntml_padded
+ */
+#else
+        (void)_cntml_padded;
 #endif
-		double* pe = p1 - p2*_STRIDE;
-		pe[0] = nrn_nernst(pe[1*_STRIDE], pe[2*_STRIDE], nrn_ion_charge(type));
-	}
+        double* pe = p1 - p2 * _STRIDE;
+        pe[0] = nrn_nernst(pe[1 * _STRIDE], pe[2 * _STRIDE], gimap[type][2], celsius);
+    }
 }
 
 static double efun(double x) {
-	if (fabs(x) < 1e-4) {
-		return 1. - x/2.;
-	}else{
-		return x/(exp(x) - 1);
-	}
+    if (fabs(x) < 1e-4) {
+        return 1. - x / 2.;
+    } else {
+        return x / (exp(x) - 1);
+    }
 }
 
 double nrn_ghk(double v, double ci, double co, double z) {
-	double eco, eci, temp;
-	temp = z*v/ktf;
-	eco = co*efun(temp);
-	eci = ci*efun(-temp);
-	return (.001)*z*FARADAY*(eci - eco);
+    double eco, eci, temp;
+    temp = z * v / ktf;
+    eco = co * efun(temp);
+    eci = ci * efun(-temp);
+    return (.001) * z * FARADAY * (eci - eco);
 }
 
 #if VECTORIZE
-#define erev	pd[0*_STRIDE]	/* From Eion */
-#define conci	pd[1*_STRIDE]
-#define conco	pd[2*_STRIDE]
-#define cur	pd[3*_STRIDE]
-#define dcurdv	pd[4*_STRIDE]
+#define erev pd[0 * _STRIDE] /* From Eion */
+#define conci pd[1 * _STRIDE]
+#define conco pd[2 * _STRIDE]
+#define cur pd[3 * _STRIDE]
+#define dcurdv pd[4 * _STRIDE]
 
 /*
  handle erev, conci, conc0 "in the right way" according to ion_style
@@ -203,13 +258,13 @@ ion_style("name_ion", [c_style, e_style, einit, eadvance, cinit])
  eca is parameter but if conc exists then eca is assigned
  if conc is nrnocCONST then eca calculated on finitialize
  if conc is STATE then eca calculated on fadvance and conc finitialize
- 	with global nai0, nao0
+        with global nai0, nao0
 
  nernst(ci, co, charge) and ghk(v, ci, co, charge) available to hoc
  and models.
 */
 
-#define iontype ppd[0]	/* how _AMBIGUOUS is to be handled */
+#define iontype ppd[0] /* how _AMBIGUOUS is to be handled */
 /*the bitmap is
 03	concentration unused, nrnocCONST, DEP, STATE
 04	initialize concentrations
@@ -224,106 +279,117 @@ ion_style("name_ion", [c_style, e_style, einit, eadvance, cinit])
 #define conci0 global_conci(type)
 #define conco0 global_conco(type)
 
-double nrn_nernst_coef(type) int type; {
-	/* for computing jacobian element dconc'/dconc */
-	return ktf/charge;
+double nrn_nernst_coef(int type) {
+    /* for computing jacobian element dconc'/dconc */
+    return ktf / charge;
 }
 
-
 /* Must be called prior to any channels which update the currents */
-static void ion_cur(NrnThread* nt, Memb_list* ml, int type) {
-	int _cntml_actual = ml->nodecount;
-	int _iml;
-	double* pd; Datum* ppd;
-	(void)nt; /* unused */
+void nrn_cur_ion(NrnThread* nt, Memb_list* ml, int type) {
+    int _cntml_actual = ml->nodecount;
+    int _iml;
+    double* pd;
+    Datum* ppd;
+    (void)nt; /* unused */
+#if defined(_OPENACC)
+    int stream_id = nt->stream_id;
+#endif
 /*printf("ion_cur %s\n", memb_func[type].sym->name);*/
 #if LAYOUT == 1 /*AoS*/
-	for (_iml = 0; _iml < _cntml_actual; ++_iml) {
-	  pd = ml->data + _iml*nparm; ppd = ml->pdata + _iml*1;
-#endif
-#if LAYOUT == 0 /*SoA*/
-	int _cntml_padded = ml->_nodecount_padded;
-	pd = ml->data; ppd = ml->pdata;
-	for (_iml = 0; _iml < _cntml_actual; ++_iml) {
-#endif
-#if LAYOUT > 1 /*AoSoA*/
+    for (_iml = 0; _iml < _cntml_actual; ++_iml) {
+        pd = ml->data + _iml * nparm;
+        ppd = ml->pdata + _iml * 1;
+#elif LAYOUT == 0         /*SoA*/
+    int _cntml_padded = ml->_nodecount_padded;
+    pd = ml->data;
+    ppd = ml->pdata;
+    _PRAGMA_FOR_CUR_ACC_LOOP_
+    for (_iml = 0; _iml < _cntml_actual; ++_iml) {
+#else /* if LAYOUT > 1 */ /*AoSoA*/
 #error AoSoA not implemented.
 #endif
-		dcurdv = 0.;
-		cur = 0.;
-		if (iontype & 0100) {
-			erev = nrn_nernst(conci, conco, charge);
-		}
-	};
+        dcurdv = 0.;
+        cur = 0.;
+        if (iontype & 0100) {
+            erev = nrn_nernst(conci, conco, charge, celsius);
+        }
+    };
 }
 
 /* Must be called prior to other models which possibly also initialize
-	concentrations based on their own states
+        concentrations based on their own states
 */
-static void ion_init(NrnThread* nt, Memb_list* ml, int type) {
-	int _cntml_actual = ml->nodecount;
-	int _iml;
-	double* pd; Datum* ppd;
-	(void)nt; /* unused */
+void nrn_init_ion(NrnThread* nt, Memb_list* ml, int type) {
+    int _cntml_actual = ml->nodecount;
+    int _iml;
+    double* pd;
+    Datum* ppd;
+    (void)nt; /* unused */
 /*printf("ion_init %s\n", memb_func[type].sym->name);*/
 #if LAYOUT == 1 /*AoS*/
-	for (_iml = 0; _iml < _cntml_actual; ++_iml) {
-	  pd = ml->data + _iml*nparm; ppd = ml->pdata + _iml*1;
-#endif
-#if LAYOUT == 0 /*SoA*/
-	int _cntml_padded = ml->_nodecount_padded;
-	pd = ml->data; ppd = ml->pdata;
-	for (_iml = 0; _iml < _cntml_actual; ++_iml) {
-#endif
-#if LAYOUT > 1 /*AoSoA*/
+    for (_iml = 0; _iml < _cntml_actual; ++_iml) {
+        pd = ml->data + _iml * nparm;
+        ppd = ml->pdata + _iml * 1;
+#elif LAYOUT == 0         /*SoA*/
+    int _cntml_padded = ml->_nodecount_padded;
+    pd = ml->data;
+    ppd = ml->pdata;
+    _PRAGMA_FOR_INIT_ACC_LOOP_
+    for (_iml = 0; _iml < _cntml_actual; ++_iml) {
+#else /* if LAYOUT > 1 */ /*AoSoA*/
 #error AoSoA not implemented.
 #endif
-		if (iontype & 04) {
-			conci = conci0;
-			conco = conco0;
-		}
-		if (iontype & 040) {
-			erev = nrn_nernst(conci, conco, charge);
-		}
-	}
+        if (iontype & 04) {
+            conci = conci0;
+            conco = conco0;
+        }
+        if (iontype & 040) {
+            erev = nrn_nernst(conci, conco, charge, celsius);
+        }
+    }
 }
 
-static void ion_alloc() {
-	assert(0);
+void nrn_alloc_ion(double* p, Datum* ppvar, int _type) {
+    assert(0);
 }
 
 void second_order_cur(NrnThread* _nt) {
-	extern int secondorder;
-	NrnThreadMembList* tml;
-	Memb_list* ml;
-	int _iml, _cntml_actual;
+    extern int secondorder;
+    NrnThreadMembList* tml;
+    Memb_list* ml;
+    int _iml, _cntml_actual;
 #if LAYOUT == 0
-	int _cntml_padded;
-#endif
-	int* ni;
-	double* pd;
-	(void)_nt; /* unused */
-  if (secondorder == 2) {
-	for (tml = _nt->tml; tml; tml = tml->next) if (memb_func[tml->index].alloc == ion_alloc) {
-		ml = tml->ml;
-		_cntml_actual = ml->nodecount;
-		ni = ml->nodeindices;
-#if LAYOUT == 1 /*AoS*/
-		for (_iml = 0; _iml < _cntml_actual; ++_iml) {
-		  pd = ml->data + _iml*nparm;
+    int _cntml_padded;
 #endif
-#if LAYOUT == 0 /*SoA*/
-		_cntml_padded = ml->_nodecount_padded;
-		pd = ml->data;
-		for (_iml = 0; _iml < _cntml_actual; ++_iml) {
+    int* ni;
+    double* pd;
+    (void)_nt; /* unused */
+#if defined(_OPENACC)
+    int stream_id = _nt->stream_id;
 #endif
-#if LAYOUT > 1 /*AoSoA*/
+    double* _vec_rhs = _nt->_actual_rhs;
+
+    if (secondorder == 2) {
+        for (tml = _nt->tml; tml; tml = tml->next)
+            if (nrn_is_ion(tml->index)) {
+                ml = tml->ml;
+                _cntml_actual = ml->nodecount;
+                ni = ml->nodeindices;
+#if LAYOUT == 1 /*AoS*/
+                for (_iml = 0; _iml < _cntml_actual; ++_iml) {
+                    pd = ml->data + _iml * nparm;
+#elif LAYOUT == 0         /*SoA*/
+                _cntml_padded = ml->_nodecount_padded;
+                pd = ml->data;
+                _PRAGMA_FOR_SEC_ORDER_CUR_ACC_LOOP_
+                for (_iml = 0; _iml < _cntml_actual; ++_iml) {
+#else /* if LAYOUT > 1 */ /*AoSoA*/
 #error AoSoA not implemented.
 #endif
-			cur += dcurdv * ( VEC_RHS(ni[_iml]) );
-		}
-	}
-   }
+                    cur += dcurdv * (_vec_rhs[ni[_iml]]);
+                }
+            }
+    }
 }
 
 #endif
diff --git a/coreneuron/nrnoc/fadvance_core.c b/coreneuron/nrnoc/fadvance_core.c
index 8ec181d28..c4fcc7d10 100644
--- a/coreneuron/nrnoc/fadvance_core.c
+++ b/coreneuron/nrnoc/fadvance_core.c
@@ -30,39 +30,50 @@ THE POSSIBILITY OF SUCH DAMAGE.
 #include "coreneuron/nrnoc/multicore.h"
 #include "coreneuron/nrnmpi/nrnmpi.h"
 #include "coreneuron/nrnoc/nrnoc_decl.h"
+#include "coreneuron/nrniv/nrn_acc_manager.h"
 
 static void* nrn_fixed_step_thread(NrnThread*);
+static void* nrn_fixed_step_lastpart(NrnThread*);
 static void* nrn_fixed_step_group_thread(NrnThread*);
 static void update(NrnThread*);
 static void nonvint(NrnThread*);
+extern void nrn_flush_reports(double);
 
 void dt2thread(double adt) { /* copied from nrnoc/fadvance.c */
     if (adt != nrn_threads[0]._dt) {
-	int i;
-	for (i=0; i < nrn_nthread; ++i) {
-		NrnThread* nt = nrn_threads + i;
-		nt->_t = t;
-        nt->_dt = dt;
-		if (secondorder) {
-			nt->cj = 2.0/dt;
-		}else{
-			nt->cj = 1.0/dt;
-		}
-    }
+        int i;
+        for (i = 0; i < nrn_nthread; ++i) {
+            NrnThread* nt = nrn_threads + i;
+            nt->_t = t;
+            nt->_dt = dt;
+            if (secondorder) {
+                nt->cj = 2.0 / dt;
+            } else {
+                nt->cj = 1.0 / dt;
+            }
+        }
     }
 }
 
-void nrn_fixed_step_minimal() {
-	if (t != nrn_threads->_t) {
-		dt2thread(-1.);
-	}else{
-		dt2thread(dt);
-	}
-/*printf("nrn_fixed_step_minimal t=%g\n", t);*/
-	nrn_thread_table_check();
-	nrn_multithread_job(nrn_fixed_step_thread);
-	nrn_spike_exchange(nrn_threads);
-	t = nrn_threads[0]._t;
+void nrn_fixed_step_minimal() { /* not so minimal anymore with gap junctions */
+    if (t != nrn_threads->_t) {
+        dt2thread(-1.);
+    } else {
+        dt2thread(dt);
+    }
+    /*printf("nrn_fixed_step_minimal t=%g\n", t);*/
+    nrn_thread_table_check();
+    nrn_multithread_job(nrn_fixed_step_thread);
+    if (nrn_have_gaps) {
+        nrnmpi_v_transfer();
+        nrn_multithread_job(nrn_fixed_step_lastpart);
+    }
+#if NRNMPI
+    if (nrn_threads[0]._stop_stepping) {
+        nrn_spike_exchange(nrn_threads);
+    }
+#endif
+    t = nrn_threads[0]._t;
 }
 
 /* better cache efficiency since a thread can do an entire minimum delay
@@ -73,94 +84,160 @@ static int step_group_begin;
 static int step_group_end;
 
 void nrn_fixed_step_group_minimal(int n) {
-	dt2thread(dt);
-	nrn_thread_table_check();
-	step_group_n = n;
-	step_group_begin = 0;
-	step_group_end = 0;
-	while(step_group_end < step_group_n) {
-/*printf("step_group_end=%d step_group_n=%d\n", step_group_end, step_group_n);*/
-		nrn_multithread_job(nrn_fixed_step_group_thread);
-		nrn_spike_exchange(nrn_threads);
-		if (stoprun) { break; }
-		step_group_begin = step_group_end;
-	}
-	t = nrn_threads[0]._t;
+    static int step = 0;
+    dt2thread(dt);
+    nrn_thread_table_check();
+    step_group_n = n;
+    step_group_begin = 0;
+    step_group_end = 0;
+    while (step_group_end < step_group_n) {
+        nrn_multithread_job(nrn_fixed_step_group_thread);
+#if NRNMPI
+        nrn_spike_exchange(nrn_threads);
+#endif
+
+#ifdef ENABLE_REPORTING
+        nrn_flush_reports(nrn_threads[0]._t);
+#endif
+        if (stoprun) {
+            break;
+        }
+        step_group_begin = step_group_end;
+        step++;
+
+        //@TODO: flush/optimize/better way
+        if (nrnmpi_myid == 0) {
+            float completed = (((float)step_group_end / step_group_n) * 100.0);
+            printf(" Completed %.2f, t = %lf\r", completed, nrn_threads[0]._t);
+            fflush(stdout);
+        }
+    }
+    t = nrn_threads[0]._t;
 }
 
 static void* nrn_fixed_step_group_thread(NrnThread* nth) {
-	int i;
-	nth->_stop_stepping = 0;
-	for (i = step_group_begin; i < step_group_n; ++i) {
-		nrn_fixed_step_thread(nth);
-		if (nth->_stop_stepping) {
-			if (nth->id == 0) { step_group_end = i + 1; }
-			nth->_stop_stepping = 0;
-			return (void*)0;
-		}
-	}
-	if (nth->id == 0) { step_group_end = step_group_n; }
-	return (void*)0;
+    int i;
+    nth->_stop_stepping = 0;
+    for (i = step_group_begin; i < step_group_n; ++i) {
+        nrn_fixed_step_thread(nth);
+        if (nth->_stop_stepping) {
+            if (nth->id == 0) {
+                step_group_end = i + 1;
+            }
+            nth->_stop_stepping = 0;
+            return (void*)0;
+        }
+    }
+    if (nth->id == 0) {
+        step_group_end = step_group_n;
+    }
+    return (void*)0;
 }
 
-static void update(NrnThread* _nt){
-	int i, i1, i2;
-	i1 = 0;
-	i2 = _nt->end;
-	/* do not need to worry about linmod or extracellular*/
-	if (secondorder) {
-		for (i=i1; i < i2; ++i) {
-			VEC_V(i) += 2.*VEC_RHS(i);
-		}
-	}else{
-		for (i=i1; i < i2; ++i) {
-			VEC_V(i) += VEC_RHS(i);
-		}
-	}
-	if (_nt->tml) {
-		assert(_nt->tml->index == CAP);
-		nrn_capacity_current(_nt, _nt->tml->ml);
-	}
+static void update(NrnThread* _nt) {
+    int i, i1, i2;
+    i1 = 0;
+    i2 = _nt->end;
+#if defined(_OPENACC)
+    int stream_id = _nt->stream_id;
+#endif
+    double* vec_v = &(VEC_V(0));
+    double* vec_rhs = &(VEC_RHS(0));
+
+    /* do not need to worry about linmod or extracellular*/
+    if (secondorder) {
+        #pragma acc parallel loop present(vec_v[0 : i2], \
+                                        vec_rhs[0 : i2]) if (_nt->compute_gpu) async(stream_id)
+        for (i = i1; i < i2; ++i) {
+            vec_v[i] += 2. * vec_rhs[i];
+        }
+    } else {
+        #pragma acc parallel loop present(vec_v[0 : i2], \
+                                        vec_rhs[0 : i2]) if (_nt->compute_gpu) async(stream_id)
+        for (i = i1; i < i2; ++i) {
+            vec_v[i] += vec_rhs[i];
+        }
+    }
+
+    // update_matrix_to_gpu(_nt);
 
+    if (_nt->tml) {
+        assert(_nt->tml->index == CAP);
+        nrn_cur_capacitance(_nt, _nt->tml->ml, _nt->tml->index);
+    }
 }
 
 static void nonvint(NrnThread* _nt) {
-	NrnThreadMembList* tml;
-	errno = 0;
-	for (tml = _nt->tml; tml; tml = tml->next) if (memb_func[tml->index].state) {
-		mod_f_t s = memb_func[tml->index].state;
-		(*s)(_nt, tml->ml, tml->index);
+    NrnThreadMembList* tml;
+    if (nrn_have_gaps) {
+        nrnthread_v_transfer(_nt);
+    }
+    errno = 0;
+
+    for (tml = _nt->tml; tml; tml = tml->next)
+        if (memb_func[tml->index].state) {
+            mod_f_t s = memb_func[tml->index].state;
+            (*s)(_nt, tml->ml, tml->index);
 #ifdef DEBUG
-		if (errno) {
-hoc_warning("errno set during calculation of states", (char*)0);
-		}
+            if (errno) {
+                hoc_warning("errno set during calculation of states", (char*)0);
+            }
 #endif
-	}
+        }
 }
 
-void nrn_ba(NrnThread* nt, int bat){
-	NrnThreadBAList* tbl;
-	for (tbl = nt->tbl[bat]; tbl; tbl = tbl->next) {
-		mod_f_t f = tbl->bam->f;
-		int type = tbl->bam->type;
-		Memb_list* ml = tbl->ml;
-		(*f)(nt, ml, type);
-	}
+void nrn_ba(NrnThread* nt, int bat) {
+    NrnThreadBAList* tbl;
+    for (tbl = nt->tbl[bat]; tbl; tbl = tbl->next) {
+        mod_f_t f = tbl->bam->f;
+        int type = tbl->bam->type;
+        Memb_list* ml = tbl->ml;
+        (*f)(nt, ml, type);
+    }
 }
 
 static void* nrn_fixed_step_thread(NrnThread* nth) {
-	deliver_net_events(nth);
-	nth->_t += .5 * nth->_dt;
-	fixed_play_continuous(nth);
-	setup_tree_matrix_minimal(nth);
-	nrn_solve_minimal(nth);
-	second_order_cur(nth);
-	update(nth);
-	nth->_t += .5 * nth->_dt;
-	fixed_play_continuous(nth);
-	nonvint(nth);
-	nrn_ba(nth, AFTER_SOLVE);
-	nrn_deliver_events(nth) ; /* up to but not past texit */
-	return (void*)0;
+    /* check thresholds and deliver all (including binqueue)
+       events up to t+dt/2 */
+    deliver_net_events(nth);
+    nth->_t += .5 * nth->_dt;
+
+    if (nth->ncell) {
+#if defined(_OPENACC)
+        int stream_id = nth->stream_id;
+        /*@todo: do we need to update nth->_t on GPU: Yes (Michael, but can launch kernel) */
+        #pragma acc update device(nth->_t) if (nth->compute_gpu) async(stream_id)
+        #pragma acc wait(stream_id)
+#endif
+
+        fixed_play_continuous(nth);
+        setup_tree_matrix_minimal(nth);
+        nrn_solve_minimal(nth);
+        second_order_cur(nth);
+        update(nth);
+    }
+    if (!nrn_have_gaps) {
+        nrn_fixed_step_lastpart(nth);
+    }
+    return (void*)0;
 }
 
+static void* nrn_fixed_step_lastpart(NrnThread* nth) {
+    nth->_t += .5 * nth->_dt;
+
+    if (nth->ncell) {
+#if defined(_OPENACC)
+        int stream_id = nth->stream_id;
+        /*@todo: do we need to update nth->_t on GPU */
+        #pragma acc update device(nth->_t) if (nth->compute_gpu) async(stream_id)
+        #pragma acc wait(stream_id)
+#endif
+
+        fixed_play_continuous(nth);
+        nonvint(nth);
+        nrn_ba(nth, AFTER_SOLVE);
+    }
+
+    nrn_deliver_events(nth); /* up to but not past texit */
+    return (void*)0;
+}
diff --git a/coreneuron/nrnoc/finitialize.c b/coreneuron/nrnoc/finitialize.c
index a359ad60c..d31408673 100644
--- a/coreneuron/nrnoc/finitialize.c
+++ b/coreneuron/nrnoc/finitialize.c
@@ -31,61 +31,71 @@ THE POSSIBILITY OF SUCH DAMAGE.
 #include "coreneuron/nrnoc/nrnoc_decl.h"
 
 void nrn_finitialize(int setv, double v) {
-	int i;
-	NrnThread* _nt;
+    int i;
+    NrnThread* _nt;
 
-	t = 0.;
-	dt2thread(-1.);
-	nrn_thread_table_check();
-	clear_event_queue();
-	nrn_spike_exchange_init();
+    t = 0.;
+    dt2thread(-1.);
+    nrn_thread_table_check();
+    clear_event_queue();
+    nrn_spike_exchange_init();
 #if VECTORIZE
-	nrn_play_init(); /* Vector.play */
-        ///Play events should be executed before initializing events
-	for (i=0; i < nrn_nthread; ++i) {
-		nrn_deliver_events(nrn_threads + i); /* The play events at t=0 */
-	}
-	if (setv) {
-          for (_nt = nrn_threads; _nt < nrn_threads + nrn_nthread; ++_nt) {
-            for (i=0; i < _nt->end; ++i) {
-			VEC_V(i) = v;
+    nrn_play_init(); /* Vector.play */
+                     /// Play events should be executed before initializing events
+    for (i = 0; i < nrn_nthread; ++i) {
+        nrn_deliver_events(nrn_threads + i); /* The play events at t=0 */
+    }
+    if (setv) {
+        for (_nt = nrn_threads; _nt < nrn_threads + nrn_nthread; ++_nt) {
+            double* vec_v = &(VEC_V(0));
+            #pragma acc parallel loop present(_nt[0 : 1], vec_v[0 : _nt->end]) if (_nt->compute_gpu)
+            for (i = 0; i < _nt->end; ++i) {
+                vec_v[i] = v;
             }
-          }
         }
-	for (i=0; i < nrn_nthread; ++i) {
-		nrn_ba(nrn_threads + i, BEFORE_INITIAL);
-	}
-	/* the INITIAL blocks are ordered so that mechanisms that write
-	   concentrations are after ions and before mechanisms that read
-	   concentrations.
-	*/
-	/* the memblist list in NrnThread is already so ordered */
-	for (i=0; i < nrn_nthread; ++i) {
-		NrnThread* nt = nrn_threads + i;
-		NrnThreadMembList* tml;
-		for (tml = nt->tml; tml; tml = tml->next) {
-			mod_f_t s = memb_func[tml->index].initialize;
-			if (s) {
-				(*s)(nt, tml->ml, tml->index);
-			}
-		}
-	}
-#endif
+    }
+
+    if (nrn_have_gaps) {
+        nrnmpi_v_transfer();
+        for (i = 0; i < nrn_nthread; ++i) {
+            nrnthread_v_transfer(nrn_threads + i);
+        }
+    }
 
-	init_net_events();
-	for (i = 0; i < nrn_nthread; ++i) {
-		nrn_ba(nrn_threads + i, AFTER_INITIAL);
-	}
-	for (i=0; i < nrn_nthread; ++i) {
-		nrn_deliver_events(nrn_threads + i); /* The INITIAL sent events at t=0 */
-	}
-        for (i=0; i < nrn_nthread; ++i) {
-            setup_tree_matrix_minimal(nrn_threads + i);
+    for (i = 0; i < nrn_nthread; ++i) {
+        nrn_ba(nrn_threads + i, BEFORE_INITIAL);
+    }
+    /* the INITIAL blocks are ordered so that mechanisms that write
+       concentrations are after ions and before mechanisms that read
+       concentrations.
+    */
+    /* the memblist list in NrnThread is already so ordered */
+    for (i = 0; i < nrn_nthread; ++i) {
+        NrnThread* nt = nrn_threads + i;
+        NrnThreadMembList* tml;
+        for (tml = nt->tml; tml; tml = tml->next) {
+            mod_f_t s = memb_func[tml->index].initialize;
+            if (s) {
+                (*s)(nt, tml->ml, tml->index);
+            }
         }
-	for (i=0; i < nrn_nthread; ++i) {
-		nrn_deliver_events(nrn_threads + i); /* The record events at t=0 */
-	}
+    }
+#endif
+
+    init_net_events();
+    for (i = 0; i < nrn_nthread; ++i) {
+        nrn_ba(nrn_threads + i, AFTER_INITIAL);
+    }
+    for (i = 0; i < nrn_nthread; ++i) {
+        nrn_deliver_events(nrn_threads + i); /* The INITIAL sent events at t=0 */
+    }
+    for (i = 0; i < nrn_nthread; ++i) {
+        setup_tree_matrix_minimal(nrn_threads + i);
+    }
+    for (i = 0; i < nrn_nthread; ++i) {
+        nrn_deliver_events(nrn_threads + i); /* The record events at t=0 */
+    }
 #if NRNMPI
-	nrn_spike_exchange(nrn_threads);
+    nrn_spike_exchange(nrn_threads);
 #endif
 }
diff --git a/coreneuron/nrnoc/md1redef.h b/coreneuron/nrnoc/md1redef.h
index 9c44d8cb8..3f74af080 100644
--- a/coreneuron/nrnoc/md1redef.h
+++ b/coreneuron/nrnoc/md1redef.h
@@ -48,6 +48,10 @@ THE POSSIBILITY OF SUCH DAMAGE.
 #undef prop
 #undef nodecount
 #undef type
+#undef pval
+#undef id
+#undef weights
+#undef weight_index_
 
 #define NrnThread _NrnThread
 #define Memb_list _Memb_list
@@ -60,5 +64,7 @@ THE POSSIBILITY OF SUCH DAMAGE.
 #define type _type
 #define pval _pval
 #define id _id
+#define weights _weights
+#define weight_index_ _weight_index
 
 #endif
diff --git a/coreneuron/nrnoc/md2redef.h b/coreneuron/nrnoc/md2redef.h
index ee901169f..efcabb1b9 100644
--- a/coreneuron/nrnoc/md2redef.h
+++ b/coreneuron/nrnoc/md2redef.h
@@ -49,6 +49,8 @@ THE POSSIBILITY OF SUCH DAMAGE.
 #undef prop
 #undef nodecount
 #undef pval
+#undef weights
+#undef weight_index_
 
 #undef type
 #undef id
diff --git a/coreneuron/nrnoc/membdef.h b/coreneuron/nrnoc/membdef.h
index 3a8e53dc2..408a28666 100644
--- a/coreneuron/nrnoc/membdef.h
+++ b/coreneuron/nrnoc/membdef.h
@@ -29,53 +29,55 @@ THE POSSIBILITY OF SUCH DAMAGE.
 /* /local/src/master/nrn/src/nrnoc/membdef.h,v 1.2 1995/02/13 20:20:42 hines Exp */
 
 /* numerical parameters */
-#define DEF_nseg	1          /* default number of segments per section*/
-#define DEF_dt		.025	   /* ms */
-#define DEF_secondorder 0	   /* >0 means crank-nicolson. 2 means current
-                                   adjusted to t+dt/2 */
+#define DEF_nseg 1             /* default number of segments per section*/
+#define DEF_dt .025            /* ms */
+#define DEF_rev_dt 1. / DEF_dt /* 1/ms */
+#define DEF_secondorder                           \
+    0 /* >0 means crank-nicolson. 2 means current \
+      adjusted to t+dt/2 */
 
 /*global parameters */
-#define DEF_Ra		35.4	   /* ohm-cm */	/*changed from 34.5 on 1/6/95*/
-#define DEF_celsius	6.3	   /* deg-C */
+#define DEF_Ra 35.4 /* ohm-cm */ /*changed from 34.5 on 1/6/95*/
+#define DEF_celsius 6.3          /* deg-C */
 
-#define DEF_vrest	-65.	   /* mV */
+#define DEF_vrest -65. /* mV */
 
 /* old point process parameters */
 /* fclamp */
-#define DEF_clamp_resist 1e-3	   /* megohm */
+#define DEF_clamp_resist 1e-3 /* megohm */
 
 /* Parameters that are used in mechanism _alloc() procedures */
 /* cable */
-#define DEF_L		100. 	   /* microns */
-#define DEF_rallbranch	1.
+#define DEF_L 100. /* microns */
+#define DEF_rallbranch 1.
 
 /* morphology */
-#define DEF_diam	500. 	   /* microns */
+#define DEF_diam 500. /* microns */
 
 /* capacitance */
-#define	DEF_cm		1.	   /* uF/cm^2 */
+#define DEF_cm 1. /* uF/cm^2 */
 
 /* fast passive (e_p and g_p)*/
-#define DEF_e		DEF_vrest  /* mV */
-#define DEF_g		5.e-4	   /* S/cm^2 */
+#define DEF_e DEF_vrest /* mV */
+#define DEF_g 5.e-4     /* S/cm^2 */
 
 /* na_ion */
-#define DEF_nai		10.	   /* mM */
-#define DEF_nao		140.	   /* mM */
-#define	DEF_ena		(115. + DEF_vrest) /* mV */
+#define DEF_nai 10.                /* mM */
+#define DEF_nao 140.               /* mM */
+#define DEF_ena (115. + DEF_vrest) /* mV */
 
 /* k_ion */
-#define DEF_ki		54.4	   /* mM */
-#define DEF_ko		2.5	   /* mM */
-#define	DEF_ek		(-12. + DEF_vrest) /* mV */
+#define DEF_ki 54.4               /* mM */
+#define DEF_ko 2.5                /* mM */
+#define DEF_ek (-12. + DEF_vrest) /* mV */
 
 /* ca_ion -> any program that uses DEF_eca must include <math.h> */
-#define DEF_cai		5.e-5	   /* mM */
-#define DEF_cao		2.	   /* mM */
+#define DEF_cai 5.e-5 /* mM */
+#define DEF_cao 2.    /* mM */
 #include <math.h>
-#define	DEF_eca		12.5 *log(DEF_cao / DEF_cai) /* mV */ 
+#define DEF_eca 12.5 * log(DEF_cao / DEF_cai) /* mV */
 
 /* default ion values */
-#define DEF_ioni	1.	   /* mM */
-#define DEF_iono	1.	   /* mM */
-#define DEF_eion	0.	   /* mV */
+#define DEF_ioni 1. /* mM */
+#define DEF_iono 1. /* mM */
+#define DEF_eion 0. /* mV */
diff --git a/coreneuron/nrnoc/membfunc.h b/coreneuron/nrnoc/membfunc.h
index 89cf711ac..a6905b372 100644
--- a/coreneuron/nrnoc/membfunc.h
+++ b/coreneuron/nrnoc/membfunc.h
@@ -35,42 +35,41 @@ extern "C" {
 
 #include "coreneuron/nrnoc/nrnoc_ml.h"
 
-typedef Datum *(*Pfrpdat)(void);
+typedef Datum* (*Pfrpdat)(void);
 
 struct NrnThread;
 
 typedef void (*mod_alloc_t)(double*, Datum*, int);
 typedef void (*mod_f_t)(struct NrnThread*, Memb_list*, int);
-typedef void (*pnt_receive_t)(Point_process*, double*, double);
+typedef void (*pnt_receive_t)(Point_process*, int, double);
 
 typedef struct Memb_func {
-	mod_alloc_t alloc;
-	mod_f_t	current;
-	mod_f_t	jacob;
-	mod_f_t	state;
-	mod_f_t	initialize;
-	Pfri	destructor;	/* only for point processes */
-	Symbol	*sym;
-	int vectorized;
-	int thread_size_; /* how many Datum needed in Memb_list if vectorized */
-	void (*thread_mem_init_)(ThreadDatum*); /* after Memb_list._thread is allocated */
-	void (*thread_cleanup_)(ThreadDatum*); /* before Memb_list._thread is freed */
+    mod_alloc_t alloc;
+    mod_f_t current;
+    mod_f_t jacob;
+    mod_f_t state;
+    mod_f_t initialize;
+    Pfri destructor; /* only for point processes */
+    Symbol* sym;
+    int vectorized;
+    int thread_size_;                       /* how many Datum needed in Memb_list if vectorized */
+    void (*thread_mem_init_)(ThreadDatum*); /* after Memb_list._thread is allocated */
+    void (*thread_cleanup_)(ThreadDatum*);  /* before Memb_list._thread is freed */
     void (*thread_table_check_)(int, int, double*, Datum*, ThreadDatum*, void*, int);
-	int is_point;
-	void (*setdata_)(double*, Datum*);
-	int* dparam_semantics; /* for nrncore writing. */
+    int is_point;
+    void (*setdata_)(double*, Datum*);
+    int* dparam_semantics; /* for nrncore writing. */
 } Memb_func;
 
-
-#define VINDEX	-1
-#define CABLESECTION	1
-#define MORPHOLOGY	2
-#define CAP	3
-#define EXTRACELL	5
+#define VINDEX -1
+#define CABLESECTION 1
+#define MORPHOLOGY 2
+#define CAP 3
+#define EXTRACELL 5
 
 #define nrnocCONST 1
 #define DEP 2
-#define STATE 3	/*See init.c and cabvars.h for order of nrnocCONST, DEP, and STATE */
+#define STATE 3 /*See init.c and cabvars.h for order of nrnocCONST, DEP, and STATE */
 
 #define BEFORE_INITIAL 0
 #define AFTER_INITIAL 1
@@ -79,16 +78,20 @@ typedef struct Memb_func {
 #define BEFORE_STEP 4
 #define BEFORE_AFTER_SIZE 5 /* 1 more than the previous */
 typedef struct BAMech {
-	mod_f_t f;
-	int type;
-	struct BAMech* next;
+    mod_f_t f;
+    int type;
+    struct BAMech* next;
 } BAMech;
 extern BAMech** bamech_;
 
+extern int nrn_ion_global_map_size;
+extern double** nrn_ion_global_map;
+
 extern Memb_func* memb_func;
 extern int n_memb_func;
-#define NRNPOINTER 4 /* added on to list of mechanism variables.These are
-pointers which connect variables  from other mechanisms via the _ppval array.
+#define NRNPOINTER                                                            \
+    4 /* added on to list of mechanism variables.These are                    \
+pointers which connect variables  from other mechanisms via the _ppval array. \
 */
 
 #define _AMBIGUOUS 5
@@ -102,25 +105,75 @@ extern pnt_receive_t* pnt_receive;
 extern pnt_receive_t* pnt_receive_init;
 
 extern int nrn_get_mechtype(const char*);
-extern int register_mech(const char** m, mod_alloc_t alloc, mod_f_t cur, mod_f_t jacob,
-  mod_f_t stat, mod_f_t initialize, int nrnpointerindex, int vectorized
-  ); 
-extern int point_register_mech(const char**, mod_alloc_t alloc, mod_f_t cur,
-  mod_f_t jacob, mod_f_t stat, mod_f_t initialize, int nrnpointerindex,
-  void*(*constructor)(), void(*destructor)(), int vectorized
-  );
-extern void nrn_cap_jacob(struct NrnThread*, Memb_list*);
+extern const char* nrn_get_mechname(int);  // slow. use memb_func[i].sym if posible
+extern int register_mech(const char** m,
+                         mod_alloc_t alloc,
+                         mod_f_t cur,
+                         mod_f_t jacob,
+                         mod_f_t stat,
+                         mod_f_t initialize,
+                         int nrnpointerindex,
+                         int vectorized);
+extern int point_register_mech(const char**,
+                               mod_alloc_t alloc,
+                               mod_f_t cur,
+                               mod_f_t jacob,
+                               mod_f_t stat,
+                               mod_f_t initialize,
+                               int nrnpointerindex,
+                               void* (*constructor)(),
+                               void (*destructor)(),
+                               int vectorized);
+typedef void (*NetBufReceive_t)(struct NrnThread*);
+extern void hoc_register_net_receive_buffering(NetBufReceive_t, int);
+extern int net_buf_receive_cnt_;
+extern int* net_buf_receive_type_;
+extern NetBufReceive_t* net_buf_receive_;
+
+extern void hoc_register_net_send_buffering(int);
+extern int net_buf_send_cnt_;
+extern int* net_buf_send_type_;
+
+extern void nrn_jacob_capacitance(struct NrnThread*, Memb_list*, int);
 extern void nrn_writes_conc(int, int);
-extern void nrn_wrote_conc(int, double*, int, int, struct NrnThread*);
+#if defined(_OPENACC)
+#pragma acc routine seq
+#endif
+extern void nrn_wrote_conc(int, double*, int, int, double**, double, int);
 extern void hoc_register_prop_size(int, int, int);
 extern void hoc_register_dparam_semantics(int type, int, const char* name);
 
+typedef struct {
+    const char* name;
+    double* pdoub;
+} DoubScal;
+typedef struct {
+    const char* name;
+    double* pdoub;
+    int index1;
+} DoubVec;
+typedef struct {
+    const char* name;
+    void (*func)(void);
+} VoidFunc;
+extern void hoc_register_var(DoubScal*, DoubVec*, VoidFunc*);
+
 extern void _nrn_layout_reg(int, int);
 extern int* nrn_mech_data_layout_;
-extern void _nrn_thread_reg0(int i, void(*f)(ThreadDatum*));
-extern void _nrn_thread_reg1(int i, void(*f)(ThreadDatum*));
-
-typedef void (*bbcore_read_t)(double*, int*, int*, int*, int, int, double*, Datum*, ThreadDatum*, struct NrnThread*, double);
+extern void _nrn_thread_reg0(int i, void (*f)(ThreadDatum*));
+extern void _nrn_thread_reg1(int i, void (*f)(ThreadDatum*));
+
+typedef void (*bbcore_read_t)(double*,
+                              int*,
+                              int*,
+                              int*,
+                              int,
+                              int,
+                              double*,
+                              Datum*,
+                              ThreadDatum*,
+                              struct NrnThread*,
+                              double);
 extern bbcore_read_t* nrn_bbcore_read_;
 
 extern int nrn_mech_depend(int type, int* dependencies);
@@ -131,11 +184,14 @@ extern void add_nrn_fornetcons(int, int);
 extern void add_nrn_artcell(int, int);
 extern void add_nrn_has_net_event(int);
 extern void net_event(Point_process*, double);
-extern void net_send(void**, double*, Point_process*, double, double);
-extern void artcell_net_send(void**, double*, Point_process*, double, double);
+extern void net_send(void**, int, Point_process*, double, double);
+extern void net_move(void**, Point_process*, double);
+extern void artcell_net_send(void**, int, Point_process*, double, double);
+// _OPENACC and/or NET_RECEIVE_BUFFERING
+extern void net_sem_from_gpu(int, int, int, int, int, double, double);
+
 extern void hoc_malchk(void); /* just a stub */
 extern void* hoc_Emalloc(size_t);
-extern int at_time(struct NrnThread*, double);
 
 #if defined(__cplusplus)
 }
diff --git a/coreneuron/nrnoc/multicore.c b/coreneuron/nrnoc/multicore.c
index cf980403f..6e4f64d19 100644
--- a/coreneuron/nrnoc/multicore.c
+++ b/coreneuron/nrnoc/multicore.c
@@ -26,6 +26,7 @@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 THE POSSIBILITY OF SUCH DAMAGE.
 */
 
+#include <stdlib.h>
 #include "coreneuron/nrnconf.h"
 #include "coreneuron/nrnoc/nrnpthread.h"
 #include "coreneuron/nrnoc/multicore.h"
@@ -64,7 +65,8 @@ model structure. We want to use Node* as much as possible and defer
 the handling of v_structure_change as long as possible.
 */
 
-#define CACHELINE_ALLOC(name,type,size) name = (type*)nrn_cacheline_alloc((void**)&name, size*sizeof(type))
+#define CACHELINE_ALLOC(name, type, size) \
+    name = (type*)nrn_cacheline_alloc((void**)&name, size * sizeof(type))
 
 int nrn_nthread;
 NrnThread* nrn_threads;
@@ -73,12 +75,6 @@ void (*nrn_mk_transfer_thread_data_)();
 extern int v_structure_change;
 extern int diam_changed;
 
-#if (PERMANENT || USE_PTHREAD)
-static int busywait_;
-static int busywait_main_;
-#endif
-
-
 extern void nrn_threads_free();
 extern void nrn_old_thread_save();
 extern double nrn_timeus();
@@ -87,30 +83,7 @@ static int nrn_thread_parallel_;
 static int table_check_cnt_;
 static ThreadDatum* table_check_;
 
-/* linux specfic for performance testing */
-/* eventually will be removed */
-#define BENCHMARKING 0
-#if BENCHMARKING
-/* for rdtscll() */
-#include <asm/msr.h>
-#define BENCHDECLARE unsigned long t1;
-#define BENCHBEGIN(arg) if (t_[arg] < t1_[arg] + BSIZE) {rdtscl(t1); *(t_[arg]++) = t1;}
-#define BENCHADD(arg) BENCHBEGIN(arg)
-#define WAIT wait_for_workers_timeit
-#define CPU_MHZ 3192
-#define BSIZE 200000
-#define BS 10
-static unsigned long bcnt_, bcnt1_;
-static unsigned long t1_[BS][BSIZE], *t_[BS];
-#else
-#define BENCHDECLARE /**/
-#define BENCHBEGIN(arg) /**/
-#define BENCHADD(arg) /**/
-#define WAIT wait_for_workers
-#define BS 0
-#endif
-
-#if USE_PTHREAD
+#if 0 && USE_PTHREAD
 static void* nulljob(NrnThread* nt) {
 	(void)nt; /* unused */
 	return (void*)0;
@@ -123,116 +96,6 @@ int nrn_inthread_;
 #include <pthread.h>
 //#include <sched.h> /* for sched_setaffinity */
 
-/* abort if using threads and a call to malloc is unprotected */
-#define use_malloc_hook 0
-#if use_malloc_hook
-#include <malloc.h>
-
-static int nrn_malloc_protected_;
-static void my_init_hook();
-static void *(*old_malloc_hook) (size_t, const void*);
-static void *(*old_memalign_hook) (size_t, size_t, const void*);
-static void *(*old_realloc_hook) (void*, size_t, const void*);
-static void (*old_free_hook) (void*, const void*);
-static void *my_malloc_hook (size_t, const void*);
-static void *my_memalign_hook (size_t, size_t, const void*);
-static void *my_realloc_hook (void*, size_t, const void*);
-static void my_free_hook (void*, const void*);
-void (*__malloc_initialize_hook)(void) = my_init_hook;
-
-static void* my_malloc_hook(size_t size, const void* caller) {
-	void* result;
-	if (nrn_inthread_ && !nrn_malloc_protected_) {
-		abort();
-	}
-	__malloc_hook = old_malloc_hook;
-	__memalign_hook = old_memalign_hook;
-	__realloc_hook = old_realloc_hook;
-	__free_hook = old_free_hook;
-	result = malloc(size);
-	old_malloc_hook = __malloc_hook;
-	old_memalign_hook = __memalign_hook;
-	old_realloc_hook = __realloc_hook;
-	old_free_hook = __free_hook;
-	__malloc_hook = my_malloc_hook;
-	__memalign_hook = my_memalign_hook;
-	__realloc_hook = my_realloc_hook;
-	__free_hook = my_free_hook;
-	return result;
-}
-static void* my_memalign_hook(size_t alignment, size_t size, const void* caller) {
-	void* result;
-	if (nrn_inthread_ && !nrn_malloc_protected_) {
-		abort();
-	}
-	__malloc_hook = old_malloc_hook;
-	__memalign_hook = old_memalign_hook;
-	__realloc_hook = old_realloc_hook;
-	__free_hook = old_free_hook;
-	result = memalign(alignment, size);
-	old_malloc_hook = __malloc_hook;
-	old_memalign_hook = __memalign_hook;
-	old_realloc_hook = __realloc_hook;
-	old_free_hook = __free_hook;
-	__malloc_hook = my_malloc_hook;
-	__memalign_hook = my_memalign_hook;
-	__realloc_hook = my_realloc_hook;
-	__free_hook = my_free_hook;
-	return result;
-}
-static void* my_realloc_hook(void* ptr, size_t size, const void* caller) {
-	void* result;
-	if (nrn_inthread_ && !nrn_malloc_protected_) {
-		abort();
-	}
-	__malloc_hook = old_malloc_hook;
-	__memalign_hook = old_memalign_hook;
-	__realloc_hook = old_realloc_hook;
-	__free_hook = old_free_hook;
-	result = realloc(ptr, size);
-	old_malloc_hook = __malloc_hook;
-	old_memalign_hook = __memalign_hook;
-	old_realloc_hook = __realloc_hook;
-	old_free_hook = __free_hook;
-	__malloc_hook = my_malloc_hook;
-	__memalign_hook = my_memalign_hook;
-	__realloc_hook = my_realloc_hook;
-	__free_hook = my_free_hook;
-	return result;
-}
-static void my_free_hook(void* ptr, const void* caller) {
-	if (nrn_inthread_ && !nrn_malloc_protected_) {
-		abort();
-	}
-	__malloc_hook = old_malloc_hook;
-	__memalign_hook = old_memalign_hook;
-	__realloc_hook = old_realloc_hook;
-	__free_hook = old_free_hook;
-	free(ptr);
-	old_malloc_hook = __malloc_hook;
-	old_memalign_hook = __memalign_hook;
-	old_realloc_hook = __realloc_hook;
-	old_free_hook = __free_hook;
-	__malloc_hook = my_malloc_hook;
-	__memalign_hook = my_memalign_hook;
-	__realloc_hook = my_realloc_hook;
-	__free_hook = my_free_hook;
-}
-static void my_init_hook() {
-	static int installed = 0;
-	if (installed) { return; }
-	installed = 1;
-	old_malloc_hook = __malloc_hook;
-	__malloc_hook = my_malloc_hook;
-	old_memalign_hook = __memalign_hook;
-	__memalign_hook = my_memalign_hook;
-	old_realloc_hook = __realloc_hook;
-	__realloc_hook = my_realloc_hook;
-	old_free_hook = __free_hook;
-	__free_hook = my_free_hook;
-}
-#endif
-
 static int interpreter_locked;
 static pthread_mutex_t interpreter_lock_;
 static pthread_mutex_t* _interpreter_lock;
@@ -243,7 +106,6 @@ pthread_mutex_t* _nmodlmutex;
 static pthread_mutex_t nrn_malloc_mutex_;
 static pthread_mutex_t* _nrn_malloc_mutex;
 
-
 /* when PERMANENT is 0, we avoid false warnings with helgrind, but a bit slower */
 /* when 0, create/join instead of wait on condition. */
 #if !defined(NDEBUG)
@@ -253,14 +115,20 @@ static pthread_mutex_t* _nrn_malloc_mutex;
 #define PERMANENT 1
 #endif
 
+#if PERMANENT
+static int busywait_;
+static int busywait_main_;
+#endif
+
 typedef volatile struct {
-        int flag;
-	int thread_id;
-        /* for nrn_solve etc.*/
-        void* (*job)(NrnThread*);
+    int flag;
+    int thread_id;
+    /* for nrn_solve etc.*/
+    void* (*job)(NrnThread*);
 } slave_conf_t;
 
 static pthread_t* slave_threads;
+
 #if PERMANENT
 static pthread_cond_t* cond;
 static pthread_mutex_t* mut;
@@ -268,390 +136,338 @@ static slave_conf_t* wc;
 #endif
 
 static void wait_for_workers() {
-	int i;
-	for (i=1; i < nrn_nthread; ++i) {
+    int i;
+    for (i = 1; i < nrn_nthread; ++i) {
 #if PERMANENT
-	    if (busywait_main_) {
-		while (wc[i].flag != 0){;}
-	    }else{
-		pthread_mutex_lock(mut + i);
-		while (wc[i].flag != 0) {
-			pthread_cond_wait(cond + i, mut + i);
-		}
-		pthread_mutex_unlock(mut + i);
-	    }
+        if (busywait_main_) {
+            while (wc[i].flag != 0) {
+                ;
+            }
+        } else {
+            pthread_mutex_lock(mut + i);
+            while (wc[i].flag != 0) {
+                pthread_cond_wait(cond + i, mut + i);
+            }
+            pthread_mutex_unlock(mut + i);
+        }
 #else
-		pthread_join(slave_threads[i], (void*)0);
+        pthread_join(slave_threads[i], (void*)0);
+        /* if CORENEURON_OPENMP is off */
+        (void)busywait_;
+        (void)busywait_main_;
+        (void)nulljob;
 #endif
-	}
-}
-
-#if BENCHMARKING
-static void wait_for_workers_timeit() {
-	BENCHDECLARE
-	BENCHBEGIN(BS-2)
-	wait_for_workers();
-	BENCHADD(BS-1)
+    }
 }
-#endif
 
 static void send_job_to_slave(int i, void* (*job)(NrnThread*)) {
 #if PERMANENT
-	pthread_mutex_lock(mut + i);
-	wc[i].job = job;
-	wc[i].flag = 1;
-	pthread_cond_signal(cond + i);
-	pthread_mutex_unlock(mut + i);
+    pthread_mutex_lock(mut + i);
+    wc[i].job = job;
+    wc[i].flag = 1;
+    pthread_cond_signal(cond + i);
+    pthread_mutex_unlock(mut + i);
 #else
-	pthread_create(slave_threads + i, (void*)0, (void*(*)(void*))job, (void*)(nrn_threads + i));
+    pthread_create(slave_threads + i, (void*)0, (void* (*)(void*))job, (void*)(nrn_threads + i));
 #endif
 }
 
-
 #if PERMANENT
 static void* slave_main(void* arg) {
-	slave_conf_t* my_wc = (slave_conf_t*)arg;
-	pthread_mutex_t *my_mut = mut + my_wc->thread_id;
-	pthread_cond_t *my_cond = cond + my_wc->thread_id;
-	BENCHDECLARE
-#if BENCHMARKING
-	unsigned long* t_[BS];
-	int a1, a2;
-	a1 = my_wc->thread_id;
-	a2 = my_wc->thread_id + nrn_nthread;
-	t_[a1] = t1_[a1];
-	t_[a2] = t1_[a2];
-#endif
-
-	for(;;) {
-	    if (busywait_) {
-			while(my_wc->flag == 0) {;}
-			if (my_wc->flag == 1) {
-				BENCHBEGIN(a1)
-				(*my_wc->job)(nrn_threads + my_wc->thread_id);
-				BENCHADD(a2)
-			}else{
-				return (void*)0;
-			}
-			my_wc->flag = 0;
-			pthread_cond_signal(my_cond);
-	    }else{
-		pthread_mutex_lock(my_mut);
-		while (my_wc->flag == 0) {
-			pthread_cond_wait(my_cond, my_mut);
-		}
-		pthread_mutex_unlock(my_mut);
-		pthread_mutex_lock(my_mut);
-		if (my_wc->flag == 1) {
-			pthread_mutex_unlock(my_mut);
-			BENCHBEGIN(a1)
-			(*my_wc->job)(nrn_threads + my_wc->thread_id);
-			BENCHADD(a2)
-		}else{
-			pthread_mutex_unlock(my_mut);
-			return (void*)0;
-		}
-		pthread_mutex_lock(my_mut);
-		my_wc->flag = 0;
-		pthread_cond_signal(my_cond);
-		pthread_mutex_unlock(my_mut);
-	    }
-	}
-	return (void*)0;
+    slave_conf_t* my_wc = (slave_conf_t*)arg;
+    pthread_mutex_t* my_mut = mut + my_wc->thread_id;
+    pthread_cond_t* my_cond = cond + my_wc->thread_id;
+
+    for (;;) {
+        if (busywait_) {
+            while (my_wc->flag == 0) {
+                ;
+            }
+            if (my_wc->flag == 1) {
+                (*my_wc->job)(nrn_threads + my_wc->thread_id);
+            } else {
+                return (void*)0;
+            }
+            my_wc->flag = 0;
+            pthread_cond_signal(my_cond);
+        } else {
+            pthread_mutex_lock(my_mut);
+            while (my_wc->flag == 0) {
+                pthread_cond_wait(my_cond, my_mut);
+            }
+            pthread_mutex_unlock(my_mut);
+            pthread_mutex_lock(my_mut);
+            if (my_wc->flag == 1) {
+                pthread_mutex_unlock(my_mut);
+                (*my_wc->job)(nrn_threads + my_wc->thread_id);
+            } else {
+                pthread_mutex_unlock(my_mut);
+                return (void*)0;
+            }
+            pthread_mutex_lock(my_mut);
+            my_wc->flag = 0;
+            pthread_cond_signal(my_cond);
+            pthread_mutex_unlock(my_mut);
+        }
+    }
+    return (void*)0;
 }
 
 #endif
 
-static void threads_create_pthread(){
+static void threads_create_pthread() {
     if (nrn_nthread > 1) {
 #if PERMANENT
-	int i;
-	CACHELINE_ALLOC(wc, slave_conf_t, nrn_nthread);
-	slave_threads = (pthread_t *)emalloc(sizeof(pthread_t)*nrn_nthread);
-	cond = (pthread_cond_t *)emalloc(sizeof(pthread_cond_t)*nrn_nthread);
-	mut = (pthread_mutex_t *)emalloc(sizeof(pthread_mutex_t)*nrn_nthread);
-	for (i=1; i < nrn_nthread; ++i) {
-		wc[i].flag = 0;
-		wc[i].thread_id = i;
-		pthread_cond_init(cond + i, (void*)0);
-		pthread_mutex_init(mut + i, (void*)0);
-		pthread_create(slave_threads + i, (void*)0, slave_main, (void*)(wc+i));
-	}
+        int i;
+        CACHELINE_ALLOC(wc, slave_conf_t, nrn_nthread);
+        slave_threads = (pthread_t*)emalloc(sizeof(pthread_t) * nrn_nthread);
+        cond = (pthread_cond_t*)emalloc(sizeof(pthread_cond_t) * nrn_nthread);
+        mut = (pthread_mutex_t*)emalloc(sizeof(pthread_mutex_t) * nrn_nthread);
+        for (i = 1; i < nrn_nthread; ++i) {
+            wc[i].flag = 0;
+            wc[i].thread_id = i;
+            pthread_cond_init(cond + i, (void*)0);
+            pthread_mutex_init(mut + i, (void*)0);
+            pthread_create(slave_threads + i, (void*)0, slave_main, (void*)(wc + i));
+        }
 #else
-	slave_threads = (pthread_t *)emalloc(sizeof(pthread_t)*nrn_nthread);
+        slave_threads = (pthread_t*)emalloc(sizeof(pthread_t) * nrn_nthread);
 #endif /* PERMANENT */
-	if (!_interpreter_lock) {
-		interpreter_locked = 0;
-		_interpreter_lock = &interpreter_lock_;
-		pthread_mutex_init(_interpreter_lock, (void*)0);
-	}
-	if (!_nmodlmutex) {
-		_nmodlmutex = &nmodlmutex_;
-		pthread_mutex_init(_nmodlmutex, (void*)0);
-	}
-	if (!_nrn_malloc_mutex) {
-		_nrn_malloc_mutex = &nrn_malloc_mutex_;
-		pthread_mutex_init(_nrn_malloc_mutex, (void*)0);
-	}
-	nrn_thread_parallel_ = 1;
-    }else{
-    	nrn_thread_parallel_ = 0;
+        if (!_interpreter_lock) {
+            interpreter_locked = 0;
+            _interpreter_lock = &interpreter_lock_;
+            pthread_mutex_init(_interpreter_lock, (void*)0);
+        }
+        if (!_nmodlmutex) {
+            _nmodlmutex = &nmodlmutex_;
+            pthread_mutex_init(_nmodlmutex, (void*)0);
+        }
+        if (!_nrn_malloc_mutex) {
+            _nrn_malloc_mutex = &nrn_malloc_mutex_;
+            pthread_mutex_init(_nrn_malloc_mutex, (void*)0);
+        }
+        nrn_thread_parallel_ = 1;
+    } else {
+        nrn_thread_parallel_ = 0;
     }
 }
 
-static void threads_free_pthread(){
-	if (slave_threads) {
+static void threads_free_pthread() {
+    if (slave_threads) {
 #if PERMANENT
-		int i;
-		wait_for_workers();
-		for (i=1; i < nrn_nthread; ++i) {
-			pthread_mutex_lock(mut + i);
-			wc[i].flag = -1;
-			pthread_cond_signal(cond + i);
-			pthread_mutex_unlock(mut + i);
-			pthread_join(slave_threads[i], (void*)0);
-			pthread_cond_destroy(cond + i);
-			pthread_mutex_destroy(mut + i);
-		}
-		free((char*)slave_threads);
-		free((char*)cond);
-		free((char*)mut);
-		free((char*)wc);
-		slave_threads = (pthread_t*)0;
-		cond = (pthread_cond_t*)0;
-		mut = (pthread_mutex_t*)0;
-		wc = (slave_conf_t*)0;
+        int i;
+        wait_for_workers();
+        for (i = 1; i < nrn_nthread; ++i) {
+            pthread_mutex_lock(mut + i);
+            wc[i].flag = -1;
+            pthread_cond_signal(cond + i);
+            pthread_mutex_unlock(mut + i);
+            pthread_join(slave_threads[i], (void*)0);
+            pthread_cond_destroy(cond + i);
+            pthread_mutex_destroy(mut + i);
+        }
+        free((char*)slave_threads);
+        free((char*)cond);
+        free((char*)mut);
+        free((char*)wc);
+        slave_threads = (pthread_t*)0;
+        cond = (pthread_cond_t*)0;
+        mut = (pthread_mutex_t*)0;
+        wc = (slave_conf_t*)0;
 #else
-		free((char*)slave_threads);
-		slave_threads = (pthread_t*)0;
+        free((char*)slave_threads);
+        slave_threads = (pthread_t*)0;
 #endif /*PERMANENT*/
-	}
-	if (_interpreter_lock) {
-		pthread_mutex_destroy(_interpreter_lock);
-		_interpreter_lock = (pthread_mutex_t*)0;
-		interpreter_locked = 0;
-	}
-	if (_nmodlmutex) {
-		pthread_mutex_destroy(_nmodlmutex);
-		_nmodlmutex = (pthread_mutex_t*)0;
-	}
-	if (_nrn_malloc_mutex) {
-		pthread_mutex_destroy(_nrn_malloc_mutex);
-		_nrn_malloc_mutex = (pthread_mutex_t*)0;
-	}
-	nrn_thread_parallel_ = 0;
+    }
+    if (_interpreter_lock) {
+        pthread_mutex_destroy(_interpreter_lock);
+        _interpreter_lock = (pthread_mutex_t*)0;
+        interpreter_locked = 0;
+    }
+    if (_nmodlmutex) {
+        pthread_mutex_destroy(_nmodlmutex);
+        _nmodlmutex = (pthread_mutex_t*)0;
+    }
+    if (_nrn_malloc_mutex) {
+        pthread_mutex_destroy(_nrn_malloc_mutex);
+        _nrn_malloc_mutex = (pthread_mutex_t*)0;
+    }
+    nrn_thread_parallel_ = 0;
 }
 
-#else /* USE_PTHREAD */
+#else  /* USE_PTHREAD */
 
-
-static void threads_create_pthread(){
-	nrn_thread_parallel_ = 0;
+static void threads_create_pthread() {
+    nrn_thread_parallel_ = 0;
 }
-static void threads_free_pthread(){
-	nrn_thread_parallel_ = 0;
+static void threads_free_pthread() {
+    nrn_thread_parallel_ = 0;
 }
 #endif /* !USE_PTHREAD */
 
-void nrn_thread_stat() {
-#if BENCHMARKING
-	FILE* f;
-	long i, j, n;
-	char buf[50];
-	sprintf(buf, "bench.%d.dat", nrnmpi_myid);
-	f = fopen(buf, "w");
-	n = (t_[0] - t1_[0]);
-	for (i=1; i < nrn_nthread; ++i) {
-		t_[i] = t1_[i] + n;
-		t_[i+nrn_nthread] = t1_[i+nrn_nthread] + n;
-	}
-	n = 0;
-	for (i=0; i < BS; ++i) {
-		n += t_[i] - t1_[i];	
-	}
-	fprintf(f, "%ld\n", n);
-	n = 0;
-	for (j=0; j < BS; ++j) {
-		n = t_[j] - t1_[j];
-		for (i=0; i < n; ++i) {
-			fprintf(f, "%ld %d\n", t1_[j][i], j*nrnmpi_numprocs + nrnmpi_myid);
-		}
-	}
-	fclose(f);
-#endif /*BENCHMARKING*/
-}
-
 void nrn_threads_create(int n, int parallel) {
     int i, j;
-	NrnThread* nt;
-	if (nrn_nthread != n) {
-/*printf("sizeof(NrnThread)=%d   sizeof(Memb_list)=%d\n", sizeof(NrnThread), sizeof(Memb_list));*/
-#if BENCHMARKING
-#endif
-		nrn_threads = (NrnThread*)0;
-		nrn_nthread = n;
-		if (n > 0) {
-			CACHELINE_ALLOC(nrn_threads, NrnThread, n);
-#if BENCHMARKING
-			for (i=0; i < BS; ++i) {
-				t_[i] = t1_[i];
-			}
-#endif
-			for (i=0; i < n; ++i) {
-				nt = nrn_threads + i;
-				nt->_t = 0.;
-				nt->_dt = -1e9;
-				nt->id = i;
-				nt->_stop_stepping = 0;
-				nt->n_vecplay = 0;
-				nt->_vecplay = NULL;
-				nt->tml = (NrnThreadMembList*)0;
-				nt->_ml_list = (Memb_list**)0;
-				nt->pntprocs = (Point_process*)0;
-				nt->presyns = (PreSyn*)0;
-				nt->netcons = (NetCon*)0;
-				nt->weights = (double*)0;
-				nt->n_pntproc = nt->n_presyn = nt->n_netcon = 0;
-				nt->n_weight = 0;
-				nt->_ndata = nt->_nidata = nt->_nvdata = 0;
-				nt->_data = (double*)0;
-				nt->_idata = (int*)0;
-				nt->_vdata = (void**)0;
-				nt->ncell = 0;
-				nt->end = 0;
-                                for (j=0; j < BEFORE_AFTER_SIZE; ++j) {
-                                        nt->tbl[j] = (NrnThreadBAList*)0;
-                                }
-				nt->_actual_rhs = 0;
-				nt->_actual_d = 0;
-				nt->_actual_a = 0;
-				nt->_actual_b = 0;
-				nt->_actual_v = 0;
-				nt->_actual_area = 0;
-				nt->_v_parent_index = 0;
-                                nt->_shadow_rhs = 0;
-                                nt->_shadow_d = 0;
-				nt->_ecell_memb_list = 0;
-				nt->_sp13mat = 0;
-				nt->_ctime = 0.0;
-			}
-		}
-		v_structure_change = 1;
-		diam_changed = 1;
-	}
-	if (nrn_thread_parallel_ != parallel) {
-		threads_free_pthread();
-		if (parallel) {
-			threads_create_pthread();
-		}
-	}
-	/*printf("nrn_threads_create %d %d\n", nrn_nthread, nrn_thread_parallel_);*/
+    NrnThread* nt;
+    if (nrn_nthread != n) {
+      /*printf("sizeof(NrnThread)=%d   sizeof(Memb_list)=%d\n", sizeof(NrnThread), sizeof(Memb_list));*/
+
+        nrn_threads = (NrnThread*)0;
+        nrn_nthread = n;
+        if (n > 0) {
+            CACHELINE_ALLOC(nrn_threads, NrnThread, n);
+
+            for (i = 0; i < n; ++i) {
+                nt = nrn_threads + i;
+                nt->_t = 0.;
+                nt->_dt = -1e9;
+                nt->id = i;
+                nt->_stop_stepping = 0;
+                nt->n_vecplay = 0;
+                nt->_vecplay = NULL;
+                nt->tml = (NrnThreadMembList*)0;
+                nt->_ml_list = (Memb_list**)0;
+                nt->pntprocs = (Point_process*)0;
+                nt->presyns = (PreSyn*)0;
+                nt->presyns_helper = NULL;
+                nt->pnt2presyn_ix = (int**)0;
+                nt->netcons = (NetCon*)0;
+                nt->weights = (double*)0;
+                nt->n_pntproc = nt->n_presyn = nt->n_netcon = 0;
+                nt->n_weight = 0;
+                nt->_ndata = nt->_nidata = nt->_nvdata = 0;
+                nt->_data = (double*)0;
+                nt->_idata = (int*)0;
+                nt->_vdata = (void**)0;
+                nt->ncell = 0;
+                nt->end = 0;
+                for (j = 0; j < BEFORE_AFTER_SIZE; ++j) {
+                    nt->tbl[j] = (NrnThreadBAList*)0;
+                }
+                nt->_actual_rhs = 0;
+                nt->_actual_d = 0;
+                nt->_actual_a = 0;
+                nt->_actual_b = 0;
+                nt->_actual_v = 0;
+                nt->_actual_area = 0;
+                nt->_v_parent_index = 0;
+                nt->_permute = 0;
+                nt->_shadow_rhs = 0;
+                nt->_shadow_d = 0;
+                nt->_ecell_memb_list = 0;
+                nt->_sp13mat = 0;
+                nt->_ctime = 0.0;
+
+                nt->_net_send_buffer_size = 0;
+                nt->_net_send_buffer = (int*)0;
+                nt->_net_send_buffer_cnt = 0;
+                nt->mapping = NULL;
+            }
+        }
+        v_structure_change = 1;
+        diam_changed = 1;
+    }
+    if (nrn_thread_parallel_ != parallel) {
+        threads_free_pthread();
+        if (parallel) {
+            threads_create_pthread();
+        }
+    }
+    /*printf("nrn_threads_create %d %d\n", nrn_nthread, nrn_thread_parallel_);*/
 }
 
 void nrn_threads_free() {
-  if(nrn_nthread)
-  {
-    threads_free_pthread();
-    free((char*)nrn_threads);
-    nrn_threads = 0;
-    nrn_nthread = 0;
-  }
+    if (nrn_nthread) {
+        threads_free_pthread();
+        free((void*)nrn_threads);
+        nrn_threads = 0;
+        nrn_nthread = 0;
+    }
 }
 
 void nrn_mk_table_check() {
-	int i, id, index;
-	int* ix;
-	if (table_check_) {
-		free((void*)table_check_);
-		table_check_ = (ThreadDatum*)0;
-	}
+    int i, id, index;
+    int* ix;
+    if (table_check_) {
+        free((void*)table_check_);
+        table_check_ = (ThreadDatum*)0;
+    }
 
     /// Allocate int array of size of mechanism types
-	ix = (int*)emalloc(n_memb_func*sizeof(int));
-	for (i=0; i < n_memb_func; ++i) {
-		ix[i] = -1;
-	}
-	table_check_cnt_ = 0;
-	for (id=0; id < nrn_nthread; ++id) {
-		NrnThread* nt = nrn_threads + id;
-		NrnThreadMembList* tml;
-		for (tml = nt->tml; tml; tml = tml->next) {
-			index = tml->index;
-			if (memb_func[index].thread_table_check_ && ix[index] == -1) {
-				ix[index] = id;
-				table_check_cnt_ += 2;
-			}
-		}
-	}
-	if (table_check_cnt_) {
-		table_check_ = (ThreadDatum*)emalloc(table_check_cnt_*sizeof(ThreadDatum));
-	}
-	i=0;
-	for (id=0; id < nrn_nthread; ++id) {
-		NrnThread* nt = nrn_threads + id;
-		NrnThreadMembList* tml;
-		for (tml = nt->tml; tml; tml = tml->next) {
-			index = tml->index;
-			if (memb_func[index].thread_table_check_ && ix[index] == id) {
-				table_check_[i++].i = id;
-				table_check_[i++]._pvoid = (void*)tml;
-			}
-		}
-	}
-	free((void*)ix);
+    ix = (int*)emalloc(n_memb_func * sizeof(int));
+    for (i = 0; i < n_memb_func; ++i) {
+        ix[i] = -1;
+    }
+    table_check_cnt_ = 0;
+    for (id = 0; id < nrn_nthread; ++id) {
+        NrnThread* nt = nrn_threads + id;
+        NrnThreadMembList* tml;
+        for (tml = nt->tml; tml; tml = tml->next) {
+            index = tml->index;
+            if (memb_func[index].thread_table_check_ && ix[index] == -1) {
+                ix[index] = id;
+                table_check_cnt_ += 2;
+            }
+        }
+    }
+    if (table_check_cnt_) {
+        table_check_ = (ThreadDatum*)emalloc(table_check_cnt_ * sizeof(ThreadDatum));
+    }
+    i = 0;
+    for (id = 0; id < nrn_nthread; ++id) {
+        NrnThread* nt = nrn_threads + id;
+        NrnThreadMembList* tml;
+        for (tml = nt->tml; tml; tml = tml->next) {
+            index = tml->index;
+            if (memb_func[index].thread_table_check_ && ix[index] == id) {
+                table_check_[i++].i = id;
+                table_check_[i++]._pvoid = (void*)tml;
+            }
+        }
+    }
+    free((void*)ix);
 }
 
 void nrn_thread_table_check() {
-	int i;
-	for (i=0; i < table_check_cnt_; i += 2) {
-		NrnThread* nt = nrn_threads + table_check_[i].i;
-		NrnThreadMembList* tml = (NrnThreadMembList*)table_check_[i+1]._pvoid;
-		Memb_list* ml = tml->ml;
-		(*memb_func[tml->index].thread_table_check_)(
-			0, ml->nodecount, ml->data, ml->pdata, ml->_thread, nt, tml->index
-		);
-	}
+    int i;
+    for (i = 0; i < table_check_cnt_; i += 2) {
+        NrnThread* nt = nrn_threads + table_check_[i].i;
+        NrnThreadMembList* tml = (NrnThreadMembList*)table_check_[i + 1]._pvoid;
+        Memb_list* ml = tml->ml;
+        (*memb_func[tml->index].thread_table_check_)(0, ml->nodecount, ml->data, ml->pdata,
+                                                     ml->_thread, nt, tml->index);
+    }
 }
 
-
-void nrn_multithread_job(void*(*job)(NrnThread*)) {
-	int i;
+void nrn_multithread_job(void* (*job)(NrnThread*)) {
+    int i;
 #if defined(_OPENMP)
 
-    // default(none) removed to avoid issue with opari2
-	#pragma omp parallel for private(i) \
-	shared(nrn_threads, job, nrn_nthread, nrnmpi_myid) schedule(static, 1)
-	for(i=0; i < nrn_nthread; ++i) {
-		(*job)(nrn_threads + i);
-	}
+// default(none) removed to avoid issue with opari2
+    #pragma omp parallel for private(i) shared(nrn_threads, job, nrn_nthread, \
+                                           nrnmpi_myid) schedule(static, 1)
+    for (i = 0; i < nrn_nthread; ++i) {
+        (*job)(nrn_threads + i);
+    }
 #else
 
 /* old implementation */
 #if USE_PTHREAD
-	BENCHDECLARE
-	if (nrn_thread_parallel_) {
-		nrn_inthread_ = 1;
-		for (i=1; i < nrn_nthread; ++i) {
-			send_job_to_slave(i, job);
-		}
-		BENCHBEGIN(0)
-		(*job)(nrn_threads);
-		BENCHADD(nrn_nthread)
-		WAIT();
-		nrn_inthread_ = 0;
-	}else{ /* sequential */
+    if (nrn_thread_parallel_) {
+        nrn_inthread_ = 1;
+        for (i = 1; i < nrn_nthread; ++i) {
+            send_job_to_slave(i, job);
+        }
+        (*job)(nrn_threads);
+        WAIT();
+        nrn_inthread_ = 0;
+    } else { /* sequential */
 #else
-	{
+    {
 #endif
-		for (i=1; i < nrn_nthread; ++i) {
-			BENCHBEGIN(i)
-			(*job)(nrn_threads + i);
-			BENCHADD(i+nrn_nthread)
-		}
-		BENCHBEGIN(0)
-		(*job)(nrn_threads);
-		BENCHADD(nrn_nthread)
-	}
+        for (i = 1; i < nrn_nthread; ++i) {
+            (*job)(nrn_threads + i);
+        }
+        (*job)(nrn_threads);
+    }
 #endif
-
 }
diff --git a/coreneuron/nrnoc/multicore.h b/coreneuron/nrnoc/multicore.h
index 5b9ac3012..8f5a8d36e 100644
--- a/coreneuron/nrnoc/multicore.h
+++ b/coreneuron/nrnoc/multicore.h
@@ -41,68 +41,95 @@ typedef void NetCon;
 typedef void PreSyn;
 #endif
 
-typedef struct NrnThreadMembList{ /* patterned after CvMembList in cvodeobj.h */
-	struct NrnThreadMembList* next;
-	struct Memb_list* ml;
-	int index;
-    int *dependencies; /* list of mechanism types that this mechanism depends on*/
+/*
+   Point_process._presyn, used only if its NET_RECEIVE sends a net_event, is
+   eliminated. Needed only by net_event function. Replaced by
+   PreSyn* = nt->presyns + nt->pnt2presyn_ix[pnttype2presyn[pnt->_type]][pnt->_i_instance];
+*/
+extern int nrn_has_net_event_cnt_; /* how many net_event sender types are there? */
+extern int* nrn_has_net_event_;    /* the types that send a net_event */
+extern int* pnttype2presyn; /* from the type, which array of pnt2presyn_ix are we talking about. */
+
+typedef struct NrnThreadMembList { /* patterned after CvMembList in cvodeobj.h */
+    struct NrnThreadMembList* next;
+    struct Memb_list* ml;
+    int index;
+    int* dependencies; /* list of mechanism types that this mechanism depends on*/
     int ndependencies; /* for scheduling we need to know the dependency count */
 } NrnThreadMembList;
 
 typedef struct NrnThreadBAList {
-	struct Memb_list* ml; /* an item in the NrnThreadMembList */
-	struct BAMech* bam;
-	struct NrnThreadBAList* next;
+    struct Memb_list* ml; /* an item in the NrnThreadMembList */
+    struct BAMech* bam;
+    struct NrnThreadBAList* next;
 } NrnThreadBAList;
 
+/* for OpenACC, in order to avoid an error while update PreSyn, with virtual base
+ * class, we are adding helper with flag variable which could be updated on GPU
+ */
+typedef struct PreSynHelper { int flag_; } PreSynHelper;
+
 typedef struct NrnThread {
-	double _t;
-	double _dt;
-	double cj;
-
-	NrnThreadMembList* tml;
-	Memb_list** _ml_list;
-        Point_process* pntprocs; // synapses and artificial cells with and without gid
-	PreSyn* presyns; // all the output PreSyn with and without gid
-        NetCon* netcons;
-	double* weights; // size n_weight. NetCon.weight_ points into this array.
-
-	int n_pntproc, n_presyn, n_netcon, n_weight; // only for model_size
-
-        int ncell; /* analogous to old rootnodecount */
-	int end;    /* 1 + position of last in v_node array. Now v_node_count. */
-	int id; /* this is nrn_threads[id] */
-        int _stop_stepping;
-	int n_vecplay; /* number of instances of VecPlayContinuous */
-
-	size_t _ndata, _nidata, _nvdata; /* sizes */
-	double* _data; /* all the other double* and Datum to doubles point into here*/
-	int* _idata; /* all the Datum to ints index into here */
-	void** _vdata; /* all the Datum to pointers index into here */
-	void** _vecplay; /* array of instances of VecPlayContinuous */
-
-	double* _actual_rhs;
-	double* _actual_d;
-	double* _actual_a;
-	double* _actual_b;
-	double* _actual_v;
-	double* _actual_area;
-	double* _shadow_rhs; /* Not pointer into _data. Avoid race for multiple POINT_PROCESS in same compartment */
-	double* _shadow_d; /* Not pointer into _data. Avoid race for multiple POINT_PROCESS in same compartment */
-	int* _v_parent_index;
-	char* _sp13mat; /* handle to general sparse matrix */
-	struct Memb_list* _ecell_memb_list; /* normally nil */
-
-	double _ctime; /* computation time in seconds (using nrnmpi_wtime) */
-
-	NrnThreadBAList* tbl[BEFORE_AFTER_SIZE]; /* wasteful since almost all empty */
+    double _t;
+    double _dt;
+    double cj;
+
+    NrnThreadMembList* tml;
+    Memb_list** _ml_list;
+    Point_process* pntprocs;  // synapses and artificial cells with and without gid
+    PreSyn* presyns;          // all the output PreSyn with and without gid
+    PreSynHelper* presyns_helper;
+    int** pnt2presyn_ix;  // eliminates Point_process._presyn used only by net_event sender.
+    NetCon* netcons;
+    double* weights;  // size n_weight. NetCon.weight_ points into this array.
+
+    int n_pntproc, n_presyn, n_input_presyn, n_netcon, n_weight;  // only for model_size
+
+    int ncell; /* analogous to old rootnodecount */
+    int end;   /* 1 + position of last in v_node array. Now v_node_count. */
+    int id;    /* this is nrn_threads[id] */
+    int _stop_stepping;
+    int n_vecplay; /* number of instances of VecPlayContinuous */
+
+    size_t _ndata, _nidata, _nvdata; /* sizes */
+    double* _data;                   /* all the other double* and Datum to doubles point into here*/
+    int* _idata;                     /* all the Datum to ints index into here */
+    void** _vdata;                   /* all the Datum to pointers index into here */
+    void** _vecplay;                 /* array of instances of VecPlayContinuous */
+
+    double* _actual_rhs;
+    double* _actual_d;
+    double* _actual_a;
+    double* _actual_b;
+    double* _actual_v;
+    double* _actual_area;
+    double* _shadow_rhs; /* Not pointer into _data. Avoid race for multiple POINT_PROCESS in same
+                            compartment */
+    double* _shadow_d;   /* Not pointer into _data. Avoid race for multiple POINT_PROCESS in same
+                            compartment */
+    int* _v_parent_index;
+    int* _permute;
+    char* _sp13mat;                     /* handle to general sparse matrix */
+    struct Memb_list* _ecell_memb_list; /* normally nil */
+
+    double _ctime; /* computation time in seconds (using nrnmpi_wtime) */
+
+    NrnThreadBAList* tbl[BEFORE_AFTER_SIZE]; /* wasteful since almost all empty */
+
+    int shadow_rhs_cnt; /* added to facilitate the NrnThread transfer to GPU */
+    int compute_gpu;    /* define whether to compute with gpus */
+    int stream_id;      /* define where the kernel will be launched on GPU stream */
+    int _net_send_buffer_size;
+    int _net_send_buffer_cnt;
+    int* _net_send_buffer;
+    void* mapping; /* section to segment mapping information */
 
 } NrnThread;
 
 extern void nrn_threads_create(int n, int parallel);
 extern int nrn_nthread;
 extern NrnThread* nrn_threads;
-extern void nrn_multithread_job(void*(*)(NrnThread*));
+extern void nrn_multithread_job(void* (*)(NrnThread*));
 extern void nrn_thread_table_check(void);
 
 extern void nrn_threads_free(void);
diff --git a/coreneuron/nrnoc/nrnoc_aux.c b/coreneuron/nrnoc/nrnoc_aux.c
index ee88ed2fe..9484b134c 100644
--- a/coreneuron/nrnoc/nrnoc_aux.c
+++ b/coreneuron/nrnoc/nrnoc_aux.c
@@ -42,84 +42,111 @@ int diam_changed;
 int hoc_errno_count;
 
 char* pnt_name(Point_process* pnt) {
-  return memb_func[pnt->_type].sym;
+    return memb_func[pnt->_type].sym;
 }
 
 void nrn_exit(int err) {
-  nrnmpi_finalize();
-  exit(err);
+#if NRNMPI
+    nrnmpi_finalize();
+#endif
+    exit(err);
 }
 
 void hoc_execerror(const char* s1, const char* s2) {
-  printf("error: %s %s\n", s1, s2?s2:"");
-  abort();
+    printf("error: %s %s\n", s1, s2 ? s2 : "");
+    abort();
 }
 
 void hoc_warning(const char* s1, const char* s2) {
-  printf("warning: %s %s\n", s1, s2?s2:"");
+    printf("warning: %s %s\n", s1, s2 ? s2 : "");
 }
 
 double* makevector(size_t size) {
     return (double*)ecalloc(size, sizeof(char));
 }
 
+void freevector(double* p) {
+    if (p) {
+        free(p);
+    }
+}
+
+double** makematrix(size_t nrows, size_t ncols) {
+    size_t i;
+    double** matrix;
+    matrix = (double**)emalloc(nrows * sizeof(double*));
+    *matrix = (double*)emalloc(nrows * ncols * sizeof(double));
+    for (i = 1; i < nrows; i++)
+        matrix[i] = matrix[i - 1] + ncols;
+    return (matrix);
+}
+
+void freematrix(double** matrix) {
+    if (matrix != NULL) {
+        free(*matrix);
+        free(matrix);
+    }
+}
+
 void* emalloc(size_t size) {
-  void* memptr;
-  memptr = malloc(size);
-  assert(memptr);
-  return memptr;
+    void* memptr;
+    memptr = malloc(size);
+    assert(memptr);
+    return memptr;
 }
 
 /* some user mod files may use this in VERBATIM */
-void* hoc_Emalloc(size_t size) { return emalloc(size);}
-void hoc_malchk(void) {}
+void* hoc_Emalloc(size_t size) {
+    return emalloc(size);
+}
+void hoc_malchk(void) {
+}
 
 void* ecalloc(size_t n, size_t size) {
-  void* p;
-  if (n == 0) { return (void*)0; }
-  p = calloc(n, size);
-  assert(p);
-  return p;
+    void* p;
+    if (n == 0) {
+        return (void*)0;
+    }
+    p = calloc(n, size);
+    assert(p);
+    return p;
 }
 
 void* erealloc(void* ptr, size_t size) {
-  void* p;
-  if (!ptr) {
-    return emalloc(size);
-  }
-  p = realloc(ptr, size);
-  assert(p);
-  return p;
+    void* p;
+    if (!ptr) {
+        return emalloc(size);
+    }
+    p = realloc(ptr, size);
+    assert(p);
+    return p;
 }
 
 void* nrn_cacheline_alloc(void** memptr, size_t size) {
 #if HAVE_MEMALIGN
-  if (posix_memalign(memptr, 64, size) != 0) {
-    fprintf(stderr, "posix_memalign not working\n");
-    assert(0);
-  }
+    if (posix_memalign(memptr, 64, size) != 0) {
+        fprintf(stderr, "posix_memalign not working\n");
+        assert(0);
+    }
 #else
     *memptr = emalloc(size);
 #endif
-  return *memptr;
+    return *memptr;
 }
 
-
 /* used by nmodl and other c, c++ code */
-double hoc_Exp(double x)
-{
-        if (x < -700.) {
-                return 0.;
-        }else if (x > 700) {
-                errno = ERANGE;
-                if (++hoc_errno_count < MAXERRCOUNT) {
-                        fprintf(stderr, "exp(%g) out of range, returning exp(700)\n", x);
-                }
-                if (hoc_errno_count == MAXERRCOUNT) {
-                        fprintf(stderr, "No more errno warnings during this execution\n");
-                }
-                return exp(700.);
+double hoc_Exp(double x) {
+    if (x < -700.) {
+        return 0.;
+    } else if (x > 700) {
+        errno = ERANGE;
+        if (++hoc_errno_count < MAXERRCOUNT) {
+            fprintf(stderr, "exp(%g) out of range, returning exp(700)\n", x);
+        }
+        if (hoc_errno_count == MAXERRCOUNT) {
+            fprintf(stderr, "No more errno warnings during this execution\n");
         }
-        return exp(x);
+        return exp(700.);
+    }
+    return exp(x);
 }
-
diff --git a/coreneuron/nrnoc/nrnoc_decl.h b/coreneuron/nrnoc/nrnoc_decl.h
index 928c0c6fe..3ffabae98 100644
--- a/coreneuron/nrnoc/nrnoc_decl.h
+++ b/coreneuron/nrnoc/nrnoc_decl.h
@@ -44,6 +44,9 @@ extern void nrn_deliver_events(NrnThread*);
 extern void init_net_events(void);
 extern void nrn_play_init(void);
 extern void fixed_play_continuous(NrnThread*);
+extern int use_solve_interleave;
+extern int* nrn_index_sort(int* values, int n);
+extern void solve_interleaved(int ith);
 extern void nrn_solve_minimal(NrnThread*);
 extern void second_order_cur(NrnThread*);
 extern void nrn_ba(NrnThread*, int);
@@ -61,10 +64,17 @@ extern void alloc_mech(int);
 extern void ion_reg(const char*, double);
 extern void nrn_mk_table_check(void);
 extern void initnrn(void);
-extern void  nrn_capacity_current(NrnThread*, Memb_list*);
 extern int prcellstate(int gid, const char* suffix);
 extern int nrn_i_layout(int i, int cnt, int j, int size, int layout);
 
+extern int nrn_have_gaps;
+extern void nrnthread_v_transfer(NrnThread*);
+extern void nrnmpi_v_transfer();
+
+extern void nrn_fatal_error(const char* msg);
+extern void nrn_abort(int errcode);
+extern double nrn_wtime(void);
+
 #if defined(__cplusplus)
 }
 #endif
diff --git a/coreneuron/nrnoc/nrnoc_ml.h b/coreneuron/nrnoc/nrnoc_ml.h
index 4cede03f1..f13c787ec 100644
--- a/coreneuron/nrnoc/nrnoc_ml.h
+++ b/coreneuron/nrnoc/nrnoc_ml.h
@@ -31,27 +31,64 @@ THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "coreneuron/nrnconf.h"
 
+#if PG_ACC_BUGS
+typedef struct ThreadDatum {
+    int i;
+    double* pval;
+    void* _pvoid;
+} ThreadDatum;
+#else
 typedef union ThreadDatum {
-	double val;
-	int i;
-	double* pval;
-	void* _pvoid;
-}ThreadDatum;
+    double val;
+    int i;
+    double* pval;
+    void* _pvoid;
+} ThreadDatum;
+#endif
+
+typedef struct NetReceiveBuffer_t {
+    int* _displ;     /* _displ_cnt + 1 of these */
+    int* _nrb_index; /* _cnt of these (order of increasing _pnt_index) */
+
+    int* _pnt_index;
+    int* _weight_index;
+    double* _nrb_t;
+    double* _nrb_flag;
+    int _cnt;
+    int _displ_cnt; /* number of unique _pnt_index */
+    int _size;      /* capacity */
+    int _pnt_offset;
+} NetReceiveBuffer_t;
+
+typedef struct NetSendBuffer_t {
+    int* _sendtype;  // net_send, net_event, net_move
+    int* _vdata_index;
+    int* _pnt_index;
+    int* _weight_index;
+    double* _nsb_t;
+    double* _nsb_flag;
+    int _cnt;
+    int _size;       /* capacity */
+    int reallocated; /* if buffer resized/reallocated, needs to be copy to cpu */
+} NetSendBuffer_t;
 
 typedef struct Memb_list {
 #if CACHEVEC != 0
-	/* nodeindices contains all nodes this extension is responsible for,
-	 * ordered according to the matrix. This allows to access the matrix
-	 * directly via the nrn_actual_* arrays instead of accessing it in the
-	 * order of insertion and via the node-structure, making it more
-	 * cache-efficient */
-	int *nodeindices;
+    /* nodeindices contains all nodes this extension is responsible for,
+     * ordered according to the matrix. This allows to access the matrix
+     * directly via the nrn_actual_* arrays instead of accessing it in the
+     * order of insertion and via the node-structure, making it more
+     * cache-efficient */
+    int* nodeindices;
 #endif /* CACHEVEC */
-	double* data;
-	Datum* pdata;
-	ThreadDatum* _thread; /* thread specific data (when static is no good) */
-	int nodecount; /* actual node count */
-	int _nodecount_padded;
+    int* _permute;
+    double* data;
+    Datum* pdata;
+    ThreadDatum* _thread; /* thread specific data (when static is no good) */
+    NetReceiveBuffer_t* _net_receive_buffer;
+    NetSendBuffer_t* _net_send_buffer;
+    int nodecount; /* actual node count */
+    int _nodecount_padded;
 } Memb_list;
 
 #endif
diff --git a/coreneuron/nrnoc/nrntimeout.c b/coreneuron/nrnoc/nrntimeout.c
index faf71103f..628121854 100644
--- a/coreneuron/nrnoc/nrntimeout.c
+++ b/coreneuron/nrnoc/nrntimeout.c
@@ -36,13 +36,14 @@ THE POSSIBILITY OF SUCH DAMAGE.
 #include <sys/time.h>
 
 /* if you are using any sampling based profiling tool,
-setitimer will conflict with profiler. In that case, 
-user can disable setitimer which is just safety for 
+setitimer will conflict with profiler. In that case,
+user can disable setitimer which is just safety for
 deadlock situations */
 
 #ifdef DISABLE_TIMEOUT
 
-void nrn_timeout(int seconds) { }
+void nrn_timeout(int seconds) {
+}
 
 #else
 
@@ -52,45 +53,46 @@ static struct itimerval value;
 static struct sigaction act, oact;
 
 static void timed_out(int sig) {
-	(void)sig; /* unused */
+    (void)sig; /* unused */
 #if 0
 printf("timed_out told=%g t=%g\n", told, t);
 #endif
-	if (nrn_threads->_t == told) { /* nothing has been accomplished since last signal*/
-		printf("nrn_timeout t=%g\n", nrn_threads->_t);
-		if (nrntimeout_call) {
-			(*nrntimeout_call)();
-		}
-		nrnmpi_abort(0);
-	}
-	told = nrn_threads->_t;
+    if (nrn_threads->_t == told) { /* nothing has been accomplished since last signal*/
+        printf("nrn_timeout t=%g\n", nrn_threads->_t);
+        if (nrntimeout_call) {
+            (*nrntimeout_call)();
+        }
+        nrn_abort(0);
+    }
+    told = nrn_threads->_t;
 }
 
 void nrn_timeout(int seconds) {
-	if (nrnmpi_myid != 0) { return; }
+    if (nrnmpi_myid != 0) {
+        return;
+    }
 #if 0
 printf("nrn_timeout %d\n", seconds);
 #endif
-	if (seconds) {
-		told = nrn_threads->_t;
-		act.sa_handler = timed_out;
-		act.sa_flags = SA_RESTART;
-		if(sigaction(SIGALRM, &act, &oact)) {
-			printf("sigaction failed\n");
-			nrnmpi_abort(0);
-		}
-	}else{
-		sigaction(SIGALRM, &oact, (struct sigaction*)0);
-	}
-	value.it_interval.tv_sec = seconds;
-	value.it_interval.tv_usec = 0;
-	value.it_value.tv_sec = seconds;
-	value.it_value.tv_usec = 0;
-	if(setitimer(ITIMER_REAL, &value, (struct itimerval*)0)) {
-		printf("setitimer failed\n");
-		nrnmpi_abort(0);
-	}
-	
+    if (seconds) {
+        told = nrn_threads->_t;
+        act.sa_handler = timed_out;
+        act.sa_flags = SA_RESTART;
+        if (sigaction(SIGALRM, &act, &oact)) {
+            printf("sigaction failed\n");
+            nrn_abort(0);
+        }
+    } else {
+        sigaction(SIGALRM, &oact, (struct sigaction*)0);
+    }
+    value.it_interval.tv_sec = seconds;
+    value.it_interval.tv_usec = 0;
+    value.it_value.tv_sec = seconds;
+    value.it_value.tv_usec = 0;
+    if (setitimer(ITIMER_REAL, &value, (struct itimerval*)0)) {
+        printf("setitimer failed\n");
+        nrn_abort(0);
+    }
 }
 
 #endif /* DISABLE_TIMEOUT */
diff --git a/coreneuron/nrnoc/register_mech.c b/coreneuron/nrnoc/register_mech.c
index 6c074febd..e30dfc917 100644
--- a/coreneuron/nrnoc/register_mech.c
+++ b/coreneuron/nrnoc/register_mech.c
@@ -27,14 +27,27 @@ THE POSSIBILITY OF SUCH DAMAGE.
 */
 
 #include <string.h>
+#include <stdlib.h>
 #include "coreneuron/nrnconf.h"
 #include "coreneuron/nrnoc/multicore.h"
 #include "coreneuron/nrnoc/membdef.h"
 #include "coreneuron/nrnoc/nrnoc_decl.h"
 #include "coreneuron/nrnmpi/nrnmpi.h"
 
-int secondorder=0;
+int secondorder = 0;
 double t, dt, celsius;
+#if defined(PG_ACC_BUGS)
+    #pragma acc declare copyin(secondorder)
+    #pragma acc declare copyin(celsius)
+#endif
+int rev_dt;
+
+int net_buf_receive_cnt_;
+int* net_buf_receive_type_;
+NetBufReceive_t* net_buf_receive_;
+
+int net_buf_send_cnt_;
+int* net_buf_send_type_;
 
 static int memb_func_size_;
 static int pointtype = 1; /* starts at 1 since 0 means not point in pnt_map*/
@@ -43,16 +56,17 @@ int n_memb_func;
 Memb_func* memb_func;
 Memb_list* memb_list;
 Point_process** point_process;
-char* pnt_map;		/* so prop_free can know its a point mech*/
+char* pnt_map; /* so prop_free can know its a point mech*/
 typedef void (*Pfrv)();
 BAMech** bamech_;
 
-pnt_receive_t* pnt_receive;	/* for synaptic events. */
+pnt_receive_t* pnt_receive; /* for synaptic events. */
 pnt_receive_t* pnt_receive_init;
 short* pnt_receive_size;
- /* values are type numbers of mechanisms which do net_send call */
+/* values are type numbers of mechanisms which do net_send call */
 int nrn_has_net_event_cnt_;
 int* nrn_has_net_event_;
+int* pnttype2presyn; /* inverse of nrn_has_net_event_ */
 int* nrn_prop_param_size_;
 int* nrn_prop_dparam_size_;
 int* nrn_mech_data_layout_; /* 1 AoS (default), >1 AoSoA, 0 SoA */
@@ -68,193 +82,228 @@ static void ion_write_depend(int type, int etype);
 
 bbcore_read_t* nrn_bbcore_read_;
 void hoc_reg_bbcore_read(int type, bbcore_read_t f) {
-  if (type == -1)
-    return;
+    if (type == -1)
+        return;
 
-  nrn_bbcore_read_[type] = f;
+    nrn_bbcore_read_[type] = f;
 }
 
-void  add_nrn_has_net_event(int type) {
-  if (type == -1)
-    return;
+void add_nrn_has_net_event(int type) {
+    if (type == -1)
+        return;
 
-  ++nrn_has_net_event_cnt_;
-  nrn_has_net_event_ = (int*)erealloc(nrn_has_net_event_, nrn_has_net_event_cnt_*sizeof(int));
-  nrn_has_net_event_[nrn_has_net_event_cnt_ - 1] = type;
+    ++nrn_has_net_event_cnt_;
+    nrn_has_net_event_ = (int*)erealloc(nrn_has_net_event_, nrn_has_net_event_cnt_ * sizeof(int));
+    nrn_has_net_event_[nrn_has_net_event_cnt_ - 1] = type;
 }
 
 /* values are type numbers of mechanisms which have FOR_NETCONS statement */
-int nrn_fornetcon_cnt_; /* how many models have a FOR_NETCONS statement */
-int* nrn_fornetcon_type_; /* what are the type numbers */
+int nrn_fornetcon_cnt_;    /* how many models have a FOR_NETCONS statement */
+int* nrn_fornetcon_type_;  /* what are the type numbers */
 int* nrn_fornetcon_index_; /* what is the index into the ppvar array */
 
 void add_nrn_fornetcons(int type, int indx) {
-  int i;
+    int i;
 
-  if (type == -1)
-    return;
+    if (type == -1)
+        return;
 
-  i = nrn_fornetcon_cnt_++;
-  nrn_fornetcon_type_ = (int*)erealloc(nrn_fornetcon_type_, (i+1)*sizeof(int));
-  nrn_fornetcon_index_ = (int*)erealloc(nrn_fornetcon_index_, (i+1)*sizeof(int));
-  nrn_fornetcon_type_[i] = type;
-  nrn_fornetcon_index_[i]= indx;
+    i = nrn_fornetcon_cnt_++;
+    nrn_fornetcon_type_ = (int*)erealloc(nrn_fornetcon_type_, (i + 1) * sizeof(int));
+    nrn_fornetcon_index_ = (int*)erealloc(nrn_fornetcon_index_, (i + 1) * sizeof(int));
+    nrn_fornetcon_type_[i] = type;
+    nrn_fornetcon_index_[i] = indx;
 }
 
 /* array is parallel to memb_func. All are 0 except 1 for ARTIFICIAL_CELL */
 short* nrn_is_artificial_;
 short* nrn_artcell_qindex_;
 
-void  add_nrn_artcell(int type, int qi){
-  if (type == -1)
-    return;
+void add_nrn_artcell(int type, int qi) {
+    if (type == -1)
+        return;
 
-  nrn_is_artificial_[type] = 1;
-  nrn_artcell_qindex_[type] = qi;
+    nrn_is_artificial_[type] = 1;
+    nrn_artcell_qindex_[type] = qi;
 }
 
-
 void alloc_mech(int n) {
-	memb_func_size_ = n;
-	n_memb_func = n;
-	memb_func = (Memb_func*)ecalloc(memb_func_size_, sizeof(Memb_func));
-	memb_list = (Memb_list*)ecalloc(memb_func_size_, sizeof(Memb_list));
-	point_process = (Point_process**)ecalloc(memb_func_size_, sizeof(Point_process*));
-	pnt_map = (char*)ecalloc(memb_func_size_, sizeof(char));
-	pnt_receive = (pnt_receive_t*)ecalloc(memb_func_size_, sizeof(pnt_receive_t));
-	pnt_receive_init = (pnt_receive_t*)ecalloc(memb_func_size_, sizeof(pnt_receive_t));
-	pnt_receive_size = (short*)ecalloc(memb_func_size_, sizeof(short));
-	nrn_is_artificial_ = (short*)ecalloc(memb_func_size_, sizeof(short));
-	nrn_artcell_qindex_ = (short*)ecalloc(memb_func_size_, sizeof(short));
-	nrn_prop_param_size_ = (int*)ecalloc(memb_func_size_, sizeof(int));
-	nrn_prop_dparam_size_ = (int*)ecalloc(memb_func_size_, sizeof(int));
-	nrn_mech_data_layout_ = (int*)ecalloc(memb_func_size_, sizeof(int));
-	{int i; for (i=0; i < memb_func_size_; ++i) { nrn_mech_data_layout_[i] = 1; }}
-	nrn_dparam_ptr_start_ = (int*)ecalloc(memb_func_size_, sizeof(int));
-	nrn_dparam_ptr_end_ = (int*)ecalloc(memb_func_size_, sizeof(int));
-	nrn_bbcore_read_ = (bbcore_read_t*)ecalloc(memb_func_size_, sizeof(bbcore_read_t));
-	bamech_ = (BAMech**)ecalloc(BEFORE_AFTER_SIZE, sizeof(BAMech*));
+    memb_func_size_ = n;
+    n_memb_func = n;
+    memb_func = (Memb_func*)ecalloc(memb_func_size_, sizeof(Memb_func));
+    memb_list = (Memb_list*)ecalloc(memb_func_size_, sizeof(Memb_list));
+    point_process = (Point_process**)ecalloc(memb_func_size_, sizeof(Point_process*));
+    pnt_map = (char*)ecalloc(memb_func_size_, sizeof(char));
+    pnt_receive = (pnt_receive_t*)ecalloc(memb_func_size_, sizeof(pnt_receive_t));
+    pnt_receive_init = (pnt_receive_t*)ecalloc(memb_func_size_, sizeof(pnt_receive_t));
+    pnt_receive_size = (short*)ecalloc(memb_func_size_, sizeof(short));
+    nrn_is_artificial_ = (short*)ecalloc(memb_func_size_, sizeof(short));
+    nrn_artcell_qindex_ = (short*)ecalloc(memb_func_size_, sizeof(short));
+    nrn_prop_param_size_ = (int*)ecalloc(memb_func_size_, sizeof(int));
+    nrn_prop_dparam_size_ = (int*)ecalloc(memb_func_size_, sizeof(int));
+    nrn_mech_data_layout_ = (int*)ecalloc(memb_func_size_, sizeof(int));
+    {
+        int i;
+        for (i = 0; i < memb_func_size_; ++i) {
+            nrn_mech_data_layout_[i] = 1;
+        }
+    }
+    nrn_dparam_ptr_start_ = (int*)ecalloc(memb_func_size_, sizeof(int));
+    nrn_dparam_ptr_end_ = (int*)ecalloc(memb_func_size_, sizeof(int));
+    nrn_bbcore_read_ = (bbcore_read_t*)ecalloc(memb_func_size_, sizeof(bbcore_read_t));
+    bamech_ = (BAMech**)ecalloc(BEFORE_AFTER_SIZE, sizeof(BAMech*));
 }
 
 void initnrn() {
-	secondorder = DEF_secondorder;	/* >0 means crank-nicolson. 2 means currents
-				   adjusted to t+dt/2 */
-	t = 0;		/* msec */
-	dt = DEF_dt;	/* msec */
-	celsius = DEF_celsius;	/* degrees celsius */
+    secondorder = DEF_secondorder; /* >0 means crank-nicolson. 2 means currents
+                              adjusted to t+dt/2 */
+    t = 0.;                        /* msec */
+    dt = DEF_dt;                   /* msec */
+    rev_dt = (int)(DEF_rev_dt);    /* 1/msec */
+    celsius = DEF_celsius;         /* degrees celsius */
 }
 
 /* if vectorized then thread_data_size added to it */
-int register_mech(const char** m, mod_alloc_t alloc, mod_f_t cur, mod_f_t jacob,
-  mod_f_t stat, mod_f_t initialize, int nrnpointerindex, int vectorized
-  ) {
-	int type;	/* 0 unused, 1 for cable section */
-	(void)nrnpointerindex; /*unused*/
-
-	type = nrn_get_mechtype(m[1]);
-
-        // No mechanism in the .dat files
-        if (type == -1)
-           return type;
-
-	assert(type);
+int register_mech(const char** m,
+                  mod_alloc_t alloc,
+                  mod_f_t cur,
+                  mod_f_t jacob,
+                  mod_f_t stat,
+                  mod_f_t initialize,
+                  int nrnpointerindex,
+                  int vectorized) {
+    int type;              /* 0 unused, 1 for cable section */
+    (void)nrnpointerindex; /*unused*/
+
+    type = nrn_get_mechtype(m[1]);
+
+    // No mechanism in the .dat files
+    if (type == -1)
+        return type;
+
+    assert(type);
 #ifdef DEBUG
-	printf("register_mech %s %d\n", m[1], type);
+    printf("register_mech %s %d\n", m[1], type);
 #endif
-	nrn_dparam_ptr_start_[type] = 0; /* fill in later */
-	nrn_dparam_ptr_end_[type] = 0; /* fill in later */
-	memb_func[type].sym = (char*)emalloc(strlen(m[1])+1);
-	strcpy(memb_func[type].sym, m[1]);
-	memb_func[type].current = cur;
-	memb_func[type].jacob = jacob;
-	memb_func[type].alloc = alloc;
-	memb_func[type].state = stat;
-	memb_func[type].initialize = initialize;
-	memb_func[type].destructor = (Pfri)0;
+    nrn_dparam_ptr_start_[type] = 0; /* fill in later */
+    nrn_dparam_ptr_end_[type] = 0;   /* fill in later */
+    if (memb_func[type].sym) {
+        assert(strcmp(memb_func[type].sym, m[1]) == 0);
+    } else {
+        memb_func[type].sym = (char*)emalloc(strlen(m[1]) + 1);
+        strcpy(memb_func[type].sym, m[1]);
+    }
+    memb_func[type].current = cur;
+    memb_func[type].jacob = jacob;
+    memb_func[type].alloc = alloc;
+    memb_func[type].state = stat;
+    memb_func[type].initialize = initialize;
+    memb_func[type].destructor = (Pfri)0;
 #if VECTORIZE
-	memb_func[type].vectorized = vectorized ? 1:0;
-	memb_func[type].thread_size_ = vectorized ? (vectorized - 1) : 0;
-	memb_func[type].thread_mem_init_ = (void*)0;
-	memb_func[type].thread_cleanup_ = (void*)0;
-	memb_func[type].thread_table_check_ = (void*)0;
-	memb_func[type].is_point = 0;
-	memb_func[type].setdata_ = (void*)0;
-	memb_func[type].dparam_semantics = (int*)0;
-	memb_list[type].nodecount = 0;
-	memb_list[type]._thread = (ThreadDatum*)0;
+    memb_func[type].vectorized = vectorized ? 1 : 0;
+    memb_func[type].thread_size_ = vectorized ? (vectorized - 1) : 0;
+    memb_func[type].thread_mem_init_ = (void*)0;
+    memb_func[type].thread_cleanup_ = (void*)0;
+    memb_func[type].thread_table_check_ = (void*)0;
+    memb_func[type].is_point = 0;
+    memb_func[type].setdata_ = (void*)0;
+    memb_func[type].dparam_semantics = (int*)0;
+    memb_list[type].nodecount = 0;
+    memb_list[type]._thread = (ThreadDatum*)0;
 #endif
-	return type;
+    return type;
 }
 
 void nrn_writes_conc(int type, int unused) {
-  static int lastion = EXTRACELL+1;
-  (void)unused; /* unused */
-  if (type == -1)
-    return;
+    static int lastion = EXTRACELL + 1;
+    (void)unused; /* unused */
+    if (type == -1)
+        return;
 
 #if 0
 	printf("%s reordered from %d to %d\n", memb_func[type].sym->name, type, lastion);
 #endif
-  if (nrn_is_ion(type)) {
-    ++lastion;
-  }
+    if (nrn_is_ion(type)) {
+        ++lastion;
+    }
 }
 
 void _nrn_layout_reg(int type, int layout) {
-	nrn_mech_data_layout_[type] = layout;
+    nrn_mech_data_layout_[type] = layout;
+}
+
+void hoc_register_net_receive_buffering(NetBufReceive_t f, int type) {
+    int i = net_buf_receive_cnt_++;
+    net_buf_receive_type_ =
+        (int*)erealloc(net_buf_receive_type_, net_buf_receive_cnt_ * sizeof(int));
+    net_buf_receive_ = (NetBufReceive_t*)erealloc(net_buf_receive_,
+                                                  net_buf_receive_cnt_ * sizeof(NetBufReceive_t));
+    net_buf_receive_type_[i] = type;
+    net_buf_receive_[i] = f;
+}
+
+void hoc_register_net_send_buffering(int type) {
+    int i = net_buf_send_cnt_++;
+    net_buf_send_type_ = (int*)erealloc(net_buf_send_type_, net_buf_send_cnt_ * sizeof(int));
+    net_buf_send_type_[i] = type;
 }
 
 void hoc_register_prop_size(int type, int psize, int dpsize) {
-  int pold, dpold;
-  if (type == -1)
-    return;  
-
-  pold = nrn_prop_param_size_[type];
-  dpold = nrn_prop_dparam_size_[type];
-  if (psize != pold || dpsize != dpold) {
-    printf("%s prop sizes differ psize %d %d   dpsize %d %d\n", memb_func[type].sym, psize, pold, dpsize, dpold);
-    exit(1);
-  }
-  nrn_prop_param_size_[type] = psize;
-  nrn_prop_dparam_size_[type] = dpsize;
-  if (dpsize) {
-    memb_func[type].dparam_semantics = (int*)ecalloc(dpsize, sizeof(int));
-  }
+    int pold, dpold;
+    if (type == -1)
+        return;
+
+    pold = nrn_prop_param_size_[type];
+    dpold = nrn_prop_dparam_size_[type];
+    if (psize != pold || dpsize != dpold) {
+        printf("%s prop sizes differ psize %d %d   dpsize %d %d\n", memb_func[type].sym, psize,
+               pold, dpsize, dpold);
+        printf("Error: %s is different version of MOD file than the one used by NEURON!\n",
+               memb_func[type].sym);
+        exit(1);
+    }
+    nrn_prop_param_size_[type] = psize;
+    nrn_prop_dparam_size_[type] = dpsize;
+    if (dpsize) {
+        memb_func[type].dparam_semantics = (int*)ecalloc(dpsize, sizeof(int));
+    }
 }
 void hoc_register_dparam_semantics(int type, int ix, const char* name) {
-	/* needed for SoA to possibly reorder name_ion and some "pointer" pointers. */
-	/* only interested in area, iontype, cvode_ieq,
-	   netsend, pointer, pntproc, bbcorepointer
-	   xx_ion and #xx_ion which will get
-	   a semantics value of -1, -2, -3,
-	   -4, -5, -6, -7,
-	   type, and type+1000 respectively
-	*/
-	if (strcmp(name, "area") == 0) {
-		memb_func[type].dparam_semantics[ix] = -1;
-	}else if (strcmp(name, "iontype") == 0) {
-		memb_func[type].dparam_semantics[ix] = -2;
-	}else if (strcmp(name, "cvodeieq") == 0) {
-		memb_func[type].dparam_semantics[ix] = -3;
-	}else if (strcmp(name, "netsend") == 0) {
-		memb_func[type].dparam_semantics[ix] = -4;
-	}else if (strcmp(name, "pointer") == 0) {
-		memb_func[type].dparam_semantics[ix] = -5;
-	}else if (strcmp(name, "pntproc") == 0) {
-		memb_func[type].dparam_semantics[ix] = -6;
-	}else if (strcmp(name, "bbcorepointer") == 0) {
-		memb_func[type].dparam_semantics[ix] = -7;
-	}else{
-		int etype;
-		int i = 0;
-		if (name[0] == '#') { i = 1; }
-		etype = nrn_get_mechtype(name+i);
-		memb_func[type].dparam_semantics[ix] = etype + i*1000;
-/* note that if style is needed (i==1), then we are writing a concentration */
-		if (i) {
-			ion_write_depend(type, etype);
-		}
-	}
+    /* needed for SoA to possibly reorder name_ion and some "pointer" pointers. */
+    /* only interested in area, iontype, cvode_ieq,
+       netsend, pointer, pntproc, bbcorepointer
+       xx_ion and #xx_ion which will get
+       a semantics value of -1, -2, -3,
+       -4, -5, -6, -7,
+       type, and type+1000 respectively
+    */
+    if (strcmp(name, "area") == 0) {
+        memb_func[type].dparam_semantics[ix] = -1;
+    } else if (strcmp(name, "iontype") == 0) {
+        memb_func[type].dparam_semantics[ix] = -2;
+    } else if (strcmp(name, "cvodeieq") == 0) {
+        memb_func[type].dparam_semantics[ix] = -3;
+    } else if (strcmp(name, "netsend") == 0) {
+        memb_func[type].dparam_semantics[ix] = -4;
+    } else if (strcmp(name, "pointer") == 0) {
+        memb_func[type].dparam_semantics[ix] = -5;
+    } else if (strcmp(name, "pntproc") == 0) {
+        memb_func[type].dparam_semantics[ix] = -6;
+    } else if (strcmp(name, "bbcorepointer") == 0) {
+        memb_func[type].dparam_semantics[ix] = -7;
+    } else {
+        int etype;
+        int i = 0;
+        if (name[0] == '#') {
+            i = 1;
+        }
+        etype = nrn_get_mechtype(name + i);
+        memb_func[type].dparam_semantics[ix] = etype + i * 1000;
+        /* note that if style is needed (i==1), then we are writing a concentration */
+        if (i) {
+            ion_write_depend(type, etype);
+        }
+    }
 #if 0
 	printf("dparam semantics %s ix=%d %s %d\n", memb_func[type].sym,
 	  ix, name, memb_func[type].dparam_semantics[ix]);
@@ -263,160 +312,181 @@ void hoc_register_dparam_semantics(int type, int ix, const char* name) {
 
 /* only ion type ion_write_depend_ are non-NULL */
 /* and those are array of integers with first integer being array size */
-/* and remaining size-1 integers containing the mechanism types that write concentrations to that ion */
+/* and remaining size-1 integers containing the mechanism types that write concentrations to that
+ * ion */
 static void ion_write_depend(int type, int etype) {
-	int size, i;
-	if (ion_write_depend_size_ < n_memb_func) {
-		ion_write_depend_ = (int**)erealloc(ion_write_depend_, n_memb_func*sizeof(int*));
-		for(i = ion_write_depend_size_; i < n_memb_func; ++i) {
-			ion_write_depend_[i] = NULL;
-		}
-		ion_write_depend_size_ = n_memb_func;
-	}
-	size = 2;
-	if (ion_write_depend_[etype]) {
-		size = ion_write_depend_[etype][0] + 1;
-	}
-	ion_write_depend_[etype] = (int*)erealloc(ion_write_depend_[etype], size*sizeof(int));
-	ion_write_depend_[etype][0] = size;
-	ion_write_depend_[etype][size-1] = type;
+    int size, i;
+    if (ion_write_depend_size_ < n_memb_func) {
+        ion_write_depend_ = (int**)erealloc(ion_write_depend_, n_memb_func * sizeof(int*));
+        for (i = ion_write_depend_size_; i < n_memb_func; ++i) {
+            ion_write_depend_[i] = NULL;
+        }
+        ion_write_depend_size_ = n_memb_func;
+    }
+    size = 2;
+    if (ion_write_depend_[etype]) {
+        size = ion_write_depend_[etype][0] + 1;
+    }
+    ion_write_depend_[etype] = (int*)erealloc(ion_write_depend_[etype], size * sizeof(int));
+    ion_write_depend_[etype][0] = size;
+    ion_write_depend_[etype][size - 1] = type;
 }
 
 static int depend_append(int idep, int* dependencies, int deptype, int type) {
-	/* append only if not already in dependencies and != type*/
-	int add, i;
-	add = 1;
-	if (deptype == type) { return idep; }
-	for (i=0; i < idep; ++i) {
-		if (deptype == dependencies[i]) {
-			add = 0;
-			break;
-		}
-	}
-	if (add) {
-		dependencies[idep++] = deptype;
-	}
-	return idep;
+    /* append only if not already in dependencies and != type*/
+    int add, i;
+    add = 1;
+    if (deptype == type) {
+        return idep;
+    }
+    for (i = 0; i < idep; ++i) {
+        if (deptype == dependencies[i]) {
+            add = 0;
+            break;
+        }
+    }
+    if (add) {
+        dependencies[idep++] = deptype;
+    }
+    return idep;
 }
 
 /* return list of types that this type depends on (10 should be more than enough) */
 /* dependencies must be an array that is large enough to hold that array */
 /* number of dependencies is returned */
 int nrn_mech_depend(int type, int* dependencies) {
-	int i, dpsize, idep, deptype;
-	int* ds;
-	dpsize = nrn_prop_dparam_size_[type];
-	ds = memb_func[type].dparam_semantics;
-	idep = 0;
-	if (ds) for (i=0; i < dpsize; ++i) {
-		if (ds[i] > 0 && ds[i] < 1000) {
-			int idepnew;
-			int* iwd;
-			deptype = ds[i];
-			idepnew = depend_append(idep, dependencies, deptype, type);
-			iwd = ion_write_depend_ ? ion_write_depend_[deptype] : 0;
-			if (idepnew > idep && iwd) {
-				int size, j;
-				size = iwd[0];
-				for (j=1; j < size; ++j) {
-					idepnew=depend_append(idepnew, dependencies, iwd[j], type);
-				}
-			}
-			idep = idepnew;
-		}
-	}
-	return idep;
+    int i, dpsize, idep, deptype;
+    int* ds;
+    dpsize = nrn_prop_dparam_size_[type];
+    ds = memb_func[type].dparam_semantics;
+    idep = 0;
+    if (ds)
+        for (i = 0; i < dpsize; ++i) {
+            if (ds[i] > 0 && ds[i] < 1000) {
+                int idepnew;
+                int* iwd;
+                deptype = ds[i];
+                idepnew = depend_append(idep, dependencies, deptype, type);
+                iwd = ion_write_depend_ ? ion_write_depend_[deptype] : 0;
+                if (idepnew > idep && iwd) {
+                    int size, j;
+                    size = iwd[0];
+                    for (j = 1; j < size; ++j) {
+                        idepnew = depend_append(idepnew, dependencies, iwd[j], type);
+                    }
+                }
+                idep = idepnew;
+            }
+        }
+    return idep;
 }
 
 void register_destructor(Pfri d) {
-	memb_func[n_memb_func - 1].destructor = d;
+    memb_func[n_memb_func - 1].destructor = d;
 }
 
 int point_reg_helper(Symbol* s2) {
-  int type;
-  type = nrn_get_mechtype(s2);
+    int type;
+    type = nrn_get_mechtype(s2);
 
-  // No mechanism in the .dat files
-  if (type == -1)
-    return type;
+    // No mechanism in the .dat files
+    if (type == -1)
+        return type;
 
-  pnt_map[type] = pointtype;
-  memb_func[type].is_point = 1;
+    pnt_map[type] = pointtype;
+    memb_func[type].is_point = 1;
 
-  return pointtype++;
+    return pointtype++;
 }
 
 int point_register_mech(const char** m,
-  mod_alloc_t alloc, mod_f_t cur, mod_f_t jacob,
-  mod_f_t stat, mod_f_t initialize, int nrnpointerindex,
-  void*(*constructor)(), void(*destructor)(),
-  int vectorized
-){
-	Symbol* s;
-	(void)constructor; (void)destructor; /* unused */
-	s = (char*)m[1];
-	register_mech(m, alloc, cur, jacob, stat, initialize, nrnpointerindex, vectorized);
-	return point_reg_helper(s);
+                        mod_alloc_t alloc,
+                        mod_f_t cur,
+                        mod_f_t jacob,
+                        mod_f_t stat,
+                        mod_f_t initialize,
+                        int nrnpointerindex,
+                        void* (*constructor)(),
+                        void (*destructor)(),
+                        int vectorized) {
+    Symbol* s;
+    (void)constructor;
+    (void)destructor; /* unused */
+    s = (char*)m[1];
+    register_mech(m, alloc, cur, jacob, stat, initialize, nrnpointerindex, vectorized);
+    return point_reg_helper(s);
 }
 
-void _modl_cleanup(){}
+void _modl_cleanup() {
+}
 
 int state_discon_allowed_;
 int state_discon_flag_ = 0;
 void state_discontinuity(int i, double* pd, double d) {
-	(void)i; /* unused */
-	if (state_discon_allowed_ && state_discon_flag_ == 0) {
-		*pd = d;
-/*printf("state_discontinuity t=%g pd=%lx d=%g\n", t, (long)pd, d);*/
-	}
+    (void)i; /* unused */
+    if (state_discon_allowed_ && state_discon_flag_ == 0) {
+        *pd = d;
+        /*printf("state_discontinuity t=%g pd=%lx d=%g\n", t, (long)pd, d);*/
+    }
 }
 
 void hoc_reg_ba(int mt, mod_f_t f, int type) {
-  BAMech* bam;
-  if (type == -1)
-    return;  
-
-  switch (type) { /* see bablk in src/nmodl/nocpout.c */
-	case 11: type = BEFORE_BREAKPOINT; break;
-	case 22: type = AFTER_SOLVE; break;
-	case 13: type = BEFORE_INITIAL; break;
-	case 23: type = AFTER_INITIAL; break;
-	case 14: type = BEFORE_STEP; break;
-	default:
-printf("before-after processing type %d for %s not implemented\n", type, memb_func[mt].sym);
-		nrn_exit(1);
-  }
-  bam = (BAMech*)emalloc(sizeof(BAMech));
-  bam->f = f;
-  bam->type = mt;
-  bam->next = bamech_[type];
-  bamech_[type] = bam;
+    BAMech* bam;
+    if (type == -1)
+        return;
+
+    switch (type) { /* see bablk in src/nmodl/nocpout.c */
+        case 11:
+            type = BEFORE_BREAKPOINT;
+            break;
+        case 22:
+            type = AFTER_SOLVE;
+            break;
+        case 13:
+            type = BEFORE_INITIAL;
+            break;
+        case 23:
+            type = AFTER_INITIAL;
+            break;
+        case 14:
+            type = BEFORE_STEP;
+            break;
+        default:
+            printf("before-after processing type %d for %s not implemented\n", type,
+                   memb_func[mt].sym);
+            nrn_exit(1);
+    }
+    bam = (BAMech*)emalloc(sizeof(BAMech));
+    bam->f = f;
+    bam->type = mt;
+    bam->next = bamech_[type];
+    bamech_[type] = bam;
 }
 
-void _nrn_thread_reg0(int i, void(*f)(ThreadDatum*)) {
-  if (i == -1)
-    return;
+void _nrn_thread_reg0(int i, void (*f)(ThreadDatum*)) {
+    if (i == -1)
+        return;
 
-  memb_func[i].thread_cleanup_ = f;
+    memb_func[i].thread_cleanup_ = f;
 }
 
-void _nrn_thread_reg1(int i, void(*f)(ThreadDatum*)) {
-  if (i == -1)
-    return;
+void _nrn_thread_reg1(int i, void (*f)(ThreadDatum*)) {
+    if (i == -1)
+        return;
 
-  memb_func[i].thread_mem_init_ = f;
+    memb_func[i].thread_mem_init_ = f;
 }
 
-void _nrn_thread_table_reg(int i, void(*f)(int, int, double*, Datum*, ThreadDatum*, void*, int)) {
-  if (i == -1)
-    return;
+void _nrn_thread_table_reg(int i, void (*f)(int, int, double*, Datum*, ThreadDatum*, void*, int)) {
+    if (i == -1)
+        return;
 
-  memb_func[i].thread_table_check_ = f;
+    memb_func[i].thread_table_check_ = f;
 }
 
-void _nrn_setdata_reg(int i, void(*call)(double*, Datum*)) {
-  if (i == -1)
-    return;
+void _nrn_setdata_reg(int i, void (*call)(double*, Datum*)) {
+    if (i == -1)
+        return;
 
-  memb_func[i].setdata_ = call;
+    memb_func[i].setdata_ = call;
 }
diff --git a/coreneuron/nrnoc/solve_core.c b/coreneuron/nrnoc/solve_core.c
index ac9b44442..66a98cd12 100644
--- a/coreneuron/nrnoc/solve_core.c
+++ b/coreneuron/nrnoc/solve_core.c
@@ -28,40 +28,81 @@ THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "coreneuron/nrnconf.h"
 #include "coreneuron/nrnoc/multicore.h"
+#include "coreneuron/nrnoc/nrnoc_decl.h"
 
-static void triang(NrnThread*), bksub(NrnThread*);
+int use_solve_interleave;
+
+static void triang(NrnThread *), bksub(NrnThread *);
 
 /* solve the matrix equation */
 void nrn_solve_minimal(NrnThread* _nt) {
-	triang(_nt);
-	bksub(_nt);
+    if (use_solve_interleave) {
+        solve_interleaved(_nt->id);
+    } else {
+        triang(_nt);
+        bksub(_nt);
+    }
 }
 
+/** TODO loops are executed seq in OpenACC just for debugging, remove it! */
+
 /* triangularization of the matrix equations */
 static void triang(NrnThread* _nt) {
-	double p;
-	int i, i2, i3;
-	i2 = _nt->ncell;
-	i3 = _nt->end;
-	for (i = i3 - 1; i >= i2; --i) {
-		p = VEC_A(i) / VEC_D(i);
-		VEC_D(_nt->_v_parent_index[i]) -= p * VEC_B(i);
-		VEC_RHS(_nt->_v_parent_index[i]) -= p * VEC_RHS(i);
-	}
+    double p;
+    int i, i2, i3;
+    i2 = _nt->ncell;
+    i3 = _nt->end;
+
+    double* vec_a = &(VEC_A(0));
+    double* vec_b = &(VEC_B(0));
+    double* vec_d = &(VEC_D(0));
+    double* vec_rhs = &(VEC_RHS(0));
+    int* parent_index = _nt->_v_parent_index;
+#if defined(_OPENACC)
+    int stream_id = _nt->stream_id;
+#endif
+
+/** @todo: just for benchmarking, otherwise produces wrong results */
+#pragma acc parallel loop seq present(                                                  \
+    vec_a[0 : i3], vec_b[0 : i3], vec_d[0 : i3], vec_rhs[0 : i3], parent_index[0 : i3]) \
+                                                     async(stream_id) if (_nt->compute_gpu)
+    for (i = i3 - 1; i >= i2; --i) {
+        p = vec_a[i] / vec_d[i];
+        vec_d[parent_index[i]] -= p * vec_b[i];
+        vec_rhs[parent_index[i]] -= p * vec_rhs[i];
+    }
 }
 
 /* back substitution to finish solving the matrix equations */
 static void bksub(NrnThread* _nt) {
-	int i, i1, i2, i3;
-	i1 = 0;
-	i2 = i1 + _nt->ncell;
-	i3 = _nt->end;
-	for (i = i1; i < i2; ++i) {
-		VEC_RHS(i) /= VEC_D(i);
-	}
-	for (i = i2; i < i3; ++i) {
-		VEC_RHS(i) -= VEC_B(i) * VEC_RHS(_nt->_v_parent_index[i]);
-		VEC_RHS(i) /= VEC_D(i);
-	}	
-}
+    int i, i1, i2, i3;
+    i1 = 0;
+    i2 = i1 + _nt->ncell;
+    i3 = _nt->end;
 
+    double* vec_b = &(VEC_B(0));
+    double* vec_d = &(VEC_D(0));
+    double* vec_rhs = &(VEC_RHS(0));
+    int* parent_index = _nt->_v_parent_index;
+#if defined(_OPENACC)
+    int stream_id = _nt->stream_id;
+#endif
+
+/** @todo: just for benchmarking, otherwise produces wrong results */
+    #pragma acc parallel loop seq present(vec_d[0 : i2], vec_rhs[0 : i2]) \
+                                          async(stream_id) if (_nt->compute_gpu)
+    for (i = i1; i < i2; ++i) {
+        vec_rhs[i] /= vec_d[i];
+    }
+
+/** @todo: just for benchmarking, otherwise produces wrong results */
+    #pragma acc parallel loop seq present(                                   \
+        vec_b[0 : i3], vec_d[0 : i3], vec_rhs[0 : i3], parent_index[0 : i3]) \
+                                      async(stream_id) if (_nt->compute_gpu)
+    for (i = i2; i < i3; ++i) {
+        vec_rhs[i] -= vec_b[i] * vec_rhs[parent_index[i]];
+        vec_rhs[i] /= vec_d[i];
+    }
+
+    #pragma acc wait(stream_id)
+}
diff --git a/coreneuron/nrnoc/treeset_core.c b/coreneuron/nrnoc/treeset_core.c
index a11483ca6..dea111156 100644
--- a/coreneuron/nrnoc/treeset_core.c
+++ b/coreneuron/nrnoc/treeset_core.c
@@ -29,6 +29,7 @@ THE POSSIBILITY OF SUCH DAMAGE.
 #include "coreneuron/nrnconf.h"
 #include "coreneuron/nrnoc/multicore.h"
 #include "coreneuron/nrnoc/nrnoc_decl.h"
+#include "coreneuron/nrniv/nrn_acc_manager.h"
 
 /*
 Fixed step method with threads and cache efficiency. No extracellular,
@@ -36,39 +37,61 @@ sparse matrix, multisplit, or legacy features.
 */
 
 static void nrn_rhs(NrnThread* _nt) {
-	int i, i1, i2, i3;
-	NrnThreadMembList* tml;
-
-	i1 = 0;
-	i2 = i1 + _nt->ncell;
-	i3 = _nt->end;
-
-	for (i = i1; i < i3; ++i) {
-		VEC_RHS(i) = 0.;
-		VEC_D(i) = 0.;
-	}
-
-	nrn_ba(_nt, BEFORE_BREAKPOINT);
-	/* note that CAP has no current */
-	for (tml = _nt->tml; tml; tml = tml->next) if (memb_func[tml->index].current) {
-		mod_f_t s = memb_func[tml->index].current;
-		(*s)(_nt, tml->ml, tml->index);
+    int i, i1, i2, i3;
+    NrnThreadMembList* tml;
+#if defined(_OPENACC)
+    int stream_id = _nt->stream_id;
+#endif
+    i1 = 0;
+    i2 = i1 + _nt->ncell;
+    i3 = _nt->end;
+
+    double* vec_rhs = &(VEC_RHS(0));
+    double* vec_d = &(VEC_D(0));
+    double* vec_a = &(VEC_A(0));
+    double* vec_b = &(VEC_B(0));
+    double* vec_v = &(VEC_V(0));
+    int* parent_index = _nt->_v_parent_index;
+
+    #pragma acc parallel loop present(vec_rhs[0 : i3], \
+                                          vec_d[0 : i3]) if (_nt->compute_gpu) async(stream_id)
+    for (i = i1; i < i3; ++i) {
+        vec_rhs[i] = 0.;
+        vec_d[i] = 0.;
+    }
+
+    nrn_ba(_nt, BEFORE_BREAKPOINT);
+    /* note that CAP has no current */
+    for (tml = _nt->tml; tml; tml = tml->next)
+        if (memb_func[tml->index].current) {
+            mod_f_t s = memb_func[tml->index].current;
+            (*s)(_nt, tml->ml, tml->index);
 #ifdef DEBUG
-		if (errno) {
-hoc_warning("errno set during calculation of currents", (char*)0);
-		}
+            if (errno) {
+                hoc_warning("errno set during calculation of currents", (char*)0);
+            }
 #endif
-	}
-	/* now the internal axial currents.
-	The extracellular mechanism contribution is already done.
-		rhs += ai_j*(vi_j - vi)
-	*/
-	for (i = i2; i < i3; ++i) {
-		double dv = VEC_V(_nt->_v_parent_index[i]) - VEC_V(i);
-		/* our connection coefficients are negative so */
-		VEC_RHS(i) -= VEC_B(i)*dv;
-		VEC_RHS(_nt->_v_parent_index[i]) += VEC_A(i)*dv;
-	}
+        }
+
+/* now the internal axial currents.
+The extracellular mechanism contribution is already done.
+        rhs += ai_j*(vi_j - vi)
+*/
+    #pragma acc parallel loop present(       \
+        vec_rhs[0 : i3],                     \
+            vec_d[0 : i3],                   \
+                  vec_a[0 : i3],             \
+                        vec_b[0 : i3],       \
+                              vec_v[0 : i3], \
+                                    parent_index[0 : i3]) if (_nt->compute_gpu) async(stream_id)
+    for (i = i2; i < i3; ++i) {
+        double dv = vec_v[parent_index[i]] - vec_v[i];
+/* our connection coefficients are negative so */
+        #pragma acc atomic update
+        vec_rhs[i] -= vec_b[i] * dv;
+        #pragma acc atomic update
+        vec_rhs[parent_index[i]] += vec_a[i] * dv;
+    }
 }
 
 /* calculate left hand side of
@@ -80,42 +103,59 @@ This is a common operation for fixed step, cvode, and daspk methods
 */
 
 static void nrn_lhs(NrnThread* _nt) {
-	int i, i1, i2, i3;
-	NrnThreadMembList* tml;
+    int i, i1, i2, i3;
+    NrnThreadMembList* tml;
+#if defined(_OPENACC)
+    int stream_id = _nt->stream_id;
+#endif
 
-	i1 = 0;
-	i2 = i1 + _nt->ncell;
-	i3 = _nt->end;
+    i1 = 0;
+    i2 = i1 + _nt->ncell;
+    i3 = _nt->end;
 
-	/* note that CAP has no jacob */
-	for (tml = _nt->tml; tml; tml = tml->next) if (memb_func[tml->index].jacob) {
-		mod_f_t s = memb_func[tml->index].jacob;
-		(*s)(_nt, tml->ml, tml->index);
+    /* note that CAP has no jacob */
+    for (tml = _nt->tml; tml; tml = tml->next)
+        if (memb_func[tml->index].jacob) {
+            mod_f_t s = memb_func[tml->index].jacob;
+            (*s)(_nt, tml->ml, tml->index);
 #ifdef DEBUG
-		if (errno) {
-hoc_warning("errno set during calculation of jacobian", (char*)0);
-		}
+            if (errno) {
+                hoc_warning("errno set during calculation of jacobian", (char*)0);
+            }
 #endif
-	}
-/* now the cap current can be computed because any change to cm by another model
-has taken effect
-*/
-	/* note, the first is CAP */
-	if (_nt->tml) {
-		assert(_nt->tml->index == CAP);
-		nrn_cap_jacob(_nt, _nt->tml->ml);
-	}
-
-	/* now add the axial currents */
-        for (i=i2; i < i3; ++i) {
-		VEC_D(i) -= VEC_B(i);
-		VEC_D(_nt->_v_parent_index[i]) -= VEC_A(i);
         }
+    /* now the cap current can be computed because any change to cm by another model
+    has taken effect
+    */
+    /* note, the first is CAP */
+    if (_nt->tml) {
+        assert(_nt->tml->index == CAP);
+        nrn_jacob_capacitance(_nt, _nt->tml->ml, _nt->tml->index);
+    }
+
+    double* vec_d = &(VEC_D(0));
+    double* vec_a = &(VEC_A(0));
+    double* vec_b = &(VEC_B(0));
+    int* parent_index = _nt->_v_parent_index;
+
+/* now add the axial currents */
+    #pragma acc parallel loop present(                                                           \
+        vec_d[0 : i3], vec_a[0 : i3], vec_b[0 : i3], parent_index[0 : i3]) if (_nt->compute_gpu) \
+                                                                               async(stream_id)
+    for (i = i2; i < i3; ++i) {
+        #pragma acc atomic update
+        vec_d[i] -= vec_b[i];
+        #pragma acc atomic update
+        vec_d[parent_index[i]] -= vec_a[i];
+    }
 }
 
 /* for the fixed step method */
-void* setup_tree_matrix_minimal(NrnThread* _nt){
-	nrn_rhs(_nt);
-	nrn_lhs(_nt);
-	return (void*)0;
+void* setup_tree_matrix_minimal(NrnThread* _nt) {
+    nrn_rhs(_nt);
+    nrn_lhs(_nt);
+
+    // update_matrix_from_gpu(_nt);
+
+    return (void*)0;
 }
diff --git a/coreneuron/nrnomp/nrnomp.c b/coreneuron/nrnomp/nrnomp.c
index b0a38f9d7..d8ed1c88a 100644
--- a/coreneuron/nrnomp/nrnomp.c
+++ b/coreneuron/nrnomp/nrnomp.c
@@ -31,14 +31,12 @@ THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(_OPENMP)
 #include <omp.h>
-#endif // _OPENMP
+#endif  // _OPENMP
 
-int nrnomp_get_numthreads()
-{
+int nrnomp_get_numthreads() {
 #if defined(_OPENMP)
-  return (omp_get_max_threads());
+    return (omp_get_max_threads());
 #else
-  return 1;
+    return 1;
 #endif
 }
-
diff --git a/coreneuron/scopmath_core/abort.c b/coreneuron/scopmath_core/abort.c
new file mode 100644
index 000000000..906583679
--- /dev/null
+++ b/coreneuron/scopmath_core/abort.c
@@ -0,0 +1,94 @@
+#include "coreneuron/nrnconf.h"
+/******************************************************************************
+ *
+ * File: abort.c
+ *
+ * Copyright (c) 1984, 1985, 1986, 1987, 1988, 1989, 1990
+ *   Duke University
+ *
+ ******************************************************************************/
+
+#ifndef LINT
+static char RCSid[] = "abort.c,v 1.2 1997/08/30 14:32:00 hines Exp";
+#endif
+
+/*-----------------------------------------------------------------------------
+ *
+ * ABORT_RUN()
+ *
+ *    Prints out an error message and returns to the main menu if a solver
+ *    routine returns a nonzero error code.
+ *
+ * Calling sequence: abort_run(code)
+ *
+ * Argument:	code	int	flag for error
+ *
+ * Returns:
+ *
+ * Functions called: abs(), cls(), cursrpos(), puts(), gets()
+ *
+ * Files accessed:
+ *---------------------------------------------------------------------------*/
+
+#include <setjmp.h>
+#include <stdio.h>
+#include "errcodes.h"
+
+int abort_run(int code) {
+    switch ((code >= 0) ? code : -code) {
+        case EXCEED_ITERS:
+            puts("Convergence not achieved in maximum number of iterations");
+            break;
+        case SINGULAR:
+            puts("The matrix in the solution method is singular or ill-conditioned");
+            break;
+        case PRECISION:
+            puts(
+                "The increment in the independent variable is less than machine "
+                "roundoff error");
+            break;
+        case CORR_FAIL:
+            puts("The corrector failed to satisfy the error check");
+            break;
+        case DIVERGED:
+            puts("The corrector iteration diverged");
+            break;
+        case INCONSISTENT:
+            puts("Inconsistent boundary conditions");
+            puts("Convergence not acheived in maximum number of iterations");
+            break;
+        case BAD_START:
+            puts("Poor starting estimate for initial conditions");
+            puts("The matrix in the solution method is singular or ill-conditioned");
+            break;
+        case NODATA:
+            puts("No data found in data file");
+            break;
+        case NO_SOLN:
+            puts("No solution was obtained for the coefficients");
+            break;
+        case LOWMEM:
+            puts("Insufficient memory to run the model");
+            break;
+        case DIVCHECK:
+            puts("Attempt to divide by zero");
+            break;
+        case NOFORCE:
+            puts(
+                "Could not open forcing function file\nThe model cannot be run "
+                "without the forcing function");
+            break;
+        case NEG_ARG:
+            puts("Cannot compute factorial of negative argument");
+            break;
+        case RANGE:
+            puts(
+                "Value of variable is outside the range of the forcing function data "
+                "table");
+            break;
+        default:
+            puts("Origin of error is unknown");
+    }
+    hoc_execerror("scopmath library error", (char*)0);
+    return 0;
+}
diff --git a/coreneuron/scopmath_core/crout_thread.c b/coreneuron/scopmath_core/crout_thread.c
new file mode 100644
index 000000000..d8ab7bff5
--- /dev/null
+++ b/coreneuron/scopmath_core/crout_thread.c
@@ -0,0 +1,235 @@
+#include "coreneuron/mech/cfile/scoplib.h"
+#include "coreneuron/scopmath_core/newton_struct.h"
+/******************************************************************************
+ *
+ * File: crout.c
+ *
+ * Copyright (c) 1987, 1988, 1989, 1990
+ *   Duke University
+ *
+ ******************************************************************************/
+
+#ifndef LINT
+static char RCSid[] = "crout.c,v 1.2 1999/01/04 12:46:43 hines Exp";
+#endif
+
+/*--------------------------------------------------------------*/
+/*                                                              */
+/*  CROUT 		                                        */
+/*                                                              */
+/*    Performs an LU triangular factorization of a real matrix  */
+/*    by the Crout algorithm using partial pivoting.  Rows are  */
+/*    not normalized; implicit equilibration is used.  ROUNDOFF */
+/*    is the minimal value for a pivot element without its being*/
+/*    considered too close to zero (currently set to 1.0E-20).  */
+/*                                                              */
+/*  Returns: 0 if no error; 2 if matrix is singular or ill-	*/
+/*			      conditioned			*/
+/*                                                              */
+/*  Calling sequence:  crout(n, a, perm)	                */
+/*                                                              */
+/*  Arguments:                                                  */
+/*                                                              */
+/*    Input:  n, integer, number of rows of the matrix          */
+/*                                                              */
+/*            a, double precision matrix to be factored         */
+/*                                                              */
+/*    Output:                                                   */
+/*                                                              */
+/*            a, factors required to transform the constant     */
+/*               vector in the set of simultaneous equations    */
+/*               are stored in the lower triangle; factors for  */
+/*               back substitution are stored in the upper      */
+/*               triangle.                                      */
+/*                                                              */
+/*            perm, integer, permutation vector to store row    */
+/*                           interchanges                       */
+/*                                                              */
+/*  Functions called: makevector(), freevector()		*/
+/*                                                              */
+/*--------------------------------------------------------------*/
+
+#include <math.h>
+#include "coreneuron/scopmath_core/errcodes.h"
+
+#define ix(arg) ((arg)*_STRIDE)
+
+/* having a differnt permutation per instance may not be a good idea */
+int nrn_crout_thread(NewtonSpace* ns, int n, double** a, int* perm, _threadargsproto_) {
+    int i, j, k, r, pivot, irow, save_i = 0, krow;
+    double sum, *rowmax, equil_1, equil_2;
+
+    /* Initialize permutation and rowmax vectors */
+
+    rowmax = ns->rowmax;
+    for (i = 0; i < n; i++) {
+        perm[ix(i)] = i;
+        k = 0;
+        for (j = 1; j < n; j++)
+            if (fabs(a[i][ix(j)]) > fabs(a[i][ix(k)]))
+                k = j;
+        rowmax[ix(i)] = a[i][ix(k)];
+    }
+
+    /* Loop over rows and columns r */
+
+    for (r = 0; r < n; r++) {
+        /*
+         * Operate on rth column.  This produces the lower triangular matrix
+         * of terms needed to transform the constant vector.
+         */
+
+        for (i = r; i < n; i++) {
+            sum = 0.0;
+            irow = perm[ix(i)];
+            for (k = 0; k < r; k++) {
+                krow = perm[ix(k)];
+                sum += a[irow][ix(k)] * a[krow][ix(r)];
+            }
+            a[irow][ix(r)] -= sum;
+        }
+
+        /* Find row containing the pivot in the rth column */
+
+        pivot = perm[ix(r)];
+        equil_1 = fabs(a[pivot][ix(r)] / rowmax[ix(pivot)]);
+        for (i = r + 1; i < n; i++) {
+            irow = perm[ix(i)];
+            equil_2 = fabs(a[irow][ix(r)] / rowmax[ix(irow)]);
+            if (equil_2 > equil_1) {
+                /* make irow the new pivot row */
+
+                pivot = irow;
+                save_i = i;
+                equil_1 = equil_2;
+            }
+        }
+
+        /* Interchange entries in permutation vector if necessary */
+
+        if (pivot != perm[ix(r)]) {
+            perm[ix(save_i)] = perm[ix(r)];
+            perm[ix(r)] = pivot;
+        }
+
+        /* Check that pivot element is not too small */
+
+        if (fabs(a[pivot][ix(r)]) < ROUNDOFF)
+            return (SINGULAR);
+
+        /*
+         * Operate on row in rth position.  This produces the upper
+         * triangular matrix whose diagonal elements are assumed to be unity.
+         * This matrix is used in the back substitution algorithm.
+         */
+
+        for (j = r + 1; j < n; j++) {
+            sum = 0.0;
+            for (k = 0; k < r; k++) {
+                krow = perm[ix(k)];
+                sum += a[pivot][ix(k)] * a[krow][ix(j)];
+            }
+            a[pivot][ix(j)] = (a[pivot][ix(j)] - sum) / a[pivot][ix(r)];
+        }
+    }
+    return (SUCCESS);
+}
+
+/*--------------------------------------------------------------*/
+/*                                                              */
+/*  SOLVE()                                          		*/
+/*                                                              */
+/*    Performs forward substitution algorithm to transform the  */
+/*    constant vector in the linear simultaneous equations to   */
+/*    be consistent with the factored matrix.  Then performs    */
+/*    back substitution to find the solution to the simultane-  */
+/*    ous linear equations.                                     */
+/*                                                              */
+/*  Returns: no return variable                                 */
+/*                                                              */
+/*  Calling sequence:  solve(n, a, b, perm, p, y)               */
+/*                                                              */
+/*  Arguments:                                                  */
+/*                                                              */
+/*    Input:  n, integer, number of rows of the matrix          */
+/*                                                              */
+/*            a, double precision matrix containing the         */
+/*               factored matrix of coefficients of the linear  */
+/*               equations.                                     */
+/*                                                              */
+/*            b, vector of function values			*/
+/*                                                              */
+/*            perm, integer, permutation vector to store row    */
+/*                           interchanges                       */
+/*                                                              */
+/*    Output:                                                   */
+/*                                                              */
+/*            p[y[i]] contains the solution vector                    */
+/*                                                              */
+/*--------------------------------------------------------------*/
+void nrn_scopmath_solve_thread(int n,
+                               double** a,
+                               double* b,
+                               int* perm,
+                               double* p,
+                               int* y,
+                               _threadargsproto_)
+#define y_(arg) _p[y[arg] * _STRIDE]
+#define b_(arg) b[ix(arg)]
+{
+    int i, j, pivot;
+    double sum;
+
+    /* Perform forward substitution with pivoting */
+    // if (y) { // pgacc bug. NULL on cpu but not on GPU
+    if (0) {
+        for (i = 0; i < n; i++) {
+            pivot = perm[ix(i)];
+            sum = 0.0;
+            for (j = 0; j < i; j++)
+                sum += a[pivot][ix(j)] * (y_(j));
+            y_(i) = (b_(pivot) - sum) / a[pivot][ix(i)];
+        }
+
+        /*
+         * Note that the y vector is already in the correct order for back
+         * substitution.  Perform back substitution, pivoting the matrix but not
+         * the y vector.  There is no need to divide by the diagonal element as
+         * this is assumed to be unity.
+         */
+
+        for (i = n - 1; i >= 0; i--) {
+            pivot = perm[ix(i)];
+            sum = 0.0;
+            for (j = i + 1; j < n; j++)
+                sum += a[pivot][ix(j)] * (y_(j));
+            y_(i) -= sum;
+        }
+    } else {
+        for (i = 0; i < n; i++) {
+            pivot = perm[ix(i)];
+            sum = 0.0;
+            if (i > 0) {  // pgacc bug. with i==0 the following loop executes once
+                for (j = 0; j < i; j++) {
+                    sum += a[pivot][ix(j)] * (p[ix(j)]);
+                }
+            }
+            p[ix(i)] = (b_(pivot) - sum) / a[pivot][ix(i)];
+        }
+
+        /*
+         * Note that the y vector is already in the correct order for back
+         * substitution.  Perform back substitution, pivoting the matrix but not
+         * the y vector.  There is no need to divide by the diagonal element as
+         * this is assumed to be unity.
+         */
+
+        for (i = n - 1; i >= 0; i--) {
+            pivot = perm[ix(i)];
+            sum = 0.0;
+            for (j = i + 1; j < n; j++)
+                sum += a[pivot][ix(j)] * (p[ix(j)]);
+            p[ix(i)] -= sum;
+        }
+    }
+}
diff --git a/coreneuron/scopmath_core/dimplic.c b/coreneuron/scopmath_core/dimplic.c
new file mode 100644
index 000000000..922fa1c2a
--- /dev/null
+++ b/coreneuron/scopmath_core/dimplic.c
@@ -0,0 +1,28 @@
+/*
+  hopefully a temporary expedient to work around the inability to
+  pass function pointers as arguments
+*/
+
+#include "coreneuron/mech/cfile/scoplib.h"
+#include "coreneuron/mech/mod2c_core_thread.h"
+#include "_kinderiv.h"
+
+int derivimplicit_thread(int n, int* slist, int* dlist, DIFUN fun, _threadargsproto_) {
+    difun(fun);
+    return 0;
+}
+
+int nrn_derivimplic_steer(int fun, _threadargsproto_) {
+    switch (fun) { _NRN_DERIVIMPLIC_CASES }
+    return 0;
+}
+
+int nrn_newton_steer(int fun, _threadargsproto_) {
+    switch (fun) { _NRN_DERIVIMPLIC_CB_CASES }
+    return 0;
+}
+
+int nrn_kinetic_steer(int fun, SparseObj* so, double* rhs, _threadargsproto_) {
+    switch (fun) { _NRN_KINETIC_CASES }
+    return 0;
+}
diff --git a/coreneuron/scopmath_core/errcodes.h b/coreneuron/scopmath_core/errcodes.h
new file mode 100644
index 000000000..fa6f76e58
--- /dev/null
+++ b/coreneuron/scopmath_core/errcodes.h
@@ -0,0 +1,45 @@
+/******************************************************************************
+ *
+ * File: errcodes.h
+ *
+ * Copyright (c) 1984, 1985, 1986, 1987, 1988, 1989, 1990
+ *   Duke University
+ *
+ * errcodes.h,v 1.1.1.1 1994/10/12 17:22:18 hines Exp
+ *
+ ******************************************************************************/
+
+extern int abort_run(int);
+
+#define ROUNDOFF 1.e-20
+#define ZERO 1.e-8
+#define STEP 1.e-6
+#define CONVERGE 1.e-6
+#define MAXCHANGE 0.05
+#define INITSIMPLEX 0.25
+#define MAXITERS 50
+#define MAXSMPLXITERS 100
+#define MAXSTEPS 20
+#define MAXHALVE 15
+#define MAXORDER 6
+#define MAXTERMS 3
+#define MAXFAIL 10
+#define MAX_JAC_ITERS 20
+#define MAX_GOOD_ORDER 2
+#define MAX_GOOD_STEPS 3
+
+#define SUCCESS 0
+#define EXCEED_ITERS 1
+#define SINGULAR 2
+#define PRECISION 3
+#define CORR_FAIL 4
+#define INCONSISTENT 5
+#define BAD_START 6
+#define NODATA 7
+#define NO_SOLN 8
+#define LOWMEM 9
+#define DIVCHECK 10
+#define NOFORCE 11
+#define DIVERGED 12
+#define NEG_ARG 13
+#define RANGE 14
diff --git a/coreneuron/scopmath_core/newton_struct.h b/coreneuron/scopmath_core/newton_struct.h
new file mode 100644
index 000000000..4f157d39b
--- /dev/null
+++ b/coreneuron/scopmath_core/newton_struct.h
@@ -0,0 +1,60 @@
+#ifndef newton_struct_h
+#define newton_struct_h
+
+#include "coreneuron/mech/mod2c_core_thread.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* avoid incessant alloc/free memory */
+typedef struct NewtonSpace {
+    int n;
+    int n_instance;
+    double* delta_x;
+    double** jacobian;
+    int* perm;
+    double* high_value;
+    double* low_value;
+    double* rowmax;
+} NewtonSpace;
+
+#pragma acc routine seq
+extern int nrn_crout_thread(NewtonSpace* ns, int n, double** a, int* perm, _threadargsproto_);
+
+#pragma acc routine seq
+extern void nrn_scopmath_solve_thread(int n,
+                                      double** a,
+                                      double* value,
+                                      int* perm,
+                                      double* delta_x,
+                                      int* s,
+                                      _threadargsproto_);
+
+#pragma acc routine seq
+extern int nrn_newton_thread(NewtonSpace* ns,
+                             int n,
+                             int* s,
+                             NEWTFUN pfunc,
+                             double* value,
+                             _threadargsproto_);
+
+#pragma acc routine seq
+extern void nrn_buildjacobian_thread(NewtonSpace* ns,
+                                     int n,
+                                     int* s,
+                                     NEWTFUN pfunc,
+                                     double* value,
+                                     double** jacobian,
+                                     _threadargsproto_);
+
+extern NewtonSpace* nrn_cons_newtonspace(int n, int n_instance);
+extern void nrn_destroy_newtonspace(NewtonSpace* ns);
+
+void nrn_newtonspace_copyto_device(NewtonSpace* ns);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/coreneuron/scopmath_core/newton_thread.c b/coreneuron/scopmath_core/newton_thread.c
new file mode 100644
index 000000000..2c420a892
--- /dev/null
+++ b/coreneuron/scopmath_core/newton_thread.c
@@ -0,0 +1,246 @@
+#include "coreneuron/mech/cfile/scoplib.h"
+#include "coreneuron/scopmath_core/newton_struct.h"
+/******************************************************************************
+ *
+ * File: newton.c
+ *
+ * Copyright (c) 1987, 1988, 1989, 1990
+ *   Duke University
+ *
+ ******************************************************************************/
+
+#ifndef LINT
+static char RCSid[] = "newton.c,v 1.3 1999/01/04 12:46:48 hines Exp";
+#endif
+
+/*------------------------------------------------------------*/
+/*                                                            */
+/*  NEWTON                                        	      */
+/*                                                            */
+/*    Iteratively solves simultaneous nonlinear equations by  */
+/*    Newton's method, using a Jacobian matrix computed by    */
+/*    finite differences.				      */
+/*                                                            */
+/*  Returns: 0 if no error; 2 if matrix is singular or ill-   */
+/*			     conditioned; 1 if maximum	      */
+/*			     iterations exceeded	      */
+/*                                                            */
+/*  Calling sequence: newton(n, x, p, pfunc, value)	      */
+/*                                                            */
+/*  Arguments:                                                */
+/*                                                            */
+/*    Input: n, integer, number of variables to solve for.    */
+/*                                                            */
+/*           x, pointer to array  of the solution             */
+/*		vector elements	possibly indexed by index     */
+/*                                                            */
+/*	     p,  array of parameter values		      */
+/*                                                            */
+/*           pfunc, pointer to function which computes the    */
+/*               deviation from zero of each equation in the  */
+/*               model.                                       */
+/*                                                            */
+/*	     value, pointer to array to array  of             */
+/*		 the function values.			      */
+/*                                                            */
+/*    Output: x contains the solution value or the most       */
+/*               recent iteration's result in the event of    */
+/*               an error.                                    */
+/*                                                            */
+/*  Functions called: makevector, freevector, makematrix,     */
+/*		      freematrix			      */
+/*		      buildjacobian, crout, solve	      */
+/*                                                            */
+/*------------------------------------------------------------*/
+
+#include <math.h>
+#include <stdlib.h>
+#include "coreneuron/mech/mod2c_core_thread.h"
+#include "coreneuron/scopmath_core/errcodes.h"
+
+#define ix(arg) ((arg)*_STRIDE)
+#define s_(arg) _p[s[arg] * _STRIDE]
+
+#pragma acc routine seq
+int nrn_newton_thread(NewtonSpace* ns,
+                      int n,
+                      int* s,
+                      NEWTFUN pfunc,
+                      double* value,
+                      _threadargsproto_) {
+    int i, count = 0, error = 0, *perm;
+    double **jacobian, *delta_x, change = 1.0, max_dev, temp;
+    int done = 0;
+    /*
+     * Create arrays for Jacobian, variable increments, function values, and
+     * permutation vector
+     */
+    delta_x = ns->delta_x;
+    jacobian = ns->jacobian;
+    perm = ns->perm;
+    /* Iteration loop */
+    while (!done) {
+        if (count++ >= MAXITERS) {
+            error = EXCEED_ITERS;
+            done = 2;
+        }
+        if (!done && change > MAXCHANGE) {
+            /*
+             * Recalculate Jacobian matrix if solution has changed by more
+             * than MAXCHANGE
+             */
+
+            nrn_buildjacobian_thread(ns, n, s, pfunc, value, jacobian, _threadargs_);
+            for (i = 0; i < n; i++)
+                value[ix(i)] = -value[ix(i)]; /* Required correction to
+                                               * function values */
+            error = nrn_crout_thread(ns, n, jacobian, perm, _threadargs_);
+            if (error != SUCCESS) {
+                done = 2;
+            }
+        }
+
+        if (!done) {
+            nrn_scopmath_solve_thread(n, jacobian, value, perm, delta_x, (int*)0, _threadargs_);
+
+            /* Update solution vector and compute norms of delta_x and value */
+
+            change = 0.0;
+            if (s) {
+                for (i = 0; i < n; i++) {
+                    if (fabs(s_(i)) > ZERO && (temp = fabs(delta_x[ix(i)] / (s_(i)))) > change)
+                        change = temp;
+                    s_(i) += delta_x[ix(i)];
+                }
+            } else {
+                for (i = 0; i < n; i++) {
+                    if (fabs(s_(i)) > ZERO && (temp = fabs(delta_x[ix(i)] / (s_(i)))) > change)
+                        change = temp;
+                    s_(i) += delta_x[ix(i)];
+                }
+            }
+            newtfun(pfunc); /* Evaluate function values with new solution */
+            max_dev = 0.0;
+            for (i = 0; i < n; i++) {
+                value[ix(i)] = -value[ix(i)]; /* Required correction to function
+                                       * values */
+                if ((temp = fabs(value[ix(i)])) > max_dev)
+                    max_dev = temp;
+            }
+
+            /* Check for convergence or maximum iterations */
+
+            if (change <= CONVERGE && max_dev <= ZERO) {
+                // break;
+                done = 1;
+            }
+        }
+    } /* end of while loop */
+
+    return (error);
+}
+
+/*------------------------------------------------------------*/
+/*                                                            */
+/*  BUILDJACOBIAN                                 	      */
+/*                                                            */
+/*    Creates the Jacobian matrix by computing partial deriv- */
+/*    atives by finite central differences.  If the column    */
+/*    variable is nonzero, an increment of 2% of the variable */
+/*    is used.  STEP is the minimum increment allowed; it is  */
+/*    currently set to 1.0E-6.                                */
+/*                                                            */
+/*  Returns: no return variable                               */
+/*                                                            */
+/*  Calling sequence:					      */
+/*	 buildjacobian(n, index, x, pfunc, value, jacobian)       */
+/*                                                            */
+/*  Arguments:                                                */
+/*                                                            */
+/*    Input: n, integer, number of variables                  */
+/*                                                            */
+/*           x, pointer to array of addresses of the solution */
+/*		vector elements				      */
+/*                                                            */
+/*	     p, array of parameter values		      */
+/*                                                            */
+/*           pfunc, pointer to function which computes the    */
+/*                     deviation from zero of each equation   */
+/*                     in the model.                          */
+/*                                                            */
+/*	     value, pointer to array of addresses of function */
+/*		       values				      */
+/*                                                            */
+/*    Output: jacobian, double, computed jacobian matrix      */
+/*                                                            */
+/*  Functions called:  user-supplied function with argument   */
+/*                     (p) to compute vector of function      */
+/*		       values for each equation.	      */
+/*		       makevector(), freevector()	      */
+/*                                                            */
+/*------------------------------------------------------------*/
+
+#define max(x, y) (fabs(x) > y ? x : y)
+
+void nrn_buildjacobian_thread(NewtonSpace* ns,
+                              int n,
+                              int* index,
+                              NEWTFUN pfunc,
+                              double* value,
+                              double** jacobian,
+                              _threadargsproto_) {
+#define x_(arg) _p[(arg)*_STRIDE]
+    int i, j;
+    double increment, *high_value, *low_value;
+
+    high_value = ns->high_value;
+    low_value = ns->low_value;
+
+    /* Compute partial derivatives by central finite differences */
+
+    for (j = 0; j < n; j++) {
+        increment = max(fabs(0.02 * (x_(index[j]))), STEP);
+        x_(index[j]) += increment;
+        newtfun(pfunc);
+        for (i = 0; i < n; i++)
+            high_value[ix(i)] = value[ix(i)];
+        x_(index[j]) -= 2.0 * increment;
+        newtfun(pfunc);
+        for (i = 0; i < n; i++) {
+            low_value[ix(i)] = value[ix(i)];
+
+            /* Insert partials into jth column of Jacobian matrix */
+
+            jacobian[i][ix(j)] = (high_value[ix(i)] - low_value[ix(i)]) / (2.0 * increment);
+        }
+
+        /* Restore original variable and function values. */
+
+        x_(index[j]) += increment;
+        newtfun(pfunc);
+    }
+}
+
+NewtonSpace* nrn_cons_newtonspace(int n, int n_instance) {
+    NewtonSpace* ns = (NewtonSpace*)emalloc(sizeof(NewtonSpace));
+    ns->n = n;
+    ns->n_instance = n_instance;
+    ns->delta_x = makevector(n * n_instance * sizeof(double));
+    ns->jacobian = makematrix(n, n * n_instance);
+    ns->perm = (int*)emalloc((unsigned)(n * n_instance * sizeof(int)));
+    ns->high_value = makevector(n * n_instance * sizeof(double));
+    ns->low_value = makevector(n * n_instance * sizeof(double));
+    ns->rowmax = makevector(n * n_instance * sizeof(double));
+    nrn_newtonspace_copyto_device(ns);
+    return ns;
+}
+
+void nrn_destroy_newtonspace(NewtonSpace* ns) {
+    free((char*)ns->perm);
+    freevector(ns->delta_x);
+    freematrix(ns->jacobian);
+    freevector(ns->high_value);
+    freevector(ns->low_value);
+    freevector(ns->rowmax);
+    free((char*)ns);
+}
diff --git a/coreneuron/scopmath_core/sparse_thread.c b/coreneuron/scopmath_core/sparse_thread.c
new file mode 100644
index 000000000..ebfd56e05
--- /dev/null
+++ b/coreneuron/scopmath_core/sparse_thread.c
@@ -0,0 +1,853 @@
+#include "coreneuron/mech/cfile/scoplib.h"
+#include "coreneuron/nrniv/nrn_assert.h"
+
+/******************************************************************************
+ *
+ * File: sparse.c
+ *
+ * Copyright (c) 1989, 1990
+ *   Duke University
+ *
+ ******************************************************************************/
+
+#ifndef LINT
+static char RCSid[] = "sparse.c,v 1.7 1998/03/12 13:17:17 hines Exp";
+#endif
+
+#include <stdlib.h>
+#include "coreneuron/mech/mod2c_core_thread.h" /* _threadargs, _STRIDE, etc. */
+#include "coreneuron/scopmath_core/errcodes.h"
+
+/* Aug 2016 coreneuron : very different prototype and memory organization */
+/* Jan 2008 thread safe */
+/* 4/23/93 converted to object so many models can use it */
+/*-----------------------------------------------------------------------------
+ *
+ *  sparse()
+ *
+ *  Abstract:
+ *  This is an experimental numerical method for SCoP-3 which integrates kinetic
+ *  rate equations.  It is intended to be used only by models generated by MODL,
+ *  and its identity is meant to be concealed from the user.
+ *
+ *
+ *  Calling sequence:
+ *	sparse(n, s, d, t, dt, fun, prhs, linflag)
+ *
+ *  Arguments:
+ * 	n		number of state variables
+ * 	s		array of pointers to the state variables
+ * 	d		array of pointers to the derivatives of states
+ * 	t		pointer to the independent variable
+ * 	dt		the time step
+ * 	fun		pointer to the function corresponding to the
+ *			kinetic block equations
+ * 	prhs		pointer to right hand side vector (answer on return)
+ *			does not have to be allocated by caller.
+ * 	linflag		solve as linear equations
+ *			when nonlinear, all states are forced >= 0
+ *
+ *
+ *  Returns:	nothing
+ *
+ *  Functions called: IGNORE(), printf(), create_coef_list(), fabs()
+ *
+ *  Files accessed:  none
+ *
+*/
+
+#if LINT
+#define IGNORE(arg) \
+    {               \
+        if (arg)    \
+            ;       \
+    }
+#else
+#define IGNORE(arg) arg
+#endif
+
+#if __TURBOC__ || VMS
+#define Free(arg) myfree((void*)arg)
+#else
+#define Free(arg) myfree((char*)arg)
+#endif
+#if 0
+extern void nrn_malloc_lock();
+extern void nrn_malloc_unlock();
+#else
+#define nrn_malloc_lock()   /**/
+#define nrn_malloc_unlock() /**/
+#endif
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/* note: solution order refers to the following
+        diag[varord[row]]->row = row = diag[varord[row]]->col
+        rowst[varord[row]]->row = row
+        varord[el->row] < varord[el->c_right->row]
+        varord[el->col] < varord[el->r_down->col]
+*/
+
+static int matsol(SparseObj* so, int _iml);
+static void subrow(SparseObj* so, Elm* pivot, Elm* rowsub, int _iml);
+static void bksub(SparseObj* so, int _iml);
+static void prmat(SparseObj* so);
+static void initeqn(SparseObj* so, unsigned maxeqn);
+static void free_elm(SparseObj* so);
+static Elm* getelm(SparseObj* so, unsigned row, unsigned col, Elm* new);
+#pragma acc routine seq
+double* _nrn_thread_getelm(SparseObj* so, int row, int col, int _iml);
+void* nrn_cons_sparseobj(SPFUN, int, Memb_list*, _threadargsproto_);
+static void create_coef_list(SparseObj* so, int n, SPFUN fun, _threadargsproto_);
+static void init_coef_list(SparseObj* so, int _iml);
+static void init_minorder(SparseObj* so);
+static void increase_order(SparseObj* so, unsigned row);
+static void reduce_order(SparseObj* so, unsigned row);
+static void spar_minorder(SparseObj* so);
+static void get_next_pivot(SparseObj* so, unsigned i);
+static Item* newitem();
+static List* newlist();
+static void freelist(List* list);
+static void linkitem(Item* item, Item* i);
+static void insert(SparseObj* so, Item* item);
+static void delete (Item* item);
+static void* myemalloc(unsigned n);
+static void myfree(void*);
+static void check_assert();
+static void re_link(SparseObj* so, unsigned i);
+static SparseObj* create_sparseobj();
+void _nrn_destroy_sparseobj_thread(SparseObj* so);
+
+#if defined(_OPENACC)
+#undef emalloc
+#undef ecalloc
+#define emalloc(arg) malloc(arg)
+#define ecalloc(arg1, arg2) malloc((arg1) * (arg2))
+#endif
+
+static Elm* nrn_pool_alloc(void* arg) {
+    return emalloc(sizeof(Elm));
+}
+
+/* sparse matrix dynamic allocation:
+create_coef_list makes a list for fast setup, does minimum ordering and
+ensures all elements needed are present */
+/* this could easily be made recursive but it isn't right now */
+
+void* nrn_cons_sparseobj(SPFUN fun, int n, Memb_list* ml, _threadargsproto_) {
+    SparseObj* so;
+    // fill in the unset _threadargsproto_ assuming _iml = 0;
+    _iml = 0; /* from _threadargsproto_ */
+    _p = ml->data;
+    _ppvar = ml->pdata;
+    _v = _nt->_actual_v[ml->nodeindices[_iml]];
+
+    so = create_sparseobj();
+    so->_cntml_padded = _cntml_padded;
+    create_coef_list(so, n, fun, _threadargs_);
+    nrn_sparseobj_copyto_device(so);
+    return so;
+}
+
+int sparse_thread(SparseObj* so,
+                  int n,
+                  int* s,
+                  int* d,
+                  double* t,
+                  double dt,
+                  SPFUN fun,
+                  int linflag,
+                  _threadargsproto_) {
+#define ix(arg) ((arg)*_STRIDE)
+#define s_(arg) _p[ix(s[arg])]
+#define d_(arg) _p[ix(d[arg])]
+
+    int i, j, ierr;
+    double err;
+
+    for (i = 0; i < n; i++) { /*save old state*/
+        d_(i) = s_(i);
+    }
+    for (err = 1, j = 0; err > CONVERGE; j++) {
+        init_coef_list(so, _iml);
+        spfun(fun, so, so->rhs);
+        if ((ierr = matsol(so, _iml))) {
+            return ierr;
+        }
+        for (err = 0., i = 1; i <= n; i++) { /* why oh why did I write it from 1 */
+            s_(i - 1) += so->rhs[ix(i)];
+#if 1 /* stability of nonlinear kinetic schemes sometimes requires this */
+            if (!linflag && s_(i - 1) < 0.) {
+                s_(i - 1) = 0.;
+            }
+#endif
+            err += fabs(so->rhs[ix(i)]);
+        }
+        if (j > MAXSTEPS) {
+            return EXCEED_ITERS;
+        }
+        if (linflag)
+            break;
+    }
+    init_coef_list(so, _iml);
+    spfun(fun, so, so->rhs);
+    for (i = 0; i < n; i++) { /*restore Dstate at t+dt*/
+        d_(i) = (s_(i) - d_(i)) / dt;
+    }
+    return SUCCESS;
+}
+
+/* for solving ax=b */
+int _cvode_sparse_thread(void** v, int n, int* x, SPFUN fun, _threadargsproto_)
+#define x_(arg) _p[x[arg] * _STRIDE]
+{
+    int i, j, ierr;
+    SparseObj* so;
+
+    so = (SparseObj*)(*v);
+    if (!so) {
+        so = create_sparseobj();
+        *v = (void*)so;
+    }
+    if (so->oldfun != fun) {
+        so->oldfun = fun;
+        create_coef_list(so, n, fun, _threadargs_); /* calls fun twice */
+    }
+    init_coef_list(so, _iml);
+    spfun(fun, so, so->rhs);
+    if ((ierr = matsol(so, _iml))) {
+        return ierr;
+    }
+    for (i = 1; i <= n; i++) { /* why oh why did I write it from 1 */
+        x_(i - 1) = so->rhs[i];
+    }
+    return SUCCESS;
+}
+
+static int matsol(SparseObj* so, int _iml) {
+    register Elm *pivot, *el;
+    unsigned i;
+
+    /* Upper triangularization */
+    so->numop = 0;
+    for (i = 1; i <= so->neqn; i++) {
+        if (fabs((pivot = so->diag[i])->value[_iml]) <= ROUNDOFF) {
+            return SINGULAR;
+        }
+        /* Eliminate all elements in pivot column */
+        for (el = pivot->r_down; el; el = el->r_down) {
+            subrow(so, pivot, el, _iml);
+        }
+    }
+    bksub(so, _iml);
+    return (SUCCESS);
+}
+
+static void subrow(SparseObj* so, Elm* pivot, Elm* rowsub, int _iml) {
+    double r;
+    register Elm* el;
+
+    int _cntml_padded = so->_cntml_padded;
+    r = rowsub->value[_iml] / pivot->value[_iml];
+    so->rhs[ix(rowsub->row)] -= so->rhs[ix(pivot->row)] * r;
+    so->numop++;
+    for (el = pivot->c_right; el; el = el->c_right) {
+        for (rowsub = rowsub->c_right; rowsub->col != el->col; rowsub = rowsub->c_right) {
+            ;
+        }
+        rowsub->value[_iml] -= el->value[_iml] * r;
+        so->numop++;
+    }
+}
+
+static void bksub(SparseObj* so, int _iml) {
+    unsigned i;
+    Elm* el;
+
+    int _cntml_padded = so->_cntml_padded;
+    for (i = so->neqn; i >= 1; i--) {
+        for (el = so->diag[i]->c_right; el; el = el->c_right) {
+            so->rhs[ix(el->row)] -= el->value[_iml] * so->rhs[ix(el->col)];
+            so->numop++;
+        }
+        so->rhs[ix(so->diag[i]->row)] /= so->diag[i]->value[_iml];
+        so->numop++;
+    }
+}
+
+static void prmat(SparseObj* so) {
+    unsigned i, j;
+    Elm* el;
+
+    IGNORE(printf("\n        "));
+    for (i = 10; i <= so->neqn; i += 10)
+        IGNORE(printf("         %1d", (i % 100) / 10));
+    IGNORE(printf("\n        "));
+    for (i = 1; i <= so->neqn; i++)
+        IGNORE(printf("%1d", i % 10));
+    IGNORE(printf("\n\n"));
+    for (i = 1; i <= so->neqn; i++) {
+        IGNORE(printf("%3d %3d ", so->diag[i]->row, i));
+        j = 0;
+        for (el = so->rowst[i]; el; el = el->c_right) {
+            for (j++; j < so->varord[el->col]; j++)
+                IGNORE(printf(" "));
+            IGNORE(printf("*"));
+        }
+        IGNORE(printf("\n"));
+    }
+    IGNORE(fflush(stdin));
+}
+
+static void initeqn(SparseObj* so, unsigned maxeqn) /* reallocate space for matrix */
+{
+    register unsigned i, nn;
+
+    if (maxeqn == so->neqn)
+        return;
+    free_elm(so);
+    so->neqn = maxeqn;
+    if (so->rowst)
+        Free(so->rowst);
+    if (so->diag)
+        Free(so->diag);
+    if (so->varord)
+        Free(so->varord);
+    if (so->rhs)
+        Free(so->rhs);
+    if (so->ngetcall)
+        free(so->ngetcall);
+    so->elmpool = NULL;
+    so->rowst = so->diag = (Elm**)0;
+    so->varord = (unsigned*)0;
+    so->rowst = (Elm**)myemalloc((maxeqn + 1) * sizeof(Elm*));
+    so->diag = (Elm**)myemalloc((maxeqn + 1) * sizeof(Elm*));
+    so->varord = (unsigned*)myemalloc((maxeqn + 1) * sizeof(unsigned));
+    so->rhs = (double*)myemalloc((maxeqn + 1) * so->_cntml_padded * sizeof(double));
+    so->ngetcall = (unsigned*)ecalloc(so->_cntml_padded, sizeof(unsigned));
+    for (i = 1; i <= maxeqn; i++) {
+        so->varord[i] = i;
+        so->diag[i] = (Elm*)nrn_pool_alloc(so->elmpool);
+        so->diag[i]->value = (double*)ecalloc(so->_cntml_padded, sizeof(double));
+        so->rowst[i] = so->diag[i];
+        so->diag[i]->row = i;
+        so->diag[i]->col = i;
+        so->diag[i]->r_down = so->diag[i]->r_up = ELM0;
+        so->diag[i]->c_right = so->diag[i]->c_left = ELM0;
+    }
+    nn = so->neqn * so->_cntml_padded;
+    for (i = 0; i < nn; ++i) {
+        so->rhs[i] = 0.;
+    }
+}
+
+static void free_elm(SparseObj* so) {
+    unsigned i;
+    Elm *el, *elnext;
+
+    /* free all elements */
+    for (i = 1; i <= so->neqn; i++) {
+        so->rowst[i] = ELM0;
+        so->diag[i] = ELM0;
+    }
+}
+
+/* see check_assert in minorder for info about how this matrix is supposed
+to look.  If new is nonzero and an element would otherwise be created, new
+is used instead. This is because linking an element is highly nontrivial
+The biggest difference is that elements are no longer removed and this
+saves much time allocating and freeing during the solve phase
+*/
+
+static Elm* getelm(SparseObj* so, unsigned row, unsigned col, Elm* new)
+/* return pointer to row col element maintaining order in rows */
+{
+    register Elm *el, *elnext;
+    unsigned vrow, vcol;
+
+    vrow = so->varord[row];
+    vcol = so->varord[col];
+
+    if (vrow == vcol) {
+        return so->diag[vrow]; /* a common case */
+    }
+    if (vrow > vcol) { /* in the lower triangle */
+        /* search downward from diag[vcol] */
+        for (el = so->diag[vcol];; el = elnext) {
+            elnext = el->r_down;
+            if (!elnext) {
+                break;
+            } else if (elnext->row == row) { /* found it */
+                return elnext;
+            } else if (so->varord[elnext->row] > vrow) {
+                break;
+            }
+        }
+        /* insert below el */
+        if (!new) {
+            new = (Elm*)nrn_pool_alloc(so->elmpool);
+            new->value = (double*)ecalloc(so->_cntml_padded, sizeof(double));
+            increase_order(so, row);
+        }
+        new->r_down = el->r_down;
+        el->r_down = new;
+        new->r_up = el;
+        if (new->r_down) {
+            new->r_down->r_up = new;
+        }
+        /* search leftward from diag[vrow] */
+        for (el = so->diag[vrow];; el = elnext) {
+            elnext = el->c_left;
+            if (!elnext) {
+                break;
+            } else if (so->varord[elnext->col] < vcol) {
+                break;
+            }
+        }
+        /* insert to left of el */
+        new->c_left = el->c_left;
+        el->c_left = new;
+        new->c_right = el;
+        if (new->c_left) {
+            new->c_left->c_right = new;
+        } else {
+            so->rowst[vrow] = new;
+        }
+    } else { /* in the upper triangle */
+        /* search upward from diag[vcol] */
+        for (el = so->diag[vcol];; el = elnext) {
+            elnext = el->r_up;
+            if (!elnext) {
+                break;
+            } else if (elnext->row == row) { /* found it */
+                return elnext;
+            } else if (so->varord[elnext->row] < vrow) {
+                break;
+            }
+        }
+        /* insert above el */
+        if (!new) {
+            new = (Elm*)nrn_pool_alloc(so->elmpool);
+            new->value = (double*)ecalloc(so->_cntml_padded, sizeof(double));
+            increase_order(so, row);
+        }
+        new->r_up = el->r_up;
+        el->r_up = new;
+        new->r_down = el;
+        if (new->r_up) {
+            new->r_up->r_down = new;
+        }
+        /* search right from diag[vrow] */
+        for (el = so->diag[vrow];; el = elnext) {
+            elnext = el->c_right;
+            if (!elnext) {
+                break;
+            } else if (so->varord[elnext->col] > vcol) {
+                break;
+            }
+        }
+        /* insert to right of el */
+        new->c_right = el->c_right;
+        el->c_right = new;
+        new->c_left = el;
+        if (new->c_right) {
+            new->c_right->c_left = new;
+        }
+    }
+    new->row = row;
+    new->col = col;
+    return new;
+}
+
+double* _nrn_thread_getelm(SparseObj* so, int row, int col, int _iml) {
+    Elm* el;
+    if (!so->phase) {
+        return so->coef_list[so->ngetcall[_iml]++];
+    }
+    el = getelm(so, (unsigned)row, (unsigned)col, ELM0);
+    if (so->phase == 1) {
+        so->ngetcall[_iml]++;
+    } else {
+        so->coef_list[so->ngetcall[_iml]++] = el->value;
+    }
+    return el->value;
+}
+
+static void create_coef_list(SparseObj* so, int n, SPFUN fun, _threadargsproto_) {
+    initeqn(so, (unsigned)n);
+    so->phase = 1;
+    so->ngetcall[0] = 0;
+    spfun(fun, so, so->rhs);
+    if (so->coef_list) {
+        free(so->coef_list);
+    }
+    so->coef_list_size = so->ngetcall[0];
+    so->coef_list = (double**)myemalloc(so->ngetcall[0] * sizeof(double*));
+    spar_minorder(so);
+    so->phase = 2;
+    so->ngetcall[0] = 0;
+    spfun(fun, so, so->rhs);
+    so->phase = 0;
+}
+
+static void init_coef_list(SparseObj* so, int _iml) {
+    unsigned i, icnt;
+    Elm* el;
+
+    so->ngetcall[_iml] = 0;
+    for (i = 1; i <= so->neqn; i++) {
+        for (el = so->rowst[i]; el; el = el->c_right) {
+            el->value[_iml] = 0.;
+        }
+    }
+}
+
+static void init_minorder(SparseObj* so) {
+    /* matrix has been set up. Construct the orderlist and orderfind
+       vector.
+    */
+    unsigned i, j;
+    Elm* el;
+
+    so->do_flag = 1;
+    if (so->roworder) {
+        for (i = 1; i <= so->nroworder; ++i) {
+            Free(so->roworder[i]);
+        }
+        Free(so->roworder);
+    }
+    so->roworder = (Item**)myemalloc((so->neqn + 1) * sizeof(Item*));
+    so->nroworder = so->neqn;
+    if (so->orderlist)
+        freelist(so->orderlist);
+    so->orderlist = newlist();
+    for (i = 1; i <= so->neqn; i++) {
+        so->roworder[i] = newitem();
+    }
+    for (i = 1; i <= so->neqn; i++) {
+        for (j = 0, el = so->rowst[i]; el; el = el->c_right) {
+            j++;
+        }
+        so->roworder[so->diag[i]->row]->elm = so->diag[i];
+        so->roworder[so->diag[i]->row]->norder = j;
+        insert(so, so->roworder[so->diag[i]->row]);
+    }
+}
+
+static void increase_order(SparseObj* so, unsigned row) {
+    /* order of row increases by 1. Maintain the orderlist. */
+    Item* order;
+
+    if (!so->do_flag)
+        return;
+    order = so->roworder[row];
+    delete (order);
+    order->norder++;
+    insert(so, order);
+}
+
+static void reduce_order(SparseObj* so, unsigned row) {
+    /* order of row decreases by 1. Maintain the orderlist. */
+    Item* order;
+
+    if (!so->do_flag)
+        return;
+    order = so->roworder[row];
+    delete (order);
+    order->norder--;
+    insert(so, order);
+}
+
+static void spar_minorder(SparseObj* so) { /* Minimum ordering algorithm to determine the order
+                        that the matrix should be solved. Also make sure
+                        all needed elements are present.
+                        This does not mess up the matrix
+                      */
+    unsigned i;
+
+    check_assert(so);
+    init_minorder(so);
+    for (i = 1; i <= so->neqn; i++) {
+        get_next_pivot(so, i);
+    }
+    so->do_flag = 0;
+    check_assert(so);
+}
+
+static void get_next_pivot(SparseObj* so, unsigned i) {
+    /* get varord[i], etc. from the head of the orderlist. */
+    Item* order;
+    Elm *pivot, *el;
+    unsigned j;
+
+    order = so->orderlist->next;
+    assert(order != so->orderlist);
+
+    if ((j = so->varord[order->elm->row]) != i) {
+        /* push order lists down by 1 and put new diag in empty slot */
+        assert(j > i);
+        el = so->rowst[j];
+        for (; j > i; j--) {
+            so->diag[j] = so->diag[j - 1];
+            so->rowst[j] = so->rowst[j - 1];
+            so->varord[so->diag[j]->row] = j;
+        }
+        so->diag[i] = order->elm;
+        so->rowst[i] = el;
+        so->varord[so->diag[i]->row] = i;
+        /* at this point row links are out of order for diag[i]->col
+           and col links are out of order for diag[i]->row */
+        re_link(so, i);
+    }
+
+    /* now make sure all needed elements exist */
+    for (el = so->diag[i]->r_down; el; el = el->r_down) {
+        for (pivot = so->diag[i]->c_right; pivot; pivot = pivot->c_right) {
+            IGNORE(getelm(so, el->row, pivot->col, ELM0));
+        }
+        reduce_order(so, el->row);
+    }
+
+#if 0
+{int j; Item *or;
+	printf("%d  ", i);
+	for (or = so->orderlist->next, j=0; j<5 && or != so->orderlist; j++, or=or->next) {
+		printf("(%d, %d)  ", or->elm->row, or->norder);
+	}
+	printf("\n");
+}
+#endif
+    delete (order);
+}
+
+/* The following routines support the concept of a list.
+modified from modl
+*/
+
+/* Implementation
+  The list is a doubly linked list. A special item with element 0 is
+  always at the tail of the list and is denoted as the List pointer itself.
+  list->next point to the first item in the list and
+  list->prev points to the last item in the list.
+        i.e. the list is circular
+  Note that in an empty list next and prev points to itself.
+
+It is intended that this implementation be hidden from the user via the
+following function calls.
+*/
+
+static Item* newitem() {
+    Item* i;
+    i = (Item*)myemalloc(sizeof(Item));
+    i->prev = ITEM0;
+    i->next = ITEM0;
+    i->norder = 0;
+    i->elm = (Elm*)0;
+    return i;
+}
+
+static List* newlist() {
+    Item* i;
+    i = newitem();
+    i->prev = i;
+    i->next = i;
+    return (List*)i;
+}
+
+static void freelist(List* list) /*free the list but not the elements*/
+{
+    Item *i1, *i2;
+    for (i1 = list->next; i1 != list; i1 = i2) {
+        i2 = i1->next;
+        Free(i1);
+    }
+    Free(list);
+}
+
+static void linkitem(Item* item, Item* i) /*link i before item*/
+{
+    i->prev = item->prev;
+    i->next = item;
+    item->prev = i;
+    i->prev->next = i;
+}
+
+static void insert(SparseObj* so, Item* item) {
+    Item* i;
+
+    for (i = so->orderlist->next; i != so->orderlist; i = i->next) {
+        if (i->norder >= item->norder) {
+            break;
+        }
+    }
+    linkitem(i, item);
+}
+
+static void delete (Item* item) {
+    item->next->prev = item->prev;
+    item->prev->next = item->next;
+    item->prev = ITEM0;
+    item->next = ITEM0;
+}
+
+static void* myemalloc(unsigned n) { /* check return from malloc */
+    void* p;
+    nrn_malloc_lock();
+    p = malloc(n);
+    nrn_malloc_unlock();
+    if (p == (void*)0) {
+        abort_run(LOWMEM);
+    }
+    return (void*)p;
+}
+
+void myfree(void* ptr) {
+    nrn_malloc_lock();
+    free(ptr);
+    nrn_malloc_unlock();
+}
+
+static void check_assert(SparseObj* so) {
+    /* check that all links are consistent */
+    unsigned i;
+    Elm* el;
+
+    for (i = 1; i <= so->neqn; i++) {
+        assert(so->diag[i]);
+        assert(so->diag[i]->row == so->diag[i]->col);
+        assert(so->varord[so->diag[i]->row] == i);
+        assert(so->rowst[i]->row == so->diag[i]->row);
+        for (el = so->rowst[i]; el; el = el->c_right) {
+            if (el == so->rowst[i]) {
+                assert(el->c_left == ELM0);
+            } else {
+                assert(el->c_left->c_right == el);
+                assert(so->varord[el->c_left->col] < so->varord[el->col]);
+            }
+        }
+        for (el = so->diag[i]->r_down; el; el = el->r_down) {
+            assert(el->r_up->r_down == el);
+            assert(so->varord[el->r_up->row] < so->varord[el->row]);
+        }
+        for (el = so->diag[i]->r_up; el; el = el->r_up) {
+            assert(el->r_down->r_up == el);
+            assert(so->varord[el->r_down->row] > so->varord[el->row]);
+        }
+    }
+}
+
+/* at this point row links are out of order for diag[i]->col
+   and col links are out of order for diag[i]->row */
+static void re_link(SparseObj* so, unsigned i) {
+    Elm *el, *dright, *dleft, *dup, *ddown, *elnext;
+
+    for (el = so->rowst[i]; el; el = el->c_right) {
+        /* repair hole */
+        if (el->r_up)
+            el->r_up->r_down = el->r_down;
+        if (el->r_down)
+            el->r_down->r_up = el->r_up;
+    }
+
+    for (el = so->diag[i]->r_down; el; el = el->r_down) {
+        /* repair hole */
+        if (el->c_right)
+            el->c_right->c_left = el->c_left;
+        if (el->c_left)
+            el->c_left->c_right = el->c_right;
+        else
+            so->rowst[so->varord[el->row]] = el->c_right;
+    }
+
+    for (el = so->diag[i]->r_up; el; el = el->r_up) {
+        /* repair hole */
+        if (el->c_right)
+            el->c_right->c_left = el->c_left;
+        if (el->c_left)
+            el->c_left->c_right = el->c_right;
+        else
+            so->rowst[so->varord[el->row]] = el->c_right;
+    }
+
+    /* matrix is consistent except that diagonal row elements are unlinked from
+    their columns and the diagonal column elements are unlinked from their
+    rows.
+    For simplicity discard all knowledge of links and use getelm to relink
+    */
+    so->rowst[i] = so->diag[i];
+    dright = so->diag[i]->c_right;
+    dleft = so->diag[i]->c_left;
+    dup = so->diag[i]->r_up;
+    ddown = so->diag[i]->r_down;
+    so->diag[i]->c_right = so->diag[i]->c_left = ELM0;
+    so->diag[i]->r_up = so->diag[i]->r_down = ELM0;
+    for (el = dright; el; el = elnext) {
+        elnext = el->c_right;
+        IGNORE(getelm(so, el->row, el->col, el));
+    }
+    for (el = dleft; el; el = elnext) {
+        elnext = el->c_left;
+        IGNORE(getelm(so, el->row, el->col, el));
+    }
+    for (el = dup; el; el = elnext) {
+        elnext = el->r_up;
+        IGNORE(getelm(so, el->row, el->col, el));
+    }
+    for (el = ddown; el; el = elnext) {
+        elnext = el->r_down;
+        IGNORE(getelm(so, el->row, el->col, el));
+    }
+}
+
+static SparseObj* create_sparseobj() {
+    SparseObj* so;
+
+    so = myemalloc(sizeof(SparseObj));
+    nrn_malloc_lock();
+    nrn_malloc_unlock();
+    so->rowst = 0;
+    so->diag = 0;
+    so->neqn = 0;
+    so->_cntml_padded = 0;
+    so->varord = 0;
+    so->rhs = 0;
+    so->oldfun = 0;
+    so->ngetcall = 0;
+    so->phase = 0;
+    so->coef_list = 0;
+    so->roworder = 0;
+    so->nroworder = 0;
+    so->orderlist = 0;
+    so->do_flag = 0;
+
+    return so;
+}
+
+void _nrn_destroy_sparseobj_thread(SparseObj* so) {
+    int i;
+    if (!so) {
+        return;
+    }
+    if (so->rowst)
+        Free(so->rowst);
+    if (so->diag)
+        Free(so->diag);
+    if (so->varord)
+        Free(so->varord);
+    if (so->rhs)
+        Free(so->rhs);
+    if (so->coef_list)
+        Free(so->coef_list);
+    if (so->roworder) {
+        for (i = 1; i <= so->nroworder; ++i) {
+            Free(so->roworder[i]);
+        }
+        Free(so->roworder);
+    }
+    if (so->orderlist)
+        freelist(so->orderlist);
+    Free(so);
+}
diff --git a/coreneuron/scopmath_core/ssimplic_thread.c b/coreneuron/scopmath_core/ssimplic_thread.c
new file mode 100644
index 000000000..555fdeccb
--- /dev/null
+++ b/coreneuron/scopmath_core/ssimplic_thread.c
@@ -0,0 +1,79 @@
+#include "coreneuron/mech/cfile/scoplib.h"
+#include "coreneuron/mech/mod2c_core_thread.h"
+#include "coreneuron/scopmath_core/errcodes.h"
+#define s_(arg) _p[s[arg] * _STRIDE]
+
+#pragma acc routine seq
+static int check_state(int, int*, _threadargsproto_);
+
+int _ss_sparse_thread(SparseObj* v,
+                      int n,
+                      int* s,
+                      int* d,
+                      double* t,
+                      double dt,
+                      SPFUN fun,
+                      int linflag,
+                      _threadargsproto_) {
+    int err, i;
+    double ss_dt;
+
+    ss_dt = 1e9;
+    _modl_set_dt_thread(ss_dt, _nt);
+
+    if (linflag) { /*iterate linear solution*/
+        err = sparse_thread(v, n, s, d, t, ss_dt, fun, 0, _threadargs_);
+    } else {
+#define NIT 7
+        i = NIT;
+        err = 0;
+        while (i) {
+            err = sparse_thread(v, n, s, d, t, ss_dt, fun, 1, _threadargs_);
+            if (!err) {
+                if (check_state(n, s, _threadargs_)) {
+                    err = sparse_thread(v, n, s, d, t, ss_dt, fun, 0, _threadargs_);
+                }
+            }
+            --i;
+            if (!err) {
+                i = 0;
+            }
+        }
+    }
+
+    _modl_set_dt_thread(dt, _nt);
+    return err;
+}
+
+int _ss_derivimplicit_thread(int n, int* slist, int* dlist, DIFUN fun, _threadargsproto_) {
+    int err, i;
+    double dtsav;
+
+    dtsav = _modl_get_dt_thread(_nt);
+    _modl_set_dt_thread(1e-9, _nt);
+
+    err = derivimplicit_thread(n, slist, dlist, fun, _threadargs_);
+
+    _modl_set_dt_thread(dtsav, _nt);
+    return err;
+}
+
+static int check_state(int n, int* s, _threadargsproto_) {
+    int i, flag;
+
+    flag = 1;
+    for (i = 0; i < n; i++) {
+        if (s_(i) < -1e-6) {
+            s_(i) = 0.;
+            flag = 0;
+        }
+    }
+    return flag;
+}
+
+void _modl_set_dt_thread(double dt, NrnThread* nt) {
+    nt->_dt = dt;
+}
+double _modl_get_dt_thread(NrnThread* nt) {
+    return nt->_dt;
+}
diff --git a/coreneuron/utils/endianness.h b/coreneuron/utils/endianness.h
index 203ae5a50..dde1b8878 100644
--- a/coreneuron/utils/endianness.h
+++ b/coreneuron/utils/endianness.h
@@ -33,23 +33,25 @@ THE POSSIBILITY OF SUCH DAMAGE.
 
 namespace endian {
 
-    enum endianness {
-        little_endian,big_endian,mixed_endian
-    };
+    enum endianness { little_endian, big_endian, mixed_endian };
 
     static const union {
         unsigned char bytes[4];
         uint32_t value;
-    } endian_check = {{0,1,2,3}};
+    } endian_check = {{0, 1, 2, 3}};
 
-    static const enum endianness native_endian=
-        endian_check.value==0x03020100ul?little_endian:
-        endian_check.value==0x00010203ul?big_endian:
-        mixed_endian;
+    static const enum endianness native_endian =
+        endian_check.value == 0x03020100ul
+            ? little_endian
+            : endian_check.value == 0x00010203ul ? big_endian : mixed_endian;
 
-    static inline bool is_little_endian() { return native_endian==little_endian; }
-    static inline bool is_big_endian() { return native_endian==big_endian; }
+    static inline bool is_little_endian() {
+        return native_endian == little_endian;
+    }
+    static inline bool is_big_endian() {
+        return native_endian == big_endian;
+    }
 
-} // namespace endian
+}  // namespace endian
 
-#endif // ifndef endianness_h
+#endif  // ifndef endianness_h
diff --git a/coreneuron/utils/memory_utils.cpp b/coreneuron/utils/memory_utils.cpp
index bd91c664d..63b372643 100644
--- a/coreneuron/utils/memory_utils.cpp
+++ b/coreneuron/utils/memory_utils.cpp
@@ -27,7 +27,7 @@ THE POSSIBILITY OF SUCH DAMAGE.
 */
 
 /**
- * @file memory_utils.c
+ * @file memory_utils.cpp
  * @date 25th Oct 2014
  *
  * @brief Provides functionality to report current memory usage
@@ -51,51 +51,47 @@ THE POSSIBILITY OF SUCH DAMAGE.
 #include <malloc.h>
 #endif
 
-
-double nrn_mallinfo( void )
-{
-  double mbs = -1.0; // -ve mem usage if mallinfo is not supported
-
+double nrn_mallinfo(void) {
+    double mbs = -1.0;  // -ve mem usage if mallinfo is not supported
 
 // On BG-Q, Use kernel/memory.h to get heap statistics
 #ifdef HAVE_MEMORY_H
-  uint64_t heap = 0;
-  Kernel_GetMemorySize(KERNEL_MEMSIZE_HEAP, &heap);
-  mbs = heap / (1024.0 * 1024.0);
-// if malloc.h available, collect information from mallinfo  
+    uint64_t heap = 0;
+    Kernel_GetMemorySize(KERNEL_MEMSIZE_HEAP, &heap);
+    mbs = heap / (1024.0 * 1024.0);
+// if malloc.h available, collect information from mallinfo
 #elif defined HAVE_MALLOC_H
-  struct mallinfo m;
-  
-  m =  mallinfo();
-  mbs = ( m.hblkhd + m.uordblks ) /  (1024.0 * 1024.0);
+    struct mallinfo m;
+
+    m = mallinfo();
+    mbs = (m.hblkhd + m.uordblks) / (1024.0 * 1024.0);
 #endif
-  
-  return mbs;
+
+    return mbs;
 }
 
+void report_mem_usage(const char* message, bool all_ranks) {
+    double cur_mem, mem_max, mem_min, mem_avg;  // min, max, avg memory
 
-void report_mem_usage( const char *message, bool all_ranks )
-{
-  double cur_mem, mem_max, mem_min, mem_avg; //min, max, avg memory 
-  
-  // current memory usage on this rank
-  cur_mem = nrn_mallinfo();
-  
-  /* @todo: avoid three all reduce class */
-  mem_avg = nrnmpi_dbl_allreduce( cur_mem, 1 ) / nrnmpi_numprocs;
-  mem_max = nrnmpi_dbl_allreduce( cur_mem, 2 );
-  mem_min = nrnmpi_dbl_allreduce( cur_mem, 3 );
-  
-  // all ranks prints information if all_ranks is true
-  if ( all_ranks ) {
-    printf( " Memory (MBs) (Rank : %2d) : %30s : Cur %.4lf, Max %.4lf, Min %.4lf, Avg %.4lf \n", \
-            nrnmpi_myid, message, cur_mem, mem_max, mem_min, mem_avg );
-  }
-  else if ( nrnmpi_myid == 0 ) {
-  
-    printf( " Memory (MBs) : %25s : Max %.4lf, Min %.4lf, Avg %.4lf \n", \
-            message, mem_max, mem_min, mem_avg );
-  }
-}
+    // current memory usage on this rank
+    cur_mem = nrn_mallinfo();
 
+/* @todo: avoid three all reduce class */
+#if NRNMPI
+    mem_avg = nrnmpi_dbl_allreduce(cur_mem, 1) / nrnmpi_numprocs;
+    mem_max = nrnmpi_dbl_allreduce(cur_mem, 2);
+    mem_min = nrnmpi_dbl_allreduce(cur_mem, 3);
+#else
+    mem_avg = mem_max = mem_min = cur_mem;
+#endif
 
+    // all ranks prints information if all_ranks is true
+    if (all_ranks) {
+        printf(" Memory (MBs) (Rank : %2d) : %30s : Cur %.4lf, Max %.4lf, Min %.4lf, Avg %.4lf \n",
+               nrnmpi_myid, message, cur_mem, mem_max, mem_min, mem_avg);
+    } else if (nrnmpi_myid == 0) {
+        printf(" Memory (MBs) : %25s : Max %.4lf, Min %.4lf, Avg %.4lf \n", message, mem_max,
+               mem_min, mem_avg);
+    }
+    fflush(stdout);
+}
diff --git a/coreneuron/utils/memory_utils.h b/coreneuron/utils/memory_utils.h
index 701e89cc9..aa5637d86 100644
--- a/coreneuron/utils/memory_utils.h
+++ b/coreneuron/utils/memory_utils.h
@@ -45,12 +45,12 @@ THE POSSIBILITY OF SUCH DAMAGE.
  *  @param all_ranks indicate whether to print info from all ranks
  *  @return Void
  */
-void report_mem_usage( const char *message, bool all_ranks = false );
+void report_mem_usage(const char* message, bool all_ranks = false);
 
 /** @brief Returns current memory usage in KBs
  *  @param Void
  *  @return memory usage in KBs
  */
-double nrn_mallinfo( void );
+double nrn_mallinfo(void);
 
 #endif /* ifndef NRN_MEMORY_UTILS */
diff --git a/coreneuron/utils/randoms/Random123/aes.h b/coreneuron/utils/randoms/Random123/aes.h
index 96e3c9cdd..334106683 100644
--- a/coreneuron/utils/randoms/Random123/aes.h
+++ b/coreneuron/utils/randoms/Random123/aes.h
@@ -49,21 +49,20 @@ typedef struct r123array4x32 aesni4x32_ukey_t;
 enum r123_enum_aesni1xm128i { aesni1xm128i_rounds = 10 };
 
 /** \cond HIDDEN_FROM_DOXYGEN */
-R123_STATIC_INLINE __m128i AES_128_ASSIST (__m128i temp1, __m128i temp2) { 
-    __m128i temp3; 
-    temp2 = _mm_shuffle_epi32 (temp2 ,0xff); 
-    temp3 = _mm_slli_si128 (temp1, 0x4);
-    temp1 = _mm_xor_si128 (temp1, temp3);
-    temp3 = _mm_slli_si128 (temp3, 0x4);
-    temp1 = _mm_xor_si128 (temp1, temp3);
-    temp3 = _mm_slli_si128 (temp3, 0x4);
-    temp1 = _mm_xor_si128 (temp1, temp3);
-    temp1 = _mm_xor_si128 (temp1, temp2); 
-    return temp1; 
+R123_STATIC_INLINE __m128i AES_128_ASSIST(__m128i temp1, __m128i temp2) {
+    __m128i temp3;
+    temp2 = _mm_shuffle_epi32(temp2, 0xff);
+    temp3 = _mm_slli_si128(temp1, 0x4);
+    temp1 = _mm_xor_si128(temp1, temp3);
+    temp3 = _mm_slli_si128(temp3, 0x4);
+    temp1 = _mm_xor_si128(temp1, temp3);
+    temp3 = _mm_slli_si128(temp3, 0x4);
+    temp1 = _mm_xor_si128(temp1, temp3);
+    temp1 = _mm_xor_si128(temp1, temp2);
+    return temp1;
 }
 
-R123_STATIC_INLINE void aesni1xm128iexpand(aesni1xm128i_ukey_t uk, __m128i ret[11])
-{
+R123_STATIC_INLINE void aesni1xm128iexpand(aesni1xm128i_ukey_t uk, __m128i ret[11]) {
     __m128i rkey = uk.v[0].m;
     __m128i tmp2;
 
@@ -109,29 +108,29 @@ R123_STATIC_INLINE void aesni1xm128iexpand(aesni1xm128i_ukey_t uk, __m128i ret[1
     ret[10] = rkey;
 }
 /** \endcond */
-    
+
 #ifdef __cplusplus
 /** @ingroup AESNI */
-struct aesni1xm128i_key_t{ 
-    __m128i k[11]; 
-    aesni1xm128i_key_t(){
+struct aesni1xm128i_key_t {
+    __m128i k[11];
+    aesni1xm128i_key_t() {
         aesni1xm128i_ukey_t uk;
         uk.v[0].m = _mm_setzero_si128();
         aesni1xm128iexpand(uk, k);
     }
-    aesni1xm128i_key_t(const aesni1xm128i_ukey_t& uk){
+    aesni1xm128i_key_t(const aesni1xm128i_ukey_t& uk) {
         aesni1xm128iexpand(uk, k);
     }
-    aesni1xm128i_key_t(const aesni4x32_ukey_t& uk){
+    aesni1xm128i_key_t(const aesni4x32_ukey_t& uk) {
         aesni1xm128i_ukey_t uk128;
         uk128.v[0].m = _mm_set_epi32(uk.v[3], uk.v[2], uk.v[1], uk.v[0]);
         aesni1xm128iexpand(uk128, k);
     }
-    aesni1xm128i_key_t& operator=(const aesni1xm128i_ukey_t& uk){
+    aesni1xm128i_key_t& operator=(const aesni1xm128i_ukey_t& uk) {
         aesni1xm128iexpand(uk, k);
         return *this;
     }
-    aesni1xm128i_key_t& operator=(const aesni4x32_ukey_t& uk){
+    aesni1xm128i_key_t& operator=(const aesni4x32_ukey_t& uk) {
         aesni1xm128i_ukey_t uk128;
         uk128.v[0].m = _mm_set_epi32(uk.v[3], uk.v[2], uk.v[1], uk.v[0]);
         aesni1xm128iexpand(uk128, k);
@@ -139,12 +138,10 @@ struct aesni1xm128i_key_t{
     }
 };
 #else
-typedef struct { 
-    __m128i k[11]; 
-}aesni1xm128i_key_t;
+typedef struct { __m128i k[11]; } aesni1xm128i_key_t;
 
 /** @ingroup AESNI */
-R123_STATIC_INLINE aesni1xm128i_key_t aesni1xm128ikeyinit(aesni1xm128i_ukey_t uk){
+R123_STATIC_INLINE aesni1xm128i_key_t aesni1xm128ikeyinit(aesni1xm128i_ukey_t uk) {
     aesni1xm128i_key_t ret;
     aesni1xm128iexpand(uk, ret.k);
     return ret;
@@ -165,19 +162,20 @@ R123_STATIC_INLINE aesni1xm128i_ctr_t aesni1xm128i(aesni1xm128i_ctr_t in, aesni1
     x = _mm_aesenc_si128(x, k.k[9]);
     x = _mm_aesenclast_si128(x, k.k[10]);
     {
-      aesni1xm128i_ctr_t ret;
-      ret.v[0].m = x;
-      return ret;
+        aesni1xm128i_ctr_t ret;
+        ret.v[0].m = x;
+        return ret;
     }
 }
 
 /** @ingroup AESNI */
-R123_STATIC_INLINE aesni1xm128i_ctr_t aesni1xm128i_R(unsigned R, aesni1xm128i_ctr_t in, aesni1xm128i_key_t k){
-    R123_ASSERT(R==10);
+R123_STATIC_INLINE aesni1xm128i_ctr_t aesni1xm128i_R(unsigned R,
+                                                     aesni1xm128i_ctr_t in,
+                                                     aesni1xm128i_key_t k) {
+    R123_ASSERT(R == 10);
     return aesni1xm128i(in, k);
 }
 
-
 /** @ingroup AESNI */
 typedef struct r123array4x32 aesni4x32_ctr_t;
 /** @ingroup AESNI */
@@ -185,7 +183,7 @@ typedef aesni1xm128i_key_t aesni4x32_key_t;
 /** @ingroup AESNI */
 enum r123_enum_aesni4x32 { aesni4x32_rounds = 10 };
 /** @ingroup AESNI */
-R123_STATIC_INLINE aesni4x32_key_t aesni4x32keyinit(aesni4x32_ukey_t uk){
+R123_STATIC_INLINE aesni4x32_key_t aesni4x32keyinit(aesni4x32_ukey_t uk) {
     aesni1xm128i_ukey_t uk128;
     aesni4x32_key_t ret;
     uk128.v[0].m = _mm_set_epi32(uk.v[3], uk.v[2], uk.v[1], uk.v[0]);
@@ -194,8 +192,11 @@ R123_STATIC_INLINE aesni4x32_key_t aesni4x32keyinit(aesni4x32_ukey_t uk){
 }
 
 /** @ingroup AESNI */
-/** The aesni4x32_R function provides a C API to the @ref AESNI "AESNI" CBRNG, allowing the number of rounds to be specified explicitly **/
-R123_STATIC_INLINE aesni4x32_ctr_t aesni4x32_R(unsigned int Nrounds, aesni4x32_ctr_t c, aesni4x32_key_t k){
+/** The aesni4x32_R function provides a C API to the @ref AESNI "AESNI" CBRNG, allowing the number
+ * of rounds to be specified explicitly **/
+R123_STATIC_INLINE aesni4x32_ctr_t aesni4x32_R(unsigned int Nrounds,
+                                               aesni4x32_ctr_t c,
+                                               aesni4x32_key_t k) {
     aesni1xm128i_ctr_t c128;
     c128.v[0].m = _mm_set_epi32(c.v[3], c.v[2], c.v[1], c.v[0]);
     c128 = aesni1xm128i_R(Nrounds, c128, k);
@@ -205,80 +206,89 @@ R123_STATIC_INLINE aesni4x32_ctr_t aesni4x32_R(unsigned int Nrounds, aesni4x32_c
 
 #define aesni4x32_rounds aesni1xm128i_rounds
 
-/** The aesni4x32 macro provides a C API to the @ref AESNI "AESNI" CBRNG, uses the default number of rounds i.e. \c aesni4x32_rounds **/
+/** The aesni4x32 macro provides a C API to the @ref AESNI "AESNI" CBRNG, uses the default number of
+ * rounds i.e. \c aesni4x32_rounds **/
 /** @ingroup AESNI */
-#define aesni4x32(c,k) aesni4x32_R(aesni4x32_rounds, c, k)
+#define aesni4x32(c, k) aesni4x32_R(aesni4x32_rounds, c, k)
 
 #ifdef __cplusplus
-namespace r123{
-/** 
-@defgroup AESNI ARS and AESNI Classes and Typedefs
-
-The ARS4x32, ARS1xm128i, AESNI4x32 and AESNI1xm128i classes export the member functions, typedefs and
-operator overloads required by a @ref CBRNG "CBRNG" class.
-
-ARS1xm128i and AESNI1xm128i are based on the AES block cipher and rely on the AES-NI hardware instructions
-available on some some new (2011) CPUs.
-
-The ARS1xm128i CBRNG and the use of AES for random number generation are described in 
-<a href="http://dl.acm.org/citation.cfm?doid=2063405"><i>Parallel Random Numbers:  As Easy as 1, 2, 3</i> </a>.
-Although it uses some cryptographic primitives, ARS1xm128i uses a cryptographically weak key schedule and is \b not suitable for cryptographic use.
-
-@class AESNI1xm128i
-@ingroup AESNI
-AESNI exports the member functions, typedefs and operator overloads required by a @ref CBRNG class.
-
-AESNI1xm128i uses the crypotgraphic AES round function, including the cryptographic key schedule.
-
-In contrast to the other CBRNGs in the Random123 library, the AESNI1xm128i_R::key_type is opaque
-and is \b not identical to the AESNI1xm128i_R::ukey_type.  Creating a key_type, using either the constructor
-or assignment operator, is significantly more time-consuming than running the bijection (hundreds
-of clock cycles vs. tens of clock cycles).
-
-AESNI1xm128i is only available when the feature-test macro R123_USE_AES_NI is true, which
-should occur only when the compiler is configured to generate AES-NI instructions (or
-when defaults are overridden by compile-time, compiler-command-line options).
-
-As of September 2011, the authors know of no statistical flaws with AESNI1xm128i.  It
-would be an event of major cryptographic note if any such flaws were ever found.
-*/
-struct AESNI1xm128i{
-    typedef aesni1xm128i_ctr_t ctr_type;
-    typedef aesni1xm128i_ukey_t ukey_type;
-    typedef aesni1xm128i_key_t key_type;
-    static const unsigned int rounds=10;
-    ctr_type operator()(ctr_type ctr, key_type key) const{
-        return aesni1xm128i(ctr, key);
-    }
-};
-
-/* @class AESNI4x32 */
-struct AESNI4x32{
-    typedef aesni4x32_ctr_t ctr_type;
-    typedef aesni4x32_ukey_t ukey_type;
-    typedef aesni4x32_key_t key_type;
-    static const unsigned int rounds=10;
-    ctr_type operator()(ctr_type ctr, key_type key) const{
-        return aesni4x32(ctr, key);
-    }
-};
-
-/** @ingroup AESNI
-    @class AESNI1xm128i_R
-
-AESNI1xm128i_R is provided for completeness, but is only instantiable with ROUNDS=10, in
-which case it is identical to AESNI1xm128i */
-template <unsigned ROUNDS=10> 
-struct AESNI1xm128i_R : public AESNI1xm128i{
-    R123_STATIC_ASSERT(ROUNDS==10, "AESNI1xm128i_R<R> is only valid with R=10");
-};
-
-/** @class AESNI4x32_R **/
-template <unsigned ROUNDS=10> 
-struct AESNI4x32_R : public AESNI4x32{
-    R123_STATIC_ASSERT(ROUNDS==10, "AESNI4x32_R<R> is only valid with R=10");
-};
-} // namespace r123
+namespace r123 {
+    /**
+    @defgroup AESNI ARS and AESNI Classes and Typedefs
+
+    The ARS4x32, ARS1xm128i, AESNI4x32 and AESNI1xm128i classes export the member functions,
+    typedefs and
+    operator overloads required by a @ref CBRNG "CBRNG" class.
+
+    ARS1xm128i and AESNI1xm128i are based on the AES block cipher and rely on the AES-NI hardware
+    instructions
+    available on some some new (2011) CPUs.
+
+    The ARS1xm128i CBRNG and the use of AES for random number generation are described in
+    <a href="http://dl.acm.org/citation.cfm?doid=2063405"><i>Parallel Random Numbers:  As Easy as 1,
+    2, 3</i> </a>.
+    Although it uses some cryptographic primitives, ARS1xm128i uses a cryptographically weak key
+    schedule and is \b not suitable for cryptographic use.
+
+    @class AESNI1xm128i
+    @ingroup AESNI
+    AESNI exports the member functions, typedefs and operator overloads required by a @ref CBRNG
+    class.
+
+    AESNI1xm128i uses the crypotgraphic AES round function, including the cryptographic key
+    schedule.
+
+    In contrast to the other CBRNGs in the Random123 library, the AESNI1xm128i_R::key_type is opaque
+    and is \b not identical to the AESNI1xm128i_R::ukey_type.  Creating a key_type, using either the
+    constructor
+    or assignment operator, is significantly more time-consuming than running the bijection
+    (hundreds
+    of clock cycles vs. tens of clock cycles).
+
+    AESNI1xm128i is only available when the feature-test macro R123_USE_AES_NI is true, which
+    should occur only when the compiler is configured to generate AES-NI instructions (or
+    when defaults are overridden by compile-time, compiler-command-line options).
+
+    As of September 2011, the authors know of no statistical flaws with AESNI1xm128i.  It
+    would be an event of major cryptographic note if any such flaws were ever found.
+    */
+    struct AESNI1xm128i {
+        typedef aesni1xm128i_ctr_t ctr_type;
+        typedef aesni1xm128i_ukey_t ukey_type;
+        typedef aesni1xm128i_key_t key_type;
+        static const unsigned int rounds = 10;
+        ctr_type operator()(ctr_type ctr, key_type key) const {
+            return aesni1xm128i(ctr, key);
+        }
+    };
+
+    /* @class AESNI4x32 */
+    struct AESNI4x32 {
+        typedef aesni4x32_ctr_t ctr_type;
+        typedef aesni4x32_ukey_t ukey_type;
+        typedef aesni4x32_key_t key_type;
+        static const unsigned int rounds = 10;
+        ctr_type operator()(ctr_type ctr, key_type key) const {
+            return aesni4x32(ctr, key);
+        }
+    };
+
+    /** @ingroup AESNI
+        @class AESNI1xm128i_R
+
+    AESNI1xm128i_R is provided for completeness, but is only instantiable with ROUNDS=10, in
+    which case it is identical to AESNI1xm128i */
+    template <unsigned ROUNDS = 10>
+    struct AESNI1xm128i_R : public AESNI1xm128i {
+        R123_STATIC_ASSERT(ROUNDS == 10, "AESNI1xm128i_R<R> is only valid with R=10");
+    };
+
+    /** @class AESNI4x32_R **/
+    template <unsigned ROUNDS = 10>
+    struct AESNI4x32_R : public AESNI4x32 {
+        R123_STATIC_ASSERT(ROUNDS == 10, "AESNI4x32_R<R> is only valid with R=10");
+    };
+}  // namespace r123
 #endif /* __cplusplus */
 
 #endif /* R123_USE_AES_NI */
@@ -288,56 +298,55 @@ struct AESNI4x32_R : public AESNI4x32{
 typedef struct r123array16x8 aesopenssl16x8_ctr_t;
 typedef struct r123array16x8 aesopenssl16x8_ukey_t;
 #ifdef __cplusplus
-struct aesopenssl16x8_key_t{
+struct aesopenssl16x8_key_t {
     AES_KEY k;
-    aesopenssl16x8_key_t(){
-        aesopenssl16x8_ukey_t ukey={{}};
-        AES_set_encrypt_key((const unsigned char *)&ukey.v[0], 128, &k);
+    aesopenssl16x8_key_t() {
+        aesopenssl16x8_ukey_t ukey = {{}};
+        AES_set_encrypt_key((const unsigned char*)&ukey.v[0], 128, &k);
     }
-    aesopenssl16x8_key_t(const aesopenssl16x8_ukey_t& ukey){
-        AES_set_encrypt_key((const unsigned char *)&ukey.v[0], 128, &k);
+    aesopenssl16x8_key_t(const aesopenssl16x8_ukey_t& ukey) {
+        AES_set_encrypt_key((const unsigned char*)&ukey.v[0], 128, &k);
     }
-    aesopenssl16x8_key_t& operator=(const aesopenssl16x8_ukey_t& ukey){
-        AES_set_encrypt_key((const unsigned char *)&ukey.v[0], 128, &k);
+    aesopenssl16x8_key_t& operator=(const aesopenssl16x8_ukey_t& ukey) {
+        AES_set_encrypt_key((const unsigned char*)&ukey.v[0], 128, &k);
         return *this;
     }
 };
 #else
-typedef struct aesopenssl16x8_key_t{
-    AES_KEY k;
-}aesopenssl16x8_key_t;
-R123_STATIC_INLINE struct aesopenssl16x8_key_t aesopenssl16x8keyinit(aesopenssl16x8_ukey_t uk){
+typedef struct aesopenssl16x8_key_t { AES_KEY k; } aesopenssl16x8_key_t;
+R123_STATIC_INLINE struct aesopenssl16x8_key_t aesopenssl16x8keyinit(aesopenssl16x8_ukey_t uk) {
     aesopenssl16x8_key_t ret;
-    AES_set_encrypt_key((const unsigned char *)&uk.v[0], 128, &ret.k);
+    AES_set_encrypt_key((const unsigned char*)&uk.v[0], 128, &ret.k);
     return ret;
 }
 #endif
 
-R123_STATIC_INLINE R123_FORCE_INLINE(aesopenssl16x8_ctr_t aesopenssl16x8_R(aesopenssl16x8_ctr_t ctr, aesopenssl16x8_key_t key));
+R123_STATIC_INLINE R123_FORCE_INLINE(
+    aesopenssl16x8_ctr_t aesopenssl16x8_R(aesopenssl16x8_ctr_t ctr, aesopenssl16x8_key_t key));
 R123_STATIC_INLINE
-aesopenssl16x8_ctr_t aesopenssl16x8_R(aesopenssl16x8_ctr_t ctr, aesopenssl16x8_key_t key){
+aesopenssl16x8_ctr_t aesopenssl16x8_R(aesopenssl16x8_ctr_t ctr, aesopenssl16x8_key_t key) {
     aesopenssl16x8_ctr_t ret;
-    AES_encrypt((const unsigned char*)&ctr.v[0], (unsigned char *)&ret.v[0], &key.k);
+    AES_encrypt((const unsigned char*)&ctr.v[0], (unsigned char*)&ret.v[0], &key.k);
     return ret;
 }
 
 #define aesopenssl16x8_rounds aesni4x32_rounds
-#define aesopenssl16x8(c,k) aesopenssl16x8_R(aesopenssl16x8_rounds)
+#define aesopenssl16x8(c, k) aesopenssl16x8_R(aesopenssl16x8_rounds)
 
 #ifdef __cplusplus
-namespace r123{
-struct AESOpenSSL16x8{
-    typedef aesopenssl16x8_ctr_t ctr_type;
-    typedef aesopenssl16x8_key_t key_type;
-    typedef aesopenssl16x8_ukey_t ukey_type;
-    static const unsigned int rounds=10;
-    ctr_type operator()(const ctr_type& in, const key_type& k){
-        ctr_type out;
-        AES_encrypt((const unsigned char *)&in[0], (unsigned char *)&out[0], &k.k);
-        return out;
-    }
-};
-} // namespace r123
+namespace r123 {
+    struct AESOpenSSL16x8 {
+        typedef aesopenssl16x8_ctr_t ctr_type;
+        typedef aesopenssl16x8_key_t key_type;
+        typedef aesopenssl16x8_ukey_t ukey_type;
+        static const unsigned int rounds = 10;
+        ctr_type operator()(const ctr_type& in, const key_type& k) {
+            ctr_type out;
+            AES_encrypt((const unsigned char*)&in[0], (unsigned char*)&out[0], &k.k);
+            return out;
+        }
+    };
+}  // namespace r123
 #endif /* __cplusplus */
 #endif /* R123_USE_AES_OPENSSL */
 
diff --git a/coreneuron/utils/randoms/Random123/array.h b/coreneuron/utils/randoms/Random123/array.h
index ab85392d8..e4106bc5e 100644
--- a/coreneuron/utils/randoms/Random123/array.h
+++ b/coreneuron/utils/randoms/Random123/array.h
@@ -46,7 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <limits>
 #include <iostream>
 
-/** @defgroup arrayNxW The r123arrayNxW classes 
+/** @defgroup arrayNxW The r123arrayNxW classes
 
     Each of the r123arrayNxW is a fixed size array of N W-bit unsigned integers.
     It is functionally equivalent to the C++0x std::array<N, uintW_t>,
@@ -56,7 +56,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     it also has a member function, incr(), which increments the zero-th
     element and carrys overflows into higher indexed elements.  Thus,
     by using incr(), sequences of up to 2^(N*W) distinct values
-    can be produced. 
+    can be produced.
 
     If SSE is supported by the compiler, then the class
     r123array1xm128i is also defined, in which the data member is an
@@ -66,141 +66,207 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
 template <typename value_type>
-inline R123_CUDA_DEVICE value_type assemble_from_u32(uint32_t *p32){
-    value_type v=0;
-    for(size_t i=0; i<(3+sizeof(value_type))/4; ++i)
-        v |= ((value_type)(*p32++)) << (32*i);
+inline R123_CUDA_DEVICE value_type assemble_from_u32(uint32_t* p32) {
+    value_type v = 0;
+    for (size_t i = 0; i < (3 + sizeof(value_type)) / 4; ++i)
+        v |= ((value_type)(*p32++)) << (32 * i);
     return v;
 }
 
 // Work-alike methods and typedefs modeled on std::array:
-#define CXXMETHODS(_N, W, T)                                            \
-    typedef T value_type;                                               \
-    typedef T* iterator;                                                \
-    typedef const T* const_iterator;                                    \
-    typedef value_type& reference;                                      \
-    typedef const value_type& const_reference;                          \
-    typedef size_t size_type;                                           \
-    typedef ptrdiff_t difference_type;                                  \
-    typedef T* pointer;                                                 \
-    typedef const T* const_pointer;                                     \
-    typedef std::reverse_iterator<iterator> reverse_iterator;           \
-    typedef std::reverse_iterator<const_iterator> const_reverse_iterator; \
-    /* Boost.array has static_size.  C++11 specializes tuple_size */    \
-    enum {static_size = _N};                                            \
-    R123_CUDA_DEVICE reference operator[](size_type i){return v[i];}                     \
-    R123_CUDA_DEVICE const_reference operator[](size_type i) const {return v[i];}        \
-    R123_CUDA_DEVICE reference at(size_type i){ if(i >=  _N) R123_THROW(std::out_of_range("array index out of range")); return (*this)[i]; } \
-    R123_CUDA_DEVICE const_reference at(size_type i) const { if(i >=  _N) R123_THROW(std::out_of_range("array index out of range")); return (*this)[i]; } \
-    R123_CUDA_DEVICE size_type size() const { return  _N; }                              \
-    R123_CUDA_DEVICE size_type max_size() const { return _N; }                           \
-    R123_CUDA_DEVICE bool empty() const { return _N==0; };                               \
-    R123_CUDA_DEVICE iterator begin() { return &v[0]; }                                  \
-    R123_CUDA_DEVICE iterator end() { return &v[_N]; }                                   \
-    R123_CUDA_DEVICE const_iterator begin() const { return &v[0]; }                      \
-    R123_CUDA_DEVICE const_iterator end() const { return &v[_N]; }                       \
-    R123_CUDA_DEVICE const_iterator cbegin() const { return &v[0]; }                     \
-    R123_CUDA_DEVICE const_iterator cend() const { return &v[_N]; }                      \
-    R123_CUDA_DEVICE reverse_iterator rbegin(){ return reverse_iterator(end()); }        \
-    R123_CUDA_DEVICE const_reverse_iterator rbegin() const{ return const_reverse_iterator(end()); } \
-    R123_CUDA_DEVICE reverse_iterator rend(){ return reverse_iterator(begin()); }        \
-    R123_CUDA_DEVICE const_reverse_iterator rend() const{ return const_reverse_iterator(begin()); } \
-    R123_CUDA_DEVICE const_reverse_iterator crbegin() const{ return const_reverse_iterator(cend()); } \
-    R123_CUDA_DEVICE const_reverse_iterator crend() const{ return const_reverse_iterator(cbegin()); } \
-    R123_CUDA_DEVICE pointer data(){ return &v[0]; }                                     \
-    R123_CUDA_DEVICE const_pointer data() const{ return &v[0]; }                         \
-    R123_CUDA_DEVICE reference front(){ return v[0]; }                                   \
-    R123_CUDA_DEVICE const_reference front() const{ return v[0]; }                       \
-    R123_CUDA_DEVICE reference back(){ return v[_N-1]; }                                 \
-    R123_CUDA_DEVICE const_reference back() const{ return v[_N-1]; }                     \
-    R123_CUDA_DEVICE bool operator==(const r123array##_N##x##W& rhs) const{ \
-	/* CUDA3 does not have std::equal */ \
-	for (size_t i = 0; i < _N; ++i) \
-	    if (v[i] != rhs.v[i]) return false; \
-	return true; \
-    } \
-    R123_CUDA_DEVICE bool operator!=(const r123array##_N##x##W& rhs) const{ return !(*this == rhs); } \
-    /* CUDA3 does not have std::fill_n */ \
-    R123_CUDA_DEVICE void fill(const value_type& val){ for (size_t i = 0; i < _N; ++i) v[i] = val; } \
-    R123_CUDA_DEVICE void swap(r123array##_N##x##W& rhs){ \
-	/* CUDA3 does not have std::swap_ranges */ \
-	for (size_t i = 0; i < _N; ++i) { \
-	    T tmp = v[i]; \
-	    v[i] = rhs.v[i]; \
-	    rhs.v[i] = tmp; \
-	} \
-    } \
-    R123_CUDA_DEVICE r123array##_N##x##W& incr(R123_ULONG_LONG n=1){                         \
-        /* This test is tricky because we're trying to avoid spurious   \
-           complaints about illegal shifts, yet still be compile-time   \
-           evaulated. */                                                \
-        if(sizeof(T)<sizeof(n) && n>>((sizeof(T)<sizeof(n))?8*sizeof(T):0) ) \
-            return incr_carefully(n);                                   \
-        if(n==1){                                                       \
-            ++v[0];                                                     \
-            if(_N==1 || R123_BUILTIN_EXPECT(!!v[0], 1)) return *this;   \
-        }else{                                                          \
-            v[0] += n;                                                  \
-            if(_N==1 || R123_BUILTIN_EXPECT(n<=v[0], 1)) return *this;  \
-        }                                                               \
-        /* We expect that the N==?? tests will be                       \
-           constant-folded/optimized away by the compiler, so only the  \
-           overflow tests (!!v[i]) remain to be done at runtime.  For  \
-           small values of N, it would be better to do this as an       \
-           uncondtional sequence of adc.  An experiment/optimization    \
-           for another day...                                           \
-           N.B.  The weird subscripting: v[_N>3?3:0] is to silence      \
-           a spurious error from icpc                                   \
-           */                                                           \
-        ++v[_N>1?1:0];                                                  \
-        if(_N==2 || R123_BUILTIN_EXPECT(!!v[_N>1?1:0], 1)) return *this; \
-        ++v[_N>2?2:0];                                                  \
-        if(_N==3 || R123_BUILTIN_EXPECT(!!v[_N>2?2:0], 1)) return *this;  \
-        ++v[_N>3?3:0];                                                  \
-        for(size_t i=4; i<_N; ++i){                                     \
-            if( R123_BUILTIN_EXPECT(!!v[i-1], 1) ) return *this;        \
-            ++v[i];                                                     \
-        }                                                               \
-        return *this;                                                   \
-    }                                                                   \
-    /* seed(SeedSeq) would be a constructor if having a constructor */  \
-    /* didn't cause headaches with defaults */                          \
-    template <typename SeedSeq>                                         \
-    R123_CUDA_DEVICE static r123array##_N##x##W seed(SeedSeq &ss){      \
-        r123array##_N##x##W ret;                                        \
-        const size_t Ngen = _N*((3+sizeof(value_type))/4);              \
-        uint32_t u32[Ngen];                                             \
-        uint32_t *p32 = &u32[0];                                        \
-        ss.generate(&u32[0], &u32[Ngen]);                               \
-        for(size_t i=0; i<_N; ++i){                                     \
-            ret.v[i] = assemble_from_u32<value_type>(p32);              \
-            p32 += (3+sizeof(value_type))/4;                            \
-        }                                                               \
-        return ret;                                                     \
-    }                                                                   \
-protected:                                                              \
-    R123_CUDA_DEVICE r123array##_N##x##W& incr_carefully(R123_ULONG_LONG n){ \
-        /* n may be greater than the maximum value of a single value_type */ \
-        value_type vtn;                                                 \
-        vtn = n;                                                        \
-        v[0] += n;                                                      \
-        const unsigned rshift = 8* ((sizeof(n)>sizeof(value_type))? sizeof(value_type) : 0); \
-        for(size_t i=1; i<_N; ++i){                                     \
-            if(rshift){                                                 \
-                n >>= rshift;                                           \
-            }else{                                                      \
-                n=0;                                                    \
-            }                                                           \
-            if( v[i-1] < vtn )                                          \
-                ++n;                                                    \
-            if( n==0 ) break;                                           \
-            vtn = n;                                                    \
-            v[i] += n;                                                  \
-        }                                                               \
-        return *this;                                                   \
-    }                                                                   \
-    
-                                                                        
+#define CXXMETHODS(_N, W, T)                                                                     \
+    typedef T value_type;                                                                        \
+    typedef T* iterator;                                                                         \
+    typedef const T* const_iterator;                                                             \
+    typedef value_type& reference;                                                               \
+    typedef const value_type& const_reference;                                                   \
+    typedef size_t size_type;                                                                    \
+    typedef ptrdiff_t difference_type;                                                           \
+    typedef T* pointer;                                                                          \
+    typedef const T* const_pointer;                                                              \
+    typedef std::reverse_iterator<iterator> reverse_iterator;                                    \
+    typedef std::reverse_iterator<const_iterator> const_reverse_iterator;                        \
+    /* Boost.array has static_size.  C++11 specializes tuple_size */                             \
+    enum { static_size = _N };                                                                   \
+    R123_CUDA_DEVICE reference operator[](size_type i) {                                         \
+        return v[i];                                                                             \
+    }                                                                                            \
+    R123_CUDA_DEVICE const_reference operator[](size_type i) const {                             \
+        return v[i];                                                                             \
+    }                                                                                            \
+    R123_CUDA_DEVICE reference at(size_type i) {                                                 \
+        if (i >= _N)                                                                             \
+            R123_THROW(std::out_of_range("array index out of range"));                           \
+        return (*this)[i];                                                                       \
+    }                                                                                            \
+    R123_CUDA_DEVICE const_reference at(size_type i) const {                                     \
+        if (i >= _N)                                                                             \
+            R123_THROW(std::out_of_range("array index out of range"));                           \
+        return (*this)[i];                                                                       \
+    }                                                                                            \
+    R123_CUDA_DEVICE size_type size() const {                                                    \
+        return _N;                                                                               \
+    }                                                                                            \
+    R123_CUDA_DEVICE size_type max_size() const {                                                \
+        return _N;                                                                               \
+    }                                                                                            \
+    R123_CUDA_DEVICE bool empty() const {                                                        \
+        return _N == 0;                                                                          \
+    };                                                                                           \
+    R123_CUDA_DEVICE iterator begin() {                                                          \
+        return &v[0];                                                                            \
+    }                                                                                            \
+    R123_CUDA_DEVICE iterator end() {                                                            \
+        return &v[_N];                                                                           \
+    }                                                                                            \
+    R123_CUDA_DEVICE const_iterator begin() const {                                              \
+        return &v[0];                                                                            \
+    }                                                                                            \
+    R123_CUDA_DEVICE const_iterator end() const {                                                \
+        return &v[_N];                                                                           \
+    }                                                                                            \
+    R123_CUDA_DEVICE const_iterator cbegin() const {                                             \
+        return &v[0];                                                                            \
+    }                                                                                            \
+    R123_CUDA_DEVICE const_iterator cend() const {                                               \
+        return &v[_N];                                                                           \
+    }                                                                                            \
+    R123_CUDA_DEVICE reverse_iterator rbegin() {                                                 \
+        return reverse_iterator(end());                                                          \
+    }                                                                                            \
+    R123_CUDA_DEVICE const_reverse_iterator rbegin() const {                                     \
+        return const_reverse_iterator(end());                                                    \
+    }                                                                                            \
+    R123_CUDA_DEVICE reverse_iterator rend() {                                                   \
+        return reverse_iterator(begin());                                                        \
+    }                                                                                            \
+    R123_CUDA_DEVICE const_reverse_iterator rend() const {                                       \
+        return const_reverse_iterator(begin());                                                  \
+    }                                                                                            \
+    R123_CUDA_DEVICE const_reverse_iterator crbegin() const {                                    \
+        return const_reverse_iterator(cend());                                                   \
+    }                                                                                            \
+    R123_CUDA_DEVICE const_reverse_iterator crend() const {                                      \
+        return const_reverse_iterator(cbegin());                                                 \
+    }                                                                                            \
+    R123_CUDA_DEVICE pointer data() {                                                            \
+        return &v[0];                                                                            \
+    }                                                                                            \
+    R123_CUDA_DEVICE const_pointer data() const {                                                \
+        return &v[0];                                                                            \
+    }                                                                                            \
+    R123_CUDA_DEVICE reference front() {                                                         \
+        return v[0];                                                                             \
+    }                                                                                            \
+    R123_CUDA_DEVICE const_reference front() const {                                             \
+        return v[0];                                                                             \
+    }                                                                                            \
+    R123_CUDA_DEVICE reference back() {                                                          \
+        return v[_N - 1];                                                                        \
+    }                                                                                            \
+    R123_CUDA_DEVICE const_reference back() const {                                              \
+        return v[_N - 1];                                                                        \
+    }                                                                                            \
+    R123_CUDA_DEVICE bool operator==(const r123array##_N##x##W& rhs) const {                     \
+        /* CUDA3 does not have std::equal */                                                     \
+        for (size_t i = 0; i < _N; ++i)                                                          \
+            if (v[i] != rhs.v[i])                                                                \
+                return false;                                                                    \
+        return true;                                                                             \
+    }                                                                                            \
+    R123_CUDA_DEVICE bool operator!=(const r123array##_N##x##W& rhs) const {                     \
+        return !(*this == rhs);                                                                  \
+    }                                                                                            \
+    /* CUDA3 does not have std::fill_n */                                                        \
+    R123_CUDA_DEVICE void fill(const value_type& val) {                                          \
+        for (size_t i = 0; i < _N; ++i)                                                          \
+            v[i] = val;                                                                          \
+    }                                                                                            \
+    R123_CUDA_DEVICE void swap(r123array##_N##x##W& rhs) {                                       \
+        /* CUDA3 does not have std::swap_ranges */                                               \
+        for (size_t i = 0; i < _N; ++i) {                                                        \
+            T tmp = v[i];                                                                        \
+            v[i] = rhs.v[i];                                                                     \
+            rhs.v[i] = tmp;                                                                      \
+        }                                                                                        \
+    }                                                                                            \
+    R123_CUDA_DEVICE r123array##_N##x##W& incr(R123_ULONG_LONG n = 1) {                          \
+        /* This test is tricky because we're trying to avoid spurious                            \
+           complaints about illegal shifts, yet still be compile-time                            \
+           evaulated. */                                                                         \
+        if (sizeof(T) < sizeof(n) && n >> ((sizeof(T) < sizeof(n)) ? 8 * sizeof(T) : 0))         \
+            return incr_carefully(n);                                                            \
+        if (n == 1) {                                                                            \
+            ++v[0];                                                                              \
+            if (_N == 1 || R123_BUILTIN_EXPECT(!!v[0], 1))                                       \
+                return *this;                                                                    \
+        } else {                                                                                 \
+            v[0] += n;                                                                           \
+            if (_N == 1 || R123_BUILTIN_EXPECT(n <= v[0], 1))                                    \
+                return *this;                                                                    \
+        }                                                                                        \
+        /* We expect that the N==?? tests will be                                                \
+           constant-folded/optimized away by the compiler, so only the                           \
+           overflow tests (!!v[i]) remain to be done at runtime.  For                            \
+           small values of N, it would be better to do this as an                                \
+           uncondtional sequence of adc.  An experiment/optimization                             \
+           for another day...                                                                    \
+           N.B.  The weird subscripting: v[_N>3?3:0] is to silence                               \
+           a spurious error from icpc                                                            \
+           */                                                                                    \
+        ++v[_N > 1 ? 1 : 0];                                                                     \
+        if (_N == 2 || R123_BUILTIN_EXPECT(!!v[_N > 1 ? 1 : 0], 1))                              \
+            return *this;                                                                        \
+        ++v[_N > 2 ? 2 : 0];                                                                     \
+        if (_N == 3 || R123_BUILTIN_EXPECT(!!v[_N > 2 ? 2 : 0], 1))                              \
+            return *this;                                                                        \
+        ++v[_N > 3 ? 3 : 0];                                                                     \
+        for (size_t i = 4; i < _N; ++i) {                                                        \
+            if (R123_BUILTIN_EXPECT(!!v[i - 1], 1))                                              \
+                return *this;                                                                    \
+            ++v[i];                                                                              \
+        }                                                                                        \
+        return *this;                                                                            \
+    }                                                                                            \
+    /* seed(SeedSeq) would be a constructor if having a constructor */                           \
+    /* didn't cause headaches with defaults */                                                   \
+    template <typename SeedSeq>                                                                  \
+    R123_CUDA_DEVICE static r123array##_N##x##W seed(SeedSeq& ss) {                              \
+        r123array##_N##x##W ret;                                                                 \
+        const size_t Ngen = _N * ((3 + sizeof(value_type)) / 4);                                 \
+        uint32_t u32[Ngen];                                                                      \
+        uint32_t* p32 = &u32[0];                                                                 \
+        ss.generate(&u32[0], &u32[Ngen]);                                                        \
+        for (size_t i = 0; i < _N; ++i) {                                                        \
+            ret.v[i] = assemble_from_u32<value_type>(p32);                                       \
+            p32 += (3 + sizeof(value_type)) / 4;                                                 \
+        }                                                                                        \
+        return ret;                                                                              \
+    }                                                                                            \
+                                                                                                 \
+  protected:                                                                                     \
+    R123_CUDA_DEVICE r123array##_N##x##W& incr_carefully(R123_ULONG_LONG n) {                    \
+        /* n may be greater than the maximum value of a single value_type */                     \
+        value_type vtn;                                                                          \
+        vtn = n;                                                                                 \
+        v[0] += n;                                                                               \
+        const unsigned rshift = 8 * ((sizeof(n) > sizeof(value_type)) ? sizeof(value_type) : 0); \
+        for (size_t i = 1; i < _N; ++i) {                                                        \
+            if (rshift) {                                                                        \
+                n >>= rshift;                                                                    \
+            } else {                                                                             \
+                n = 0;                                                                           \
+            }                                                                                    \
+            if (v[i - 1] < vtn)                                                                  \
+                ++n;                                                                             \
+            if (n == 0)                                                                          \
+                break;                                                                           \
+            vtn = n;                                                                             \
+            v[i] += n;                                                                           \
+        }                                                                                        \
+        return *this;                                                                            \
+    }
+
 // There are several tricky considerations for the insertion and extraction
 // operators:
 // - we would like to be able to print r123array16x8 as a sequence of 16 integers,
@@ -210,69 +276,73 @@ protected:                                                              \
 //   lots of ambiguity problems with automatic promotions.
 // Solution: r123arrayinsertable and r123arrayextractable
 
-template<typename T>
-struct r123arrayinsertable{
+template <typename T>
+struct r123arrayinsertable {
     const T& v;
-    r123arrayinsertable(const T& t_) : v(t_) {} 
-    friend std::ostream& operator<<(std::ostream& os, const r123arrayinsertable<T>& t){
+    r123arrayinsertable(const T& t_) : v(t_) {
+    }
+    friend std::ostream& operator<<(std::ostream& os, const r123arrayinsertable<T>& t) {
         return os << t.v;
     }
 };
 
-template<>
-struct r123arrayinsertable<uint8_t>{
+template <>
+struct r123arrayinsertable<uint8_t> {
     const uint8_t& v;
-    r123arrayinsertable(const uint8_t& t_) : v(t_) {} 
-    friend std::ostream& operator<<(std::ostream& os, const r123arrayinsertable<uint8_t>& t){
+    r123arrayinsertable(const uint8_t& t_) : v(t_) {
+    }
+    friend std::ostream& operator<<(std::ostream& os, const r123arrayinsertable<uint8_t>& t) {
         return os << (int)t.v;
     }
 };
 
-template<typename T>
-struct r123arrayextractable{
+template <typename T>
+struct r123arrayextractable {
     T& v;
-    r123arrayextractable(T& t_) : v(t_) {}
-    friend std::istream& operator>>(std::istream& is, r123arrayextractable<T>& t){
+    r123arrayextractable(T& t_) : v(t_) {
+    }
+    friend std::istream& operator>>(std::istream& is, r123arrayextractable<T>& t) {
         return is >> t.v;
     }
 };
 
-template<>
-struct r123arrayextractable<uint8_t>{
+template <>
+struct r123arrayextractable<uint8_t> {
     uint8_t& v;
-    r123arrayextractable(uint8_t& t_) : v(t_) {} 
-    friend std::istream& operator>>(std::istream& is, r123arrayextractable<uint8_t>& t){
+    r123arrayextractable(uint8_t& t_) : v(t_) {
+    }
+    friend std::istream& operator>>(std::istream& is, r123arrayextractable<uint8_t>& t) {
         int i;
-        is >>  i;
+        is >> i;
         t.v = i;
         return is;
     }
 };
 
-#define CXXOVERLOADS(_N, W, T)                                          \
-                                                                        \
-inline std::ostream& operator<<(std::ostream& os, const r123array##_N##x##W& a){   \
-    os << r123arrayinsertable<T>(a.v[0]);                                  \
-    for(size_t i=1; i<_N; ++i)                                          \
-        os << " " << r123arrayinsertable<T>(a.v[i]);                       \
-    return os;                                                          \
-}                                                                       \
-                                                                        \
-inline std::istream& operator>>(std::istream& is, r123array##_N##x##W& a){         \
-    for(size_t i=0; i<_N; ++i){                                         \
-        r123arrayextractable<T> x(a.v[i]);                                 \
-        is >> x;                                                        \
-    }                                                                   \
-    return is;                                                          \
-}                                                                       \
-                                                                        \
-namespace r123{                                                        \
- typedef r123array##_N##x##W Array##_N##x##W;                          \
-}
-                                                                        
+#define CXXOVERLOADS(_N, W, T)                                                        \
+                                                                                      \
+    inline std::ostream& operator<<(std::ostream& os, const r123array##_N##x##W& a) { \
+        os << r123arrayinsertable<T>(a.v[0]);                                         \
+        for (size_t i = 1; i < _N; ++i)                                               \
+            os << " " << r123arrayinsertable<T>(a.v[i]);                              \
+        return os;                                                                    \
+    }                                                                                 \
+                                                                                      \
+    inline std::istream& operator>>(std::istream& is, r123array##_N##x##W& a) {       \
+        for (size_t i = 0; i < _N; ++i) {                                             \
+            r123arrayextractable<T> x(a.v[i]);                                        \
+            is >> x;                                                                  \
+        }                                                                             \
+        return is;                                                                    \
+    }                                                                                 \
+                                                                                      \
+    namespace r123 {                                                                  \
+        typedef r123array##_N##x##W Array##_N##x##W;                                  \
+    }
+
 #endif /* __cplusplus */
 
-/* _r123array_tpl expands to a declaration of struct r123arrayNxW.  
+/* _r123array_tpl expands to a declaration of struct r123arrayNxW.
 
    In C, it's nothing more than a struct containing an array of N
    objects of type T.
@@ -285,42 +355,41 @@ namespace r123{                                                        \
    a typedef equivalent to r123arrayNxW.
 */
 
-#define _r123array_tpl(_N, W, T)                   \
-    /** @ingroup arrayNxW */                        \
-    /** @see arrayNxW */                            \
-struct r123array##_N##x##W{                         \
- T v[_N];                                       \
- CXXMETHODS(_N, W, T)                           \
-};                                              \
-                                                \
-CXXOVERLOADS(_N, W, T)
+#define _r123array_tpl(_N, W, T) \
+    /** @ingroup arrayNxW */     \
+    /** @see arrayNxW */         \
+    struct r123array##_N##x##W { \
+        T v[_N];                 \
+        CXXMETHODS(_N, W, T)     \
+    };                           \
+                                 \
+    CXXOVERLOADS(_N, W, T)
 
 /** @endcond */
 
-_r123array_tpl(1, 32, uint32_t)  /* r123array1x32 */
-_r123array_tpl(2, 32, uint32_t)  /* r123array2x32 */
-_r123array_tpl(4, 32, uint32_t)  /* r123array4x32 */
-_r123array_tpl(8, 32, uint32_t)  /* r123array8x32 */
+_r123array_tpl(1, 32, uint32_t)     /* r123array1x32 */
+    _r123array_tpl(2, 32, uint32_t) /* r123array2x32 */
+    _r123array_tpl(4, 32, uint32_t) /* r123array4x32 */
+    _r123array_tpl(8, 32, uint32_t) /* r123array8x32 */
 
-_r123array_tpl(1, 64, uint64_t)  /* r123array1x64 */
-_r123array_tpl(2, 64, uint64_t)  /* r123array2x64 */
-_r123array_tpl(4, 64, uint64_t)  /* r123array4x64 */
+    _r123array_tpl(1, 64, uint64_t) /* r123array1x64 */
+    _r123array_tpl(2, 64, uint64_t) /* r123array2x64 */
+    _r123array_tpl(4, 64, uint64_t) /* r123array4x64 */
 
-_r123array_tpl(16, 8, uint8_t)  /* r123array16x8 for ARSsw, AESsw */
+    _r123array_tpl(16, 8, uint8_t) /* r123array16x8 for ARSsw, AESsw */
 
 #if R123_USE_SSE
-_r123array_tpl(1, m128i, r123m128i) /* r123array1x128i for ARSni, AESni */
+    _r123array_tpl(1, m128i, r123m128i) /* r123array1x128i for ARSni, AESni */
 #endif
 
 /* In C++, it's natural to use sizeof(a::value_type), but in C it's
    pretty convoluted to figure out the width of the value_type of an
    r123arrayNxW:
 */
-#define R123_W(a)   (8*sizeof(((a *)0)->v[0]))
+#define R123_W(a) (8 * sizeof(((a*)0)->v[0]))
 
 /** @namespace r123
-  Most of the Random123 C++ API is contained in the r123 namespace. 
+  Most of the Random123 C++ API is contained in the r123 namespace.
 */
 
 #endif
-
diff --git a/coreneuron/utils/randoms/Random123/ars.h b/coreneuron/utils/randoms/Random123/ars.h
index a027b6fe0..765076fad 100644
--- a/coreneuron/utils/randoms/Random123/ars.h
+++ b/coreneuron/utils/randoms/Random123/ars.h
@@ -42,7 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 /** @ingroup AESNI */
-enum r123_enum_ars1xm128i {ars1xm128i_rounds = ARS1xm128i_DEFAULT_ROUNDS};
+enum r123_enum_ars1xm128i { ars1xm128i_rounds = ARS1xm128i_DEFAULT_ROUNDS };
 
 /* ARS1xm128i with Weyl keys.  Fast, and Crush-resistant, but NOT CRYPTO. */
 /** @ingroup AESNI */
@@ -52,10 +52,14 @@ typedef struct r123array1xm128i ars1xm128i_key_t;
 /** @ingroup AESNI */
 typedef struct r123array1xm128i ars1xm128i_ukey_t;
 /** @ingroup AESNI */
-R123_STATIC_INLINE ars1xm128i_key_t ars1xm128ikeyinit(ars1xm128i_ukey_t uk) { return uk; }
+R123_STATIC_INLINE ars1xm128i_key_t ars1xm128ikeyinit(ars1xm128i_ukey_t uk) {
+    return uk;
+}
 /** @ingroup AESNI */
-R123_STATIC_INLINE ars1xm128i_ctr_t ars1xm128i_R(unsigned int Nrounds, ars1xm128i_ctr_t in, ars1xm128i_key_t k){
-    __m128i kweyl = _mm_set_epi64x(R123_64BIT(0xBB67AE8584CAA73B), /* sqrt(3) - 1.0 */
+R123_STATIC_INLINE ars1xm128i_ctr_t ars1xm128i_R(unsigned int Nrounds,
+                                                 ars1xm128i_ctr_t in,
+                                                 ars1xm128i_key_t k) {
+    __m128i kweyl = _mm_set_epi64x(R123_64BIT(0xBB67AE8584CAA73B),  /* sqrt(3) - 1.0 */
                                    R123_64BIT(0x9E3779B97F4A7C15)); /* golden ratio */
     /* N.B.  the aesenc instructions do the xor *after*
     // so if we want to follow the AES pattern, we
@@ -63,40 +67,40 @@ R123_STATIC_INLINE ars1xm128i_ctr_t ars1xm128i_R(unsigned int Nrounds, ars1xm128
     __m128i kk = k.v[0].m;
     __m128i v = _mm_xor_si128(in.v[0].m, kk);
     ars1xm128i_ctr_t ret;
-    R123_ASSERT(Nrounds<=10);
-    if( Nrounds>1 ){
+    R123_ASSERT(Nrounds <= 10);
+    if (Nrounds > 1) {
         kk = _mm_add_epi64(kk, kweyl);
         v = _mm_aesenc_si128(v, kk);
     }
-    if( Nrounds>2 ){
+    if (Nrounds > 2) {
         kk = _mm_add_epi64(kk, kweyl);
         v = _mm_aesenc_si128(v, kk);
     }
-    if( Nrounds>3 ){
+    if (Nrounds > 3) {
         kk = _mm_add_epi64(kk, kweyl);
         v = _mm_aesenc_si128(v, kk);
     }
-    if( Nrounds>4 ){
+    if (Nrounds > 4) {
         kk = _mm_add_epi64(kk, kweyl);
         v = _mm_aesenc_si128(v, kk);
     }
-    if( Nrounds>5 ){
+    if (Nrounds > 5) {
         kk = _mm_add_epi64(kk, kweyl);
         v = _mm_aesenc_si128(v, kk);
     }
-    if( Nrounds>6 ){
+    if (Nrounds > 6) {
         kk = _mm_add_epi64(kk, kweyl);
         v = _mm_aesenc_si128(v, kk);
     }
-    if( Nrounds>7 ){
+    if (Nrounds > 7) {
         kk = _mm_add_epi64(kk, kweyl);
         v = _mm_aesenc_si128(v, kk);
     }
-    if( Nrounds>8 ){
+    if (Nrounds > 8) {
         kk = _mm_add_epi64(kk, kweyl);
         v = _mm_aesenc_si128(v, kk);
     }
-    if( Nrounds>9 ){
+    if (Nrounds > 9) {
         kk = _mm_add_epi64(kk, kweyl);
         v = _mm_aesenc_si128(v, kk);
     }
@@ -108,8 +112,9 @@ R123_STATIC_INLINE ars1xm128i_ctr_t ars1xm128i_R(unsigned int Nrounds, ars1xm128
 
 /** @def ars1xm128i
 @ingroup AESNI
-The ars1mx128i macro provides a C API interface to the @ref AESNI "ARS" CBRNG with the default number of rounds i.e. \c ars1xm128i_rounds **/
-#define ars1xm128i(c,k) ars1xm128i_R(ars1xm128i_rounds, c, k)
+The ars1mx128i macro provides a C API interface to the @ref AESNI "ARS" CBRNG with the default
+number of rounds i.e. \c ars1xm128i_rounds **/
+#define ars1xm128i(c, k) ars1xm128i_R(ars1xm128i_rounds, c, k)
 
 /** @ingroup AESNI */
 typedef struct r123array4x32 ars4x32_ctr_t;
@@ -118,11 +123,13 @@ typedef struct r123array4x32 ars4x32_key_t;
 /** @ingroup AESNI */
 typedef struct r123array4x32 ars4x32_ukey_t;
 /** @ingroup AESNI */
-enum r123_enum_ars4x32 {ars4x32_rounds = ARS1xm128i_DEFAULT_ROUNDS};
+enum r123_enum_ars4x32 { ars4x32_rounds = ARS1xm128i_DEFAULT_ROUNDS };
 /** @ingroup AESNI */
-R123_STATIC_INLINE ars4x32_key_t ars4x32keyinit(ars4x32_ukey_t uk) { return uk; }
+R123_STATIC_INLINE ars4x32_key_t ars4x32keyinit(ars4x32_ukey_t uk) {
+    return uk;
+}
 /** @ingroup AESNI */
-R123_STATIC_INLINE ars4x32_ctr_t ars4x32_R(unsigned int Nrounds, ars4x32_ctr_t c, ars4x32_key_t k){
+R123_STATIC_INLINE ars4x32_ctr_t ars4x32_R(unsigned int Nrounds, ars4x32_ctr_t c, ars4x32_key_t k) {
     ars1xm128i_ctr_t c128;
     ars1xm128i_key_t k128;
     c128.v[0].m = _mm_set_epi32(c.v[3], c.v[2], c.v[1], c.v[0]);
@@ -134,68 +141,70 @@ R123_STATIC_INLINE ars4x32_ctr_t ars4x32_R(unsigned int Nrounds, ars4x32_ctr_t c
 
 /** @def ars4x32
 @ingroup AESNI
-The ars4x32 macro provides a C API interface to the @ref AESNI "ARS" CBRNG with the default number of rounds i.e. \c ars4x32_rounds **/
-#define ars4x32(c,k) ars4x32_R(ars4x32_rounds, c, k)
+The ars4x32 macro provides a C API interface to the @ref AESNI "ARS" CBRNG with the default number
+of rounds i.e. \c ars4x32_rounds **/
+#define ars4x32(c, k) ars4x32_R(ars4x32_rounds, c, k)
 
 #ifdef __cplusplus
-namespace r123{
-/** 
-@ingroup AESNI
-
-ARS1xm128i_R exports the member functions, typedefs and operator overloads required by a @ref CBRNG class.
-
-ARS1xm128i uses the crypotgraphic AES round function, but a @b non-cryptographc key schedule
-to save time and space.
-
-ARS1xm128i is only available when the feature-test macro R123_USE_AES_NI is true, which
-should occur only when the compiler is configured to generate AES-NI instructions (or
-when defaults are overridden by compile-time, compiler-command-line options).
-
-The template argument, ROUNDS, is the number of times the ARS round
-functions will be applied.
-
-As of September 2011, the authors know of no statistical flaws with
-ROUNDS=5 or more.
-
-@class ARS1xm128i_R
-
-*/
-template<unsigned int ROUNDS>
-struct ARS1xm128i_R{
-    typedef ars1xm128i_ctr_t ctr_type;
-    typedef ars1xm128i_key_t key_type;
-    typedef ars1xm128i_key_t ukey_type;
-    static const unsigned int rounds=ROUNDS;
-    R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key) const){
-        return ars1xm128i_R(ROUNDS, ctr, key);
-    }
-};
-
-/** @class ARS4x32_R
+namespace r123 {
+    /**
     @ingroup AESNI
-*/
 
-template<unsigned int ROUNDS>
-struct ARS4x32_R{
-    typedef ars4x32_ctr_t ctr_type;
-    typedef ars4x32_key_t key_type;
-    typedef ars4x32_key_t ukey_type;
-    static const unsigned int rounds=ROUNDS;
-    R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key) const){
-        return ars4x32_R(ROUNDS, ctr, key);
-    }
-};
-/**
-@ingroup AESNI
+    ARS1xm128i_R exports the member functions, typedefs and operator overloads required by a @ref
+    CBRNG class.
+
+    ARS1xm128i uses the crypotgraphic AES round function, but a @b non-cryptographc key schedule
+    to save time and space.
+
+    ARS1xm128i is only available when the feature-test macro R123_USE_AES_NI is true, which
+    should occur only when the compiler is configured to generate AES-NI instructions (or
+    when defaults are overridden by compile-time, compiler-command-line options).
+
+    The template argument, ROUNDS, is the number of times the ARS round
+    functions will be applied.
+
+    As of September 2011, the authors know of no statistical flaws with
+    ROUNDS=5 or more.
+
+    @class ARS1xm128i_R
+
+    */
+    template <unsigned int ROUNDS>
+    struct ARS1xm128i_R {
+        typedef ars1xm128i_ctr_t ctr_type;
+        typedef ars1xm128i_key_t key_type;
+        typedef ars1xm128i_key_t ukey_type;
+        static const unsigned int rounds = ROUNDS;
+        R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key) const) {
+            return ars1xm128i_R(ROUNDS, ctr, key);
+        }
+    };
+
+    /** @class ARS4x32_R
+        @ingroup AESNI
+    */
+
+    template <unsigned int ROUNDS>
+    struct ARS4x32_R {
+        typedef ars4x32_ctr_t ctr_type;
+        typedef ars4x32_key_t key_type;
+        typedef ars4x32_key_t ukey_type;
+        static const unsigned int rounds = ROUNDS;
+        R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key) const) {
+            return ars4x32_R(ROUNDS, ctr, key);
+        }
+    };
+    /**
+    @ingroup AESNI
 
-@class ARS1xm128i_R
-  ARS1xm128i is equivalent to ARS1xm128i_R<7>.    With 7 rounds,
-  the ARS1xm128i CBRNG  has a considerable safety margin over the minimum number
-  of rounds with no known statistical flaws, but still has excellent
-  performance. */
-typedef ARS1xm128i_R<ars1xm128i_rounds> ARS1xm128i;
-typedef ARS4x32_R<ars4x32_rounds> ARS4x32;
-} // namespace r123
+    @class ARS1xm128i_R
+      ARS1xm128i is equivalent to ARS1xm128i_R<7>.    With 7 rounds,
+      the ARS1xm128i CBRNG  has a considerable safety margin over the minimum number
+      of rounds with no known statistical flaws, but still has excellent
+      performance. */
+    typedef ARS1xm128i_R<ars1xm128i_rounds> ARS1xm128i;
+    typedef ARS4x32_R<ars4x32_rounds> ARS4x32;
+}  // namespace r123
 
 #endif /* __cplusplus */
 
diff --git a/coreneuron/utils/randoms/Random123/features/clangfeatures.h b/coreneuron/utils/randoms/Random123/features/clangfeatures.h
index 908aee8b0..1b1f7bbdd 100644
--- a/coreneuron/utils/randoms/Random123/features/clangfeatures.h
+++ b/coreneuron/utils/randoms/Random123/features/clangfeatures.h
@@ -33,7 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define __clangfeatures_dot_hpp
 
 #ifndef R123_USE_X86INTRIN_H
-#define R123_USE_X86INTRIN_H ((defined(__x86_64__)||defined(__i386__)))
+#define R123_USE_X86INTRIN_H ((defined(__x86_64__) || defined(__i386__)))
 #endif
 
 #ifndef R123_USE_CXX11_UNRESTRICTED_UNIONS
@@ -56,7 +56,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //  #define R123_USE_CXX11_RANDOM __has_include(<random>)
 // dumps core.
 #ifndef R123_USE_CXX11_RANDOM
-#if __cplusplus>=201103L && __has_include(<random>)
+#if __cplusplus >= 201103L && __has_include(<random>)
 #define R123_USE_CXX11_RANDOM 1
 #else
 #define R123_USE_CXX11_RANDOM 0
@@ -64,7 +64,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifndef R123_USE_CXX11_TYPE_TRAITS
-#if __cplusplus>=201103L && __has_include(<type_traits>)
+#if __cplusplus >= 201103L && __has_include(<type_traits>)
 #define R123_USE_CXX11_TYPE_TRAITS 1
 #else
 #define R123_USE_CXX11_TYPE_TRAITS 0
diff --git a/coreneuron/utils/randoms/Random123/features/compilerfeatures.h b/coreneuron/utils/randoms/Random123/features/compilerfeatures.h
index a92cd5ce2..a9f922816 100644
--- a/coreneuron/utils/randoms/Random123/features/compilerfeatures.h
+++ b/coreneuron/utils/randoms/Random123/features/compilerfeatures.h
@@ -36,7 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 The Random123 library is portable across C, C++, CUDA, OpenCL environments,
 and multiple operating systems (Linux, Windows 7, Mac OS X, FreeBSD, Solaris).
 This level of portability requires the abstraction of some features
-and idioms that are either not standardized (e.g., asm statments), or for which 
+and idioms that are either not standardized (e.g., asm statments), or for which
 different vendors have their own standards (e.g., SSE intrinsics) or for
 which vendors simply refuse to conform to well-established standards (e.g., <inttypes.h>).
 
@@ -55,12 +55,13 @@ Most of the symbols are boolean valued.  In general, they will
 Library users can override any value by defining the pp-symbol with a compiler option,
 e.g.,
 
-    cc -DR123_USE_MULHILO64_C99 
+    cc -DR123_USE_MULHILO64_C99
 
 will use a strictly c99 version of the full-width 64x64->128-bit multiplication
 function, even if it would be disabled by default.
 
-All boolean-valued pre-processor symbols in Random123/features/compilerfeatures.h start with the prefix R123_USE_
+All boolean-valued pre-processor symbols in Random123/features/compilerfeatures.h start with the
+prefix R123_USE_
 @verbatim
          AES_NI
          AES_OPENSSL
@@ -83,8 +84,8 @@ All boolean-valued pre-processor symbols in Random123/features/compilerfeatures.
          CXX11_UNRESTRICTED_UNIONS
          CXX11_EXPLICIT_CONVERSIONS
          CXX11_LONG_LONG
-         CXX11 
-   
+         CXX11
+
          X86INTRIN_H
          IA32INTRIN_H
          XMMINTRIN_H
@@ -101,7 +102,7 @@ All boolean-valued pre-processor symbols in Random123/features/compilerfeatures.
          MULHILO64_C99
 
          U01_DOUBLE
-	 
+
 @endverbatim
 Most have obvious meanings.  Some non-obvious ones:
 
@@ -140,11 +141,11 @@ There are also non-boolean valued symbols:
 <ul>
 <li>R123_STATIC_INLINE -
   According to both C99 and GNU99, the 'static inline' declaration allows
-  the compiler to not emit code if the function is not used.  
+  the compiler to not emit code if the function is not used.
   Note that the semantics of 'inline', 'static' and 'extern' in
   gcc have changed over time and are subject to modification by
   command line options, e.g., -std=gnu89, -fgnu-inline.
-  Nevertheless, it appears that the meaning of 'static inline' 
+  Nevertheless, it appears that the meaning of 'static inline'
   has not changed over time and (with a little luck) the use of 'static inline'
   here will be portable between versions of gcc and to other C99
   compilers.
@@ -156,7 +157,7 @@ There are also non-boolean valued symbols:
   embellishments to strongly encourage that the declared function be
   inlined.  If there is no such compiler-specific magic, it should
   expand to decl, unadorned.
-   
+
 <li>R123_CUDA_DEVICE - which expands to __device__ (or something else with
   sufficiently similar semantics) when CUDA is in use, and expands
   to nothing in other cases.
@@ -187,12 +188,13 @@ There are also non-boolean valued symbols:
 \cond HIDDEN_FROM_DOXYGEN
 */
 
-/* 
+/*
 N.B.  When something is added to the list of features, it should be
 added to each of the *features.h files, AND to examples/ut_features.cpp.
 */
 
-/* N.B.  most other compilers (icc, nvcc, open64, llvm) will also define __GNUC__, so order matters. */
+/* N.B.  most other compilers (icc, nvcc, open64, llvm) will also define __GNUC__, so order matters.
+ */
 #if defined(__OPENCL_VERSION__) && __OPENCL_VERSION__ > 0
 #include "openclfeatures.h"
 #elif defined(__CUDACC__)
@@ -201,20 +203,22 @@ added to each of the *features.h files, AND to examples/ut_features.cpp.
 #include "iccfeatures.h"
 #elif defined(__xlC__)
 #include "xlcfeatures.h"
+#elif defined(__PGI)
+#include "pgccfeatures.h"
 #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
 #include "sunprofeatures.h"
 #elif defined(__OPEN64__)
 #include "open64features.h"
 #elif defined(__clang__)
 #include "clangfeatures.h"
+#elif defined(_CRAYC)
+#include "crayfeatures.h"
+#elif defined(__FCC_VERSION) || defined(__FUJITSU)
+#include "fujitsufeatures.h"
 #elif defined(__GNUC__)
 #include "gccfeatures.h"
-#elif defined(__PGI)
-#include "pgccfeatures.h"
 #elif defined(_MSC_FULL_VER)
 #include "msvcfeatures.h"
-#elif defined(_CRAYC)
-#include "crayfeatures.h"
 #else
 #error "Can't identify compiler.  You'll need to add a new xxfeatures.hpp"
 { /* maybe an unbalanced brace will terminate the compilation */
@@ -268,8 +272,8 @@ added to each of the *features.h files, AND to examples/ut_features.cpp.
 #if R123_USE_CXX11_STATIC_ASSERT
 #define R123_STATIC_ASSERT(expr, msg) static_assert(expr, msg)
 #else
-    /* if msg always_looked_like_this, we could paste it into the name.  Worth it? */
-#define R123_STATIC_ASSERT(expr, msg) typedef char static_assertion[(!!(expr))*2-1]
+/* if msg always_looked_like_this, we could paste it into the name.  Worth it? */
+#define R123_STATIC_ASSERT(expr, msg) typedef char static_assertion[(!!(expr)) * 2 - 1]
 #endif
 #endif
 
@@ -282,7 +286,10 @@ added to each of the *features.h files, AND to examples/ut_features.cpp.
 #endif
 
 #ifndef R123_USE_PHILOX_64BIT
-#define R123_USE_PHILOX_64BIT (R123_USE_MULHILO64_ASM || R123_USE_MULHILO64_MSVC_INTRIN || R123_USE_MULHILO64_CUDA_INTRIN || R123_USE_GNU_UINT128 || R123_USE_MULHILO64_C99 || R123_USE_MULHILO64_OPENCL_INTRIN || R123_USE_MULHILO64_MULHI_INTRIN)
+#define R123_USE_PHILOX_64BIT                                                                      \
+    (R123_USE_MULHILO64_ASM || R123_USE_MULHILO64_MSVC_INTRIN || R123_USE_MULHILO64_CUDA_INTRIN || \
+     R123_USE_GNU_UINT128 || R123_USE_MULHILO64_C99 || R123_USE_MULHILO64_OPENCL_INTRIN ||         \
+     R123_USE_MULHILO64_MULHI_INTRIN)
 #endif
 
 #ifndef R123_ULONG_LONG
@@ -304,7 +311,7 @@ added to each of the *features.h files, AND to examples/ut_features.cpp.
 #endif
 
 #ifndef R123_THROW
-#define R123_THROW(x)    throw (x)
+#define R123_THROW(x) throw(x)
 #endif
 
 /*
diff --git a/coreneuron/utils/randoms/Random123/features/crayfeatures.h b/coreneuron/utils/randoms/Random123/features/crayfeatures.h
index fe5a477ad..d6ea4c96b 100644
--- a/coreneuron/utils/randoms/Random123/features/crayfeatures.h
+++ b/coreneuron/utils/randoms/Random123/features/crayfeatures.h
@@ -38,7 +38,7 @@ with cray compiler.
 #endif
 
 #ifndef R123_BUILTIN_EXPECT
-#define R123_BUILTIN_EXPECT(expr,likely) expr
+#define R123_BUILTIN_EXPECT(expr, likely) expr
 #endif
 
 #ifndef R123_USE_WMMINTRIN_H
diff --git a/coreneuron/utils/randoms/Random123/features/fujitsufeatures.h b/coreneuron/utils/randoms/Random123/features/fujitsufeatures.h
new file mode 100644
index 000000000..b36aab86b
--- /dev/null
+++ b/coreneuron/utils/randoms/Random123/features/fujitsufeatures.h
@@ -0,0 +1,96 @@
+/*
+Copyright (c) 2014 EPFL-BBP, All rights reserved.
+
+THIS SOFTWARE IS PROVIDED BY THE BLUE BRAIN PROJECT "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE BLUE BRAIN PROJECT
+BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Note: Minimum/Initial version derived from openclfeatures.h to work
+with cray compiler.
+*/
+
+#ifndef __crayfeatures_dot_hpp
+#define __crayfeatures_dot_hpp
+
+#ifndef R123_STATIC_INLINE
+#define R123_STATIC_INLINE static __inline
+#endif
+
+#ifndef R123_FORCE_INLINE
+#define R123_FORCE_INLINE(decl) decl
+#endif
+
+#ifndef R123_CUDA_DEVICE
+#define R123_CUDA_DEVICE
+#endif
+
+#ifndef R123_ASSERT
+#include <assert.h>
+#define R123_ASSERT(x) assert(x)
+#endif
+
+#ifndef R123_BUILTIN_EXPECT
+#define R123_BUILTIN_EXPECT(expr, likely) expr
+#endif
+
+#ifndef R123_USE_WMMINTRIN_H
+#define R123_USE_WMMINTRIN_H 0
+#endif
+
+#ifndef R123_USE_INTRIN_H
+#define R123_USE_INTRIN_H 0
+#endif
+
+#ifndef R123_USE_MULHILO32_ASM
+#define R123_USE_MULHILO32_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_ASM
+#define R123_USE_MULHILO64_ASM 0
+#endif
+
+#ifndef R123_USE_MULHILO64_MSVC_INTRIN
+#define R123_USE_MULHILO64_MSVC_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_CUDA_INTRIN
+#define R123_USE_MULHILO64_CUDA_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_OPENCL_INTRIN
+#define R123_USE_MULHILO64_OPENCL_INTRIN 0
+#endif
+
+#ifndef R123_USE_MULHILO64_MULHI_INTRIN
+#define R123_USE_MULHILO64_MULHI_INTRIN (defined(__powerpc64__))
+#endif
+
+#ifndef R123_MULHILO64_MULHI_INTRIN
+#define R123_MULHILO64_MULHI_INTRIN __mulhdu
+#endif
+
+#ifndef R123_USE_MULHILO32_MULHI_INTRIN
+#define R123_USE_MULHILO32_MULHI_INTRIN 0
+#endif
+
+#ifndef R123_MULHILO32_MULHI_INTRIN
+#define R123_MULHILO32_MULHI_INTRIN __mulhwu
+#endif
+
+#ifndef __STDC_CONSTANT_MACROS
+#define __STDC_CONSTANT_MACROS
+#endif
+#include <stdint.h>
+#ifndef UINT64_C
+#error UINT64_C not defined.  You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
+#endif
+
+#endif
diff --git a/coreneuron/utils/randoms/Random123/features/gccfeatures.h b/coreneuron/utils/randoms/Random123/features/gccfeatures.h
index d6bb06088..dc551342f 100644
--- a/coreneuron/utils/randoms/Random123/features/gccfeatures.h
+++ b/coreneuron/utils/randoms/Random123/features/gccfeatures.h
@@ -32,16 +32,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef __gccfeatures_dot_hpp
 #define __gccfeatures_dot_hpp
 
-#define R123_GNUC_VERSION (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
+#define R123_GNUC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
 
 #if !defined(__x86_64__) && !defined(__i386__) && !defined(__powerpc__)
-#  error "This code has only been tested on x86 and powerpc platforms."
+#error "This code has only been tested on x86 and powerpc platforms."
 #include <including_a_nonexistent_file_will_stop_some_compilers_from_continuing_with_a_hopeless_task>
 { /* maybe an unbalanced brace will terminate the compilation */
- /* Feel free to try the Random123 library on other architectures by changing
- the conditions that reach this error, but you should consider it a
- porting exercise and expect to encounter bugs and deficiencies.
- Please let the authors know of any successes (or failures). */
+  /* Feel free to try the Random123 library on other architectures by changing
+  the conditions that reach this error, but you should consider it a
+  porting exercise and expect to encounter bugs and deficiencies.
+  Please let the authors know of any successes (or failures). */
 #endif
 
 #ifdef __powerpc__
@@ -70,7 +70,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifndef R123_BUILTIN_EXPECT
-#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely)
+#define R123_BUILTIN_EXPECT(expr, likely) __builtin_expect(expr, likely)
 #endif
 
 /* According to the C++0x standard, we should be able to test the numeric
@@ -80,7 +80,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    earlier versions, the only way  to detect whether --std=c++0x was requested
    on the command line is to look at the __GCC_EXPERIMENTAL_CXX0X__ pp-symbol.
 */
-#define GNU_CXX11 (__cplusplus>=201103L || (R123_GNUC_VERSION<40700 && defined(__GCC_EXPERIMENTAL_CXX0X__) ))
+#define GNU_CXX11 \
+    (__cplusplus >= 201103L || (R123_GNUC_VERSION < 40700 && defined(__GCC_EXPERIMENTAL_CXX0X__)))
 
 #ifndef R123_USE_CXX11_UNRESTRICTED_UNIONS
 #define R123_USE_CXX11_UNRESTRICTED_UNIONS ((R123_GNUC_VERSION >= 40600) && GNU_CXX11)
@@ -99,11 +100,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifndef R123_USE_CXX11_RANDOM
-#define R123_USE_CXX11_RANDOM ((R123_GNUC_VERSION>=40500) && GNU_CXX11)
+#define R123_USE_CXX11_RANDOM ((R123_GNUC_VERSION >= 40500) && GNU_CXX11)
 #endif
 
 #ifndef R123_USE_CXX11_TYPE_TRAITS
-#define R123_USE_CXX11_TYPE_TRAITS ((R123_GNUC_VERSION>=40400) && GNU_CXX11)
+#define R123_USE_CXX11_TYPE_TRAITS ((R123_GNUC_VERSION >= 40400) && GNU_CXX11)
 #endif
 
 #ifndef R123_USE_AES_NI
@@ -158,7 +159,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifndef R123_USE_ASM_GNU
-#define R123_USE_ASM_GNU (defined(__x86_64__)||defined(__i386__))
+#define R123_USE_ASM_GNU (defined(__x86_64__) || defined(__i386__))
 #endif
 
 #ifndef R123_USE_CPUID_MSVC
@@ -166,7 +167,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifndef R123_USE_X86INTRIN_H
-#define R123_USE_X86INTRIN_H ((defined(__x86_64__)||defined(__i386__)) && R123_GNUC_VERSION >= 40402)
+#define R123_USE_X86INTRIN_H \
+    ((defined(__x86_64__) || defined(__i386__)) && R123_GNUC_VERSION >= 40402)
 #endif
 
 #ifndef R123_USE_IA32INTRIN_H
@@ -178,7 +180,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifndef R123_USE_EMMINTRIN_H
-/* gcc -m64 on Solaris 10 defines __SSE2__ but doesn't have 
+/* gcc -m64 on Solaris 10 defines __SSE2__ but doesn't have
    emmintrin.h in the include search path.  This is
    so broken that I refuse to try to work around it.  If this
    affects you, figure out where your emmintrin.h lives and
diff --git a/coreneuron/utils/randoms/Random123/features/iccfeatures.h b/coreneuron/utils/randoms/Random123/features/iccfeatures.h
index b64e5c229..d18356532 100644
--- a/coreneuron/utils/randoms/Random123/features/iccfeatures.h
+++ b/coreneuron/utils/randoms/Random123/features/iccfeatures.h
@@ -33,11 +33,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define __icpcfeatures_dot_hpp
 
 // icc relies on gcc libraries and other toolchain components.
-#define R123_GNUC_VERSION (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
+#define R123_GNUC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
 
 #if !defined(__x86_64__) && !defined(__i386__)
-#  error "This code has only been tested on x86 platforms."
-{ // maybe an unbalanced brace will terminate the compilation
+#error "This code has only been tested on x86 platforms."
+{  // maybe an unbalanced brace will terminate the compilation
 // You are invited to try Easy123 on other architectures, by changing
 // the conditions that reach this error, but you should consider it a
 // porting exercise and expect to encounter bugs and deficiencies.
@@ -62,7 +62,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifndef R123_BUILTIN_EXPECT
-#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely)
+#define R123_BUILTIN_EXPECT(expr, likely) __builtin_expect(expr, likely)
 #endif
 
 // The basic idiom is:
@@ -119,7 +119,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 // AES-NI Whitepaper by Gueron says that icc has supported AES-NI from
 // 11.1 onwards.
 //
-#define R123_USE_AES_NI ((__ICC>=1101) && defined(__AES__))
+#define R123_USE_AES_NI ((__ICC >= 1101) && defined(__AES__))
 #endif
 
 #ifndef R123_USE_AES_OPENSSL
diff --git a/coreneuron/utils/randoms/Random123/features/msvcfeatures.h b/coreneuron/utils/randoms/Random123/features/msvcfeatures.h
index 9eb952091..2e719021d 100644
--- a/coreneuron/utils/randoms/Random123/features/msvcfeatures.h
+++ b/coreneuron/utils/randoms/Random123/features/msvcfeatures.h
@@ -37,8 +37,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //#endif
 
 #if !defined(_M_IX86) && !defined(_M_X64)
-#  error "This code has only been tested on x86 platforms."
-{ // maybe an unbalanced brace will terminate the compilation
+#error "This code has only been tested on x86 platforms."
+{  // maybe an unbalanced brace will terminate the compilation
 // You are invited to try Random123 on other architectures, by changing
 // the conditions that reach this error, but you should consider it a
 // porting exercise and expect to encounter bugs and deficiencies.
@@ -63,7 +63,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifndef R123_BUILTIN_EXPECT
-#define R123_BUILTIN_EXPECT(expr,likely) expr
+#define R123_BUILTIN_EXPECT(expr, likely) expr
 #endif
 
 // The basic idiom is:
@@ -192,8 +192,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #error UINT64_C not defined.  You must define __STDC_CONSTANT_MACROS before you #include <stdint.h>
 #endif
 
-#pragma warning(disable:4244)
-#pragma warning(disable:4996)
+#pragma warning(disable : 4244)
+#pragma warning(disable : 4996)
 
 // If you add something, it must go in all the other XXfeatures.hpp
 // and in ../ut_features.cpp
diff --git a/coreneuron/utils/randoms/Random123/features/nvccfeatures.h b/coreneuron/utils/randoms/Random123/features/nvccfeatures.h
index 711babf88..8fc0bea97 100644
--- a/coreneuron/utils/randoms/Random123/features/nvccfeatures.h
+++ b/coreneuron/utils/randoms/Random123/features/nvccfeatures.h
@@ -37,7 +37,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #if CUDART_VERSION < 4010
-#error "CUDA versions earlier than 4.1 produce incorrect results for some templated functions in namespaces.  Random123 isunsupported.  See comments in nvccfeatures.h"
+#error \
+    "CUDA versions earlier than 4.1 produce incorrect results for some templated functions in namespaces.  Random123 isunsupported.  See comments in nvccfeatures.h"
 // This test was added in Random123-1.08 (August, 2013) because we
 // discovered that Ftype(maxTvalue<T>()) with Ftype=double and
 // T=uint64_t in examples/uniform.hpp produces -1 for CUDA4.0 and
@@ -63,11 +64,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifndef R123_ASSERT
-#define R123_ASSERT(x) if((x)) ; else asm("trap;")
+#define R123_ASSERT(x) \
+    if ((x))           \
+        ;              \
+    else               \
+    asm("trap;")
 #endif
 
 #ifndef R123_BUILTIN_EXPECT
-#define R123_BUILTIN_EXPECT(expr,likely) expr
+#define R123_BUILTIN_EXPECT(expr, likely) expr
 #endif
 
 #ifndef R123_USE_AES_NI
@@ -98,7 +103,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #ifndef R123_THROW
 // No exceptions in CUDA, at least upto 4.0
-#define R123_THROW(x)    R123_ASSERT(0)
+#define R123_THROW(x) R123_ASSERT(0)
 #endif
 
 #if defined(__GNUC__)
diff --git a/coreneuron/utils/randoms/Random123/features/openclfeatures.h b/coreneuron/utils/randoms/Random123/features/openclfeatures.h
index af03d3092..6751ed6d9 100644
--- a/coreneuron/utils/randoms/Random123/features/openclfeatures.h
+++ b/coreneuron/utils/randoms/Random123/features/openclfeatures.h
@@ -49,7 +49,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifndef R123_BUILTIN_EXPECT
-#define R123_BUILTIN_EXPECT(expr,likely) expr
+#define R123_BUILTIN_EXPECT(expr, likely) expr
 #endif
 
 #ifndef R123_USE_GNU_UINT128
@@ -82,7 +82,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 // But these typedefs mean we cannot include stdint.h with
 // these headers?  Do we need R123_64T, R123_32T, R123_8T?
 typedef ulong uint64_t;
-typedef uint  uint32_t;
+typedef uint uint32_t;
 typedef uchar uint8_t;
 #define UINT64_C(x) ((ulong)(x##UL))
 
diff --git a/coreneuron/utils/randoms/Random123/features/pgccfeatures.h b/coreneuron/utils/randoms/Random123/features/pgccfeatures.h
index 18ace1353..33d5fa0c5 100644
--- a/coreneuron/utils/randoms/Random123/features/pgccfeatures.h
+++ b/coreneuron/utils/randoms/Random123/features/pgccfeatures.h
@@ -46,13 +46,13 @@ so as not to confuse it with the version available from LANL.
 #define __pgccfeatures_dot_hpp
 
 #if !defined(__x86_64__) && !defined(__i386__)
-#  error "This code has only been tested on x86 platforms."
+#error "This code has only been tested on x86 platforms."
 #include <including_a_nonexistent_file_will_stop_some_compilers_from_continuing_with_a_hopeless_task>
 { /* maybe an unbalanced brace will terminate the compilation */
- /* Feel free to try the Random123 library on other architectures by changing
- the conditions that reach this error, but you should consider it a
- porting exercise and expect to encounter bugs and deficiencies.
- Please let the authors know of any successes (or failures). */
+  /* Feel free to try the Random123 library on other architectures by changing
+  the conditions that reach this error, but you should consider it a
+  porting exercise and expect to encounter bugs and deficiencies.
+  Please let the authors know of any successes (or failures). */
 #endif
 
 #ifndef R123_STATIC_INLINE
@@ -74,7 +74,7 @@ so as not to confuse it with the version available from LANL.
 #endif
 
 #ifndef R123_BUILTIN_EXPECT
-#define R123_BUILTIN_EXPECT(expr,likely) (expr)
+#define R123_BUILTIN_EXPECT(expr, likely) (expr)
 #endif
 
 /* PGI through 13.2 doesn't appear to support AES-NI. */
diff --git a/coreneuron/utils/randoms/Random123/features/sse.h b/coreneuron/utils/randoms/Random123/features/sse.h
index 88efd65f1..5794c87fe 100644
--- a/coreneuron/utils/randoms/Random123/features/sse.h
+++ b/coreneuron/utils/randoms/Random123/features/sse.h
@@ -64,21 +64,20 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if R123_USE_ASM_GNU
 
 /* bit25 of CX tells us whether AES is enabled. */
-R123_STATIC_INLINE int haveAESNI(){
+R123_STATIC_INLINE int haveAESNI() {
     unsigned int eax, ebx, ecx, edx;
-    __asm__ __volatile__ ("cpuid": "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) :
-                      "a" (1));
-    return (ecx>>25) & 1;
+    __asm__ __volatile__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) : "a"(1));
+    return (ecx >> 25) & 1;
 }
 #elif R123_USE_CPUID_MSVC
-R123_STATIC_INLINE int haveAESNI(){
+R123_STATIC_INLINE int haveAESNI() {
     int CPUInfo[4];
     __cpuid(CPUInfo, 1);
-    return (CPUInfo[2]>>25)&1;
+    return (CPUInfo[2] >> 25) & 1;
 }
 #else /* R123_USE_CPUID_??? */
 #warning "No R123_USE_CPUID_XXX method chosen.  haveAESNI will always return false"
-R123_STATIC_INLINE int haveAESNI(){
+R123_STATIC_INLINE int haveAESNI() {
     return 0;
 }
 #endif /* R123_USE_ASM_GNU || R123_USE_CPUID_MSVC */
@@ -90,13 +89,13 @@ R123_STATIC_INLINE int haveAESNI(){
 // R123_USE_feature tests for each of these in each of the
 // compilerfeatures.h files we just keep the complexity localized
 // to here...
-#if (defined(__ICC) && __ICC<1210) || (defined(_MSC_VER) && !defined(_WIN64))
-/* Is there an intrinsic to assemble an __m128i from two 64-bit words? 
+#if (defined(__ICC) && __ICC < 1210) || (defined(_MSC_VER) && !defined(_WIN64))
+/* Is there an intrinsic to assemble an __m128i from two 64-bit words?
    If not, use the 4x32-bit intrisic instead.  N.B.  It looks like Intel
    added _mm_set_epi64x to icc version 12.1 in Jan 2012.
 */
-R123_STATIC_INLINE __m128i _mm_set_epi64x(uint64_t v1, uint64_t v0){
-    union{
+R123_STATIC_INLINE __m128i _mm_set_epi64x(uint64_t v1, uint64_t v0) {
+    union {
         uint64_t u64;
         uint32_t u32[2];
     } u1, u0;
@@ -116,36 +115,36 @@ R123_STATIC_INLINE __m128i _mm_set_epi64x(uint64_t v1, uint64_t v0){
    without the 'x'.
 */
 #if !defined(__x86_64__) || defined(_MSC_VER) || defined(__OPEN64__)
-R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
-    union{
+R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si) {
+    union {
         uint64_t u64[2];
         __m128i m;
-    }u;
+    } u;
     _mm_store_si128(&u.m, si);
     return u.u64[0];
 }
 #elif defined(__llvm__) || defined(__ICC)
-R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
+R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si) {
     return (uint64_t)_mm_cvtsi128_si64(si);
 }
 #else /* GNUC, others */
 /* FWIW, gcc's emmintrin.h has had the 'x' spelling
    since at least gcc-3.4.4.  The no-'x' spelling showed up
    around 4.2. */
-R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si){
+R123_STATIC_INLINE uint64_t _mm_extract_lo64(__m128i si) {
     return (uint64_t)_mm_cvtsi128_si64x(si);
 }
 #endif
 #if defined(__GNUC__) && __GNUC__ < 4
 /* the cast builtins showed up in gcc4. */
-R123_STATIC_INLINE __m128 _mm_castsi128_ps(__m128i si){
+R123_STATIC_INLINE __m128 _mm_castsi128_ps(__m128i si) {
     return (__m128)si;
 }
 #endif
 
 #ifdef __cplusplus
 
-struct r123m128i{
+struct r123m128i {
     __m128i m;
 #if R123_USE_CXX11_UNRESTRICTED_UNIONS
     // C++98 forbids a union member from having *any* constructors.
@@ -154,110 +153,139 @@ struct r123m128i{
     // we can provide a r123m128i constructor with an __m128i argument, and still
     // have the default (and hence trivial) default constructor.
     r123m128i() = default;
-    r123m128i(__m128i _m): m(_m){}
+    r123m128i(__m128i _m) : m(_m) {
+    }
 #endif
-    r123m128i& operator=(const __m128i& rhs){ m=rhs; return *this;}
-    r123m128i& operator=(R123_ULONG_LONG n){ m = _mm_set_epi64x(0, n); return *this;}
+    r123m128i& operator=(const __m128i& rhs) {
+        m = rhs;
+        return *this;
+    }
+    r123m128i& operator=(R123_ULONG_LONG n) {
+        m = _mm_set_epi64x(0, n);
+        return *this;
+    }
 #if R123_USE_CXX11_EXPLICIT_CONVERSIONS
     // With C++0x we can attach explicit to the bool conversion operator
     // to disambiguate undesired promotions.  For g++, this works
     // only in 4.5 and above.
-    explicit operator bool() const {return _bool();}
+    explicit operator bool() const {
+        return _bool();
+    }
 #else
     // Pre-C++0x, we have to do something else.  Google for the "safe bool"
     // idiom for other ideas...
-    operator const void*() const{return _bool()?this:0;}
+    operator const void*() const {
+        return _bool() ? this : 0;
+    }
 #endif
-    operator __m128i() const {return m;}
+    operator __m128i() const {
+        return m;
+    }
 
-private:
+  private:
 #if R123_USE_SSE4_1
-    bool _bool() const{ return !_mm_testz_si128(m,m); }
+    bool _bool() const {
+        return !_mm_testz_si128(m, m);
+    }
 #else
-    bool _bool() const{ return 0xf != _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(m, _mm_setzero_si128()))); }
+    bool _bool() const {
+        return 0xf != _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(m, _mm_setzero_si128())));
+    }
 #endif
 };
 
-R123_STATIC_INLINE r123m128i& operator++(r123m128i& v){
+R123_STATIC_INLINE r123m128i& operator++(r123m128i& v) {
     __m128i& c = v.m;
     __m128i zeroone = _mm_set_epi64x(R123_64BIT(0), R123_64BIT(1));
     c = _mm_add_epi64(c, zeroone);
-    //return c;
+// return c;
 #if R123_USE_SSE4_1
     __m128i zerofff = _mm_set_epi64x(0, ~(R123_64BIT(0)));
-    if( R123_BUILTIN_EXPECT(_mm_testz_si128(c,zerofff), 0) ){
+    if (R123_BUILTIN_EXPECT(_mm_testz_si128(c, zerofff), 0)) {
         __m128i onezero = _mm_set_epi64x(R123_64BIT(1), R123_64BIT(0));
         c = _mm_add_epi64(c, onezero);
     }
 #else
-    unsigned mask  = _mm_movemask_ps( _mm_castsi128_ps(_mm_cmpeq_epi32(c, _mm_setzero_si128())));
+    unsigned mask = _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(c, _mm_setzero_si128())));
     // The low two bits of mask are 11 iff the low 64 bits of
     // c are zero.
-    if( R123_BUILTIN_EXPECT((mask&0x3) == 0x3, 0) ){
-        __m128i onezero = _mm_set_epi64x(1,0);
+    if (R123_BUILTIN_EXPECT((mask & 0x3) == 0x3, 0)) {
+        __m128i onezero = _mm_set_epi64x(1, 0);
         c = _mm_add_epi64(c, onezero);
     }
 #endif
     return v;
 }
 
-R123_STATIC_INLINE r123m128i& operator+=(r123m128i& lhs, R123_ULONG_LONG n){ 
+R123_STATIC_INLINE r123m128i& operator+=(r123m128i& lhs, R123_ULONG_LONG n) {
     __m128i c = lhs.m;
     __m128i incr128 = _mm_set_epi64x(0, n);
     c = _mm_add_epi64(c, incr128);
-    // return c;     // NO CARRY!  
+    // return c;     // NO CARRY!
 
     int64_t lo64 = _mm_extract_lo64(c);
-    if((uint64_t)lo64 < n)
-        c = _mm_add_epi64(c, _mm_set_epi64x(1,0));
+    if ((uint64_t)lo64 < n)
+        c = _mm_add_epi64(c, _mm_set_epi64x(1, 0));
     lhs.m = c;
-    return lhs; 
+    return lhs;
 }
 
 // We need this one because it's present, but never used in r123array1xm128i::incr
-R123_STATIC_INLINE bool operator<=(R123_ULONG_LONG, const r123m128i &){
-    throw std::runtime_error("operator<=(unsigned long long, r123m128i) is unimplemented.");}
+R123_STATIC_INLINE bool operator<=(R123_ULONG_LONG, const r123m128i&) {
+    throw std::runtime_error("operator<=(unsigned long long, r123m128i) is unimplemented.");
+}
 
-// The comparisons aren't implemented, but if we leave them out, and 
+// The comparisons aren't implemented, but if we leave them out, and
 // somebody writes, e.g., M1 < M2, the compiler will do an implicit
 // conversion through void*.  Sigh...
-R123_STATIC_INLINE bool operator<(const r123m128i&, const r123m128i&){
-    throw std::runtime_error("operator<(r123m128i, r123m128i) is unimplemented.");}
-R123_STATIC_INLINE bool operator<=(const r123m128i&, const r123m128i&){
-    throw std::runtime_error("operator<=(r123m128i, r123m128i) is unimplemented.");}
-R123_STATIC_INLINE bool operator>(const r123m128i&, const r123m128i&){
-    throw std::runtime_error("operator>(r123m128i, r123m128i) is unimplemented.");}
-R123_STATIC_INLINE bool operator>=(const r123m128i&, const r123m128i&){
-    throw std::runtime_error("operator>=(r123m128i, r123m128i) is unimplemented.");}
+R123_STATIC_INLINE bool operator<(const r123m128i&, const r123m128i&) {
+    throw std::runtime_error("operator<(r123m128i, r123m128i) is unimplemented.");
+}
+R123_STATIC_INLINE bool operator<=(const r123m128i&, const r123m128i&) {
+    throw std::runtime_error("operator<=(r123m128i, r123m128i) is unimplemented.");
+}
+R123_STATIC_INLINE bool operator>(const r123m128i&, const r123m128i&) {
+    throw std::runtime_error("operator>(r123m128i, r123m128i) is unimplemented.");
+}
+R123_STATIC_INLINE bool operator>=(const r123m128i&, const r123m128i&) {
+    throw std::runtime_error("operator>=(r123m128i, r123m128i) is unimplemented.");
+}
 
-R123_STATIC_INLINE bool operator==(const r123m128i &lhs, const r123m128i &rhs){ 
-    return 0xf==_mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(lhs, rhs))); }
-R123_STATIC_INLINE bool operator!=(const r123m128i &lhs, const r123m128i &rhs){ 
-    return !(lhs==rhs);}
-R123_STATIC_INLINE bool operator==(R123_ULONG_LONG lhs, const r123m128i &rhs){
-    r123m128i LHS; LHS.m=_mm_set_epi64x(0, lhs); return LHS == rhs; }
-R123_STATIC_INLINE bool operator!=(R123_ULONG_LONG lhs, const r123m128i &rhs){
-    return !(lhs==rhs);}
-R123_STATIC_INLINE std::ostream& operator<<(std::ostream& os, const r123m128i& m){
-    union{
+R123_STATIC_INLINE bool operator==(const r123m128i& lhs, const r123m128i& rhs) {
+    return 0xf == _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpeq_epi32(lhs, rhs)));
+}
+R123_STATIC_INLINE bool operator!=(const r123m128i& lhs, const r123m128i& rhs) {
+    return !(lhs == rhs);
+}
+R123_STATIC_INLINE bool operator==(R123_ULONG_LONG lhs, const r123m128i& rhs) {
+    r123m128i LHS;
+    LHS.m = _mm_set_epi64x(0, lhs);
+    return LHS == rhs;
+}
+R123_STATIC_INLINE bool operator!=(R123_ULONG_LONG lhs, const r123m128i& rhs) {
+    return !(lhs == rhs);
+}
+R123_STATIC_INLINE std::ostream& operator<<(std::ostream& os, const r123m128i& m) {
+    union {
         uint64_t u64[2];
         __m128i m;
-    }u;
+    } u;
     _mm_storeu_si128(&u.m, m.m);
     return os << u.u64[0] << " " << u.u64[1];
 }
 
-R123_STATIC_INLINE std::istream& operator>>(std::istream& is, r123m128i& m){
+R123_STATIC_INLINE std::istream& operator>>(std::istream& is, r123m128i& m) {
     uint64_t u64[2];
     is >> u64[0] >> u64[1];
     m.m = _mm_set_epi64x(u64[1], u64[0]);
     return is;
 }
 
-template<typename T> inline T assemble_from_u32(uint32_t *p32); // forward declaration
+template <typename T>
+inline T assemble_from_u32(uint32_t* p32);  // forward declaration
 
 template <>
-inline r123m128i assemble_from_u32<r123m128i>(uint32_t *p32){
+inline r123m128i assemble_from_u32<r123m128i>(uint32_t* p32) {
     r123m128i ret;
     ret.m = _mm_set_epi32(p32[3], p32[2], p32[1], p32[0]);
     return ret;
@@ -265,14 +293,12 @@ inline r123m128i assemble_from_u32<r123m128i>(uint32_t *p32){
 
 #else
 
-typedef struct {
-    __m128i m;
-} r123m128i;
+typedef struct { __m128i m; } r123m128i;
 
 #endif /* __cplusplus */
 
-#else /* !R123_USE_SSE */
-R123_STATIC_INLINE int haveAESNI(){
+#else  /* !R123_USE_SSE */
+R123_STATIC_INLINE int haveAESNI() {
     return 0;
 }
 #endif /* R123_USE_SSE */
diff --git a/coreneuron/utils/randoms/Random123/features/sunprofeatures.h b/coreneuron/utils/randoms/Random123/features/sunprofeatures.h
index c9cdc00f5..da824119e 100644
--- a/coreneuron/utils/randoms/Random123/features/sunprofeatures.h
+++ b/coreneuron/utils/randoms/Random123/features/sunprofeatures.h
@@ -50,7 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifndef R123_BUILTIN_EXPECT
-#define R123_BUILTIN_EXPECT(expr,likely) expr
+#define R123_BUILTIN_EXPECT(expr, likely) expr
 #endif
 
 // The basic idiom is:
diff --git a/coreneuron/utils/randoms/Random123/features/xlcfeatures.h b/coreneuron/utils/randoms/Random123/features/xlcfeatures.h
index a5c8412a4..f9ee3591d 100644
--- a/coreneuron/utils/randoms/Random123/features/xlcfeatures.h
+++ b/coreneuron/utils/randoms/Random123/features/xlcfeatures.h
@@ -46,13 +46,13 @@ so as not to confuse it with the version available from LANL.
 #define __xlcfeatures_dot_hpp
 
 #if !defined(__x86_64__) && !defined(__i386__) && !defined(__powerpc__)
-#  error "This code has only been tested on x86 and PowerPC platforms."
+#error "This code has only been tested on x86 and PowerPC platforms."
 #include <including_a_nonexistent_file_will_stop_some_compilers_from_continuing_with_a_hopeless_task>
 { /* maybe an unbalanced brace will terminate the compilation */
- /* Feel free to try the Random123 library on other architectures by changing
- the conditions that reach this error, but you should consider it a
- porting exercise and expect to encounter bugs and deficiencies.
- Please let the authors know of any successes (or failures). */
+  /* Feel free to try the Random123 library on other architectures by changing
+  the conditions that reach this error, but you should consider it a
+  porting exercise and expect to encounter bugs and deficiencies.
+  Please let the authors know of any successes (or failures). */
 #endif
 
 #ifdef __cplusplus
@@ -81,7 +81,7 @@ so as not to confuse it with the version available from LANL.
 #endif
 
 #ifndef R123_BUILTIN_EXPECT
-#define R123_BUILTIN_EXPECT(expr,likely) __builtin_expect(expr,likely)
+#define R123_BUILTIN_EXPECT(expr, likely) __builtin_expect(expr, likely)
 #endif
 
 #ifndef R123_USE_AES_NI
diff --git a/coreneuron/utils/randoms/Random123/philox.h b/coreneuron/utils/randoms/Random123/philox.h
index 9c87384c1..1390a16cd 100644
--- a/coreneuron/utils/randoms/Random123/philox.h
+++ b/coreneuron/utils/randoms/Random123/philox.h
@@ -37,7 +37,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "features/compilerfeatures.h"
 #include "array.h"
 
-
 /*
 // Macros _Foo_tpl are code generation 'templates'  They define
 // inline functions with names obtained by mangling Foo and the
@@ -58,49 +57,41 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 // obviously not guaranteed that all compilers will be that smart, so
 // other implementations might be preferable, e.g., using an intrinsic
 // or an asm block.  On the other hand, for 32-bit multiplies,
-// this *is* perfectly standard C99 - any C99 compiler should 
+// this *is* perfectly standard C99 - any C99 compiler should
 // understand it and produce correct code.  For 64-bit multiplies,
 // it's only usable if the compiler recognizes that it can do
 // arithmetic on a 128-bit type.  That happens to be true for gcc on
 // x86-64, and powerpc64 but not much else.
 */
-#define _mulhilo_dword_tpl(W, Word, Dword)                              \
-R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \
-    Dword product = ((Dword)a)*((Dword)b);                              \
-    *hip = product>>W;                                                  \
-    return (Word)product;                                               \
-}
+#define _mulhilo_dword_tpl(W, Word, Dword)                                           \
+    R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip) { \
+        Dword product = ((Dword)a) * ((Dword)b);                                     \
+        *hip = product >> W;                                                         \
+        return (Word)product;                                                        \
+    }
 
 /*
 // A template for mulhilo using gnu-style asm syntax.
-// INSN can be "mulw", "mull" or "mulq".  
+// INSN can be "mulw", "mull" or "mulq".
 // FIXME - porting to other architectures, we'll need still-more conditional
 // branching here.  Note that intrinsics are usually preferable.
 */
 #ifdef __powerpc__
-#define _mulhilo_asm_tpl(W, Word, INSN)                         \
-R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word *hip){ \
-    Word dx = 0;                                                \
-    __asm__("\n\t"                                              \
-        INSN " %0,%1,%2\n\t"                                    \
-        : "=r"(dx)                                              \
-        : "r"(b), "r"(ax)                                       \
-        );                                                      \
-    *hip = dx;                                                  \
-    return ax*b;                                                \
-}
+#define _mulhilo_asm_tpl(W, Word, INSN)                                    \
+    R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word* hip) {       \
+        Word dx = 0;                                                       \
+        __asm__("\n\t" INSN " %0,%1,%2\n\t" : "=r"(dx) : "r"(b), "r"(ax)); \
+        *hip = dx;                                                         \
+        return ax * b;                                                     \
+    }
 #else
-#define _mulhilo_asm_tpl(W, Word, INSN)                         \
-R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word *hip){      \
-    Word dx;                                                    \
-    __asm__("\n\t"                                              \
-        INSN " %2\n\t"                                          \
-        : "=a"(ax), "=d"(dx)                                    \
-        : "r"(b), "0"(ax)                                       \
-        );                                                      \
-    *hip = dx;                                                  \
-    return ax;                                                  \
-}
+#define _mulhilo_asm_tpl(W, Word, INSN)                                        \
+    R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word* hip) {           \
+        Word dx;                                                               \
+        __asm__("\n\t" INSN " %2\n\t" : "=a"(ax), "=d"(dx) : "r"(b), "0"(ax)); \
+        *hip = dx;                                                             \
+        return ax;                                                             \
+    }
 #endif /* __powerpc__ */
 
 /*
@@ -108,18 +99,18 @@ R123_STATIC_INLINE Word mulhilo##W(Word ax, Word b, Word *hip){      \
 // For example,_umul128 is an msvc intrinsic, c.f.
 // http://msdn.microsoft.com/en-us/library/3dayytw9.aspx
 */
-#define _mulhilo_msvc_intrin_tpl(W, Word, INTRIN)               \
-R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){       \
-    return INTRIN(a, b, hip);                                   \
-}
+#define _mulhilo_msvc_intrin_tpl(W, Word, INTRIN)                   \
+    R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip) { \
+        return INTRIN(a, b, hip);                                   \
+    }
 
 /* N.B.  This really should be called _mulhilo_mulhi_intrin.  It just
    happens that CUDA was the first time we used the idiom. */
-#define _mulhilo_cuda_intrin_tpl(W, Word, INTRIN)                       \
-R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){ \
-    *hip = INTRIN(a, b);                                                \
-    return a*b;                                                         \
-}
+#define _mulhilo_cuda_intrin_tpl(W, Word, INTRIN)                                    \
+    R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip) { \
+        *hip = INTRIN(a, b);                                                         \
+        return a * b;                                                                \
+    }
 
 /*
 // A template for mulhilo using only word-size operations and
@@ -134,42 +125,42 @@ R123_CUDA_DEVICE R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip){
 // features/XXfeatures.h headers.  It can, of course, be
 // set with a compile-time -D option.
 */
-#define _mulhilo_c99_tpl(W, Word) \
-R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word *hip){ \
-    const unsigned WHALF = W/2;                                    \
-    const Word LOMASK = ((((Word)1)<<WHALF)-1);                    \
-    Word lo = a*b;               /* full low multiply */           \
-    Word ahi = a>>WHALF;                                           \
-    Word alo = a& LOMASK;                                          \
-    Word bhi = b>>WHALF;                                           \
-    Word blo = b& LOMASK;                                          \
-                                                                   \
-    Word ahbl = ahi*blo;                                           \
-    Word albh = alo*bhi;                                           \
-                                                                   \
-    Word ahbl_albh = ((ahbl&LOMASK) + (albh&LOMASK));                   \
-    Word hi = ahi*bhi + (ahbl>>WHALF) +  (albh>>WHALF);                 \
-    hi += ahbl_albh >> WHALF; /* carry from the sum of lo(ahbl) + lo(albh) ) */ \
-    /* carry from the sum with alo*blo */                               \
-    hi += ((lo >> WHALF) < (ahbl_albh&LOMASK));                         \
-    *hip = hi;                                                          \
-    return lo;                                                          \
-}
+#define _mulhilo_c99_tpl(W, Word)                                                   \
+    R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip) {                 \
+        const unsigned WHALF = W / 2;                                               \
+        const Word LOMASK = ((((Word)1) << WHALF) - 1);                             \
+        Word lo = a * b; /* full low multiply */                                    \
+        Word ahi = a >> WHALF;                                                      \
+        Word alo = a & LOMASK;                                                      \
+        Word bhi = b >> WHALF;                                                      \
+        Word blo = b & LOMASK;                                                      \
+                                                                                    \
+        Word ahbl = ahi * blo;                                                      \
+        Word albh = alo * bhi;                                                      \
+                                                                                    \
+        Word ahbl_albh = ((ahbl & LOMASK) + (albh & LOMASK));                       \
+        Word hi = ahi * bhi + (ahbl >> WHALF) + (albh >> WHALF);                    \
+        hi += ahbl_albh >> WHALF; /* carry from the sum of lo(ahbl) + lo(albh) ) */ \
+        /* carry from the sum with alo*blo */                                       \
+        hi += ((lo >> WHALF) < (ahbl_albh & LOMASK));                               \
+        *hip = hi;                                                                  \
+        return lo;                                                                  \
+    }
 
 /*
 // A template for mulhilo on a platform that can't do it
 // We could put a C version here, but is it better to run *VERY*
 // slowly or to just stop and force the user to find another CBRNG?
 */
-#define _mulhilo_fail_tpl(W, Word)                                      \
-R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word *hip){               \
-    R123_STATIC_ASSERT(0, "mulhilo" #W " is not implemented on this machine\n"); \
-}
+#define _mulhilo_fail_tpl(W, Word)                                                   \
+    R123_STATIC_INLINE Word mulhilo##W(Word a, Word b, Word* hip) {                  \
+        R123_STATIC_ASSERT(0, "mulhilo" #W " is not implemented on this machine\n"); \
+    }
 
 /*
 // N.B.  There's an MSVC intrinsic called _emul,
 // which *might* compile into better code than
-// _mulhilo_dword_tpl 
+// _mulhilo_dword_tpl
 */
 #if R123_USE_MULHILO32_ASM
 #ifdef __powerpc__
@@ -184,12 +175,12 @@ _mulhilo_dword_tpl(32, uint32_t, uint64_t)
 #if R123_USE_PHILOX_64BIT
 #if R123_USE_MULHILO64_ASM
 #ifdef __powerpc64__
-_mulhilo_asm_tpl(64, uint64_t, "mulhdu")
+    _mulhilo_asm_tpl(64, uint64_t, "mulhdu")
 #else
-_mulhilo_asm_tpl(64, uint64_t, "mulq")
+    _mulhilo_asm_tpl(64, uint64_t, "mulq")
 #endif /* __powerpc64__ */
 #elif R123_USE_MULHILO64_MSVC_INTRIN
-_mulhilo_msvc_intrin_tpl(64, uint64_t, _umul128)
+    _mulhilo_msvc_intrin_tpl(64, uint64_t, _umul128)
 #elif R123_USE_MULHILO64_CUDA_INTRIN
 _mulhilo_cuda_intrin_tpl(64, uint64_t, __umul64hi)
 #elif R123_USE_MULHILO64_OPENCL_INTRIN
@@ -208,7 +199,7 @@ _mulhilo_fail_tpl(64, uint64_t)
 /*
 // The multipliers and Weyl constants are "hard coded".
 // To change them, you can #define them with different
-// values before #include-ing this file. 
+// values before #include-ing this file.
 // This isn't terribly elegant, but it works for C as
 // well as C++.  A nice C++-only solution would be to
 // use template parameters in the style of <random>
@@ -237,10 +228,10 @@ _mulhilo_fail_tpl(64, uint64_t)
 #endif
 
 #ifndef PHILOX_W64_0
-#define PHILOX_W64_0 R123_64BIT(0x9E3779B97F4A7C15)  /* golden ratio */
+#define PHILOX_W64_0 R123_64BIT(0x9E3779B97F4A7C15) /* golden ratio */
 #endif
 #ifndef PHILOX_W64_1
-#define PHILOX_W64_1 R123_64BIT(0xBB67AE8584CAA73B)  /* sqrt(3)-1 */
+#define PHILOX_W64_1 R123_64BIT(0xBB67AE8584CAA73B) /* sqrt(3)-1 */
 #endif
 
 #ifndef PHILOX_W32_0
@@ -268,91 +259,150 @@ _mulhilo_fail_tpl(64, uint64_t)
 
 /* The ignored fourth argument allows us to instantiate the
    same macro regardless of N. */
-#define _philox2xWround_tpl(W, T)                                       \
-R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key)); \
-R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr, struct r123array1x##W key){ \
-    T hi;                                                               \
-    T lo = mulhilo##W(PHILOX_M2x##W##_0, ctr.v[0], &hi);                \
-    struct r123array2x##W out = {{hi^key.v[0]^ctr.v[1], lo}};               \
-    return out;                                                         \
-}
-#define _philox2xWbumpkey_tpl(W)                                        \
-R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array1x##W _philox2x##W##bumpkey( struct r123array1x##W key) { \
-    key.v[0] += PHILOX_W##W##_0;                                        \
-    return key;                                                         \
-}
-
-#define _philox4xWround_tpl(W, T)                                       \
-R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key)); \
-R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr, struct r123array2x##W key){ \
-    T hi0;                                                              \
-    T hi1;                                                              \
-    T lo0 = mulhilo##W(PHILOX_M4x##W##_0, ctr.v[0], &hi0);              \
-    T lo1 = mulhilo##W(PHILOX_M4x##W##_1, ctr.v[2], &hi1);              \
-    struct r123array4x##W out = {{hi1^ctr.v[1]^key.v[0], lo1,               \
-                              hi0^ctr.v[3]^key.v[1], lo0}};             \
-    return out;                                                         \
-}
-
-#define _philox4xWbumpkey_tpl(W)                                        \
-R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox4x##W##bumpkey( struct r123array2x##W key) { \
-    key.v[0] += PHILOX_W##W##_0;                                        \
-    key.v[1] += PHILOX_W##W##_1;                                        \
-    return key;                                                         \
-}
-
-#define _philoxNxW_tpl(N, Nhalf, W, T)                         \
-/** @ingroup PhiloxNxW */                                       \
-enum r123_enum_philox##N##x##W { philox##N##x##W##_rounds = PHILOX##N##x##W##_DEFAULT_ROUNDS }; \
-typedef struct r123array##N##x##W philox##N##x##W##_ctr_t;                  \
-typedef struct r123array##Nhalf##x##W philox##N##x##W##_key_t;              \
-typedef struct r123array##Nhalf##x##W philox##N##x##W##_ukey_t;              \
-R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_key_t philox##N##x##W##keyinit(philox##N##x##W##_ukey_t uk) { return uk; } \
-R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key)); \
-R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key) { \
-    R123_ASSERT(R<=16);                                                 \
-    if(R>0){                                       ctr = _philox##N##x##W##round(ctr, key); } \
-    if(R>1){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
-    if(R>2){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
-    if(R>3){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
-    if(R>4){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
-    if(R>5){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
-    if(R>6){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
-    if(R>7){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
-    if(R>8){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
-    if(R>9){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
-    if(R>10){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
-    if(R>11){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
-    if(R>12){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
-    if(R>13){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
-    if(R>14){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
-    if(R>15){ key = _philox##N##x##W##bumpkey(key); ctr = _philox##N##x##W##round(ctr, key); } \
-    return ctr;                                                         \
-}
-         
-_philox2xWbumpkey_tpl(32)
-_philox4xWbumpkey_tpl(32)
-_philox2xWround_tpl(32, uint32_t) /* philo2x32round */
-_philox4xWround_tpl(32, uint32_t)            /* philo4x32round */
-/** \endcond */
-_philoxNxW_tpl(2, 1, 32, uint32_t)    /* philox2x32bijection */
-_philoxNxW_tpl(4, 2, 32, uint32_t)    /* philox4x32bijection */
+#define _philox2xWround_tpl(W, T)                                                  \
+    R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(                         \
+        struct r123array2x##W _philox2x##W##round(struct r123array2x##W ctr,       \
+                                                  struct r123array1x##W key));     \
+    R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox2x##W##round( \
+        struct r123array2x##W ctr, struct r123array1x##W key) {                    \
+        T hi;                                                                      \
+        T lo = mulhilo##W(PHILOX_M2x##W##_0, ctr.v[0], &hi);                       \
+        struct r123array2x##W out = {{hi ^ key.v[0] ^ ctr.v[1], lo}};              \
+        return out;                                                                \
+    }
+#define _philox2xWbumpkey_tpl(W)                                                     \
+    R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array1x##W _philox2x##W##bumpkey( \
+        struct r123array1x##W key) {                                                 \
+        key.v[0] += PHILOX_W##W##_0;                                                 \
+        return key;                                                                  \
+    }
+
+#define _philox4xWround_tpl(W, T)                                                  \
+    R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(                         \
+        struct r123array4x##W _philox4x##W##round(struct r123array4x##W ctr,       \
+                                                  struct r123array2x##W key));     \
+    R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array4x##W _philox4x##W##round( \
+        struct r123array4x##W ctr, struct r123array2x##W key) {                    \
+        T hi0;                                                                     \
+        T hi1;                                                                     \
+        T lo0 = mulhilo##W(PHILOX_M4x##W##_0, ctr.v[0], &hi0);                     \
+        T lo1 = mulhilo##W(PHILOX_M4x##W##_1, ctr.v[2], &hi1);                     \
+        struct r123array4x##W out = {                                              \
+            {hi1 ^ ctr.v[1] ^ key.v[0], lo1, hi0 ^ ctr.v[3] ^ key.v[1], lo0}};     \
+        return out;                                                                \
+    }
+
+#define _philox4xWbumpkey_tpl(W)                                                     \
+    R123_CUDA_DEVICE R123_STATIC_INLINE struct r123array2x##W _philox4x##W##bumpkey( \
+        struct r123array2x##W key) {                                                 \
+        key.v[0] += PHILOX_W##W##_0;                                                 \
+        key.v[1] += PHILOX_W##W##_1;                                                 \
+        return key;                                                                  \
+    }
+
+#define _philoxNxW_tpl(N, Nhalf, W, T)                                                           \
+    /** @ingroup PhiloxNxW */                                                                    \
+    enum r123_enum_philox##N##x##W{philox##N##x##W##_rounds = PHILOX##N##x##W##_DEFAULT_ROUNDS}; \
+    typedef struct r123array##N##x##W philox##N##x##W##_ctr_t;                                   \
+    typedef struct r123array##Nhalf##x##W philox##N##x##W##_key_t;                               \
+    typedef struct r123array##Nhalf##x##W philox##N##x##W##_ukey_t;                              \
+    R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_key_t philox##N##x##W##keyinit(        \
+        philox##N##x##W##_ukey_t uk) {                                                           \
+        return uk;                                                                               \
+    }                                                                                            \
+    R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(                                       \
+        philox##N##x##W##_ctr_t philox##N##x##W##_R(unsigned int R, philox##N##x##W##_ctr_t ctr, \
+                                                    philox##N##x##W##_key_t key));               \
+    R123_CUDA_DEVICE R123_STATIC_INLINE philox##N##x##W##_ctr_t philox##N##x##W##_R(             \
+        unsigned int R, philox##N##x##W##_ctr_t ctr, philox##N##x##W##_key_t key) {              \
+        R123_ASSERT(R <= 16);                                                                    \
+        if (R > 0) {                                                                             \
+            ctr = _philox##N##x##W##round(ctr, key);                                             \
+        }                                                                                        \
+        if (R > 1) {                                                                             \
+            key = _philox##N##x##W##bumpkey(key);                                                \
+            ctr = _philox##N##x##W##round(ctr, key);                                             \
+        }                                                                                        \
+        if (R > 2) {                                                                             \
+            key = _philox##N##x##W##bumpkey(key);                                                \
+            ctr = _philox##N##x##W##round(ctr, key);                                             \
+        }                                                                                        \
+        if (R > 3) {                                                                             \
+            key = _philox##N##x##W##bumpkey(key);                                                \
+            ctr = _philox##N##x##W##round(ctr, key);                                             \
+        }                                                                                        \
+        if (R > 4) {                                                                             \
+            key = _philox##N##x##W##bumpkey(key);                                                \
+            ctr = _philox##N##x##W##round(ctr, key);                                             \
+        }                                                                                        \
+        if (R > 5) {                                                                             \
+            key = _philox##N##x##W##bumpkey(key);                                                \
+            ctr = _philox##N##x##W##round(ctr, key);                                             \
+        }                                                                                        \
+        if (R > 6) {                                                                             \
+            key = _philox##N##x##W##bumpkey(key);                                                \
+            ctr = _philox##N##x##W##round(ctr, key);                                             \
+        }                                                                                        \
+        if (R > 7) {                                                                             \
+            key = _philox##N##x##W##bumpkey(key);                                                \
+            ctr = _philox##N##x##W##round(ctr, key);                                             \
+        }                                                                                        \
+        if (R > 8) {                                                                             \
+            key = _philox##N##x##W##bumpkey(key);                                                \
+            ctr = _philox##N##x##W##round(ctr, key);                                             \
+        }                                                                                        \
+        if (R > 9) {                                                                             \
+            key = _philox##N##x##W##bumpkey(key);                                                \
+            ctr = _philox##N##x##W##round(ctr, key);                                             \
+        }                                                                                        \
+        if (R > 10) {                                                                            \
+            key = _philox##N##x##W##bumpkey(key);                                                \
+            ctr = _philox##N##x##W##round(ctr, key);                                             \
+        }                                                                                        \
+        if (R > 11) {                                                                            \
+            key = _philox##N##x##W##bumpkey(key);                                                \
+            ctr = _philox##N##x##W##round(ctr, key);                                             \
+        }                                                                                        \
+        if (R > 12) {                                                                            \
+            key = _philox##N##x##W##bumpkey(key);                                                \
+            ctr = _philox##N##x##W##round(ctr, key);                                             \
+        }                                                                                        \
+        if (R > 13) {                                                                            \
+            key = _philox##N##x##W##bumpkey(key);                                                \
+            ctr = _philox##N##x##W##round(ctr, key);                                             \
+        }                                                                                        \
+        if (R > 14) {                                                                            \
+            key = _philox##N##x##W##bumpkey(key);                                                \
+            ctr = _philox##N##x##W##round(ctr, key);                                             \
+        }                                                                                        \
+        if (R > 15) {                                                                            \
+            key = _philox##N##x##W##bumpkey(key);                                                \
+            ctr = _philox##N##x##W##round(ctr, key);                                             \
+        }                                                                                        \
+        return ctr;                                                                              \
+    }
+
+        _philox2xWbumpkey_tpl(32) _philox4xWbumpkey_tpl(32)
+            _philox2xWround_tpl(32, uint32_t) /* philo2x32round */
+    _philox4xWround_tpl(32, uint32_t)         /* philo4x32round */
+    /** \endcond */
+    _philoxNxW_tpl(2, 1, 32, uint32_t) /* philox2x32bijection */
+    _philoxNxW_tpl(4, 2, 32, uint32_t) /* philox4x32bijection */
 #if R123_USE_PHILOX_64BIT
-/** \cond HIDDEN_FROM_DOXYGEN */
-_philox2xWbumpkey_tpl(64)
-_philox4xWbumpkey_tpl(64)
-_philox2xWround_tpl(64, uint64_t) /* philo2x64round */
-_philox4xWround_tpl(64, uint64_t) /* philo4x64round */
-/** \endcond */
-_philoxNxW_tpl(2, 1, 64, uint64_t)    /* philox2x64bijection */
-_philoxNxW_tpl(4, 2, 64, uint64_t)    /* philox4x64bijection */
-#endif /* R123_USE_PHILOX_64BIT */
-
-#define philox2x32(c,k) philox2x32_R(philox2x32_rounds, c, k)
-#define philox4x32(c,k) philox4x32_R(philox4x32_rounds, c, k)
+    /** \cond HIDDEN_FROM_DOXYGEN */
+    _philox2xWbumpkey_tpl(64) _philox4xWbumpkey_tpl(64)
+        _philox2xWround_tpl(64, uint64_t) /* philo2x64round */
+    _philox4xWround_tpl(64, uint64_t)     /* philo4x64round */
+    /** \endcond */
+    _philoxNxW_tpl(2, 1, 64, uint64_t) /* philox2x64bijection */
+    _philoxNxW_tpl(4, 2, 64, uint64_t) /* philox4x64bijection */
+#endif                                 /* R123_USE_PHILOX_64BIT */
+
+#define philox2x32(c, k) philox2x32_R(philox2x32_rounds, c, k)
+#define philox4x32(c, k) philox4x32_R(philox4x32_rounds, c, k)
 #if R123_USE_PHILOX_64BIT
-#define philox2x64(c,k) philox2x64_R(philox2x64_rounds, c, k)
-#define philox4x64(c,k) philox4x64_R(philox4x64_rounds, c, k)
+#define philox2x64(c, k) philox2x64_R(philox2x64_rounds, c, k)
+#define philox4x64(c, k) philox4x64_R(philox4x64_rounds, c, k)
 #endif /* R123_USE_PHILOX_64BIT */
 
 #ifdef __cplusplus
@@ -360,49 +410,53 @@ _philoxNxW_tpl(4, 2, 64, uint64_t)    /* philox4x64bijection */
 
 /** \cond HIDDEN_FROM_DOXYGEN */
 
-#define _PhiloxNxW_base_tpl(CType, KType, N, W)                         \
-namespace r123{                                                          \
-template<unsigned int ROUNDS>                                             \
-struct Philox##N##x##W##_R{                                             \
-    typedef CType ctr_type;                                         \
-    typedef KType key_type;                                             \
-    typedef KType ukey_type;                                         \
-    static const unsigned int rounds=ROUNDS;                                 \
-    inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key) const){ \
-        R123_STATIC_ASSERT(ROUNDS<=16, "philox is only unrolled up to 16 rounds\n"); \
-        return philox##N##x##W##_R(ROUNDS, ctr, key);                       \
-    }                                                                   \
-};                                                                      \
-typedef Philox##N##x##W##_R<philox##N##x##W##_rounds> Philox##N##x##W; \
- } // namespace r123
-/** \endcond */
-
-_PhiloxNxW_base_tpl(r123array2x32, r123array1x32, 2, 32) // Philox2x32_R<R>
-_PhiloxNxW_base_tpl(r123array4x32, r123array2x32, 4, 32) // Philox4x32_R<R>
+#define _PhiloxNxW_base_tpl(CType, KType, N, W)                                                  \
+    namespace r123 {                                                                             \
+        template <unsigned int ROUNDS>                                                           \
+        struct Philox##N##x##W##_R {                                                             \
+            typedef CType ctr_type;                                                              \
+            typedef KType key_type;                                                              \
+            typedef KType ukey_type;                                                             \
+            static const unsigned int rounds = ROUNDS;                                           \
+            inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr,          \
+                                                                          key_type key) const) { \
+                R123_STATIC_ASSERT(ROUNDS <= 16, "philox is only unrolled up to 16 rounds\n");   \
+                return philox##N##x##W##_R(ROUNDS, ctr, key);                                    \
+            }                                                                                    \
+        };                                                                                       \
+        typedef Philox##N##x##W##_R<philox##N##x##W##_rounds> Philox##N##x##W;                   \
+    }  // namespace r123
+    /** \endcond */
+
+    _PhiloxNxW_base_tpl(r123array2x32, r123array1x32, 2, 32)  // Philox2x32_R<R>
+    _PhiloxNxW_base_tpl(r123array4x32, r123array2x32, 4, 32)  // Philox4x32_R<R>
 #if R123_USE_PHILOX_64BIT
-_PhiloxNxW_base_tpl(r123array2x64, r123array1x64, 2, 64) // Philox2x64_R<R>
-_PhiloxNxW_base_tpl(r123array4x64, r123array2x64, 4, 64) // Philox4x64_R<R>
+    _PhiloxNxW_base_tpl(r123array2x64, r123array1x64, 2, 64)  // Philox2x64_R<R>
+    _PhiloxNxW_base_tpl(r123array4x64, r123array2x64, 4, 64)  // Philox4x64_R<R>
 #endif
 
 /* The _tpl macros don't quite work to do string-pasting inside comments.
    so we just write out the boilerplate documentation four times... */
 
-/** 
+/**
 @defgroup PhiloxNxW Philox Classes and Typedefs
 
 The PhiloxNxW classes export the member functions, typedefs and
 operator overloads required by a @ref CBRNG "CBRNG" class.
 
-As described in  
-<a href="http://dl.acm.org/citation.cfm?doid=2063405"><i>Parallel Random Numbers:  As Easy as 1, 2, 3</i> </a>.
-The Philox family of counter-based RNGs use integer multiplication, xor and permutation of W-bit words
+As described in
+<a href="http://dl.acm.org/citation.cfm?doid=2063405"><i>Parallel Random Numbers:  As Easy as 1, 2,
+3</i> </a>.
+The Philox family of counter-based RNGs use integer multiplication, xor and permutation of W-bit
+words
 to scramble its N-word input key.  Philox is a mnemonic for Product HI LO Xor).
 
 
-@class r123::Philox2x32_R 
+@class r123::Philox2x32_R
 @ingroup PhiloxNxW
 
-exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
+exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG"
+class.
 
 The template argument, ROUNDS, is the number of times the Philox round
 function will be applied.
@@ -415,14 +469,15 @@ ROUNDS=6 or more for Philox2x32.
   Philox2x32 is equivalent to Philox2x32_R<10>.    With 10 rounds,
   Philox2x32 has a considerable safety margin over the minimum number
   of rounds with no known statistical flaws, but still has excellent
-   performance. 
+   performance.
 
 
 
-@class r123::Philox2x64_R 
+@class r123::Philox2x64_R
 @ingroup PhiloxNxW
 
-exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
+exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG"
+class.
 
 The template argument, ROUNDS, is the number of times the Philox round
 function will be applied.
@@ -435,14 +490,15 @@ ROUNDS=6 or more for Philox2x64.
   Philox2x64 is equivalent to Philox2x64_R<10>.    With 10 rounds,
   Philox2x64 has a considerable safety margin over the minimum number
   of rounds with no known statistical flaws, but still has excellent
-   performance. 
+   performance.
 
 
 
-@class r123::Philox4x32_R 
+@class r123::Philox4x32_R
 @ingroup PhiloxNxW
 
-exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
+exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG"
+class.
 
 The template argument, ROUNDS, is the number of times the Philox round
 function will be applied.
@@ -458,14 +514,15 @@ ROUNDS=8 or more for Philox4x32.
   Philox4x32 is equivalent to Philox4x32_R<10>.    With 10 rounds,
   Philox4x32 has a considerable safety margin over the minimum number
   of rounds with no known statistical flaws, but still has excellent
-   performance. 
+   performance.
 
 
 
-@class r123::Philox4x64_R 
+@class r123::Philox4x64_R
 @ingroup PhiloxNxW
 
-exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
+exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG"
+class.
 
 The template argument, ROUNDS, is the number of times the Philox round
 function will be applied.
@@ -478,7 +535,7 @@ ROUNDS=7 or more for Philox4x64.
   Philox4x64 is equivalent to Philox4x64_R<10>.    With 10 rounds,
   Philox4x64 has a considerable safety margin over the minimum number
   of rounds with no known statistical flaws, but still has excellent
-   performance. 
+   performance.
 */
 
 #endif /* __cplusplus */
diff --git a/coreneuron/utils/randoms/Random123/threefry.h b/coreneuron/utils/randoms/Random123/threefry.h
index da2de979c..c9ffe5629 100644
--- a/coreneuron/utils/randoms/Random123/threefry.h
+++ b/coreneuron/utils/randoms/Random123/threefry.h
@@ -66,14 +66,22 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 enum r123_enum_threefry64x4 {
     /* These are the R_256 constants from the Threefish reference sources
        with names changed to R_64x4... */
-    R_64x4_0_0=14, R_64x4_0_1=16,
-    R_64x4_1_0=52, R_64x4_1_1=57,
-    R_64x4_2_0=23, R_64x4_2_1=40,
-    R_64x4_3_0= 5, R_64x4_3_1=37,
-    R_64x4_4_0=25, R_64x4_4_1=33,
-    R_64x4_5_0=46, R_64x4_5_1=12,
-    R_64x4_6_0=58, R_64x4_6_1=22,
-    R_64x4_7_0=32, R_64x4_7_1=32
+    R_64x4_0_0 = 14,
+    R_64x4_0_1 = 16,
+    R_64x4_1_0 = 52,
+    R_64x4_1_1 = 57,
+    R_64x4_2_0 = 23,
+    R_64x4_2_1 = 40,
+    R_64x4_3_0 = 5,
+    R_64x4_3_1 = 37,
+    R_64x4_4_0 = 25,
+    R_64x4_4_1 = 33,
+    R_64x4_5_0 = 46,
+    R_64x4_5_1 = 12,
+    R_64x4_6_0 = 58,
+    R_64x4_6_1 = 22,
+    R_64x4_7_0 = 32,
+    R_64x4_7_1 = 32
 };
 
 enum r123_enum_threefry64x2 {
@@ -81,16 +89,16 @@ enum r123_enum_threefry64x2 {
     // Output from skein_rot_search: (srs64_B64-X1000)
     // Random seed = 1. BlockSize = 128 bits. sampleCnt =  1024. rounds =  8, minHW_or=57
     // Start: Tue Mar  1 10:07:48 2011
-    // rMin = 0.136. #0325[*15] [CRC=455A682F. hw_OR=64. cnt=16384. blkSize= 128].format   
+    // rMin = 0.136. #0325[*15] [CRC=455A682F. hw_OR=64. cnt=16384. blkSize= 128].format
     */
-    R_64x2_0_0=16,
-    R_64x2_1_0=42,
-    R_64x2_2_0=12,
-    R_64x2_3_0=31,
-    R_64x2_4_0=16,
-    R_64x2_5_0=32,
-    R_64x2_6_0=24,
-    R_64x2_7_0=21
+    R_64x2_0_0 = 16,
+    R_64x2_1_0 = 42,
+    R_64x2_2_0 = 12,
+    R_64x2_3_0 = 31,
+    R_64x2_4_0 = 16,
+    R_64x2_5_0 = 32,
+    R_64x2_6_0 = 24,
+    R_64x2_7_0 = 21
     /* 4 rounds: minHW =  4  [  4  4  4  4 ]
     // 5 rounds: minHW =  8  [  8  8  8  8 ]
     // 6 rounds: minHW = 16  [ 16 16 16 16 ]
@@ -107,14 +115,22 @@ enum r123_enum_threefry32x4 {
     // Start: Mon Aug 24 22:41:36 2009
     // ...
     // rMin = 0.472. #0A4B[*33] [CRC=DD1ECE0F. hw_OR=31. cnt=16384. blkSize= 128].format    */
-    R_32x4_0_0=10, R_32x4_0_1=26,
-    R_32x4_1_0=11, R_32x4_1_1=21,
-    R_32x4_2_0=13, R_32x4_2_1=27,
-    R_32x4_3_0=23, R_32x4_3_1= 5,
-    R_32x4_4_0= 6, R_32x4_4_1=20,
-    R_32x4_5_0=17, R_32x4_5_1=11,
-    R_32x4_6_0=25, R_32x4_6_1=10,
-    R_32x4_7_0=18, R_32x4_7_1=20
+    R_32x4_0_0 = 10,
+    R_32x4_0_1 = 26,
+    R_32x4_1_0 = 11,
+    R_32x4_1_1 = 21,
+    R_32x4_2_0 = 13,
+    R_32x4_2_1 = 27,
+    R_32x4_3_0 = 23,
+    R_32x4_3_1 = 5,
+    R_32x4_4_0 = 6,
+    R_32x4_4_1 = 20,
+    R_32x4_5_0 = 17,
+    R_32x4_5_1 = 11,
+    R_32x4_6_0 = 25,
+    R_32x4_6_1 = 10,
+    R_32x4_7_0 = 18,
+    R_32x4_7_1 = 20
 
     /* 4 rounds: minHW =  3  [  3  3  3  3 ]
     // 5 rounds: minHW =  7  [  7  7  7  7 ]
@@ -132,14 +148,14 @@ enum r123_enum_threefry32x2 {
     // Random seed = 1. BlockSize = 64 bits. sampleCnt =  1024. rounds =  8, minHW_or=28
     // Start: Tue Jul 12 11:11:33 2011
     // rMin = 0.334. #0206[*07] [CRC=1D9765C0. hw_OR=32. cnt=16384. blkSize=  64].format   */
-    R_32x2_0_0=13,
-    R_32x2_1_0=15,
-    R_32x2_2_0=26,
-    R_32x2_3_0= 6,
-    R_32x2_4_0=17,
-    R_32x2_5_0=29,
-    R_32x2_6_0=16,
-    R_32x2_7_0=24
+    R_32x2_0_0 = 13,
+    R_32x2_1_0 = 15,
+    R_32x2_2_0 = 26,
+    R_32x2_3_0 = 6,
+    R_32x2_4_0 = 17,
+    R_32x2_5_0 = 29,
+    R_32x2_6_0 = 16,
+    R_32x2_7_0 = 24
 
     /* 4 rounds: minHW =  4  [  4  4  4  4 ]
     // 5 rounds: minHW =  6  [  6  8  6  8 ]
@@ -149,27 +165,22 @@ enum r123_enum_threefry32x2 {
     // 9 rounds: minHW = 32  [ 32 32 32 32 ]
     //10 rounds: minHW = 32  [ 32 32 32 32 ]
     //11 rounds: minHW = 32  [ 32 32 32 32 ] */
-    };
-
-enum r123_enum_threefry_wcnt {
-    WCNT2=2,
-    WCNT4=4
 };
+
+enum r123_enum_threefry_wcnt { WCNT2 = 2, WCNT4 = 4 };
 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint64_t RotL_64(uint64_t x, unsigned int N));
-R123_CUDA_DEVICE R123_STATIC_INLINE uint64_t RotL_64(uint64_t x, unsigned int N)
-{
-    return (x << (N & 63)) | (x >> ((64-N) & 63));
+R123_CUDA_DEVICE R123_STATIC_INLINE uint64_t RotL_64(uint64_t x, unsigned int N) {
+    return (x << (N & 63)) | (x >> ((64 - N) & 63));
 }
-    
+
 R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(uint32_t RotL_32(uint32_t x, unsigned int N));
-R123_CUDA_DEVICE R123_STATIC_INLINE uint32_t RotL_32(uint32_t x, unsigned int N)
-{
-    return (x << (N & 31)) | (x >> ((32-N) & 31));
+R123_CUDA_DEVICE R123_STATIC_INLINE uint32_t RotL_32(uint32_t x, unsigned int N) {
+    return (x << (N & 31)) | (x >> ((32 - N) & 31));
 }
 
-#define SKEIN_MK_64(hi32,lo32)  ((lo32) + (((uint64_t) (hi32)) << 32))
-#define SKEIN_KS_PARITY64         SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
-#define SKEIN_KS_PARITY32         0x1BD11BDA
+#define SKEIN_MK_64(hi32, lo32) ((lo32) + (((uint64_t)(hi32)) << 32))
+#define SKEIN_KS_PARITY64 SKEIN_MK_64(0x1BD11BDA, 0xA9FC1A22)
+#define SKEIN_KS_PARITY32 0x1BD11BDA
 
 #ifndef THREEFRY2x32_DEFAULT_ROUNDS
 #define THREEFRY2x32_DEFAULT_ROUNDS 20
@@ -187,600 +198,1088 @@ R123_CUDA_DEVICE R123_STATIC_INLINE uint32_t RotL_32(uint32_t x, unsigned int N)
 #define THREEFRY4x64_DEFAULT_ROUNDS 20
 #endif
 
-#define _threefry2x_tpl(W)                                              \
-typedef struct r123array2x##W threefry2x##W##_ctr_t;                          \
-typedef struct r123array2x##W threefry2x##W##_key_t;                          \
-typedef struct r123array2x##W threefry2x##W##_ukey_t;                          \
-R123_CUDA_DEVICE R123_STATIC_INLINE threefry2x##W##_key_t threefry2x##W##keyinit(threefry2x##W##_ukey_t uk) { return uk; } \
-R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
-R123_CUDA_DEVICE R123_STATIC_INLINE                                          \
-threefry2x##W##_ctr_t threefry2x##W##_R(unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
-    threefry2x##W##_ctr_t X;                                              \
-    uint##W##_t ks[2+1];                                          \
-    int  i; /* avoid size_t to avoid need for stddef.h */                   \
-    R123_ASSERT(Nrounds<=32);                                           \
-    ks[2] =  SKEIN_KS_PARITY##W;                                   \
-    for (i=0;i < 2; i++)                                        \
-        {                                                               \
-            ks[i] = k.v[i];                                             \
-            X.v[i]  = in.v[i];                                          \
-            ks[2] ^= k.v[i];                                    \
-        }                                                               \
-                                                                        \
-    /* Insert initial key before round 0 */                             \
-    X.v[0] += ks[0]; X.v[1] += ks[1];                                   \
-                                                                        \
-    if(Nrounds>0){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
-    if(Nrounds>1){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
-    if(Nrounds>2){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
-    if(Nrounds>3){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
-    if(Nrounds>3){                                                      \
-        /* InjectKey(r=1) */                                            \
-        X.v[0] += ks[1]; X.v[1] += ks[2];                               \
-        X.v[1] += 1;     /* X.v[2-1] += r  */                   \
-    }                                                                   \
-    if(Nrounds>4){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
-    if(Nrounds>5){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
-    if(Nrounds>6){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
-    if(Nrounds>7){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
-    if(Nrounds>7){                                                      \
-        /* InjectKey(r=2) */                                            \
-        X.v[0] += ks[2]; X.v[1] += ks[0];                               \
-        X.v[1] += 2;                                                    \
-    }                                                                   \
-    if(Nrounds>8){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
-    if(Nrounds>9){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
-    if(Nrounds>10){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
-    if(Nrounds>11){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
-    if(Nrounds>11){                                                     \
-        /* InjectKey(r=3) */                                            \
-        X.v[0] += ks[0]; X.v[1] += ks[1];                               \
-        X.v[1] += 3;                                                    \
-    }                                                                   \
-    if(Nrounds>12){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
-    if(Nrounds>13){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
-    if(Nrounds>14){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
-    if(Nrounds>15){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
-    if(Nrounds>15){                                                     \
-        /* InjectKey(r=4) */                                            \
-        X.v[0] += ks[1]; X.v[1] += ks[2];                               \
-        X.v[1] += 4;                                                    \
-    }                                                                   \
-    if(Nrounds>16){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
-    if(Nrounds>17){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
-    if(Nrounds>18){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
-    if(Nrounds>19){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
-    if(Nrounds>19){                                                     \
-        /* InjectKey(r=5) */                                            \
-        X.v[0] += ks[2]; X.v[1] += ks[0];                               \
-        X.v[1] += 5;                                                    \
-    }                                                                   \
-    if(Nrounds>20){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
-    if(Nrounds>21){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
-    if(Nrounds>22){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
-    if(Nrounds>23){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
-    if(Nrounds>23){                                                     \
-        /* InjectKey(r=6) */                                            \
-        X.v[0] += ks[0]; X.v[1] += ks[1];                               \
-        X.v[1] += 6;                                                    \
-    }                                                                   \
-    if(Nrounds>24){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_0_0); X.v[1] ^= X.v[0]; } \
-    if(Nrounds>25){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_1_0); X.v[1] ^= X.v[0]; } \
-    if(Nrounds>26){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_2_0); X.v[1] ^= X.v[0]; } \
-    if(Nrounds>27){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_3_0); X.v[1] ^= X.v[0]; } \
-    if(Nrounds>27){                                                     \
-        /* InjectKey(r=7) */                                            \
-        X.v[0] += ks[1]; X.v[1] += ks[2];                               \
-        X.v[1] += 7;                                                    \
-    }                                                                   \
-    if(Nrounds>28){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_4_0); X.v[1] ^= X.v[0]; } \
-    if(Nrounds>29){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_5_0); X.v[1] ^= X.v[0]; } \
-    if(Nrounds>30){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_6_0); X.v[1] ^= X.v[0]; } \
-    if(Nrounds>31){  X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x2_7_0); X.v[1] ^= X.v[0]; } \
-    if(Nrounds>31){                                                     \
-        /* InjectKey(r=8) */                                            \
-        X.v[0] += ks[2]; X.v[1] += ks[0];                               \
-        X.v[1] += 8;                                                    \
-    }                                                                   \
-    return X;                                                           \
-}                                                                       \
- /** @ingroup ThreefryNxW */                                            \
-enum r123_enum_threefry2x##W { threefry2x##W##_rounds = THREEFRY2x##W##_DEFAULT_ROUNDS };       \
-R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k)); \
-R123_CUDA_DEVICE R123_STATIC_INLINE                                     \
-threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k){ \
-    return threefry2x##W##_R(threefry2x##W##_rounds, in, k);            \
-}
+#define _threefry2x_tpl(W)                                                                         \
+    typedef struct r123array2x##W threefry2x##W##_ctr_t;                                           \
+    typedef struct r123array2x##W threefry2x##W##_key_t;                                           \
+    typedef struct r123array2x##W threefry2x##W##_ukey_t;                                          \
+    R123_CUDA_DEVICE R123_STATIC_INLINE threefry2x##W##_key_t threefry2x##W##keyinit(              \
+        threefry2x##W##_ukey_t uk) {                                                               \
+        return uk;                                                                                 \
+    }                                                                                              \
+    R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry2x##W##_ctr_t threefry2x##W##_R( \
+        unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k));                 \
+    R123_CUDA_DEVICE R123_STATIC_INLINE threefry2x##W##_ctr_t threefry2x##W##_R(                   \
+        unsigned int Nrounds, threefry2x##W##_ctr_t in, threefry2x##W##_key_t k) {                 \
+        threefry2x##W##_ctr_t X;                                                                   \
+        uint##W##_t ks[2 + 1];                                                                     \
+        int i; /* avoid size_t to avoid need for stddef.h */                                       \
+        R123_ASSERT(Nrounds <= 32);                                                                \
+        ks[2] = SKEIN_KS_PARITY##W;                                                                \
+        for (i = 0; i < 2; i++) {                                                                  \
+            ks[i] = k.v[i];                                                                        \
+            X.v[i] = in.v[i];                                                                      \
+            ks[2] ^= k.v[i];                                                                       \
+        }                                                                                          \
+                                                                                                   \
+        /* Insert initial key before round 0 */                                                    \
+        X.v[0] += ks[0];                                                                           \
+        X.v[1] += ks[1];                                                                           \
+                                                                                                   \
+        if (Nrounds > 0) {                                                                         \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x2_0_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 1) {                                                                         \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x2_1_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 2) {                                                                         \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x2_2_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 3) {                                                                         \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x2_3_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 3) {                                                                         \
+            /* InjectKey(r=1) */                                                                   \
+            X.v[0] += ks[1];                                                                       \
+            X.v[1] += ks[2];                                                                       \
+            X.v[1] += 1; /* X.v[2-1] += r  */                                                      \
+        }                                                                                          \
+        if (Nrounds > 4) {                                                                         \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x2_4_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 5) {                                                                         \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x2_5_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 6) {                                                                         \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x2_6_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 7) {                                                                         \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x2_7_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 7) {                                                                         \
+            /* InjectKey(r=2) */                                                                   \
+            X.v[0] += ks[2];                                                                       \
+            X.v[1] += ks[0];                                                                       \
+            X.v[1] += 2;                                                                           \
+        }                                                                                          \
+        if (Nrounds > 8) {                                                                         \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x2_0_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 9) {                                                                         \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x2_1_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 10) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x2_2_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 11) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x2_3_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 11) {                                                                        \
+            /* InjectKey(r=3) */                                                                   \
+            X.v[0] += ks[0];                                                                       \
+            X.v[1] += ks[1];                                                                       \
+            X.v[1] += 3;                                                                           \
+        }                                                                                          \
+        if (Nrounds > 12) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x2_4_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 13) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x2_5_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 14) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x2_6_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 15) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x2_7_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 15) {                                                                        \
+            /* InjectKey(r=4) */                                                                   \
+            X.v[0] += ks[1];                                                                       \
+            X.v[1] += ks[2];                                                                       \
+            X.v[1] += 4;                                                                           \
+        }                                                                                          \
+        if (Nrounds > 16) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x2_0_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 17) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x2_1_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 18) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x2_2_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 19) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x2_3_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 19) {                                                                        \
+            /* InjectKey(r=5) */                                                                   \
+            X.v[0] += ks[2];                                                                       \
+            X.v[1] += ks[0];                                                                       \
+            X.v[1] += 5;                                                                           \
+        }                                                                                          \
+        if (Nrounds > 20) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x2_4_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 21) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x2_5_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 22) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x2_6_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 23) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x2_7_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 23) {                                                                        \
+            /* InjectKey(r=6) */                                                                   \
+            X.v[0] += ks[0];                                                                       \
+            X.v[1] += ks[1];                                                                       \
+            X.v[1] += 6;                                                                           \
+        }                                                                                          \
+        if (Nrounds > 24) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x2_0_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 25) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x2_1_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 26) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x2_2_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 27) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x2_3_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 27) {                                                                        \
+            /* InjectKey(r=7) */                                                                   \
+            X.v[0] += ks[1];                                                                       \
+            X.v[1] += ks[2];                                                                       \
+            X.v[1] += 7;                                                                           \
+        }                                                                                          \
+        if (Nrounds > 28) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x2_4_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 29) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x2_5_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 30) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x2_6_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 31) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x2_7_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 31) {                                                                        \
+            /* InjectKey(r=8) */                                                                   \
+            X.v[0] += ks[2];                                                                       \
+            X.v[1] += ks[0];                                                                       \
+            X.v[1] += 8;                                                                           \
+        }                                                                                          \
+        return X;                                                                                  \
+    }                                                                                              \
+    /** @ingroup ThreefryNxW */                                                                    \
+    enum r123_enum_threefry2x##W{threefry2x##W##_rounds = THREEFRY2x##W##_DEFAULT_ROUNDS};         \
+    R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(                                         \
+        threefry2x##W##_ctr_t threefry2x##W(threefry2x##W##_ctr_t in, threefry2x##W##_key_t k));   \
+    R123_CUDA_DEVICE R123_STATIC_INLINE threefry2x##W##_ctr_t threefry2x##W(                       \
+        threefry2x##W##_ctr_t in, threefry2x##W##_key_t k) {                                       \
+        return threefry2x##W##_R(threefry2x##W##_rounds, in, k);                                   \
+    }
 
-
-#define _threefry4x_tpl(W)                                              \
-typedef struct r123array4x##W threefry4x##W##_ctr_t;                        \
-typedef struct r123array4x##W threefry4x##W##_key_t;                        \
-typedef struct r123array4x##W threefry4x##W##_ukey_t;                        \
-R123_CUDA_DEVICE R123_STATIC_INLINE threefry4x##W##_key_t threefry4x##W##keyinit(threefry4x##W##_ukey_t uk) { return uk; } \
-R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
-R123_CUDA_DEVICE R123_STATIC_INLINE                                          \
-threefry4x##W##_ctr_t threefry4x##W##_R(unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
-    threefry4x##W##_ctr_t X;                                            \
-    uint##W##_t ks[4+1];                                            \
-    int  i; /* avoid size_t to avoid need for stddef.h */                   \
-    R123_ASSERT(Nrounds<=72);                                           \
-    ks[4] =  SKEIN_KS_PARITY##W;                                    \
-    for (i=0;i < 4; i++)                                            \
-        {                                                               \
-            ks[i] = k.v[i];                                             \
-            X.v[i]  = in.v[i];                                          \
-            ks[4] ^= k.v[i];                                        \
-        }                                                               \
-                                                                        \
-    /* Insert initial key before round 0 */                             \
-    X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
-                                                                        \
-    if(Nrounds>0){                                                      \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>1){                                                      \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>2){                                                      \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>3){                                                      \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>3){                                                      \
-        /* InjectKey(r=1) */                                            \
-        X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
-        X.v[4-1] += 1;     /* X.v[WCNT4-1] += r  */                 \
-    }                                                                   \
-                                                                        \
-    if(Nrounds>4){                                                      \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>5){                                                      \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>6){                                                      \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>7){                                                      \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>7){                                                      \
-        /* InjectKey(r=2) */                                            \
-        X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
-        X.v[4-1] += 2;     /* X.v[WCNT4-1] += r  */                 \
-    }                                                                   \
-                                                                        \
-    if(Nrounds>8){                                                      \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>9){                                                      \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>10){                                                     \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>11){                                                     \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>11){                                                     \
-        /* InjectKey(r=3) */                                            \
-        X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
-        X.v[4-1] += 3;     /* X.v[WCNT4-1] += r  */                 \
-    }                                                                   \
-                                                                        \
-    if(Nrounds>12){                                                     \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>13){                                                     \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>14){                                                     \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>15){                                                     \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>15){                                                     \
-        /* InjectKey(r=1) */                                            \
-        X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \
-        X.v[4-1] += 4;     /* X.v[WCNT4-1] += r  */                 \
-    }                                                                   \
-                                                                        \
-    if(Nrounds>16){                                                     \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>17){                                                     \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>18){                                                     \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>19){                                                     \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>19){                                                     \
-        /* InjectKey(r=1) */                                            \
-        X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
-        X.v[4-1] += 5;     /* X.v[WCNT4-1] += r  */                 \
-    }                                                                   \
-                                                                        \
-    if(Nrounds>20){                                                     \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>21){                                                     \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>22){                                                     \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>23){                                                     \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>23){                                                     \
-        /* InjectKey(r=1) */                                            \
-        X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
-        X.v[4-1] += 6;     /* X.v[WCNT4-1] += r  */                 \
-    }                                                                   \
-                                                                        \
-    if(Nrounds>24){                                                     \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>25){                                                     \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>26){                                                     \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>27){                                                     \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>27){                                                     \
-        /* InjectKey(r=1) */                                            \
-        X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
-        X.v[4-1] += 7;     /* X.v[WCNT4-1] += r  */                 \
-    }                                                                   \
-                                                                        \
-    if(Nrounds>28){                                                     \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>29){                                                     \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>30){                                                     \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>31){                                                     \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>31){                                                     \
-        /* InjectKey(r=1) */                                            \
-        X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
-        X.v[4-1] += 8;     /* X.v[WCNT4-1] += r  */                 \
-    }                                                                   \
-                                                                        \
-    if(Nrounds>32){                                                     \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>33){                                                     \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>34){                                                     \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>35){                                                     \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>35){                                                     \
-        /* InjectKey(r=1) */                                            \
-        X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \
-        X.v[4-1] += 9;     /* X.v[WCNT4-1] += r  */                 \
-    }                                                                   \
-                                                                        \
-    if(Nrounds>36){                                                     \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>37){                                                     \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>38){                                                     \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>39){                                                     \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>39){                                                     \
-        /* InjectKey(r=1) */                                            \
-        X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
-        X.v[4-1] += 10;     /* X.v[WCNT4-1] += r  */                 \
-    }                                                                   \
-                                                                        \
-    if(Nrounds>40){                                                     \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>41){                                                     \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>42){                                                     \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>43){                                                     \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>43){                                                     \
-        /* InjectKey(r=1) */                                            \
-        X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
-        X.v[4-1] += 11;     /* X.v[WCNT4-1] += r  */                \
-    }                                                                   \
-                                                                        \
-    if(Nrounds>44){                                                     \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>45){                                                     \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>46){                                                     \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>47){                                                     \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>47){                                                     \
-        /* InjectKey(r=1) */                                            \
-        X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
-        X.v[4-1] += 12;     /* X.v[WCNT4-1] += r  */                 \
-    }                                                                   \
-                                                                        \
-    if(Nrounds>48){                                                     \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>49){                                                     \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>50){                                                     \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>51){                                                     \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>51){                                                     \
-        /* InjectKey(r=1) */                                            \
-        X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
-        X.v[4-1] += 13;     /* X.v[WCNT4-1] += r  */                 \
-    }                                                                   \
-                                                                        \
-    if(Nrounds>52){                                                     \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>53){                                                     \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>54){                                                     \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>55){                                                     \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>55){                                                     \
-        /* InjectKey(r=1) */                                            \
-        X.v[0] += ks[4]; X.v[1] += ks[0]; X.v[2] += ks[1]; X.v[3] += ks[2]; \
-        X.v[4-1] += 14;     /* X.v[WCNT4-1] += r  */                 \
-    }                                                                   \
-                                                                        \
-    if(Nrounds>56){                                                     \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>57){                                                     \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>58){                                                     \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>59){                                                     \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>59){                                                     \
-        /* InjectKey(r=1) */                                            \
-        X.v[0] += ks[0]; X.v[1] += ks[1]; X.v[2] += ks[2]; X.v[3] += ks[3]; \
-        X.v[4-1] += 15;     /* X.v[WCNT4-1] += r  */                 \
-    }                                                                   \
-                                                                        \
-    if(Nrounds>60){                                                     \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>61){                                                     \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>62){                                                     \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>63){                                                     \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>63){                                                     \
-        /* InjectKey(r=1) */                                            \
-        X.v[0] += ks[1]; X.v[1] += ks[2]; X.v[2] += ks[3]; X.v[3] += ks[4]; \
-        X.v[4-1] += 16;     /* X.v[WCNT4-1] += r  */                 \
-    }                                                                   \
-                                                                        \
-    if(Nrounds>64){                                                     \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_0_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_0_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>65){                                                     \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_1_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_1_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>66){                                                     \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_2_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_2_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>67){                                                     \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_3_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_3_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>67){                                                     \
-        /* InjectKey(r=1) */                                            \
-        X.v[0] += ks[2]; X.v[1] += ks[3]; X.v[2] += ks[4]; X.v[3] += ks[0]; \
-        X.v[4-1] += 17;     /* X.v[WCNT4-1] += r  */                 \
-    }                                                                   \
-                                                                        \
-    if(Nrounds>68){                                                     \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_4_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_4_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>69){                                                     \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_5_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_5_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>70){                                                     \
-        X.v[0] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_6_0); X.v[1] ^= X.v[0]; \
-        X.v[2] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_6_1); X.v[3] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>71){                                                     \
-        X.v[0] += X.v[3]; X.v[3] = RotL_##W(X.v[3],R_##W##x4_7_0); X.v[3] ^= X.v[0]; \
-        X.v[2] += X.v[1]; X.v[1] = RotL_##W(X.v[1],R_##W##x4_7_1); X.v[1] ^= X.v[2]; \
-    }                                                                   \
-    if(Nrounds>71){                                                     \
-        /* InjectKey(r=1) */                                            \
-        X.v[0] += ks[3]; X.v[1] += ks[4]; X.v[2] += ks[0]; X.v[3] += ks[1]; \
-        X.v[4-1] += 18;     /* X.v[WCNT4-1] += r  */                 \
-    }                                                                   \
-                                                                        \
-    return X;                                                           \
-}                                                                       \
- /** @ingroup ThreefryNxW */                                            \
-enum r123_enum_threefry4x##W { threefry4x##W##_rounds = THREEFRY4x##W##_DEFAULT_ROUNDS };       \
-R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k)); \
-R123_CUDA_DEVICE R123_STATIC_INLINE                                     \
-threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k){ \
-    return threefry4x##W##_R(threefry4x##W##_rounds, in, k);            \
-}
+#define _threefry4x_tpl(W)                                                                         \
+    typedef struct r123array4x##W threefry4x##W##_ctr_t;                                           \
+    typedef struct r123array4x##W threefry4x##W##_key_t;                                           \
+    typedef struct r123array4x##W threefry4x##W##_ukey_t;                                          \
+    R123_CUDA_DEVICE R123_STATIC_INLINE threefry4x##W##_key_t threefry4x##W##keyinit(              \
+        threefry4x##W##_ukey_t uk) {                                                               \
+        return uk;                                                                                 \
+    }                                                                                              \
+    R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(threefry4x##W##_ctr_t threefry4x##W##_R( \
+        unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k));                 \
+    R123_CUDA_DEVICE R123_STATIC_INLINE threefry4x##W##_ctr_t threefry4x##W##_R(                   \
+        unsigned int Nrounds, threefry4x##W##_ctr_t in, threefry4x##W##_key_t k) {                 \
+        threefry4x##W##_ctr_t X;                                                                   \
+        uint##W##_t ks[4 + 1];                                                                     \
+        int i; /* avoid size_t to avoid need for stddef.h */                                       \
+        R123_ASSERT(Nrounds <= 72);                                                                \
+        ks[4] = SKEIN_KS_PARITY##W;                                                                \
+        for (i = 0; i < 4; i++) {                                                                  \
+            ks[i] = k.v[i];                                                                        \
+            X.v[i] = in.v[i];                                                                      \
+            ks[4] ^= k.v[i];                                                                       \
+        }                                                                                          \
+                                                                                                   \
+        /* Insert initial key before round 0 */                                                    \
+        X.v[0] += ks[0];                                                                           \
+        X.v[1] += ks[1];                                                                           \
+        X.v[2] += ks[2];                                                                           \
+        X.v[3] += ks[3];                                                                           \
+                                                                                                   \
+        if (Nrounds > 0) {                                                                         \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_0_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_0_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 1) {                                                                         \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_1_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_1_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 2) {                                                                         \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_2_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_2_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 3) {                                                                         \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_3_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_3_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 3) {                                                                         \
+            /* InjectKey(r=1) */                                                                   \
+            X.v[0] += ks[1];                                                                       \
+            X.v[1] += ks[2];                                                                       \
+            X.v[2] += ks[3];                                                                       \
+            X.v[3] += ks[4];                                                                       \
+            X.v[4 - 1] += 1; /* X.v[WCNT4-1] += r  */                                              \
+        }                                                                                          \
+                                                                                                   \
+        if (Nrounds > 4) {                                                                         \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_4_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_4_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 5) {                                                                         \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_5_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_5_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 6) {                                                                         \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_6_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_6_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 7) {                                                                         \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_7_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_7_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 7) {                                                                         \
+            /* InjectKey(r=2) */                                                                   \
+            X.v[0] += ks[2];                                                                       \
+            X.v[1] += ks[3];                                                                       \
+            X.v[2] += ks[4];                                                                       \
+            X.v[3] += ks[0];                                                                       \
+            X.v[4 - 1] += 2; /* X.v[WCNT4-1] += r  */                                              \
+        }                                                                                          \
+                                                                                                   \
+        if (Nrounds > 8) {                                                                         \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_0_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_0_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 9) {                                                                         \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_1_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_1_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 10) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_2_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_2_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 11) {                                                                        \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_3_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_3_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 11) {                                                                        \
+            /* InjectKey(r=3) */                                                                   \
+            X.v[0] += ks[3];                                                                       \
+            X.v[1] += ks[4];                                                                       \
+            X.v[2] += ks[0];                                                                       \
+            X.v[3] += ks[1];                                                                       \
+            X.v[4 - 1] += 3; /* X.v[WCNT4-1] += r  */                                              \
+        }                                                                                          \
+                                                                                                   \
+        if (Nrounds > 12) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_4_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_4_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 13) {                                                                        \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_5_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_5_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 14) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_6_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_6_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 15) {                                                                        \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_7_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_7_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 15) {                                                                        \
+            /* InjectKey(r=1) */                                                                   \
+            X.v[0] += ks[4];                                                                       \
+            X.v[1] += ks[0];                                                                       \
+            X.v[2] += ks[1];                                                                       \
+            X.v[3] += ks[2];                                                                       \
+            X.v[4 - 1] += 4; /* X.v[WCNT4-1] += r  */                                              \
+        }                                                                                          \
+                                                                                                   \
+        if (Nrounds > 16) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_0_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_0_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 17) {                                                                        \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_1_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_1_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 18) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_2_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_2_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 19) {                                                                        \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_3_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_3_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 19) {                                                                        \
+            /* InjectKey(r=1) */                                                                   \
+            X.v[0] += ks[0];                                                                       \
+            X.v[1] += ks[1];                                                                       \
+            X.v[2] += ks[2];                                                                       \
+            X.v[3] += ks[3];                                                                       \
+            X.v[4 - 1] += 5; /* X.v[WCNT4-1] += r  */                                              \
+        }                                                                                          \
+                                                                                                   \
+        if (Nrounds > 20) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_4_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_4_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 21) {                                                                        \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_5_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_5_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 22) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_6_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_6_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 23) {                                                                        \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_7_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_7_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 23) {                                                                        \
+            /* InjectKey(r=1) */                                                                   \
+            X.v[0] += ks[1];                                                                       \
+            X.v[1] += ks[2];                                                                       \
+            X.v[2] += ks[3];                                                                       \
+            X.v[3] += ks[4];                                                                       \
+            X.v[4 - 1] += 6; /* X.v[WCNT4-1] += r  */                                              \
+        }                                                                                          \
+                                                                                                   \
+        if (Nrounds > 24) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_0_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_0_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 25) {                                                                        \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_1_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_1_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 26) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_2_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_2_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 27) {                                                                        \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_3_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_3_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 27) {                                                                        \
+            /* InjectKey(r=1) */                                                                   \
+            X.v[0] += ks[2];                                                                       \
+            X.v[1] += ks[3];                                                                       \
+            X.v[2] += ks[4];                                                                       \
+            X.v[3] += ks[0];                                                                       \
+            X.v[4 - 1] += 7; /* X.v[WCNT4-1] += r  */                                              \
+        }                                                                                          \
+                                                                                                   \
+        if (Nrounds > 28) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_4_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_4_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 29) {                                                                        \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_5_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_5_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 30) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_6_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_6_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 31) {                                                                        \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_7_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_7_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 31) {                                                                        \
+            /* InjectKey(r=1) */                                                                   \
+            X.v[0] += ks[3];                                                                       \
+            X.v[1] += ks[4];                                                                       \
+            X.v[2] += ks[0];                                                                       \
+            X.v[3] += ks[1];                                                                       \
+            X.v[4 - 1] += 8; /* X.v[WCNT4-1] += r  */                                              \
+        }                                                                                          \
+                                                                                                   \
+        if (Nrounds > 32) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_0_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_0_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 33) {                                                                        \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_1_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_1_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 34) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_2_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_2_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 35) {                                                                        \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_3_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_3_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 35) {                                                                        \
+            /* InjectKey(r=1) */                                                                   \
+            X.v[0] += ks[4];                                                                       \
+            X.v[1] += ks[0];                                                                       \
+            X.v[2] += ks[1];                                                                       \
+            X.v[3] += ks[2];                                                                       \
+            X.v[4 - 1] += 9; /* X.v[WCNT4-1] += r  */                                              \
+        }                                                                                          \
+                                                                                                   \
+        if (Nrounds > 36) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_4_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_4_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 37) {                                                                        \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_5_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_5_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 38) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_6_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_6_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 39) {                                                                        \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_7_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_7_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 39) {                                                                        \
+            /* InjectKey(r=1) */                                                                   \
+            X.v[0] += ks[0];                                                                       \
+            X.v[1] += ks[1];                                                                       \
+            X.v[2] += ks[2];                                                                       \
+            X.v[3] += ks[3];                                                                       \
+            X.v[4 - 1] += 10; /* X.v[WCNT4-1] += r  */                                             \
+        }                                                                                          \
+                                                                                                   \
+        if (Nrounds > 40) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_0_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_0_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 41) {                                                                        \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_1_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_1_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 42) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_2_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_2_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 43) {                                                                        \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_3_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_3_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 43) {                                                                        \
+            /* InjectKey(r=1) */                                                                   \
+            X.v[0] += ks[1];                                                                       \
+            X.v[1] += ks[2];                                                                       \
+            X.v[2] += ks[3];                                                                       \
+            X.v[3] += ks[4];                                                                       \
+            X.v[4 - 1] += 11; /* X.v[WCNT4-1] += r  */                                             \
+        }                                                                                          \
+                                                                                                   \
+        if (Nrounds > 44) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_4_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_4_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 45) {                                                                        \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_5_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_5_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 46) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_6_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_6_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 47) {                                                                        \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_7_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_7_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 47) {                                                                        \
+            /* InjectKey(r=1) */                                                                   \
+            X.v[0] += ks[2];                                                                       \
+            X.v[1] += ks[3];                                                                       \
+            X.v[2] += ks[4];                                                                       \
+            X.v[3] += ks[0];                                                                       \
+            X.v[4 - 1] += 12; /* X.v[WCNT4-1] += r  */                                             \
+        }                                                                                          \
+                                                                                                   \
+        if (Nrounds > 48) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_0_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_0_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 49) {                                                                        \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_1_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_1_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 50) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_2_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_2_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 51) {                                                                        \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_3_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_3_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 51) {                                                                        \
+            /* InjectKey(r=1) */                                                                   \
+            X.v[0] += ks[3];                                                                       \
+            X.v[1] += ks[4];                                                                       \
+            X.v[2] += ks[0];                                                                       \
+            X.v[3] += ks[1];                                                                       \
+            X.v[4 - 1] += 13; /* X.v[WCNT4-1] += r  */                                             \
+        }                                                                                          \
+                                                                                                   \
+        if (Nrounds > 52) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_4_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_4_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 53) {                                                                        \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_5_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_5_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 54) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_6_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_6_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 55) {                                                                        \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_7_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_7_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 55) {                                                                        \
+            /* InjectKey(r=1) */                                                                   \
+            X.v[0] += ks[4];                                                                       \
+            X.v[1] += ks[0];                                                                       \
+            X.v[2] += ks[1];                                                                       \
+            X.v[3] += ks[2];                                                                       \
+            X.v[4 - 1] += 14; /* X.v[WCNT4-1] += r  */                                             \
+        }                                                                                          \
+                                                                                                   \
+        if (Nrounds > 56) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_0_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_0_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 57) {                                                                        \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_1_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_1_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 58) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_2_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_2_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 59) {                                                                        \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_3_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_3_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 59) {                                                                        \
+            /* InjectKey(r=1) */                                                                   \
+            X.v[0] += ks[0];                                                                       \
+            X.v[1] += ks[1];                                                                       \
+            X.v[2] += ks[2];                                                                       \
+            X.v[3] += ks[3];                                                                       \
+            X.v[4 - 1] += 15; /* X.v[WCNT4-1] += r  */                                             \
+        }                                                                                          \
+                                                                                                   \
+        if (Nrounds > 60) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_4_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_4_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 61) {                                                                        \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_5_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_5_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 62) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_6_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_6_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 63) {                                                                        \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_7_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_7_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 63) {                                                                        \
+            /* InjectKey(r=1) */                                                                   \
+            X.v[0] += ks[1];                                                                       \
+            X.v[1] += ks[2];                                                                       \
+            X.v[2] += ks[3];                                                                       \
+            X.v[3] += ks[4];                                                                       \
+            X.v[4 - 1] += 16; /* X.v[WCNT4-1] += r  */                                             \
+        }                                                                                          \
+                                                                                                   \
+        if (Nrounds > 64) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_0_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_0_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 65) {                                                                        \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_1_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_1_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 66) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_2_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_2_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 67) {                                                                        \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_3_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_3_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 67) {                                                                        \
+            /* InjectKey(r=1) */                                                                   \
+            X.v[0] += ks[2];                                                                       \
+            X.v[1] += ks[3];                                                                       \
+            X.v[2] += ks[4];                                                                       \
+            X.v[3] += ks[0];                                                                       \
+            X.v[4 - 1] += 17; /* X.v[WCNT4-1] += r  */                                             \
+        }                                                                                          \
+                                                                                                   \
+        if (Nrounds > 68) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_4_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_4_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 69) {                                                                        \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_5_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_5_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 70) {                                                                        \
+            X.v[0] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_6_0);                                              \
+            X.v[1] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_6_1);                                              \
+            X.v[3] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 71) {                                                                        \
+            X.v[0] += X.v[3];                                                                      \
+            X.v[3] = RotL_##W(X.v[3], R_##W##x4_7_0);                                              \
+            X.v[3] ^= X.v[0];                                                                      \
+            X.v[2] += X.v[1];                                                                      \
+            X.v[1] = RotL_##W(X.v[1], R_##W##x4_7_1);                                              \
+            X.v[1] ^= X.v[2];                                                                      \
+        }                                                                                          \
+        if (Nrounds > 71) {                                                                        \
+            /* InjectKey(r=1) */                                                                   \
+            X.v[0] += ks[3];                                                                       \
+            X.v[1] += ks[4];                                                                       \
+            X.v[2] += ks[0];                                                                       \
+            X.v[3] += ks[1];                                                                       \
+            X.v[4 - 1] += 18; /* X.v[WCNT4-1] += r  */                                             \
+        }                                                                                          \
+                                                                                                   \
+        return X;                                                                                  \
+    }                                                                                              \
+    /** @ingroup ThreefryNxW */                                                                    \
+    enum r123_enum_threefry4x##W{threefry4x##W##_rounds = THREEFRY4x##W##_DEFAULT_ROUNDS};         \
+    R123_CUDA_DEVICE R123_STATIC_INLINE R123_FORCE_INLINE(                                         \
+        threefry4x##W##_ctr_t threefry4x##W(threefry4x##W##_ctr_t in, threefry4x##W##_key_t k));   \
+    R123_CUDA_DEVICE R123_STATIC_INLINE threefry4x##W##_ctr_t threefry4x##W(                       \
+        threefry4x##W##_ctr_t in, threefry4x##W##_key_t k) {                                       \
+        return threefry4x##W##_R(threefry4x##W##_rounds, in, k);                                   \
+    }
 /** \endcond */
 
-_threefry2x_tpl(64)
-_threefry2x_tpl(32)
-_threefry4x_tpl(64)
-_threefry4x_tpl(32)
+_threefry2x_tpl(64) _threefry2x_tpl(32) _threefry4x_tpl(64) _threefry4x_tpl(32)
 
 /* gcc4.5 and 4.6 seem to optimize a macro-ized threefryNxW better
    than a static inline function.  Why?  */
-#define threefry2x32(c,k) threefry2x32_R(threefry2x32_rounds, c, k)
-#define threefry4x32(c,k) threefry4x32_R(threefry4x32_rounds, c, k)
-#define threefry2x64(c,k) threefry2x64_R(threefry2x64_rounds, c, k)
-#define threefry4x64(c,k) threefry4x64_R(threefry4x64_rounds, c, k)
+#define threefry2x32(c, k) threefry2x32_R(threefry2x32_rounds, c, k)
+#define threefry4x32(c, k) threefry4x32_R(threefry4x32_rounds, c, k)
+#define threefry2x64(c, k) threefry2x64_R(threefry2x64_rounds, c, k)
+#define threefry4x64(c, k) threefry4x64_R(threefry4x64_rounds, c, k)
 
 #ifdef __cplusplus
 /** \cond HIDDEN_FROM_DOXYGEN */
-#define _threefryNxWclass_tpl(NxW)                                      \
-namespace r123{                                                     \
-template<unsigned int R>                                                  \
- struct Threefry##NxW##_R{                                              \
-    typedef threefry##NxW##_ctr_t ctr_type;                             \
-    typedef threefry##NxW##_key_t key_type;                             \
-    typedef threefry##NxW##_key_t ukey_type;                            \
-    static const unsigned int rounds=R;                                 \
-   inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr, key_type key)){ \
-        R123_STATIC_ASSERT(R<=72, "threefry is only unrolled up to 72 rounds\n"); \
-        return threefry##NxW##_R(R, ctr, key);                              \
-    }                                                                   \
-};                                                                      \
- typedef Threefry##NxW##_R<threefry##NxW##_rounds> Threefry##NxW;       \
-} // namespace r123
+#define _threefryNxWclass_tpl(NxW)                                                          \
+    namespace r123 {                                                                        \
+        template <unsigned int R>                                                           \
+        struct Threefry##NxW##_R {                                                          \
+            typedef threefry##NxW##_ctr_t ctr_type;                                         \
+            typedef threefry##NxW##_key_t key_type;                                         \
+            typedef threefry##NxW##_key_t ukey_type;                                        \
+            static const unsigned int rounds = R;                                           \
+            inline R123_CUDA_DEVICE R123_FORCE_INLINE(ctr_type operator()(ctr_type ctr,     \
+                                                                          key_type key)) {  \
+                R123_STATIC_ASSERT(R <= 72, "threefry is only unrolled up to 72 rounds\n"); \
+                return threefry##NxW##_R(R, ctr, key);                                      \
+            }                                                                               \
+        };                                                                                  \
+        typedef Threefry##NxW##_R<threefry##NxW##_rounds> Threefry##NxW;                    \
+    }  // namespace r123
 
-/** \endcond */
+    /** \endcond */
 
-_threefryNxWclass_tpl(2x32)
-_threefryNxWclass_tpl(4x32)
-_threefryNxWclass_tpl(2x64)
-_threefryNxWclass_tpl(4x64)
+    _threefryNxWclass_tpl(2x32) _threefryNxWclass_tpl(4x32) _threefryNxWclass_tpl(2x64)
+        _threefryNxWclass_tpl(4x64)
 
 /* The _tpl macros don't quite work to do string-pasting inside comments.
    so we just write out the boilerplate documentation four times... */
 
-/** 
+/**
 @defgroup ThreefryNxW Threefry Classes and Typedefs
 
 The ThreefryNxW classes export the member functions, typedefs and
 operator overloads required by a @ref CBRNG "CBRNG" class.
 
-As described in  
-<a href="http://dl.acm.org/citation.cfm?doid=2063405"><i>Parallel Random Numbers:  As Easy as 1, 2, 3</i> </a>, 
+As described in
+<a href="http://dl.acm.org/citation.cfm?doid=2063405"><i>Parallel Random Numbers:  As Easy as 1, 2,
+3</i> </a>,
 the Threefry family is closely related to the Threefish block cipher from
-<a href="http://www.skein-hash.info/"> Skein Hash Function</a>.  
+<a href="http://www.skein-hash.info/"> Skein Hash Function</a>.
 Threefry is \b not suitable for cryptographic use.
 
-Threefry uses integer addition, bitwise rotation, xor and permutation of words to randomize its output.
+Threefry uses integer addition, bitwise rotation, xor and permutation of words to randomize its
+output.
 
-@class r123::Threefry2x32_R 
+@class r123::Threefry2x32_R
 @ingroup ThreefryNxW
 
-exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
+exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG"
+class.
 
 The template argument, ROUNDS, is the number of times the Threefry round
 function will be applied.
@@ -793,12 +1292,13 @@ ROUNDS=13 or more for Threefry2x32.
   Threefry2x32 is equivalent to Threefry2x32_R<20>.    With 20 rounds,
   Threefry2x32 has a considerable safety margin over the minimum number
   of rounds with no known statistical flaws, but still has excellent
-   performance. 
+   performance.
 
-@class r123::Threefry2x64_R 
+@class r123::Threefry2x64_R
 @ingroup ThreefryNxW
 
-exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
+exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG"
+class.
 
 The template argument, ROUNDS, is the number of times the Threefry round
 function will be applied.
@@ -816,14 +1316,15 @@ ROUNDS=14 or more for Threefry2x64.
   Threefry2x64 is equivalent to Threefry2x64_R<20>.    With 20 rounds,
   Threefry2x64 has a considerable safety margin over the minimum number
   of rounds with no known statistical flaws, but still has excellent
-   performance. 
+   performance.
 
 
 
-@class r123::Threefry4x32_R 
+@class r123::Threefry4x32_R
 @ingroup ThreefryNxW
 
-exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
+exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG"
+class.
 
 The template argument, ROUNDS, is the number of times the Threefry round
 function will be applied.
@@ -836,14 +1337,15 @@ ROUNDS=12 or more for Threefry4x32.
   Threefry4x32 is equivalent to Threefry4x32_R<20>.    With 20 rounds,
   Threefry4x32 has a considerable safety margin over the minimum number
   of rounds with no known statistical flaws, but still has excellent
-   performance. 
+   performance.
 
 
 
-@class r123::Threefry4x64_R 
+@class r123::Threefry4x64_R
 @ingroup ThreefryNxW
 
-exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG" class.
+exports the member functions, typedefs and operator overloads required by a @ref CBRNG "CBRNG"
+class.
 
 The template argument, ROUNDS, is the number of times the Threefry round
 function will be applied.
@@ -856,7 +1358,7 @@ ROUNDS=12 or more for Threefry4x64.
   Threefry4x64 is equivalent to Threefry4x64_R<20>.    With 20 rounds,
   Threefry4x64 has a considerable safety margin over the minimum number
   of rounds with no known statistical flaws, but still has excellent
-   performance. 
+   performance.
 */
 
 #endif
diff --git a/coreneuron/utils/randoms/nrnran123.c b/coreneuron/utils/randoms/nrnran123.c
index 4e5f76dad..61d566c47 100644
--- a/coreneuron/utils/randoms/nrnran123.c
+++ b/coreneuron/utils/randoms/nrnran123.c
@@ -33,112 +33,128 @@ THE POSSIBILITY OF SUCH DAMAGE.
 #include "coreneuron/utils/randoms/Random123/philox.h"
 #include "coreneuron/nrniv/nrnmutdec.h"
 
-static const double SHIFT32   = 1.0 / 4294967297.0;    /* 1/(2^32 + 1) */
+static const double SHIFT32 = 1.0 / 4294967297.0; /* 1/(2^32 + 1) */
 
-static philox4x32_key_t k={{0}};
+static philox4x32_key_t k = {{0}};
 
 static size_t instance_count_ = 0;
-size_t nrnran123_instance_count() { return instance_count_; }
+size_t nrnran123_instance_count() {
+    return instance_count_;
+}
+
+/* now this is declated in nrnran123.h so that its available for prototype declaration
 
 struct nrnran123_State {
-	philox4x32_ctr_t c;
-	philox4x32_ctr_t r;
-	char which_;
+        philox4x32_ctr_t c;
+        philox4x32_ctr_t r;
+        unsigned char which_;
 };
-size_t nrnran123_state_size() { return sizeof(nrnran123_State); }
+*/
+
+size_t nrnran123_state_size() {
+    return sizeof(nrnran123_State);
+}
 
 void nrnran123_set_globalindex(uint32_t gix) {
-	k.v[0] = gix;
+    k.v[0] = gix;
 }
 
 /* if one sets the global, one should reset all the stream sequences. */
 uint32_t nrnran123_get_globalindex() {
-	return k.v[0];
+    return k.v[0];
 }
 
-static MUTDEC
-void nrnran123_mutconstruct() {
-	if (!mut_) {
-		MUTCONSTRUCT(1);
-	}
+static MUTDEC void nrnran123_mutconstruct() {
+    if (!mut_) {
+        MUTCONSTRUCT(1);
+    }
 }
 
 nrnran123_State* nrnran123_newstream(uint32_t id1, uint32_t id2) {
-	nrnran123_State* s = (nrnran123_State*)ecalloc(sizeof(nrnran123_State), 1);
-	s->c.v[2] = id1;
-	s->c.v[3] = id2;
-	nrnran123_setseq(s, 0, 0);
-	MUTLOCK
-	++instance_count_;
-	MUTUNLOCK
-	return s;
+    return nrnran123_newstream3(id1, id2, 0);
+}
+
+nrnran123_State* nrnran123_newstream3(uint32_t id1, uint32_t id2, uint32_t id3) {
+    nrnran123_State* s = (nrnran123_State*)ecalloc(sizeof(nrnran123_State), 1);
+    s->c.v[1] = id3;
+    s->c.v[2] = id1;
+    s->c.v[3] = id2;
+    nrnran123_setseq(s, 0, 0);
+    MUTLOCK
+    ++instance_count_;
+    MUTUNLOCK
+    return s;
 }
 
 void nrnran123_deletestream(nrnran123_State* s) {
-	MUTLOCK
-	--instance_count_;
-	MUTUNLOCK
-	free(s);
+    MUTLOCK
+    --instance_count_;
+    MUTUNLOCK
+    free(s);
 }
 
-void nrnran123_getseq(nrnran123_State* s, uint32_t* seq, char* which) {
-	*seq = s->c.v[0];
-	*which = s->which_;
+void nrnran123_getseq(nrnran123_State* s, uint32_t* seq, unsigned char* which) {
+    *seq = s->c.v[0];
+    *which = s->which_;
 }
 
-void nrnran123_setseq(nrnran123_State* s, uint32_t seq, char which) {
-	if (which > 3 || which < 0) {
-		s->which_ = 0;
-	}else{
-		s->which_ = which;
-	}
-	s->c.v[0] = seq;
-	s->r = philox4x32(s->c, k);
+void nrnran123_setseq(nrnran123_State* s, uint32_t seq, unsigned char which) {
+    if (which > 3) {
+        s->which_ = 0;
+    } else {
+        s->which_ = which;
+    }
+    s->c.v[0] = seq;
+    s->r = philox4x32(s->c, k);
 }
 
 void nrnran123_getids(nrnran123_State* s, uint32_t* id1, uint32_t* id2) {
-	*id1 = s->c.v[2];
-	*id2 = s->c.v[3];
+    *id1 = s->c.v[2];
+    *id2 = s->c.v[3];
 }
 
 uint32_t nrnran123_ipick(nrnran123_State* s) {
-	uint32_t rval;
-	unsigned char which = s->which_;
-	assert (which < 4);
-	rval = s->r.v[which++];
-	if (which > 3) {
-		which = 0;
-		s->c.v[0]++;
-		s->r = philox4x32(s->c, k);
-	}
-	s->which_ = which;
-	return rval;
+    uint32_t rval;
+    unsigned char which = s->which_;
+    assert(which < 4);
+    rval = s->r.v[which++];
+    if (which > 3) {
+        which = 0;
+        s->c.v[0]++;
+        s->r = philox4x32(s->c, k);
+    }
+    s->which_ = which;
+    return rval;
 }
 
 double nrnran123_dblpick(nrnran123_State* s) {
-	return nrnran123_uint2dbl(nrnran123_ipick(s));
+    return nrnran123_uint2dbl(nrnran123_ipick(s));
 }
 
 double nrnran123_negexp(nrnran123_State* s) {
-	/* min 2.3283064e-10 to max 22.18071 */
-	return -log(nrnran123_dblpick(s));
+    /* min 2.3283064e-10 to max 22.18071 */
+    return -log(nrnran123_dblpick(s));
 }
 
 /* at cost of a cached  value we could compute two at a time. */
 double nrnran123_normal(nrnran123_State* s) {
-	double w, x, y;
-	double u1 = nrnran123_dblpick(s);
-	double u2 = nrnran123_dblpick(s);
-	u1 = 2.*u1 - 1.;
-	u2 = 2.*u2 - 1.;
-	w = (u1*u1) + (u2*u2);
-	y = sqrt( (-2.*log(w))/w);
-	x = u1*y;
-	return x;
+    double w, x, y;
+    double u1, u2;
+    do {
+        u1 = nrnran123_dblpick(s);
+        u2 = nrnran123_dblpick(s);
+        u1 = 2. * u1 - 1.;
+        u2 = 2. * u2 - 1.;
+        w = (u1 * u1) + (u2 * u2);
+    } while (w > 1);
+
+    y = sqrt((-2. * log(w)) / w);
+    x = u1 * y;
+    return x;
 }
 
 double nrnran123_uint2dbl(uint32_t u) {
-	/* 0 to 2^32-1 transforms to double value in open (0,1) interval */
-	/* min 2.3283064e-10 to max (1 - 2.3283064e-10) */
-	return ((double)u + 1.0) * SHIFT32;
+    /* 0 to 2^32-1 transforms to double value in open (0,1) interval */
+    /* min 2.3283064e-10 to max (1 - 2.3283064e-10) */
+    return ((double)u + 1.0) * SHIFT32;
 }
diff --git a/coreneuron/utils/randoms/nrnran123.cu b/coreneuron/utils/randoms/nrnran123.cu
new file mode 100644
index 000000000..379ab11cb
--- /dev/null
+++ b/coreneuron/utils/randoms/nrnran123.cu
@@ -0,0 +1,160 @@
+/*
+Copyright (c) 2016, Blue Brain Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "coreneuron/utils/randoms/nrnran123.h"
+
+/* global data structure per process */
+__device__ static const double SHIFT32 = 1.0 / 4294967297.0; /* 1/(2^32 + 1) */
+__device__ static philox4x32_key_t k = {{0}};
+__device__ static unsigned int instance_count_ = 0;
+__device__ size_t nrnran123_instance_count() {
+    return instance_count_;
+}
+
+__device__ size_t nrnran123_state_size() {
+    return sizeof(nrnran123_State);
+}
+
+__device__ void nrnran123_set_globalindex(uint32_t gix) {
+    k.v[0] = gix;
+}
+
+/* if one sets the global, one should reset all the stream sequences. */
+__device__ uint32_t nrnran123_get_globalindex() {
+    return k.v[0];
+}
+
+__global__ void nrnran123_setup_cuda_newstream(nrnran123_State* s,
+                                               uint32_t id1,
+                                               uint32_t id2,
+                                               uint32_t id3) {
+    s->c.v[0] = 0;
+    s->c.v[1] = id3;
+    s->c.v[2] = id1;
+    s->c.v[3] = id2;
+    nrnran123_setseq(s, 0, 0);
+    atomicAdd(&instance_count_, 1);
+}
+
+__global__ void nrnran123_cuda_deletestream(nrnran123_State* s) {
+    atomicSub(&instance_count_, 1);
+}
+
+__device__ void nrnran123_getseq(nrnran123_State* s, uint32_t* seq, unsigned char* which) {
+    *seq = s->c.v[0];
+    *which = s->which_;
+}
+
+__device__ void nrnran123_setseq(nrnran123_State* s, uint32_t seq, unsigned char which) {
+    if (which > 3) {
+        s->which_ = 0;
+    } else {
+        s->which_ = which;
+    }
+    s->c.v[0] = seq;
+    s->r = philox4x32(s->c, k);
+}
+
+__device__ void nrnran123_getids(nrnran123_State* s, uint32_t* id1, uint32_t* id2) {
+    *id1 = s->c.v[2];
+    *id2 = s->c.v[3];
+}
+
+__device__ uint32_t nrnran123_ipick(nrnran123_State* s) {
+    uint32_t rval;
+    unsigned char which = s->which_;
+    rval = s->r.v[which++];
+    if (which > 3) {
+        which = 0;
+        s->c.v[0]++;
+        s->r = philox4x32(s->c, k);
+    }
+    s->which_ = which;
+    return rval;
+}
+
+__device__ double nrnran123_dblpick(nrnran123_State* s) {
+    return nrnran123_uint2dbl(nrnran123_ipick(s));
+}
+
+__device__ double nrnran123_negexp(nrnran123_State* s) {
+    /* min 2.3283064e-10 to max 22.18071 */
+    return -log(nrnran123_dblpick(s));
+}
+
+/* at cost of a cached  value we could compute two at a time. */
+__device__ double nrnran123_normal(nrnran123_State* s) {
+    double w, x, y;
+    double u1, u2;
+
+    do {
+        u1 = nrnran123_dblpick(s);
+        u2 = nrnran123_dblpick(s);
+        u1 = 2. * u1 - 1.;
+        u2 = 2. * u2 - 1.;
+        w = (u1 * u1) + (u2 * u2);
+    } while (w > 1);
+
+    y = sqrt((-2. * log(w)) / w);
+    x = u1 * y;
+    return x;
+}
+
+__device__ double nrnran123_uint2dbl(uint32_t u) {
+    /* 0 to 2^32-1 transforms to double value in open (0,1) interval */
+    /* min 2.3283064e-10 to max (1 - 2.3283064e-10) */
+    return ((double)u + 1.0) * SHIFT32;
+}
+
+/* nrn123 streams are created from cpu launcher routine */
+nrnran123_State* nrnran123_newstream(uint32_t id1, uint32_t id2) {
+    return nrnran123_newstream3(id1, id2, 0);
+}
+
+nrnran123_State* nrnran123_newstream3(uint32_t id1, uint32_t id2, uint32_t id3) {
+    nrnran123_State* s;
+
+    cudaMalloc((void**)&s, sizeof(nrnran123_State));
+    cudaMemset((void**)&s, 0, sizeof(nrnran123_State));
+
+    nrnran123_setup_cuda_newstream<<<1, 1>>>(s, id1, id2, id3);
+    cudaDeviceSynchronize();
+
+    return s;
+}
+
+/* nrn123 streams are destroyed from cpu launcher routine */
+void nrnran123_deletestream(nrnran123_State* s) {
+    nrnran123_cuda_deletestream<<<1, 1>>>(s);
+    cudaDeviceSynchronize();
+
+    cudaFree(s);
+}
diff --git a/coreneuron/utils/randoms/nrnran123.h b/coreneuron/utils/randoms/nrnran123.h
index af81de501..d662f18a4 100644
--- a/coreneuron/utils/randoms/nrnran123.h
+++ b/coreneuron/utils/randoms/nrnran123.h
@@ -51,45 +51,108 @@ of the full distribution available from
 http://www.deshawresearch.com/resources_random123.html
 */
 
+#include "Random123/philox.h"
 #include <inttypes.h>
 
+#ifdef __CUDACC__
+#define DEVICE __device__
+#define GLOBAL __global__
+#else
+#define DEVICE
+#define GLOBAL
+#endif
+
+#if (defined(__CUDACC__) || defined(_OPENACC)) && !defined(DISABLE_OPENACC)
+#define nrnran123_newstream cu_nrnran123_newstream
+#define nrnran123_newstream3 cu_nrnran123_newstream3
+#define nrnran123_deletestream cu_nrnran123_deletestream
+#define nrnran123_uint2dbl cu_nrnran123_uint2dbl
+#define nrnran123_negexp cu_nrnran123_negexp
+#define nrnran123_dblpick cu_nrnran123_dblpick
+#define nrnran123_ipick cu_nrnran123_ipick
+#define nrnran123_getids cu_nrnran123_getids
+#define nrnran123_setseq cu_nrnran123_setseq
+#define nrnran123_getseq cu_nrnran123_getseq
+#define nrnran123_get_globalindex cu_nrnran123_get_globalindex
+#define nrnran123_set_globalindex cu_nrnran123_set_globalindex
+#define nrnran123_state_size cu_nrnran123_state_size
+#define nrnran123_instance_count cu_nrnran123_instance_count
+#define nrnran123_normal cu_nrnran123_normal
+#endif
+
 #if defined(__cplusplus)
 extern "C" {
 #endif
 
-/* do this on launch to make nrnran123_newstream threadsafe */
-extern void nrnran123_mutconstruct(void);
+typedef struct nrnran123_State {
+    philox4x32_ctr_t c;
+    philox4x32_ctr_t r;
+    char which_;
+} nrnran123_State;
 
-typedef struct nrnran123_State nrnran123_State;
+typedef struct nrnran123_array4x32 { uint32_t v[4]; } nrnran123_array4x32;
 
-typedef struct nrnran123_array4x32 {
-	uint32_t v[4];
-} nrnran123_array4x32;
+/* do this on launch to make nrnran123_newstream threadsafe */
+extern DEVICE void nrnran123_mutconstruct(void);
 
 /* global index. eg. run number */
 /* all generator instances share this global index */
-extern void nrnran123_set_globalindex(uint32_t gix);
-extern uint32_t nrnran123_get_globalindex();
+extern DEVICE void nrnran123_set_globalindex(uint32_t gix);
+extern DEVICE uint32_t nrnran123_get_globalindex();
 
-extern size_t nrnran123_instance_count(void);
-extern size_t nrnran123_state_size(void);
+extern DEVICE size_t nrnran123_instance_count(void);
+extern DEVICE size_t nrnran123_state_size(void);
 
-/* minimal data stream */
+/* routines for creating and deleteing streams are called from cpu */
 extern nrnran123_State* nrnran123_newstream(uint32_t id1, uint32_t id2);
+extern nrnran123_State* nrnran123_newstream3(uint32_t id1, uint32_t id2, uint32_t id3);
 extern void nrnran123_deletestream(nrnran123_State*);
-extern void nrnran123_getseq(nrnran123_State*, uint32_t* seq, char* which);
-extern void nrnran123_setseq(nrnran123_State*, uint32_t seq, char which);
-extern void nrnran123_getids(nrnran123_State*, uint32_t* id1, uint32_t* id2);
-extern uint32_t nrnran123_ipick(nrnran123_State*); /* uniform 0 to 2^32-1 */
-extern double nrnran123_dblpick(nrnran123_State*); /* uniform open interval (0,1)*/
-    /* nrnran123_dblpick minimum value is 2.3283064e-10 and max value is 1-min */
-extern double nrnran123_negexp(nrnran123_State*);  /* mean 1.0 */
-    /* nrnran123_negexp min value is 2.3283064e-10, max is 22.18071 */
-extern double nrnran123_gauss(nrnran123_State*); /* mean 0.0, std 1.0 */
+
+/* routines for creating and deleteing streams are called from cpu but initializing/deleting gpu
+ * context */
+extern nrnran123_State* cu_nrnran123_newstream(uint32_t id1, uint32_t id2);
+extern nrnran123_State* cu_nrnran123_newstream3(uint32_t id1, uint32_t id2, uint32_t id3);
+extern void cu_nrnran123_deletestream(nrnran123_State*);
+
+extern GLOBAL void nrnran123_setup_deletestream(nrnran123_State* s);
+extern GLOBAL void nrnran123_setup_newstream(nrnran123_State* s, uint32_t id1, uint32_t id2);
+extern GLOBAL void nrnran123_setup_newstream3(nrnran123_State* s,
+                                              uint32_t id1,
+                                              uint32_t id2,
+                                              uint32_t id3);
+
+/* minimal data stream */
+extern DEVICE void nrnran123_getseq(nrnran123_State*, uint32_t* seq, unsigned char* which);
+extern DEVICE void nrnran123_getids(nrnran123_State*, uint32_t* id1, uint32_t* id2);
+extern DEVICE uint32_t nrnran123_ipick(nrnran123_State*); /* uniform 0 to 2^32-1 */
+
+/* this could be called from openacc parallel construct */
+#if !defined(DISABLE_OPENACC)
+#pragma acc routine seq
+#endif
+extern DEVICE double nrnran123_dblpick(nrnran123_State*); /* uniform open interval (0,1)*/
+/* nrnran123_dblpick minimum value is 2.3283064e-10 and max value is 1-min */
+
+/* this could be called from openacc parallel construct (in INITIAL block) */
+#if !defined(DISABLE_OPENACC)
+#pragma acc routine seq
+#endif
+extern DEVICE void nrnran123_setseq(nrnran123_State*, uint32_t seq, unsigned char which);
+
+#if !defined(DISABLE_OPENACC)
+#pragma acc routine seq
+#endif
+extern DEVICE double nrnran123_negexp(nrnran123_State*); /* mean 1.0 */
+/* nrnran123_negexp min value is 2.3283064e-10, max is 22.18071 */
+
+/* missing declaration in coreneuron */
+extern DEVICE double nrnran123_normal(nrnran123_State*);
+
+extern DEVICE double nrnran123_gauss(nrnran123_State*); /* mean 0.0, std 1.0 */
 
 /* more fundamental (stateless) (though the global index is still used) */
-extern nrnran123_array4x32 nrnran123_iran(uint32_t seq, uint32_t id1, uint32_t id2);
-extern double nrnran123_uint2dbl(uint32_t);
+extern DEVICE nrnran123_array4x32 nrnran123_iran(uint32_t seq, uint32_t id1, uint32_t id2);
+extern DEVICE double nrnran123_uint2dbl(uint32_t);
 
 #if defined(__cplusplus)
 }
diff --git a/coreneuron/utils/reports/nrnreport.cpp b/coreneuron/utils/reports/nrnreport.cpp
new file mode 100644
index 000000000..67c1f334c
--- /dev/null
+++ b/coreneuron/utils/reports/nrnreport.cpp
@@ -0,0 +1,238 @@
+/*
+Copyright (c) 2016, Blue Brain Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <iostream>
+#include <vector>
+#include <algorithm>
+#include "coreneuron/nrnoc/multicore.h"
+#include "coreneuron/utils/reports/nrnreport.h"
+
+#ifdef ENABLE_REPORTING
+#include "reportinglib/Records.h"
+#endif
+
+int step = 0;
+
+/** constructor */
+ReportEvent::ReportEvent(double t) {
+    dt = t;
+    step = 0;
+}
+
+/** on deliver, call ReportingLib and setup next event */
+
+void ReportEvent::deliver(double t, NetCvode* nc, NrnThread* nt) {
+// avoid pgc++-Fatal-/opt/pgi/linux86-64/16.5/bin/pggpp2ex TERMINATED by signal 11
+#ifdef ENABLE_REPORTING
+
+    /** @todo: reportinglib is not thread safe, fix this */
+    #pragma omp critical
+    {
+// each thread needs to know its own step
+#ifdef ENABLE_REPORTING
+        // TODO: do not rebuild this cellid vector each reporting dt!
+        std::vector<int> temp;
+        for (int i = 0; i < nt->ncell; i++) {
+            PreSyn& presyn = nt->presyns[i];
+            temp.push_back(presyn.gid_);
+        }
+        std::sort(temp.begin(), temp.end());
+        records_nrec(step, nt->ncell, &temp[0]);
+#endif
+        send(t + dt, nc, nt);
+        step++;
+    }
+#else
+    (void)t;
+    (void)nc;
+    (void)nt;
+#endif  // ENABLE_REPORTING
+}
+
+/** based on command line arguments, setup reportinglib interface */
+ReportGenerator::ReportGenerator(int rtype,
+                                 double start_,
+                                 double stop_,
+                                 double dt_,
+                                 double delay,
+                                 double dt_report_,
+                                 std::string path) {
+    start = start_;
+    stop = stop_;
+    dt = dt_;
+    dt_report = dt_report_;
+    mindelay = delay;
+
+    switch (rtype) {
+        case 1:
+            type = SomaReport;
+            report_filepath = path + std::string("/soma");
+            break;
+
+        case 2:
+            type = CompartmentReport;
+            report_filepath = path + std::string("/voltage");
+            break;
+
+        default:
+            if (nrnmpi_myid == 0) {
+                std::cout << " WARNING: Invalid report type, enabling Soma reports!\n";
+            }
+            type = SomaReport;
+            report_filepath = path + std::string("/soma");
+    }
+}
+
+#ifdef ENABLE_REPORTING
+
+void ReportGenerator::register_report() {
+    /* simulation dt */
+    records_set_atomic_step(dt);
+
+    for (int ith = 0; ith < nrn_nthread; ++ith) {
+        NrnThread& nt = nrn_threads[ith];
+        NeuronGroupMappingInfo* mapinfo = (NeuronGroupMappingInfo*)nt.mapping;
+
+        /** avoid empty NrnThread */
+        if (nt.ncell) {
+            /** new event for every thread */
+            events.push_back(new ReportEvent(dt));
+            events[ith]->send(dt, net_cvode_instance, &nt);
+
+            /** @todo: hard coded parameters for ReportingLib from Jim*/
+            int sizemapping = 1;
+            int extramapping = 5;
+            int mapping[] = {0};            // first column i.e. section numbers
+            int extra[] = {1, 0, 0, 0, 1};  // first row, from 2nd value (skip gid)
+            const char* unit = "mV";
+            const char* kind = "compartment";
+            const char* reportname = report_filepath.c_str();
+
+            /** iterate over all neurons */
+            for (int i = 0; i < nt.ncell; ++i) {
+                /** for this gid, get mapping information */
+                int gid = nt.presyns[i].gid_;
+                NeuronMappingInfo* m = mapinfo->get_neuron_mapping(gid);
+
+                /** for full compartment reports, set extra mapping */
+                if (type == CompartmentReport) {
+                    extra[0] = m->nsegment;
+                    extra[1] = m->nsoma;
+                    extra[2] = m->naxon;
+                    extra[3] = m->ndendrite;
+                    extra[4] = m->napical;
+                }
+
+                /** add report variable : @todo api changes in reportinglib*/
+                records_add_report((char*)reportname, gid, gid, gid, start, stop, dt_report,
+                                   sizemapping, (char*)kind, extramapping, (char*)unit);
+
+                /** add extra mapping : @todo api changes in reportinglib*/
+                records_extra_mapping((char*)reportname, gid, 5, extra);
+
+                /** if there more segments that we need to register ? */
+                bool pending_segments = true;
+
+                section_segment_map_type::iterator iter;
+
+                /** iterate over all sections of a cell */
+                for (iter = m->sec_seg_map.begin();
+                     iter != m->sec_seg_map.end() && pending_segments; iter++) {
+                    /** set section id */
+                    mapping[0] = iter->first;
+
+                    /** these are all segments for a given section */
+                    segment_vector_type& segments = iter->second;
+
+                    /** iterate over all segments and register them */
+                    for (int j = 0; j < segments.size() && pending_segments; j++) {
+                        /** segment id here is just offset into voltage array */
+                        int idx = segments[j];
+
+                        /** corresponding voltage in coreneuron voltage array */
+                        double* v = nt._actual_v + idx;
+
+                        /** add segment for reporting */
+                        records_add_var_with_mapping((char*)reportname, gid, v, sizemapping,
+                                                     mapping);
+
+                        /** for soma report, we have to break! only register first segment in
+                         * section */
+                        if (type == SomaReport) {
+                            /** soma must be always in 0th section */
+                            if (mapping[0] != 0) {
+                                std::cout
+                                    << " WARNING: first section for soma report is non-zero ?\n";
+                            }
+
+                            /** done with current cell */
+                            pending_segments = 0;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    /** in the current implementation, we call flush during every spike exchange
+     *  interval. Hence there should be sufficient buffer to hold all reports
+     *  for the duration of mindelay interval. In the below call we specify the
+     *  number of timesteps that we have to buffer.
+     *  TODO: revisit this because spike exchnage can happen few steps before/after
+     *  mindelay interval and hence adding two extra timesteps to buffer.
+     */
+    int timesteps_to_buffer = mindelay / dt_report + 2;
+    records_set_steps_to_buffer(timesteps_to_buffer);
+
+    /** reportinglib setup */
+    records_setup_communicator();
+    records_finish_and_share();
+
+    if (nrnmpi_myid == 0) {
+        if (type == SomaReport)
+            std::cout << " Soma report registration finished!\n";
+        else
+            std::cout << " Full compartment report registration finished!\n";
+    }
+}
+
+/** returns mapping information for given gid */
+NeuronMappingInfo* NeuronGroupMappingInfo::get_neuron_mapping(int gid) {
+    for (int i = 0; i < neuronsmapinfo.size(); i++) {
+        if (neuronsmapinfo[i].gid == gid)
+            return &(neuronsmapinfo[i]);
+    }
+
+    return NULL;
+}
+
+extern "C" void nrn_flush_reports(double t) {
+    records_flush(t);
+}
+
+#endif  // ENABLE_REPORTING
diff --git a/coreneuron/utils/reports/nrnreport.h b/coreneuron/utils/reports/nrnreport.h
new file mode 100644
index 000000000..e2eb84446
--- /dev/null
+++ b/coreneuron/utils/reports/nrnreport.h
@@ -0,0 +1,149 @@
+/*
+Copyright (c) 2016, Blue Brain Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/**
+ * @file nrnreport.h
+ * @date 25th April 2015
+ *
+ * @brief interface with reportinglib for soma reports
+ */
+
+#ifndef _H_NRN_REPORT_
+#define _H_NRN_REPORT_
+
+#include <string>
+#include <map>
+#include "coreneuron/nrniv/netcon.h"
+
+/** global instance */
+extern NetCvode* net_cvode_instance;
+
+/** To manage report events, subclass of DiscreteEvent */
+class ReportEvent : public DiscreteEvent {
+  private:
+    /** every thread or event can have different dt */
+    double dt;
+    unsigned long step;
+
+  public:
+    ReportEvent(double t);
+
+    /** on deliver, call ReportingLib and setup next event */
+    virtual void deliver(double t, NetCvode* nc, NrnThread* nt);
+};
+
+/** possible voltage report types */
+enum ReportType { SomaReport, CompartmentReport };
+
+/** class for managing report generation with ReportingLib */
+class ReportGenerator {
+  private:
+    /** every thread should have an event */
+    std::vector<ReportEvent*> events;
+    double start;
+    double stop;
+    double dt;
+    double dt_report;
+    double mindelay;
+    ReportType type;
+    std::string report_filepath;
+
+  public:
+    ReportGenerator(int type,
+                    double start,
+                    double stop,
+                    double dt,
+                    double delay,
+                    double dt_report,
+                    std::string path);
+
+#ifdef ENABLE_REPORTING
+    void register_report();
+#endif
+
+    ~ReportGenerator() {
+        events.clear();
+    }
+};
+
+/** type to store every section and associated segments */
+typedef std::vector<int> segment_vector_type;
+typedef std::map<int, segment_vector_type> section_segment_map_type;
+
+/** Mapping information for single neuron */
+class NeuronMappingInfo {
+  public:
+    int gid;           // gid of cellgroup
+    int nsegment;      // no of segments
+    int nsoma;         // no of somas
+    int naxon;         // no of axons
+    int ndendrite;     // no of dendrites
+    int napical;       // no of apical
+    int ncompartment;  // no of compartment
+
+    section_segment_map_type sec_seg_map;  // mapping of section to segments
+
+    NeuronMappingInfo(int id, int seg, int soma, int axon, int dend, int apical, int compartment)
+        : gid(id),
+          nsegment(seg),
+          nsoma(soma),
+          naxon(axon),
+          ndendrite(dend),
+          napical(apical),
+          ncompartment(compartment) {
+    }
+
+    void add_segment(int sec, int seg) {
+        sec_seg_map[sec].push_back(seg);
+    }
+
+    /** section 0 is always soma. there could be multiple compartments
+     *  in the soma and hence return the first compartment
+     */
+    int get_soma_compartment_index() {
+        return (sec_seg_map.begin()->second)[0];
+    }
+};
+
+/** Mapping information for all neurons in NrnThread */
+class NeuronGroupMappingInfo {
+  public:
+    std::vector<NeuronMappingInfo> neuronsmapinfo;  // mapping info for each gid in NrnThread
+
+    void add_neuron_mapping_info(NeuronMappingInfo& mapping) {
+        neuronsmapinfo.push_back(mapping);
+    }
+
+    size_t count() const {
+        return neuronsmapinfo.size();
+    }
+
+    NeuronMappingInfo* get_neuron_mapping(int gid);
+};
+
+#endif  //_H_NRN_REPORT_
diff --git a/coreneuron/utils/sdprintf.cpp b/coreneuron/utils/sdprintf.cpp
index 26fe281af..8853bab73 100644
--- a/coreneuron/utils/sdprintf.cpp
+++ b/coreneuron/utils/sdprintf.cpp
@@ -34,11 +34,11 @@ THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "sdprintf.h"
 
-sd_ptr sdprintf(char *buf,size_t sz,const char *fmt,...) {
+sd_ptr sdprintf(char* buf, size_t sz, const char* fmt, ...) {
     va_list ap;
-    va_start(ap,fmt);
+    va_start(ap, fmt);
 
-    sd_ptr s=vsdprintf(buf,sz,fmt,ap);
+    sd_ptr s = vsdprintf(buf, sz, fmt, ap);
 
     va_end(ap);
     return s;
@@ -47,58 +47,57 @@ sd_ptr sdprintf(char *buf,size_t sz,const char *fmt,...) {
 #if !defined(va_copy)
 /* check for __va_copy: work around for icpc 2015 */
 #if defined(__va_copy)
-#define va_copy(dest,src) __va_copy(dest,src)
-#else 
+#define va_copy(dest, src) __va_copy(dest, src)
+#else
 /* non-portable, so specialise for those cases where
  *  * value assignment does not work and va_copy is nonetheless
  *   * not defined */
 #warning "no va_copy() or __va_copy defined, using value assignment"
-#define va_copy(dest,src) ((dest)=(src))
+#define va_copy(dest, src) ((dest) = (src))
 #endif
 #endif
 
-sd_ptr vsdprintf(char *buf,size_t sz,const char *fmt,va_list ap) {
+sd_ptr vsdprintf(char* buf, size_t sz, const char* fmt, va_list ap) {
     using namespace std;
 
     sd_ptr s;
     va_list ap2;
-    va_copy(ap2,ap);
+    va_copy(ap2, ap);
 
-    int rv=0;
-    if (buf!=0 && sz>0)
-        rv=vsnprintf(buf,sz,fmt,ap);
+    int rv = 0;
+    if (buf != 0 && sz > 0)
+        rv = vsnprintf(buf, sz, fmt, ap);
     else {
         char p[1];
-        sz=0;
-        rv=vsnprintf(p,sz,fmt,ap);
+        sz = 0;
+        rv = vsnprintf(p, sz, fmt, ap);
     }
 
-    if (rv<0) {
-        s=0;
+    if (rv < 0) {
+        s = 0;
         goto exit;
     }
 
-    if ((size_t)rv<sz) {
-        s=buf;
+    if ((size_t)rv < sz) {
+        s = buf;
         goto exit;
-    }
-    else {
+    } else {
         // buffer too small; allocate and try again
-        sz=(size_t)rv+1;
-        char *p=(char *)std::malloc(sz);
-        if (!p){
+        sz = (size_t)rv + 1;
+        char* p = (char*)malloc(sz);
+        if (!p) {
             va_end(ap2);
             return 0;
         }
 
-        rv=vsnprintf(p,sz,fmt,ap2);
-        if (rv<0 || (size_t)rv>=sz) {
+        rv = vsnprintf(p, sz, fmt, ap2);
+        if (rv < 0 || (size_t)rv >= sz) {
             free(p);
-            s=0;
+            s = 0;
             goto exit;
         }
 
-        s=sd_ptr(p,true);
+        s = sd_ptr(p, true);
         goto exit;
     }
 
diff --git a/coreneuron/utils/sdprintf.h b/coreneuron/utils/sdprintf.h
index e7b7c5c6b..0039e0b1b 100644
--- a/coreneuron/utils/sdprintf.h
+++ b/coreneuron/utils/sdprintf.h
@@ -34,17 +34,16 @@ THE POSSIBILITY OF SUCH DAMAGE.
  *
  * Provides sprintf()-like function that uses the  offered character
  * array if it is sufficiently large, else allocates space on the heap.
- * 
+ *
  * The return object is a 'smart' pointer wrapper, offering a subset of
  * the C++11 std::unique_ptr functionality. (We need C++03
  * compatibility, and do not want to introduce a dependency on the
  * Boost library.)
- * 
+ *
  * Code assumes C99-compatible behaviour of (non-standard before C++11)
  * vsnprintf().  Microsoft Visual C++ does not conform, for example.
  */
 
-
 #ifndef SDPRINTF_H_
 #define SDPRINTF_H_
 
@@ -60,46 +59,57 @@ THE POSSIBILITY OF SUCH DAMAGE.
  * buffer.
  */
 
-template <void (*dealloc)(void *)>
+template <void (*dealloc)(void*)>
 struct sd_ptr_generic {
-    sd_ptr_generic(): ptr(0),dflag(false) {}
+    sd_ptr_generic() : ptr(0), dflag(false) {
+    }
 
-    sd_ptr_generic(const char *p,bool dflag_=false): ptr(p),dflag(dflag_) {}
+    sd_ptr_generic(const char* p, bool dflag_ = false) : ptr(p), dflag(dflag_) {
+    }
 
-    sd_ptr_generic(const sd_ptr_generic &them): ptr(them.ptr),dflag(them.dflag) {
-        them.dflag=false;
+    sd_ptr_generic(const sd_ptr_generic& them) : ptr(them.ptr), dflag(them.dflag) {
+        them.dflag = false;
     }
 
-    sd_ptr_generic &operator=(const char *p) {
+    sd_ptr_generic& operator=(const char* p) {
         release();
-        ptr=p;
+        ptr = p;
         return *this;
     }
 
-    sd_ptr_generic &operator=(const sd_ptr_generic &them) {
-        if (&them!=this) {
+    sd_ptr_generic& operator=(const sd_ptr_generic& them) {
+        if (&them != this) {
             release();
-            ptr=them.ptr;
-            dflag=them.dflag;
-            them.dflag=false;
+            ptr = them.ptr;
+            dflag = them.dflag;
+            them.dflag = false;
         }
         return *this;
     }
 
     void release() {
-        if (dflag) dealloc((void *)ptr);
-        dflag=false;
+        if (dflag)
+            dealloc((void*)ptr);
+        dflag = false;
     }
 
-    const char *get() const { return ptr; }
-    operator const char *() const { return get(); }
+    const char* get() const {
+        return ptr;
+    }
+    operator const char*() const {
+        return get();
+    }
 
-    operator bool() const { return (bool)ptr; }
+    operator bool() const {
+        return (bool)ptr;
+    }
 
-    ~sd_ptr_generic() { release(); }
+    ~sd_ptr_generic() {
+        release();
+    }
 
-private:
-    const char *ptr;
+  private:
+    const char* ptr;
     mutable bool dflag;
 };
 
@@ -120,10 +130,10 @@ typedef sd_ptr_generic<std::free> sd_ptr;
  * On error, returns a null sd_ptr.
  */
 
-sd_ptr sdprintf(char *buf,size_t sz,const char *fmt,...);
+sd_ptr sdprintf(char* buf, size_t sz, const char* fmt, ...);
 
 /** @brief Varargs version of sdprintf() (q.v.)
- * 
+ *
  * @param buf Pointer to buffer in which to write string.
  * @param sz  Size in bytes of buffer.
  * @param fmt A printf format string.
@@ -132,6 +142,6 @@ sd_ptr sdprintf(char *buf,size_t sz,const char *fmt,...);
  * @return An sd_ptr encapsulating the provided or allocated buffer.
  */
 
-sd_ptr vsdprintf(char *buf,size_t sz,const char *fmt,va_list ap);
+sd_ptr vsdprintf(char* buf, size_t sz, const char* fmt, va_list ap);
 
-#endif // ndef SDPRINTF_H_
+#endif  // ndef SDPRINTF_H_
diff --git a/coreneuron/utils/swap_endian.h b/coreneuron/utils/swap_endian.h
index 1a13a1b71..4c5ca9d04 100644
--- a/coreneuron/utils/swap_endian.h
+++ b/coreneuron/utils/swap_endian.h
@@ -44,22 +44,26 @@ THE POSSIBILITY OF SUCH DAMAGE.
 #include <immintrin.h>
 #endif
 
+/* required for correct choice of PPC assembly */
+#include "coreneuron/utils/endianness.h"
+
 #if !defined(SWAP_ENDIAN_MAX_UNROLL)
 #define SWAP_ENDIAN_MAX_UNROLL 8
 #endif
 
-#if defined(__GNUC__) && __GNUC__==4 && __GNUC_MINOR__<6
+#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 6
 #define SWAP_ENDIAN_BROKEN_MEMCPY
 #endif
 
 #ifdef SWAP_ENDIAN_BROKEN_MEMCPY
-#define memcpy(d,s,n) ::endian::impl::safe_memcpy((d),(s),(n))
+#define memcpy(d, s, n) ::endian::impl::safe_memcpy((d), (s), (n))
 namespace endian {
     namespace impl {
-        static inline void *safe_memcpy(void *d,void *s,size_t n) {
-            char *d_=(char *)d;
-            char *s_=(char *)s;
-            while (n-->0) *d_++=*s_++;
+        static inline void* safe_memcpy(void* d, void* s, size_t n) {
+            char* d_ = (char*)d;
+            char* s_ = (char*)s;
+            while (n-- > 0)
+                *d_++ = *s_++;
             return d;
         }
     }
@@ -70,36 +74,43 @@ namespace endian {
 
     namespace impl {
         template <typename I>
-        struct is_pointer { enum { value=false }; };
+        struct is_pointer {
+            enum { value = false };
+        };
 
         template <typename V>
-        struct is_pointer<V *> { enum { value=true }; };
+        struct is_pointer<V*> {
+            enum { value = true };
+        };
 
-        struct not_implemented { static void eval(...) { abort(); } };
+        struct not_implemented {
+            static void eval(...) {
+                abort();
+            }
+        };
 
         /** Check to ensure a class is not derived from not_implemented */
         template <typename C>
         struct is_implemented {
             typedef char no;
             typedef double yes;
-            static no check(not_implemented *);
+            static no check(not_implemented*);
             static yes check(...);
-            enum { value=sizeof(check((C *)0))==sizeof(yes) };
+            enum { value = sizeof(check((C*)0)) == sizeof(yes) };
         };
 
+        template <size_t K, size_t Unroll, bool Aligned>
+        struct swap_endian_basic : not_implemented {};
 
-        template <size_t K,size_t Unroll,bool Aligned>
-        struct swap_endian_basic: not_implemented {};
-
-        template <size_t K,bool Aligned>
-        struct swap_endian_basic<K,1,Aligned> {
-            static void eval(unsigned char *d) {
-                std::reverse(d,d+K);
+        template <size_t K, bool Aligned>
+        struct swap_endian_basic<K, 1, Aligned> {
+            static void eval(unsigned char* d) {
+                std::reverse(d, d + K);
             }
         };
 
-        template <size_t K,size_t Unroll,bool Aligned>
-        struct swap_endian_fast: not_implemented {};
+        template <size_t K, size_t Unroll, bool Aligned>
+        struct swap_endian_fast : not_implemented {};
 
         /** Reverse bytes within values of fixed size in buffer
          *
@@ -113,160 +124,206 @@ namespace endian {
          * If Align is true, we can assume that d is aligned to a multiple
          * of K*Unroll bytes.
          */
-        template <size_t K,size_t Unroll,bool Aligned=false>
+        template <size_t K, size_t Unroll, bool Aligned = false>
         struct swap_endian {
-            static void eval(unsigned char *d) {
+            static void eval(unsigned char* d) {
 #ifdef SWAP_ENDIAN_ASSERT
-                assert(!Aligned || (uintptr_t)d%(K*Unroll)==0);
+                assert(!Aligned || (uintptr_t)d % (K * Unroll) == 0);
 #endif
-                if (is_implemented<swap_endian_fast<K,Unroll,Aligned> >::value)
-                    swap_endian_fast<K,Unroll,Aligned>::eval(d);
-                else if (is_implemented<swap_endian_basic<K,Unroll,Aligned> >::value)
-                    swap_endian_basic<K,Unroll,Aligned>::eval(d);
-                else if (Unroll%2==0 || !Aligned) {
-                    swap_endian<K,Unroll/2,Aligned>::eval(d);
-                    swap_endian<K,Unroll/2,Aligned>::eval(d+K*(Unroll/2));
-                    if (Unroll%2)
-                        swap_endian<K,1,Aligned>::eval(d+K*(Unroll-1));
-                }
-                else {
+                if (is_implemented<swap_endian_fast<K, Unroll, Aligned> >::value)
+                    swap_endian_fast<K, Unroll, Aligned>::eval(d);
+                else if (is_implemented<swap_endian_basic<K, Unroll, Aligned> >::value)
+                    swap_endian_basic<K, Unroll, Aligned>::eval(d);
+                else if (Unroll % 2 == 0 || !Aligned) {
+                    swap_endian<K, Unroll / 2, Aligned>::eval(d);
+                    swap_endian<K, Unroll / 2, Aligned>::eval(d + K * (Unroll / 2));
+                    if (Unroll % 2)
+                        swap_endian<K, 1, Aligned>::eval(d + K * (Unroll - 1));
+                } else {
                     // Unroll is odd: can't make guarantees that we're aligned wrt
                     // (Unroll/2).
-                    swap_endian<K,Unroll/2,false>::eval(d);
-                    swap_endian<K,Unroll/2,false>::eval(d+K*(Unroll/2));
-                    swap_endian<K,1,Aligned>::eval(d+K*(Unroll-1));
+                    swap_endian<K, Unroll / 2, false>::eval(d);
+                    swap_endian<K, Unroll / 2, false>::eval(d + K * (Unroll / 2));
+                    swap_endian<K, 1, Aligned>::eval(d + K * (Unroll - 1));
                 }
             }
         };
 
         // This specialization is required ONLY to convince gcc 4.4.7
         // that it will never divide by zero statically.
-        template <size_t K,bool Aligned>
-        struct swap_endian<K,0,Aligned> {
-            static void eval(unsigned char *) {}
+        template <size_t K, bool Aligned>
+        struct swap_endian<K, 0, Aligned> {
+            static void eval(unsigned char*) {
+            }
         };
 
-        template <size_t Unroll,bool Aligned>
-        struct swap_endian<1,Unroll,Aligned> {
-            static void eval(unsigned char *) {}
+        template <size_t Unroll, bool Aligned>
+        struct swap_endian<1, Unroll, Aligned> {
+            static void eval(unsigned char*) {
+            }
         };
 
         // specialise swap_endian_basic for integer data sizes
         template <bool Aligned>
-        struct swap_endian_basic<2,1,Aligned> {
-            static void eval(unsigned char *d) {
+        struct swap_endian_basic<2, 1, Aligned> {
+            static void eval(unsigned char* d) {
                 uint16_t v;
-                memcpy(&v,d,2);
-                v=(uint16_t)((v>>8u)|(v<<8u));
-                memcpy(d,&v,2);
+                memcpy(&v, d, 2);
+                v = (uint16_t)((v >> 8u) | (v << 8u));
+                memcpy(d, &v, 2);
             }
         };
 
         template <bool Aligned>
-        struct swap_endian_basic<4,1,Aligned> {
-            static void eval(unsigned char *d) {
+        struct swap_endian_basic<4, 1, Aligned> {
+            static void eval(unsigned char* d) {
                 uint32_t v;
-                memcpy(&v,d,4);
-                v=(v>>24) | ((v>>8) & 0x0000ff00ul) | ((v<<8) & 0x00ff0000ul) | (v<<24);
-                memcpy(d,&v,4);
+                memcpy(&v, d, 4);
+                v = (v >> 24) | ((v >> 8) & 0x0000ff00ul) | ((v << 8) & 0x00ff0000ul) | (v << 24);
+                memcpy(d, &v, 4);
             }
         };
 
         template <bool Aligned>
-        struct swap_endian_basic<8,1,Aligned> {
-            static void eval(unsigned char *d) {
+        struct swap_endian_basic<8, 1, Aligned> {
+            static void eval(unsigned char* d) {
                 uint64_t v;
-                memcpy(&v,d,8);
-                v=(v>>56) | 
-                     ((v<<40) & 0x00FF000000000000ull) |
-                     ((v<<24) & 0x0000FF0000000000ull) |
-                     ((v<<8)  & 0x000000FF00000000ull) |
-                     ((v>>8)  & 0x00000000FF000000ull) |
-                     ((v>>24) & 0x0000000000FF0000ull) |
-                     ((v>>40) & 0x000000000000FF00ull) |
-                     (v<<56);
-                memcpy(d,&v,8);
+                memcpy(&v, d, 8);
+                v = (v >> 56) | ((v << 40) & 0x00FF000000000000ull) |
+                    ((v << 24) & 0x0000FF0000000000ull) | ((v << 8) & 0x000000FF00000000ull) |
+                    ((v >> 8) & 0x00000000FF000000ull) | ((v >> 24) & 0x0000000000FF0000ull) |
+                    ((v >> 40) & 0x000000000000FF00ull) | (v << 56);
+                memcpy(d, &v, 8);
             }
         };
 
-#if !defined(SWAP_ENDIAN_DISABLE_ASM) && defined(__PPC64__) 
+#if !defined(SWAP_ENDIAN_DISABLE_ASM) && defined(__PPC64__)
         /* generic methods very slow on bgq */
 
         template <bool Aligned>
-        struct swap_endian_fast<4,1,Aligned> {
-            static void eval(unsigned char *d) {
-                struct chunk_t { unsigned char x[4]; } &u=*(chunk_t *)(void *)d;
+        struct swap_endian_fast<4, 1, Aligned> {
+            static void eval(unsigned char* d) {
+                struct chunk_t {
+                    unsigned char x[4];
+                }& u = *(chunk_t*)(void*)d;
 
                 uint32_t v;
                 asm("lwz    %[v],%[ldata]         \n\t"
                     "stwbrx %[v],0,%[sdata]       \n\t"
-                  : [v]"+&r"(v), "+o"(u)
-                  : [ldata]"o"(u), [sdata]"r"(d)
-                  : ); 
+                    : [v] "=&r"(v), "+o"(u)
+                    : [ldata] "o"(u), [sdata] "r"(d)
+                    :);
             }
         };
 
         template <>
-        struct swap_endian_fast<4,2,true> {
-            static void eval(unsigned char *d) {
-                struct chunk_t { unsigned char x[8]; } &u=*(chunk_t *)(void *)d;
+        struct swap_endian_fast<4, 2, true> {
+            static void eval(unsigned char* d) {
+                struct chunk_t {
+                    unsigned char x[8];
+                }& u = *(chunk_t*)(void*)d;
 
                 uint64_t v;
-                asm("ld     %[v],%[ldata]         \n\t"
-                    "stwbrx %[v],%[word],%[sdata] \n\t"
-                    "srd    %[v],%[v],%[shift]    \n\t"
-                    "stwbrx %[v],0,%[sdata]       \n\t"
-                  : [v]"+&r"(v), "+o"(u)
-                  : [ldata]"o"(u), [sdata]"r"(d), [word]"b"(4), [shift]"b"(32)
-                  : ); 
+                if (endian::is_big_endian()) {
+                    asm("ld     %[v],%[ldata]         \n\t"
+                        "stwbrx %[v],%[word],%[sdata] \n\t"
+                        "srd    %[v],%[v],%[shift]    \n\t"
+                        "stwbrx %[v],0,%[sdata]       \n\t"
+                        : [v] "=&r"(v), "+o"(u)
+                        : [ldata] "o"(u), [sdata] "r"(d), [word] "b"(4), [shift] "b"(32)
+                        :);
+                } else {
+                    asm("ld     %[v],%[ldata]         \n\t"
+                        "stwbrx %[v],0,%[sdata]       \n\t"
+                        "srd    %[v],%[v],%[shift]    \n\t"
+                        "stwbrx %[v],%[word],%[sdata] \n\t"
+                        : [v] "=&r"(v), "+o"(u)
+                        : [ldata] "o"(u), [sdata] "r"(d), [word] "b"(4), [shift] "b"(32)
+                        :);
+                }
             }
         };
 
+#if !defined(__POWER8_VECTOR__)
+        // 2.06 ISA does not have stdbrx
         template <>
-        struct swap_endian_fast<8,1,true> {
-            static void eval(unsigned char *d) {
-                struct chunk_t { unsigned char x[8]; } &u=*(chunk_t *)(void *)d;
+        struct swap_endian_fast<8, 1, true> {
+            static void eval(unsigned char* d) {
+                struct chunk_t {
+                    unsigned char x[8];
+                }& u = *(chunk_t*)(void*)d;
+
+                uint64_t v;
+                if (endian::is_big_endian()) {
+                    asm("ld     %[v],%[ldata]         \n\t"
+                        "stwbrx %[v],0,%[sdata]       \n\t"
+                        "srd    %[v],%[v],%[shift]    \n\t"
+                        "stwbrx %[v],%[word],%[sdata] \n\t"
+                        : [v] "=&r"(v), "+o"(u)
+                        : [ldata] "o"(u), [sdata] "r"(d), [word] "b"(4), [shift] "b"(32)
+                        :);
+                } else {
+                    asm("ld     %[v],%[ldata]         \n\t"
+                        "stwbrx %[v],%[word],%[sdata] \n\t"
+                        "srd    %[v],%[v],%[shift]    \n\t"
+                        "stwbrx %[v],0,%[sdata]       \n\t"
+                        : [v] "=&r"(v), "+o"(u)
+                        : [ldata] "o"(u), [sdata] "r"(d), [word] "b"(4), [shift] "b"(32)
+                        :);
+                }
+            }
+        };
+#else
+        template <>
+        struct swap_endian_fast<8, 1, true> {
+            static void eval(unsigned char* d) {
+                struct chunk_t {
+                    unsigned char x[8];
+                }& u = *(chunk_t*)(void*)d;
 
                 uint64_t v;
                 asm("ld     %[v],%[ldata]         \n\t"
-                    "stwbrx %[v],0,%[sdata]       \n\t"
-                    "srd    %[v],%[v],%[shift]    \n\t"
-                    "stwbrx %[v],%[word],%[sdata] \n\t"
-                  : [v]"+&r"(v), "+o"(u)
-                  : [ldata]"o"(u), [sdata]"r"(d), [word]"b"(4), [shift]"b"(32)
-                  : ); 
+                    "stdbrx %[v],0,%[sdata]       \n\t"
+                    : [v] "=&r"(v), "+o"(u)
+                    : [ldata] "o"(u), [sdata] "r"(d)
+                    :);
             }
         };
+
+#endif
 #endif
 
 #if !defined(SWAP_ENDIAN_DISABLE_ASM) && defined(__SSSE3__)
         template <>
-        struct swap_endian_fast<2,8,true> {
-            static void eval(unsigned char *d) {
-                __m128i permute=_mm_setr_epi8(1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
-                __m128i x=_mm_load_si128((__m128i *)d);
-                x=_mm_shuffle_epi8(x,permute);
-                _mm_store_si128((__m128i *)d,x);
+        struct swap_endian_fast<2, 8, true> {
+            static void eval(unsigned char* d) {
+                __m128i permute =
+                    _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+                __m128i x = _mm_load_si128((__m128i*)d);
+                x = _mm_shuffle_epi8(x, permute);
+                _mm_store_si128((__m128i*)d, x);
             }
         };
 
         template <>
-        struct swap_endian_fast<4,4,true> {
-            static void eval(unsigned char *d) {
-                __m128i permute=_mm_setr_epi8(3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12);
-                __m128i x=_mm_load_si128((__m128i *)d);
-                x=_mm_shuffle_epi8(x,permute);
-                _mm_store_si128((__m128i *)d,x);
+        struct swap_endian_fast<4, 4, true> {
+            static void eval(unsigned char* d) {
+                __m128i permute =
+                    _mm_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
+                __m128i x = _mm_load_si128((__m128i*)d);
+                x = _mm_shuffle_epi8(x, permute);
+                _mm_store_si128((__m128i*)d, x);
             }
         };
 
         template <>
-        struct swap_endian_fast<8,2,true> {
-            static void eval(unsigned char *d) {
-                __m128i permute=_mm_setr_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8);
-                __m128i x=_mm_load_si128((__m128i *)d);
-                x=_mm_shuffle_epi8(x,permute);
-                _mm_store_si128((__m128i *)d,x);
+        struct swap_endian_fast<8, 2, true> {
+            static void eval(unsigned char* d) {
+                __m128i permute =
+                    _mm_setr_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+                __m128i x = _mm_load_si128((__m128i*)d);
+                x = _mm_shuffle_epi8(x, permute);
+                _mm_store_si128((__m128i*)d, x);
             }
         };
 #endif
@@ -274,81 +331,84 @@ namespace endian {
 #if !defined(SWAP_ENDIAN_DISABLE_ASM) && defined(__AVX2__)
         // Modern implementations suffer little or no penalty from unaligned load.
         template <bool Aligned>
-        struct swap_endian_fast<4,8,Aligned> {
-            static void eval(unsigned char *d) {
-                __m256i permute=_mm256_setr_epi8(3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,
-                                                 19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28);
-                __m256i x=_mm256_loadu_si256((__m256i *)d);
-                x=_mm256_shuffle_epi8(x,permute);
-                _mm256_storeu_si256((__m256i *)d,x);
+        struct swap_endian_fast<4, 8, Aligned> {
+            static void eval(unsigned char* d) {
+                __m256i permute =
+                    _mm256_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18,
+                                     17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28);
+                __m256i x = _mm256_loadu_si256((__m256i*)d);
+                x = _mm256_shuffle_epi8(x, permute);
+                _mm256_storeu_si256((__m256i*)d, x);
             }
         };
 
         template <bool Aligned>
-        struct swap_endian_fast<8,4,Aligned> {
-            static void eval(unsigned char *d) {
-                __m256i permute=_mm256_setr_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,
-                                                 23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24);
-                __m256i x=_mm256_loadu_si256((__m256i *)d);
-                x=_mm256_shuffle_epi8(x,permute);
-                _mm256_storeu_si256((__m256i *)d,x);
+        struct swap_endian_fast<8, 4, Aligned> {
+            static void eval(unsigned char* d) {
+                __m256i permute =
+                    _mm256_setr_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 23, 22,
+                                     21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24);
+                __m256i x = _mm256_loadu_si256((__m256i*)d);
+                x = _mm256_shuffle_epi8(x, permute);
+                _mm256_storeu_si256((__m256i*)d, x);
             }
         };
 
 #endif
 
         template <typename V>
-        void swap_endian_unroll(V *b,V *e) {
-            static const size_t n_unroll=SWAP_ENDIAN_MAX_UNROLL;
-            if (e<=b) return;
+        void swap_endian_unroll(V* b, V* e) {
+            static const size_t n_unroll = SWAP_ENDIAN_MAX_UNROLL;
+            if (e <= b)
+                return;
 
-            size_t n=e-b;
-            bool aligned_vsize=((uintptr_t)b%sizeof(V)==0);
+            size_t n = e - b;
+            bool aligned_vsize = ((uintptr_t)b % sizeof(V) == 0);
 
-            if (n>=n_unroll) {
+            if (n >= n_unroll) {
                 if (!aligned_vsize) {
                     /* No guarantees on alignment for swap_endian: elements
                        are not aligned to multiple of the element size.
                        This can happen with doubles on some 32-bit ABIs. */
 
-                    while (n>=n_unroll) {
-                        swap_endian<sizeof(V),n_unroll,false>::eval((unsigned char *)b);
-                        b+=n_unroll;
-                        n-=n_unroll;
+                    while (n >= n_unroll) {
+                        swap_endian<sizeof(V), n_unroll, false>::eval((unsigned char*)b);
+                        b += n_unroll;
+                        n -= n_unroll;
                     }
-                }
-                else {
+                } else {
                     /* process elements singly until we get to a n_unroll*sizeof(V)
                        boundary, and then do an alignment-guaranteed swap_endian. */
 
-                    size_t off_align_count=(size_t)((uintptr_t)b%(sizeof(V)*n_unroll))/sizeof(V);
-                    if (off_align_count>0) {
-                        while (off_align_count++<n_unroll) {
-                            swap_endian<sizeof(V),1,true>::eval((unsigned char *)b++);
+                    size_t off_align_count =
+                        (size_t)((uintptr_t)b % (sizeof(V) * n_unroll)) / sizeof(V);
+                    if (off_align_count > 0) {
+                        while (off_align_count++ < n_unroll) {
+                            swap_endian<sizeof(V), 1, true>::eval((unsigned char*)b++);
                             --n;
                         }
                     }
-                    
-                    /* b should now be n_unroll*sizeof(V) aligned. */
+
+/* b should now be n_unroll*sizeof(V) aligned. */
 #ifdef SWAP_ENDIAN_ASSERT
-                    assert((uintptr_t)b%(sizeof(V)*n_unroll)==0);
+                    assert((uintptr_t)b % (sizeof(V) * n_unroll) == 0);
 #endif
-                    while (n>=n_unroll) {
-                        swap_endian<sizeof(V),n_unroll,true>::eval((unsigned char *)b);
-                        b+=n_unroll;
-                        n-=n_unroll;
+                    while (n >= n_unroll) {
+                        swap_endian<sizeof(V), n_unroll, true>::eval((unsigned char*)b);
+                        b += n_unroll;
+                        n -= n_unroll;
                     }
                 }
             }
 
             if (aligned_vsize) {
-                while (n-->0) swap_endian<sizeof(V),1,true>::eval((unsigned char *)b++);
-            }
-            else {
-                while (n-->0) swap_endian<sizeof(V),1,false>::eval((unsigned char *)b++);
+                while (n-- > 0)
+                    swap_endian<sizeof(V), 1, true>::eval((unsigned char*)b++);
+            } else {
+                while (n-- > 0)
+                    swap_endian<sizeof(V), 1, false>::eval((unsigned char*)b++);
             }
         }
-
     }
 
     /** Reverse the endianness of a value in-place.
@@ -357,20 +417,25 @@ namespace endian {
      * /param v value to byte-reorder
      */
     template <typename T>
-    T &swap_endian(T &v) {
-        impl::swap_endian<sizeof(T),1>::eval((unsigned char *)&v);
+    T& swap_endian(T& v) {
+        impl::swap_endian<sizeof(T), 1>::eval((unsigned char*)&v);
         return v;
     }
 
     namespace impl {
-        template <typename I,bool unroll>
+        template <typename I, bool unroll>
         struct swap_endian_range_dispatch {
-            static void eval(I b,I e) { while (b!=e) ::endian::swap_endian(*b++); }
+            static void eval(I b, I e) {
+                while (b != e)
+                    ::endian::swap_endian(*b++);
+            }
         };
 
         template <typename I>
-        struct swap_endian_range_dispatch<I,true> {
-            static void eval(I b,I e) { swap_endian_unroll(b,e); }
+        struct swap_endian_range_dispatch<I, true> {
+            static void eval(I b, I e) {
+                swap_endian_unroll(b, e);
+            }
         };
     }
 
@@ -383,14 +448,14 @@ namespace endian {
      * All values in the iterator range [b,e) are byte-reversed.
      */
     template <typename I>
-    void swap_endian_range(I b,I e) {
-        impl::swap_endian_range_dispatch<I,impl::is_pointer<I>::value>::eval(b,e);
+    void swap_endian_range(I b, I e) {
+        impl::swap_endian_range_dispatch<I, impl::is_pointer<I>::value>::eval(b, e);
     }
 
-} // namespace endian
+}  // namespace endian
 
 #ifdef SWAP_ENDIAN_BROKEN_MEMCPY
 #undef memcpy
 #endif
 
-#endif // ifndef swap_endian_h
+#endif  // ifndef swap_endian_h
diff --git a/default.nix b/default.nix
new file mode 100644
index 000000000..f07082ebf
--- /dev/null
+++ b/default.nix
@@ -0,0 +1,27 @@
+# Nix development environment
+#
+# build:
+# nix-build -I "BBPpkgs=https://github.com/BlueBrain/bbp-nixpkgs/archive/master.tar.gz" default.nix
+#
+# build and test:
+# nix-build -I "BBPpkgs=https://github.com/BlueBrain/bbp-nixpkgs/archive/master.tar.gz" --arg testExec true  default.nix  -j 4
+#
+# dev shell:
+# nix-shell -I "BBPpkgs=https://github.com/BlueBrain/bbp-nixpkgs/archive/master.tar.gz"  default.nix
+#
+with import <BBPpkgs> { };
+
+{ testExec ? false} : {
+
+
+    func = stdenv.lib.overrideDerivation coreneuron (oldAttrs: {
+      name = "coreneuron-DEVLOCAL";
+      src = ./.;
+
+      buildInputs = oldAttrs.buildInputs ++ [ bbptestdata ];
+
+    });
+
+}
+
+
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index e7774e36b..6dc1f8928 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -24,6 +24,7 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 # THE POSSIBILITY OF SUCH DAMAGE.
 
+
 include(FindSLURM)
 include(TestHelpers)
 
@@ -42,12 +43,13 @@ add_subdirectory(unit/mechbuild)
 add_subdirectory(unit/omp)
 add_subdirectory(unit/cmdline_interface)
 add_subdirectory(unit/alignment)
+add_subdirectory(unit/queueing)
 
 endif()
 
 if(FUNCTIONAL_TESTS)
 
 add_subdirectory(integration)
+add_subdirectory(regression)
 
 endif()
-
diff --git a/tests/integration/CMakeLists.txt b/tests/integration/CMakeLists.txt
index 2b6426b9e..e3fe76851 100644
--- a/tests/integration/CMakeLists.txt
+++ b/tests/integration/CMakeLists.txt
@@ -25,15 +25,21 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 
 
+
+
 # List of tests with arguments
 set(TEST_CASE_ARGS
   "ring!--datpath=${CMAKE_CURRENT_SOURCE_DIR}/ring --outpath=${CMAKE_CURRENT_BINARY_DIR}/ring --celsius=6.3 --tstop=100. -mpi"
   "ring_IClamp!--datpath=${CMAKE_CURRENT_SOURCE_DIR}/ring_IClamp --outpath=${CMAKE_CURRENT_BINARY_DIR}/ring_IClamp --celsius=6.3 --tstop=100. -mpi")
 
+
+
 #Configure test scripts
 foreach (args_line ${TEST_CASE_ARGS})
   string(REPLACE "!" ";" string_line ${args_line})
+if(MPI_FOUND)
   string(REPLACE ";" " " SRUN_PREFIX  "${TEST_MPI_EXEC_BIN};-n;1")
+endif()
   list(GET string_line 0 TEST_NAME)
   list(GET string_line 1 TEST_ARGS)
   set(SIM_NAME ${TEST_NAME})
@@ -42,3 +48,31 @@ foreach (args_line ${TEST_CASE_ARGS})
            COMMAND "/bin/sh" ${CMAKE_CURRENT_BINARY_DIR}/${TEST_NAME}/integration_test.sh
            WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${TEST_NAME}")
 endforeach()
+
+# This test is for coreneuron having all the mechanisms integrated into the
+# library. Runs when the corresponding condition is utilized plus the
+# executable is built.
+# Set of 10Cells tests
+if(ADDITIONAL_MECHS AND CORENEURON_MAIN)
+
+  function(exec_test TEST_NAME SUBTEST_NAME)
+if(MPI_FOUND)
+    string(REPLACE ";" " " SRUN_PREFIX  "${TEST_MPI_EXEC_BIN};-n;2")
+endif()
+    set(SIM_NAME "${TEST_NAME}/${SUBTEST_NAME}")
+    if(${SUBTEST_NAME} MATCHES "generic")
+      set(TEST_ARGS "--datpath=${CMAKE_CURRENT_SOURCE_DIR}/${TEST_NAME} --outpath=${CMAKE_CURRENT_BINARY_DIR}/${SIM_NAME} --tstop=100. -mpi")
+    elseif(${SUBTEST_NAME} MATCHES "forwardskip")
+      set(TEST_ARGS "--datpath=${CMAKE_CURRENT_SOURCE_DIR}/${TEST_NAME} --outpath=${CMAKE_CURRENT_BINARY_DIR}/${SIM_NAME} --forwardskip=5000. --tstop=100. -mpi")
+    elseif(${SUBTEST_NAME} MATCHES "spikereplay")
+      set(TEST_ARGS "--datpath=${CMAKE_CURRENT_SOURCE_DIR}/${TEST_NAME} --outpath=${CMAKE_CURRENT_BINARY_DIR}/${SIM_NAME} --pattern=${CMAKE_CURRENT_SOURCE_DIR}/${SIM_NAME}/out.std --tstop=100. -mpi")
+    endif()
+
+    configure_file(integration_test.sh.in ${SIM_NAME}/integration_test.sh @ONLY)
+
+    add_test(NAME ${TEST_NAME}_${SUBTEST_NAME}_TEST
+             COMMAND "/bin/sh" ${CMAKE_CURRENT_BINARY_DIR}/${SIM_NAME}/integration_test.sh
+             WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${SIM_NAME}")
+  endfunction(exec_test)
+
+endif()
diff --git a/tests/integration/README b/tests/integration/README
new file mode 100644
index 000000000..9a97644f4
--- /dev/null
+++ b/tests/integration/README
@@ -0,0 +1 @@
+Integration tests
diff --git a/tests/integration/ring/ring_ref_solution.h b/tests/integration/ring/ring_ref_solution.h
index 8e6fb4c71..774d5b10a 100644
--- a/tests/integration/ring/ring_ref_solution.h
+++ b/tests/integration/ring/ring_ref_solution.h
@@ -26,6 +26,7 @@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 THE POSSIBILITY OF SUCH DAMAGE.
 */
 
+
 #include <stdio.h>
 #include <iostream>
 
diff --git a/tests/regression/CMakeLists.txt b/tests/regression/CMakeLists.txt
new file mode 100644
index 000000000..9b15a9447
--- /dev/null
+++ b/tests/regression/CMakeLists.txt
@@ -0,0 +1,32 @@
+# Copyright (c) 2016, Blue Brain Project
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+# 1. Redistributions of source code must retain the above copyright notice,
+#    this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+# 3. Neither the name of the copyright holder nor the names of its contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+
+
+# TEST_LIBRARIES variable is used by the CommonCTest.cmake script to link against the given libraries
+set(TEST_LIBRARIES ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY} coreneuron)
+
+# CommonCTest, in the current folder recursively compiles targets for *.cpp files using TEST_LIBRARIES
+#include(CommonCTest)
diff --git a/tests/regression/README b/tests/regression/README
new file mode 100644
index 000000000..4968806e5
--- /dev/null
+++ b/tests/regression/README
@@ -0,0 +1 @@
+Regression tests
diff --git a/tests/regression/test_header.hpp b/tests/regression/test_header.hpp
new file mode 100644
index 000000000..b101640a6
--- /dev/null
+++ b/tests/regression/test_header.hpp
@@ -0,0 +1,20 @@
+#ifndef CORENEURON_TEST_HEADER_HPP
+#define CORENEURON_TEST_HEADER_HPP
+
+#define BOOST_TEST_MODULE math
+#define BOOST_TEST_MAIN
+//#include <boost/test/included/unit_test.hpp>
+
+#include <iostream>
+#include <boost/test/unit_test.hpp>
+#include <boost/test/test_case_template.hpp>
+
+namespace coreneuron {
+    namespace test {
+   
+    //something later
+
+    } // end namespace test
+} // end namespace CORENEURON
+
+#endif // CORENEURON_TEST_HEADER_HPP
diff --git a/tests/unit/alignment/CMakeLists.txt b/tests/unit/alignment/CMakeLists.txt
index 73ee81333..99af0eefe 100644
--- a/tests/unit/alignment/CMakeLists.txt
+++ b/tests/unit/alignment/CMakeLists.txt
@@ -24,6 +24,8 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 # THE POSSIBILITY OF SUCH DAMAGE.
 
+
+
 include_directories(${CMAKE_SOURCE_DIR}/coreneuron ${Boost_INCLUDE_DIRS})
 FILE(GLOB alignment_test_src "*.cpp")
 
diff --git a/tests/unit/alignment/alignment.cpp b/tests/unit/alignment/alignment.cpp
index fe28bc88f..cb4e007ac 100644
--- a/tests/unit/alignment/alignment.cpp
+++ b/tests/unit/alignment/alignment.cpp
@@ -25,6 +25,8 @@ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 THE POSSIBILITY OF SUCH DAMAGE.
 */
+
+
 #define BOOST_TEST_MODULE PaddingCheck
 #define BOOST_TEST_MAIN
 
diff --git a/tests/unit/cmdline_interface/CMakeLists.txt b/tests/unit/cmdline_interface/CMakeLists.txt
index b5b513516..496c57eea 100644
--- a/tests/unit/cmdline_interface/CMakeLists.txt
+++ b/tests/unit/cmdline_interface/CMakeLists.txt
@@ -25,10 +25,12 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 
 
+
+
 FILE(GLOB cmd_interface_test_src "*.cpp")
 
 add_executable(cmd_interface_test_bin ${cmd_interface_test_src})
-target_link_libraries(cmd_interface_test_bin ${MPI_C_LIBRARIES} ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY} coreneuron)
+target_link_libraries(cmd_interface_test_bin ${MPI_C_LIBRARIES} ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY} coreneuron ${reportinglib_LIBRARY})
 
 add_test(NAME cmd_interface_test COMMAND ${TEST_EXEC_PREFIX} ${CMAKE_CURRENT_BINARY_DIR}/cmd_interface_test_bin)
 
diff --git a/tests/unit/cmdline_interface/test_cmdline_interface.cpp b/tests/unit/cmdline_interface/test_cmdline_interface.cpp
index 8d8f2c43c..e1b00c1a6 100644
--- a/tests/unit/cmdline_interface/test_cmdline_interface.cpp
+++ b/tests/unit/cmdline_interface/test_cmdline_interface.cpp
@@ -25,17 +25,19 @@ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 THE POSSIBILITY OF SUCH DAMAGE.
 */
-#define BOOST_TEST_MODULE cmdline_interface 
+
+
+#define BOOST_TEST_MODULE cmdline_interface
 #define BOOST_TEST_MAIN
 
 #include <boost/test/unit_test.hpp>
 #include "nrniv/nrnoptarg.h"
 #include <float.h>
 
-BOOST_AUTO_TEST_CASE(cmdline_interface) 
+BOOST_AUTO_TEST_CASE(cmdline_interface)
 {
   cn_input_params input_params;
-  int argc = 14; 
+  int argc = 18;
   char ** argv = new char*[argc];
   argv[0] = (char*)"executable";
   argv[1] = (char*)"--tstart=0.001";
@@ -51,6 +53,10 @@ BOOST_AUTO_TEST_CASE(cmdline_interface)
   argv[11] = (char*)"--filesdat=/this/is/the/path";
   argv[12] = (char*)"--outpath=/this/is/the/path";
   argv[13] = (char*)"--forwardskip=0.02";
+  argv[14] = (char*)"--multiple=3";
+  argv[15] = (char*)"--extracon=10";
+  argv[16] = (char*)"--dt_report=0.25";
+  argv[17] = (char*)"--report=2";
 
   input_params.read_cb_opts(argc,argv);
 
@@ -67,7 +73,13 @@ BOOST_AUTO_TEST_CASE(cmdline_interface)
   BOOST_CHECK(!strcmp(input_params.patternstim,(char*)"filespike.dat"));
   BOOST_CHECK(!strcmp(input_params.filesdat,(char*)"/this/is/the/path"));
   BOOST_CHECK(input_params.spikebuf==100);
+  BOOST_CHECK(!strcmp(input_params.outpath,(char*)"/this/is/the/path"));
+  BOOST_CHECK_CLOSE(input_params.forwardskip,0.02,DBL_EPSILON);
+  BOOST_CHECK(input_params.multiple==3);
+  BOOST_CHECK(input_params.extracon==10);
+  BOOST_CHECK_CLOSE(input_params.dt_report,0.25,DBL_EPSILON);
+  BOOST_CHECK(input_params.report==2);
 
   delete [] argv;
 }
-    
+
diff --git a/tests/unit/endian/CMakeLists.txt b/tests/unit/endian/CMakeLists.txt
index b319009bb..c888d213c 100644
--- a/tests/unit/endian/CMakeLists.txt
+++ b/tests/unit/endian/CMakeLists.txt
@@ -24,6 +24,7 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 # THE POSSIBILITY OF SUCH DAMAGE.
 
+
 # Need to test the endian code against aggressive compiler optimization
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_${COMPILER_LANGUAGE}_OPT_AGGRESSIVE}")
 
diff --git a/tests/unit/endian/endianness_test.cpp b/tests/unit/endian/endianness_test.cpp
index 4cd96bd62..80f8e9494 100644
--- a/tests/unit/endian/endianness_test.cpp
+++ b/tests/unit/endian/endianness_test.cpp
@@ -25,6 +25,8 @@ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 THE POSSIBILITY OF SUCH DAMAGE.
 */
+
+
 /* confirm that reported native endianness is big- or little-endian, and that
  * this corresponds with our expectations across 16-, 32- and 64-bit integers,
  * and floats and doubles (assumes IEEE 4- and 8- byte representation) */
diff --git a/tests/unit/endian/swap_endian_default.cpp b/tests/unit/endian/swap_endian_default.cpp
index 5a1d5a088..ddb53fd86 100644
--- a/tests/unit/endian/swap_endian_default.cpp
+++ b/tests/unit/endian/swap_endian_default.cpp
@@ -25,6 +25,8 @@ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 THE POSSIBILITY OF SUCH DAMAGE.
 */
+
+
 /* Run endian tests with default configuration of swap_endian.h */
 
 #define SWAP_ENDIAN_CONFIG Default
diff --git a/tests/unit/endian/swap_endian_noasm.cpp b/tests/unit/endian/swap_endian_noasm.cpp
index 697c5a752..5b5c27caa 100644
--- a/tests/unit/endian/swap_endian_noasm.cpp
+++ b/tests/unit/endian/swap_endian_noasm.cpp
@@ -25,6 +25,8 @@ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 THE POSSIBILITY OF SUCH DAMAGE.
 */
+
+
 /* Run endian tests with assembly and intrinsics disabled.
  */
 
diff --git a/tests/unit/endian/swap_endian_nounroll.cpp b/tests/unit/endian/swap_endian_nounroll.cpp
index 6b445235d..3b35c6d1e 100644
--- a/tests/unit/endian/swap_endian_nounroll.cpp
+++ b/tests/unit/endian/swap_endian_nounroll.cpp
@@ -25,6 +25,8 @@ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 THE POSSIBILITY OF SUCH DAMAGE.
 */
+
+
 /* Run endian tests with unroll factor forced to be one.
  */
 
diff --git a/tests/unit/endian/swap_endian_oddunroll.cpp b/tests/unit/endian/swap_endian_oddunroll.cpp
index f0d03bac5..91fc49e15 100644
--- a/tests/unit/endian/swap_endian_oddunroll.cpp
+++ b/tests/unit/endian/swap_endian_oddunroll.cpp
@@ -25,6 +25,8 @@ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 THE POSSIBILITY OF SUCH DAMAGE.
 */
+
+
 /* Run endian tests with an unusual, odd unroll factor.
  *
  * In practice, unroll should be left at the default value or set
diff --git a/tests/unit/mechbuild/CMakeLists.txt b/tests/unit/mechbuild/CMakeLists.txt
index d1d184b28..ac31a184d 100644
--- a/tests/unit/mechbuild/CMakeLists.txt
+++ b/tests/unit/mechbuild/CMakeLists.txt
@@ -24,6 +24,7 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 # THE POSSIBILITY OF SUCH DAMAGE.
 
+
 include(TestScriptUtils)
 
 set(TEST_LABEL unit modinclude)
diff --git a/tests/unit/omp/CMakeLists.txt b/tests/unit/omp/CMakeLists.txt
index bd18a2ed7..dfde63d4c 100644
--- a/tests/unit/omp/CMakeLists.txt
+++ b/tests/unit/omp/CMakeLists.txt
@@ -24,10 +24,11 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 # THE POSSIBILITY OF SUCH DAMAGE.
 
+
 FILE(GLOB omp_test_src "*.cpp")
 
 add_executable(omp_test_bin ${omp_test_src})
-target_link_libraries(omp_test_bin ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY} coreneuron)
+target_link_libraries(omp_test_bin ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY} coreneuron ${reportinglib_LIBRARY})
 
 add_test(NAME omp_test COMMAND ${TEST_EXEC_PREFIX} ${CMAKE_CURRENT_BINARY_DIR}/omp_test_bin)
 
diff --git a/tests/unit/omp/test_omp.cpp b/tests/unit/omp/test_omp.cpp
index ef26f10b2..1574b8f51 100644
--- a/tests/unit/omp/test_omp.cpp
+++ b/tests/unit/omp/test_omp.cpp
@@ -25,6 +25,8 @@ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 THE POSSIBILITY OF SUCH DAMAGE.
 */
+
+
 #define BOOST_TEST_MODULE omp 
 #define BOOST_TEST_MAIN
 
diff --git a/tests/unit/queueing/CMakeLists.txt b/tests/unit/queueing/CMakeLists.txt
new file mode 100644
index 000000000..ac6fb0bab
--- /dev/null
+++ b/tests/unit/queueing/CMakeLists.txt
@@ -0,0 +1,33 @@
+# Copyright (c) 2016, Blue Brain Project
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+# 1. Redistributions of source code must retain the above copyright notice,
+#    this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+# 3. Neither the name of the copyright holder nor the names of its contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+
+
+FILE(GLOB queuing_test_src "*.cpp")
+
+add_executable(queuing_test_bin ${queuing_test_src})
+target_link_libraries(queuing_test_bin ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY} ${Boost_SYSTEM_LIBRARY} coreneuron ${reportinglib_LIBRARY})
+
+add_test(NAME queuing_test COMMAND ${TEST_EXEC_PREFIX} ${CMAKE_CURRENT_BINARY_DIR}/queuing_test_bin)
diff --git a/tests/unit/queueing/test_header.hpp b/tests/unit/queueing/test_header.hpp
new file mode 100644
index 000000000..801311a6d
--- /dev/null
+++ b/tests/unit/queueing/test_header.hpp
@@ -0,0 +1,37 @@
+/*
+ *  * Miniapp - test_header.hpp, Copyright (c), 2016,
+ *  * Kai Langen - Swiss Federal Institute of technology in Lausanne,
+ *  * kai.langen@epfl.ch,
+ *  * All rights reserved.
+ *  *
+ *  * This library is free software; you can redistribute it and/or
+ *  * modify it under the terms of the GNU Lesser General Public
+ *  * License as published by the Free Software Foundation; either
+ *  * version 3.0 of the License, or (at your option) any later version.
+ *  *
+ *  * This library is distributed in the hope that it will be useful,
+ *  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  * Lesser General Public License for more details.
+ *  *
+ *  * You should have received a copy of the GNU Lesser General Public
+ *  * License along with this library.
+ *  */
+
+#ifndef test_header_hpp
+#define test_header_hpp
+
+#include <boost/mpl/list.hpp>
+#include "coreneuron/nrniv/sptbinq.h"
+
+template<container C>
+struct data{
+	static const container cont = C;
+};
+
+typedef boost::mpl::list<
+						data<queueing::spltree>,
+						data<queueing::pq_que>
+						> full_test_types;
+
+#endif
diff --git a/tests/unit/queueing/test_queueing.cpp b/tests/unit/queueing/test_queueing.cpp
new file mode 100644
index 000000000..51a3611ab
--- /dev/null
+++ b/tests/unit/queueing/test_queueing.cpp
@@ -0,0 +1,159 @@
+/*
+Copyright (c) 2016, Blue Brain Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+/*
+ * Neuromapp - test.cpp, Copyright (c), 2015,
+ * Kai Langen - Swiss Federal Institute of technology in Lausanne,
+ * kai.langen@epfl.ch,
+ * All rights reserved.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3.0 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library.
+ */
+/**
+ * @file neuromapp/test/queuing/test.cpp
+ *  Test on the Queueing Miniapp.
+ */
+
+#define BOOST_TEST_MODULE QueueingTest
+#define TYPE T::cont
+
+#include <boost/test/unit_test.hpp>
+#include <boost/test/test_case_template.hpp>
+#include <boost/filesystem.hpp>
+
+#include <cstdlib>
+#include <vector>
+#include <iostream>
+//#include "test/unit/queueing/test_header.hpp"
+#include "coreneuron/nrniv/netcvode.h"
+#include "coreneuron/nrniv/tqueue.h"
+
+namespace bfs = ::boost::filesystem;
+
+//UNIT TESTS
+BOOST_AUTO_TEST_CASE(priority_queue_nq_dq){
+	TQueue<pq_que> tq = TQueue<pq_que>();
+	const int num = 8;
+	int cnter = 0;
+	//enqueue 8 items with increasing time
+	for(int i = 0; i < num; ++i)
+		tq.insert(static_cast<double>(i), NULL);
+
+	BOOST_CHECK(tq.pq_que_.size() == (num - 1));
+
+	// dequeue items with time <= 5.0. Should be 6 events: from 0. to 5.
+	TQItem *item = NULL;
+	while((item = tq.atomic_dq(5.0)) != NULL){
+		++cnter;
+		delete item;
+	}
+	BOOST_CHECK(cnter == 6);
+	BOOST_CHECK(tq.pq_que_.size() == (num - 6 - 1));
+
+	//dequeue the rest
+	while((item = tq.atomic_dq(8.0)) != NULL){
+		++cnter;
+		delete item;
+	}
+
+	BOOST_CHECK(cnter == num);
+	BOOST_CHECK(tq.pq_que_.size() == 0);
+	BOOST_CHECK(tq.least() == NULL);
+}
+
+BOOST_AUTO_TEST_CASE(tqueue_ordered_test){
+	TQueue<pq_que> tq = TQueue<pq_que>();
+	const int num = 10;
+	int cnter = 0;
+	double time = double();
+
+	//insert N items with time < N
+	for(int i = 0; i < num; ++i){
+		time = static_cast<double>(rand() % num);
+		tq.insert(time, NULL);
+	}
+
+	time = 0.0;
+	TQItem *item = NULL;
+	//dequeue all items and check that previous item time <= current item time
+	while((item = tq.atomic_dq(10.0)) != NULL){
+		BOOST_CHECK(time <= item->t_);
+		++cnter;
+		time = item->t_;
+		delete item;
+	}
+	BOOST_CHECK(cnter == num);
+	BOOST_CHECK(tq.pq_que_.size() == 0);
+	BOOST_CHECK(tq.least() == NULL);
+}
+
+BOOST_AUTO_TEST_CASE(tqueue_move_nolock){
+}
+
+BOOST_AUTO_TEST_CASE(tqueue_remove){
+}
+
+BOOST_AUTO_TEST_CASE(threaddata_interthread_send){
+	NetCvodeThreadData nt = NetCvodeThreadData();
+	const size_t num = 6;
+	for(size_t i = 0; i < num; ++i)
+		nt.interthread_send(static_cast<double>(i), NULL, NULL);
+
+	BOOST_CHECK(nt.inter_thread_events_.size() == num);
+}
+/*
+BOOST_AUTO_TEST_CASE(threaddata_enqueue){
+	NetCvode n = NetCvode();
+	const int num = 6;
+	for(int i = 0; i < num; ++i)
+		n.p[1].interthread_send(static_cast<double>(i), NULL, NULL);
+
+	BOOST_CHECK(n.p[1].inter_thread_events_.size() == num);
+
+	//enqueue the inter_thread_events_
+	n.p[1].enqueue(&n, &(n.p[1]));
+	BOOST_CHECK(n.p[1].inter_thread_events_.size() == 0);
+	BOOST_CHECK(n.p[1].tqe_->pq_que_.size() == num);
+
+	//cleanup priority queue
+	TQItem* item = NULL;
+	while((item = n.p[1].tqe_->atomic_dq(6.0)) != NULL)
+		delete item;
+}*/
diff --git a/tests/unit/sdprintf/CMakeLists.txt b/tests/unit/sdprintf/CMakeLists.txt
index 030adb487..575bc97e9 100644
--- a/tests/unit/sdprintf/CMakeLists.txt
+++ b/tests/unit/sdprintf/CMakeLists.txt
@@ -24,11 +24,13 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 # THE POSSIBILITY OF SUCH DAMAGE.
 
+
+
 include_directories(${CMAKE_SOURCE_DIR}/coreneuron ${Boost_INCLUDE_DIRS})
 FILE(GLOB sdprintf_test_src "*.cpp")
 
 add_executable(sdprintf_test_bin ${sdprintf_test_src})
-target_link_libraries(sdprintf_test_bin ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY} coreneuron)
+target_link_libraries(sdprintf_test_bin ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY} coreneuron ${reportinglib_LIBRARY})
 
 add_test(NAME sdprintf_test COMMAND ${TEST_EXEC_PREFIX} ${CMAKE_CURRENT_BINARY_DIR}/sdprintf_test_bin)
 
diff --git a/tests/unit/sdprintf/test_sdprintf.cpp b/tests/unit/sdprintf/test_sdprintf.cpp
index 9ecba7630..8ca41f754 100644
--- a/tests/unit/sdprintf/test_sdprintf.cpp
+++ b/tests/unit/sdprintf/test_sdprintf.cpp
@@ -26,6 +26,7 @@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 THE POSSIBILITY OF SUCH DAMAGE.
 */
 
+
 #define BOOST_TEST_MODULE sdprintf
 #define BOOST_TEST_MAIN