From e8378a929a924c4b218c86d3cfebdf204254f4f1 Mon Sep 17 00:00:00 2001
From: Noel Chalmers <noel.chalmers@amd.com>
Date: Thu, 7 Jul 2022 12:45:08 -0500
Subject: [PATCH] Initial public release of rocHPL

---
 .clang-format                          |   90 ++
 .gitignore                             |   47 +
 CMakeLists.txt                         |  217 ++++
 LICENSE                                |   71 ++
 README.md                              |  151 +++
 cmake/Dependencies.cmake               |  152 +++
 include/hpl.hpp                        |   64 +
 include/hpl_auxil.hpp                  |   90 ++
 include/hpl_blas.hpp                   |  266 ++++
 include/hpl_comm.hpp                   |   94 ++
 include/hpl_grid.hpp                   |  106 ++
 include/hpl_misc.hpp                   |   66 +
 include/hpl_panel.hpp                  |  146 +++
 include/hpl_pauxil.hpp                 |  287 +++++
 include/hpl_pfact.hpp                  |  199 +++
 include/hpl_pgesv.hpp                  |  150 +++
 include/hpl_pmatgen.hpp                |   73 ++
 include/hpl_pmisc.hpp                  |   29 +
 include/hpl_ptest.hpp                  |  118 ++
 include/hpl_ptimer.hpp                 |   71 ++
 include/hpl_version.hpp.in             |   24 +
 install.sh                             |  390 ++++++
 scripts/HPL.dat                        |   31 +
 scripts/mpirun_rochpl.in               |  198 +++
 scripts/run_rochpl.in                  |  410 +++++++
 src/HPL_InitGPU.cpp                    |  119 ++
 src/HPL_pddriver.cpp                   |  285 +++++
 src/HPL_pdinfo.cpp                     | 1557 ++++++++++++++++++++++++
 src/HPL_pdtest.cpp                     |  501 ++++++++
 src/auxil/HPL_abort.cpp                |   74 ++
 src/auxil/HPL_dlacpy.cpp               |   68 ++
 src/auxil/HPL_dlamch.cpp               |  763 ++++++++++++
 src/auxil/HPL_dlange.cpp               |  132 ++
 src/auxil/HPL_dlaprnt.cpp              |   76 ++
 src/auxil/HPL_dlatcpy.cpp              |   68 ++
 src/auxil/HPL_dlatcpy_device.cpp       |  113 ++
 src/auxil/HPL_fprintf.cpp              |   53 +
 src/auxil/HPL_warn.cpp                 |   80 ++
 src/blas/HPL_daxpy.cpp                 |   43 +
 src/blas/HPL_dgemm.cpp                 |   65 +
 src/blas/HPL_dgemv.cpp                 |   60 +
 src/blas/HPL_dger.cpp                  |   47 +
 src/blas/HPL_dscal.cpp                 |   41 +
 src/blas/HPL_idamax.cpp                |   64 +
 src/comm/HPL_all_reduce.cpp            |   59 +
 src/comm/HPL_all_reduce_dmxswp.cpp     |  298 +++++
 src/comm/HPL_allgatherv.cpp            |  128 ++
 src/comm/HPL_barrier.cpp               |   40 +
 src/comm/HPL_bcast.cpp                 |   82 ++
 src/comm/HPL_bcast_1rinM.cpp           |  109 ++
 src/comm/HPL_bcast_1ring.cpp           |   99 ++
 src/comm/HPL_bcast_2rinM.cpp           |  165 +++
 src/comm/HPL_bcast_2ring.cpp           |  153 +++
 src/comm/HPL_bcast_blonM.cpp           |  185 +++
 src/comm/HPL_bcast_blong.cpp           |  161 +++
 src/comm/HPL_broadcast.cpp             |   58 +
 src/comm/HPL_recv.cpp                  |   63 +
 src/comm/HPL_reduce.cpp                |   74 ++
 src/comm/HPL_scatterv.cpp              |  125 ++
 src/comm/HPL_sdrv.cpp                  |   91 ++
 src/comm/HPL_send.cpp                  |   60 +
 src/grid/HPL_grid_exit.cpp             |   58 +
 src/grid/HPL_grid_info.cpp             |   66 +
 src/grid/HPL_grid_init.cpp             |  190 +++
 src/matgen/HPL_pdmatgen.cpp            |  262 ++++
 src/matgen/HPL_pdrandmat_device.cpp    |  201 +++
 src/matgen/HPL_xjumpm.cpp              |   92 ++
 src/panel/HPL_pdpanel_SendToDevice.cpp |  216 ++++
 src/panel/HPL_pdpanel_SendToHost.cpp   |   28 +
 src/panel/HPL_pdpanel_bcast.cpp        |   56 +
 src/panel/HPL_pdpanel_disp.cpp         |   48 +
 src/panel/HPL_pdpanel_free.cpp         |   56 +
 src/panel/HPL_pdpanel_init.cpp         |  475 ++++++++
 src/panel/HPL_pdpanel_new.cpp          |  105 ++
 src/panel/HPL_pdpanel_wait.cpp         |   22 +
 src/pauxil/HPL_dlaswp00N_device.cpp    |  111 ++
 src/pauxil/HPL_dlaswp01T_device.cpp    |  135 ++
 src/pauxil/HPL_dlaswp02T_device.cpp    |  106 ++
 src/pauxil/HPL_dlaswp03T_device.cpp    |  133 ++
 src/pauxil/HPL_dlaswp04T_device.cpp    |  128 ++
 src/pauxil/HPL_dlaswp10N_device.cpp    |   91 ++
 src/pauxil/HPL_indxg2l.cpp             |   96 ++
 src/pauxil/HPL_indxg2lp.cpp            |  116 ++
 src/pauxil/HPL_indxg2p.cpp             |   74 ++
 src/pauxil/HPL_indxl2g.cpp             |  105 ++
 src/pauxil/HPL_infog2l.cpp             |  280 +++++
 src/pauxil/HPL_numroc.cpp              |   67 +
 src/pauxil/HPL_numrocI.cpp             |  185 +++
 src/pauxil/HPL_pabort.cpp              |   85 ++
 src/pauxil/HPL_pdlamch.cpp             |   87 ++
 src/pauxil/HPL_pdlange_device.cpp      |  302 +++++
 src/pauxil/HPL_pwarn.cpp               |   89 ++
 src/pfact/HPL_dlocmax.cpp              |  110 ++
 src/pfact/HPL_dlocswpN.cpp             |  150 +++
 src/pfact/HPL_dlocswpT.cpp             |  150 +++
 src/pfact/HPL_pdfact.cpp               |  109 ++
 src/pfact/HPL_pdmxswp.cpp              |  132 ++
 src/pfact/HPL_pdpancrN.cpp             |  233 ++++
 src/pfact/HPL_pdpancrT.cpp             |  232 ++++
 src/pfact/HPL_pdpanllN.cpp             |  224 ++++
 src/pfact/HPL_pdpanllT.cpp             |  223 ++++
 src/pfact/HPL_pdpanrlN.cpp             |  228 ++++
 src/pfact/HPL_pdpanrlT.cpp             |  224 ++++
 src/pfact/HPL_pdrpancrN.cpp            |  214 ++++
 src/pfact/HPL_pdrpancrT.cpp            |  213 ++++
 src/pfact/HPL_pdrpanllN.cpp            |  193 +++
 src/pfact/HPL_pdrpanllT.cpp            |  193 +++
 src/pfact/HPL_pdrpanrlN.cpp            |  198 +++
 src/pfact/HPL_pdrpanrlT.cpp            |  198 +++
 src/pgesv/HPL_pdgesv.cpp               |  409 +++++++
 src/pgesv/HPL_pdlaswp.cpp              |  532 ++++++++
 src/pgesv/HPL_pdtrsv_device.cpp        |  352 ++++++
 src/pgesv/HPL_pdupdateNT.cpp           |  169 +++
 src/pgesv/HPL_pdupdateTT.cpp           |  168 +++
 src/pgesv/HPL_perm.cpp                 |   89 ++
 src/pgesv/HPL_pipid.cpp                |  164 +++
 src/pgesv/HPL_piplen.cpp               |   58 +
 src/pgesv/HPL_plindx.cpp               |  238 ++++
 src/timer/HPL_ptimer.cpp               |  262 ++++
 src/timer/HPL_ptimer_cputime.cpp       |   45 +
 src/timer/HPL_ptimer_walltime.cpp      |   29 +
 121 files changed, 19503 insertions(+)
 create mode 100644 .clang-format
 create mode 100644 .gitignore
 create mode 100644 CMakeLists.txt
 create mode 100644 LICENSE
 create mode 100644 README.md
 create mode 100644 cmake/Dependencies.cmake
 create mode 100644 include/hpl.hpp
 create mode 100644 include/hpl_auxil.hpp
 create mode 100644 include/hpl_blas.hpp
 create mode 100644 include/hpl_comm.hpp
 create mode 100644 include/hpl_grid.hpp
 create mode 100644 include/hpl_misc.hpp
 create mode 100644 include/hpl_panel.hpp
 create mode 100644 include/hpl_pauxil.hpp
 create mode 100644 include/hpl_pfact.hpp
 create mode 100644 include/hpl_pgesv.hpp
 create mode 100644 include/hpl_pmatgen.hpp
 create mode 100644 include/hpl_pmisc.hpp
 create mode 100644 include/hpl_ptest.hpp
 create mode 100644 include/hpl_ptimer.hpp
 create mode 100644 include/hpl_version.hpp.in
 create mode 100755 install.sh
 create mode 100644 scripts/HPL.dat
 create mode 100755 scripts/mpirun_rochpl.in
 create mode 100755 scripts/run_rochpl.in
 create mode 100644 src/HPL_InitGPU.cpp
 create mode 100644 src/HPL_pddriver.cpp
 create mode 100644 src/HPL_pdinfo.cpp
 create mode 100644 src/HPL_pdtest.cpp
 create mode 100644 src/auxil/HPL_abort.cpp
 create mode 100644 src/auxil/HPL_dlacpy.cpp
 create mode 100644 src/auxil/HPL_dlamch.cpp
 create mode 100644 src/auxil/HPL_dlange.cpp
 create mode 100644 src/auxil/HPL_dlaprnt.cpp
 create mode 100644 src/auxil/HPL_dlatcpy.cpp
 create mode 100644 src/auxil/HPL_dlatcpy_device.cpp
 create mode 100644 src/auxil/HPL_fprintf.cpp
 create mode 100644 src/auxil/HPL_warn.cpp
 create mode 100644 src/blas/HPL_daxpy.cpp
 create mode 100644 src/blas/HPL_dgemm.cpp
 create mode 100644 src/blas/HPL_dgemv.cpp
 create mode 100644 src/blas/HPL_dger.cpp
 create mode 100644 src/blas/HPL_dscal.cpp
 create mode 100644 src/blas/HPL_idamax.cpp
 create mode 100644 src/comm/HPL_all_reduce.cpp
 create mode 100644 src/comm/HPL_all_reduce_dmxswp.cpp
 create mode 100644 src/comm/HPL_allgatherv.cpp
 create mode 100644 src/comm/HPL_barrier.cpp
 create mode 100644 src/comm/HPL_bcast.cpp
 create mode 100644 src/comm/HPL_bcast_1rinM.cpp
 create mode 100644 src/comm/HPL_bcast_1ring.cpp
 create mode 100644 src/comm/HPL_bcast_2rinM.cpp
 create mode 100644 src/comm/HPL_bcast_2ring.cpp
 create mode 100644 src/comm/HPL_bcast_blonM.cpp
 create mode 100644 src/comm/HPL_bcast_blong.cpp
 create mode 100644 src/comm/HPL_broadcast.cpp
 create mode 100644 src/comm/HPL_recv.cpp
 create mode 100644 src/comm/HPL_reduce.cpp
 create mode 100644 src/comm/HPL_scatterv.cpp
 create mode 100644 src/comm/HPL_sdrv.cpp
 create mode 100644 src/comm/HPL_send.cpp
 create mode 100644 src/grid/HPL_grid_exit.cpp
 create mode 100644 src/grid/HPL_grid_info.cpp
 create mode 100644 src/grid/HPL_grid_init.cpp
 create mode 100644 src/matgen/HPL_pdmatgen.cpp
 create mode 100644 src/matgen/HPL_pdrandmat_device.cpp
 create mode 100644 src/matgen/HPL_xjumpm.cpp
 create mode 100644 src/panel/HPL_pdpanel_SendToDevice.cpp
 create mode 100644 src/panel/HPL_pdpanel_SendToHost.cpp
 create mode 100644 src/panel/HPL_pdpanel_bcast.cpp
 create mode 100644 src/panel/HPL_pdpanel_disp.cpp
 create mode 100644 src/panel/HPL_pdpanel_free.cpp
 create mode 100644 src/panel/HPL_pdpanel_init.cpp
 create mode 100644 src/panel/HPL_pdpanel_new.cpp
 create mode 100644 src/panel/HPL_pdpanel_wait.cpp
 create mode 100644 src/pauxil/HPL_dlaswp00N_device.cpp
 create mode 100644 src/pauxil/HPL_dlaswp01T_device.cpp
 create mode 100644 src/pauxil/HPL_dlaswp02T_device.cpp
 create mode 100644 src/pauxil/HPL_dlaswp03T_device.cpp
 create mode 100644 src/pauxil/HPL_dlaswp04T_device.cpp
 create mode 100644 src/pauxil/HPL_dlaswp10N_device.cpp
 create mode 100644 src/pauxil/HPL_indxg2l.cpp
 create mode 100644 src/pauxil/HPL_indxg2lp.cpp
 create mode 100644 src/pauxil/HPL_indxg2p.cpp
 create mode 100644 src/pauxil/HPL_indxl2g.cpp
 create mode 100644 src/pauxil/HPL_infog2l.cpp
 create mode 100644 src/pauxil/HPL_numroc.cpp
 create mode 100644 src/pauxil/HPL_numrocI.cpp
 create mode 100644 src/pauxil/HPL_pabort.cpp
 create mode 100644 src/pauxil/HPL_pdlamch.cpp
 create mode 100644 src/pauxil/HPL_pdlange_device.cpp
 create mode 100644 src/pauxil/HPL_pwarn.cpp
 create mode 100644 src/pfact/HPL_dlocmax.cpp
 create mode 100644 src/pfact/HPL_dlocswpN.cpp
 create mode 100644 src/pfact/HPL_dlocswpT.cpp
 create mode 100644 src/pfact/HPL_pdfact.cpp
 create mode 100644 src/pfact/HPL_pdmxswp.cpp
 create mode 100644 src/pfact/HPL_pdpancrN.cpp
 create mode 100644 src/pfact/HPL_pdpancrT.cpp
 create mode 100644 src/pfact/HPL_pdpanllN.cpp
 create mode 100644 src/pfact/HPL_pdpanllT.cpp
 create mode 100644 src/pfact/HPL_pdpanrlN.cpp
 create mode 100644 src/pfact/HPL_pdpanrlT.cpp
 create mode 100644 src/pfact/HPL_pdrpancrN.cpp
 create mode 100644 src/pfact/HPL_pdrpancrT.cpp
 create mode 100644 src/pfact/HPL_pdrpanllN.cpp
 create mode 100644 src/pfact/HPL_pdrpanllT.cpp
 create mode 100644 src/pfact/HPL_pdrpanrlN.cpp
 create mode 100644 src/pfact/HPL_pdrpanrlT.cpp
 create mode 100644 src/pgesv/HPL_pdgesv.cpp
 create mode 100644 src/pgesv/HPL_pdlaswp.cpp
 create mode 100644 src/pgesv/HPL_pdtrsv_device.cpp
 create mode 100644 src/pgesv/HPL_pdupdateNT.cpp
 create mode 100644 src/pgesv/HPL_pdupdateTT.cpp
 create mode 100644 src/pgesv/HPL_perm.cpp
 create mode 100644 src/pgesv/HPL_pipid.cpp
 create mode 100644 src/pgesv/HPL_piplen.cpp
 create mode 100644 src/pgesv/HPL_plindx.cpp
 create mode 100644 src/timer/HPL_ptimer.cpp
 create mode 100644 src/timer/HPL_ptimer_cputime.cpp
 create mode 100644 src/timer/HPL_ptimer_walltime.cpp

diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..d659c15
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,90 @@
+---
+Language:        Cpp
+AccessModifierOffset: 0
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: true
+AlignConsecutiveDeclarations: true
+AlignEscapedNewlinesLeft: true
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: true
+AllowShortIfStatementsOnASingleLine: true
+AllowShortLoopsOnASingleLine: true
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: false
+BinPackParameters: false
+BraceWrapping:
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Custom
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
+IncludeCategories:
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
+    Priority:        2
+  - Regex:           '^(<|"(gtest|isl|json)/)'
+    Priority:        3
+  - Regex:           '.*'
+    Priority:        1
+IndentCaseLabels: true
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: true
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 1000
+PointerAlignment: Left
+ReflowComments:  true
+SortIncludes:    false
+SpaceAfterCStyleCast: false
+# SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: Never
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+TabWidth:        2
+UseTab:          Never
+...
+
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..cdc7121
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,47 @@
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Fortran module files
+*.mod
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Executables
+*.exe
+*.out
+*.app
+
+# vim tags
+tags
+.tags
+.*.swp
+
+# Editors
+.vscode
+
+# build-in-source directory
+build
+
+# doc directory
+docBin
+_build
+
+#third-party software
+tpl/
+ltmain.sh
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..645594b
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,217 @@
+# Modifications (c) 2018-2022 Advanced Micro Devices, Inc.
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+# 3. Neither the name of the copyright holder nor the names of its contributors
+#    may be used to endorse or promote products derived from this software without
+#    specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+# IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+cmake_minimum_required(VERSION 3.10 FATAL_ERROR)
+
+option(HPL_VERBOSE_PRINT   "Enable printing to terminal during run" OFF)
+option(HPL_PROGRESS_REPORT "Enable printing progess report to terminal during run" OFF)
+option(HPL_DETAILED_TIMING "Enable detailed timers during run" OFF)
+
+option(ROCM_PATH "Path to ROCm install" /opt/rocm)
+option(HPL_BLAS_DIR "Path to CPU BLAS library" ${CMAKE_CURRENT_SOURCE_DIR}/tpl/openblas)
+option(HPL_MPI_DIR  "Path to MPI library" ${CMAKE_CURRENT_SOURCE_DIR}/tpl/openmpi)
+
+option(HPL_OPENMPI_UCX "Compile WITH OpenMPI+UCX support." OFF)
+
+set(CMAKE_INSTALL_PREFIX "rocHPL" CACHE PATH "Install path prefix, prepended onto install directories")
+
+# CMake modules
+list(APPEND CMAKE_MODULE_PATH
+     ${CMAKE_CURRENT_SOURCE_DIR}/cmake
+     ${ROCM_PATH}/hip/cmake)
+
+# Set a default build type if none was specified
+if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+  message(STATUS "Setting build type to 'Release' as none was specified.")
+  set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build." FORCE)
+  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "" "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
+endif()
+
+# Honor per-config flags in try_compile() source-file signature. cmake v3.7 and up
+if(POLICY CMP0066)
+  cmake_policy(SET CMP0066 NEW)
+endif()
+
+# rocHPL project
+project(rochpl LANGUAGES CXX)
+
+# Build flags
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+# Build options
+option(HPL_DEBUG "Compile with modest debugging turned on" OFF)
+option(HPL_DETAILED_DEBUG "Compile with voluminous debugging information turned on" OFF)
+option(HPL_DETAILED_TIMING "Enable detail timers" OFF)
+option(HPL_REFERENCE "Build reference mode" OFF)
+option(BUILD_TEST "Build rocHPL single-node test" OFF)
+
+# Dependencies
+include(cmake/Dependencies.cmake)
+
+# Setup version
+rocm_setup_version(VERSION 6.0.0)
+
+# This option only works for make/nmake and the ninja generators, but no reason it shouldn't be on all the time
+# This tells cmake to create a compile_commands.json file that can be used with clang tooling or vim
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+# HPL sources
+file(GLOB_RECURSE rochpl_source RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "src/*.cpp")
+
+# HPL device sources
+file(GLOB_RECURSE rochpl_device_source RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "src/*_device.cpp")
+list(REMOVE_ITEM rochpl_source ${rochpl_device_source})
+
+# Flag source files as hip source files
+foreach(i ${rochpl_device_source})
+  set_source_files_properties(${i} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT TRUE)
+endforeach()
+
+# HIP flags workaround while target_compile_options does not work
+list(APPEND HIP_HIPCC_FLAGS "-Wno-unused-command-line-argument -fPIE")
+list(APPEND CMAKE_HOST_FLAGS "")
+
+if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+  list(APPEND HIP_HIPCC_FLAGS "-g -ggdb")
+  list(APPEND CMAKE_HOST_FLAGS "-O0;-g")
+else()
+  list(APPEND HIP_HIPCC_FLAGS "-O3 -march=native -ffp-contract=fast -ffast-math -funsafe-math-optimizations")
+  list(APPEND CMAKE_HOST_FLAGS "-O3;-march=native")
+endif()
+
+
+# Availability of rocm_check_target_ids command assures that we can also build
+# for gfx90a target
+if(COMMAND rocm_check_target_ids)
+  set(DEFAULT_AMDGPU_TARGETS "gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx908:xnack+;gfx90a:xnack-;gfx90a:xnack+")
+else()
+  set(DEFAULT_AMDGPU_TARGETS "gfx900;gfx906;gfx908;gfx908")
+endif()
+set(TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for library to target")
+
+# AMD targets
+foreach(target ${TARGETS})
+  list(APPEND HIP_HIPCC_FLAGS "--amdgpu-target=${target}")
+endforeach()
+
+# Target executable
+hip_add_executable(rochpl ${rochpl_source} ${rochpl_device_source})
+
+target_compile_options(rochpl PRIVATE ${CMAKE_HOST_FLAGS})
+
+if(HPL_DEBUG)
+  target_compile_definitions(rochpl PRIVATE HPL_DEBUG)
+endif()
+
+if(HPL_VERBOSE_PRINT)
+  target_compile_definitions(rochpl PRIVATE HPL_VERBOSE_PRINT)
+endif()
+
+if(HPL_DETAILED_TIMING)
+  target_compile_definitions(rochpl PRIVATE HPL_DETAILED_TIMING)
+endif()
+
+if(HPL_PROGRESS_REPORT)
+  target_compile_definitions(rochpl PRIVATE HPL_PROGRESS_REPORT)
+endif()
+
+# Target include directories
+target_include_directories(rochpl
+                           PRIVATE
+                           $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+                           $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/include>
+                           $<BUILD_INTERFACE:${HIP_INCLUDE_DIRS}>
+                           $<BUILD_INTERFACE:${HPL_MPI_DIR}/include>
+                           $<BUILD_INTERFACE:${ROCM_PATH}/include/roctracer>)
+
+#HIP
+target_link_libraries(rochpl PRIVATE hip::host)
+
+# MPI
+target_link_libraries(rochpl PRIVATE MPI::MPI_CXX)
+
+# OpenMP
+target_link_libraries(rochpl PRIVATE OpenMP::OpenMP_CXX)
+
+# Target link libraries
+target_link_libraries(rochpl PRIVATE BLAS::BLAS)
+target_link_libraries(rochpl PRIVATE roc::rocblas)
+target_link_libraries(rochpl PRIVATE roc::roctracer)
+target_link_libraries(rochpl PRIVATE roc::roctx)
+
+# Target properties
+set_target_properties(rochpl PROPERTIES VERSION ${rochpl_VERSION})
+set_target_properties(rochpl PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
+
+set_target_properties(rochpl PROPERTIES LINKER_LANGUAGE CXX)
+
+target_link_options(rochpl PRIVATE "-fopenmp")
+
+set_target_properties(rochpl PROPERTIES HIP_ARCHITECTURES "${DEFAULT_AMDGPU_TARGETS}")
+
+# Configure a header file to pass the rocHPL version
+configure_file("${CMAKE_CURRENT_SOURCE_DIR}/include/hpl_version.hpp.in"
+               "${PROJECT_BINARY_DIR}/include/hpl_version.hpp")
+
+# Configure run scripts
+configure_file("${CMAKE_CURRENT_SOURCE_DIR}/scripts/run_rochpl.in"
+               "${CMAKE_BINARY_DIR}/run_rochpl"
+               @ONLY)
+configure_file("${CMAKE_CURRENT_SOURCE_DIR}/scripts/mpirun_rochpl.in"
+               "${CMAKE_BINARY_DIR}/mpirun_rochpl"
+               @ONLY)
+
+#move input file
+file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/scripts/HPL.dat
+     DESTINATION ${CMAKE_BINARY_DIR})
+
+# Install targets
+rocm_install_targets(TARGETS rochpl)
+
+install(PROGRAMS ${CMAKE_BINARY_DIR}/run_rochpl ${CMAKE_BINARY_DIR}/mpirun_rochpl
+        DESTINATION ${CMAKE_INSTALL_PREFIX})
+install(FILES ${CMAKE_BINARY_DIR}/HPL.dat
+        DESTINATION ${CMAKE_INSTALL_PREFIX})
+
+# Package specific CPACK vars
+set(CPACK_DEBIAN_PACKAGE_DEPENDS "rocm-dev (>= 3.5.0)")
+set(CPACK_RPM_PACKAGE_REQUIRES "rocm-dev >= 3.5.0")
+set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
+
+if(NOT CPACK_PACKAGING_INSTALL_PREFIX)
+  set(CPACK_PACKAGING_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+endif()
+
+set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "\${CPACK_PACKAGING_INSTALL_PREFIX}" "\${CPACK_PACKAGING_INSTALL_PREFIX}/include")
+
+# Package name
+set(package_name rochpl)
+
+rocm_create_package(
+  NAME ${package_name}
+  DESCRIPTION "Radeon Open Compute HPL application"
+  MAINTAINER "Noel Chalmers")
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..f603960
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,71 @@
+======================================================================
+ -- High Performance Computing Linpack Benchmark (HPL)
+    HPL - 2.2 - February 24, 2016
+    Antoine P. Petitet
+    University of Tennessee, Knoxville
+    Innovative Computing Laboratory
+    (C) Copyright 2000-2008 All Rights Reserved
+
+ -- Copyright notice and Licensing terms:
+
+ Redistribution  and  use in  source and binary forms, with or without
+ modification, are  permitted provided  that the following  conditions
+ are met:
+
+ 1. Redistributions  of  source  code  must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce  the above copyright
+ notice, this list of conditions,  and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ 3. All  advertising  materials  mentioning  features  or  use of this
+ software must display the following acknowledgement:
+ This  product  includes  software  developed  at  the  University  of
+ Tennessee, Knoxville, Innovative Computing Laboratory.
+
+ 4. The name of the  University,  the name of the  Laboratory,  or the
+ names  of  its  contributors  may  not  be used to endorse or promote
+ products  derived   from   this  software  without  specific  written
+ permission.
+
+ -- Disclaimer:
+
+ THIS  SOFTWARE  IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,  INCLUDING,  BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
+ OR  CONTRIBUTORS  BE  LIABLE FOR ANY  DIRECT,  INDIRECT,  INCIDENTAL,
+ SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES  (INCLUDING,  BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA OR PROFITS; OR BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+======================================================================
+
+Modifications (c) 2018-2022 Advanced Micro Devices, Inc.
+Modified by: Noel Chalmers
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..bad2d62
--- /dev/null
+++ b/README.md
@@ -0,0 +1,151 @@
+# rocHPL
+rocHPL is a benchmark based on the [HPL][] benchmark application, implemented on top of AMD's Radeon Open Compute [ROCm][] Platform, runtime, and toolchains. rocHPL is created using the [HIP][] programming language and optimized for AMD's latest discrete GPUs.
+
+## Requirements
+* Git
+* CMake (3.10 or later)
+* MPI (Optional)
+* AMD [ROCm] platform (3.5 or later)
+* [rocBLAS][]
+
+## Quickstart rocHPL build and install
+
+#### Install script
+You can build rocHPL using the `install.sh` script
+```
+# Clone rocHPL using git
+git clone https://github.com/ROCmSoftwarePlatform/rocHPL.git
+
+# Go to rocHPL directory
+cd rocHPL
+
+# Run install.sh script
+# Command line options:
+#    -h|--help            - prints this help message
+#    -g|--debug           - Set build type to Debug (otherwise build Release)
+#    --prefix=<dir>       - Path to rocHPL install location (Default: build/rocHPL)
+#    --with-rocm=<dir>    - Path to ROCm install (Default: /opt/rocm)
+#    --with-rocblas=<dir> - Path to rocBLAS library (Default: /opt/rocm/rocblas)
+#    --with-cpublas=<dir> - Path to external CPU BLAS library (Default: clone+build AMD BLIS)
+#    --with-mpi=<dir>     - Path to external MPI install (Default: clone+build OpenMPI)
+#    --verbose-print      - Verbose output during HPL setup (Default: true)
+#    --progress-report    - Print progress report to terminal during HPL run (Default: true)
+#    --detailed-timing    - Record detailed timers during HPL run (Default: true)
+./install.sh
+```
+By default, [BLIS] v3.1, [UCX] v1.12.1, and [OpenMPI] v4.1.4 will be cloned and built in rocHPL/tpl. After building, the `rochpl` executable is placed in build/rochpl-install.
+
+## Running rocHPL benchmark application
+rocHPL provides some helpful wrapper scripts. A wrapper script for launching via `mpirun` is provided in `mpirun_rochpl`. This script has two distinct run modes:
+```
+mpirun_rochpl -P <P> -Q <P> -N <N> --NB <NB> -f <frac>
+# where
+# P       - is the number of rows in the MPI grid
+# Q       - is the number of columns in the MPI grid
+# N       - is the total number of rows/columns of the global matrix
+# NB      - is the panel size in the blocking algorithm
+# frac    - is the split-update fraction (imporant for hiding some MPI
+            communication)
+```
+This run script will launch a total of np=PxQ MPI processes.
+
+The second runmode takes an input file together with a number of MPI processes:
+```
+mpirun_rochpl -P <p> -Q <q> -i <input> -f <frac>
+# where
+# P       - is the number of rows in the MPI grid
+# Q       - is the number of columns in the MPI grid
+# input   - is the input filename (default HPL.dat)
+# frac    - is the split-update fraction (important for hiding some MPI
+            communication)
+```
+
+The input file accpted by the `rochpl` executable follows the format below:
+```
+HPLinpack benchmark input file
+Innovative Computing Laboratory, University of Tennessee
+HPL.out      output file name (if any)
+0            device out (6=stdout,7=stderr,file)
+1            # of problems sizes (N)
+45312        Ns
+1            # of NBs
+384          NBs
+1            PMAP process mapping (0=Row-,1=Column-major)
+1            # of process grids (P x Q)
+1            Ps
+1            Qs
+16.0         threshold
+1            # of panel fact
+2            PFACTs (0=left, 1=Crout, 2=Right)
+1            # of recursive stopping criterium
+2            NBMINs (>= 1)
+1            # of panels in recursion
+2            NDIVs
+1            # of recursive panel fact.
+2            RFACTs (0=left, 1=Crout, 2=Right)
+1            # of broadcast
+6            BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM,6=Ibcast)
+1            # of lookahead depth
+1            DEPTHs (>=0)
+1            SWAP (0=bin-exch,1=long,2=mix)
+64           swapping threshold
+1            L1 in (0=transposed,1=no-transposed) form
+0            U  in (0=transposed,1=no-transposed) form
+0            Equilibration (0=no,1=yes)
+8            memory alignment in double (> 0)
+```
+
+The `mpirun_rochpl` wraps a second script, `run_rochpl`, wherein some CPU core bindings are determined autmotically based on the node-local MPI grid. Users wishing to launch rocHPL via a workload manager such as slurm may directly use this run script. For example,
+```
+srun -N 2 -n 16 run_rochpl -P 4 -Q 4 -N 128000 --NB 512
+```
+When launching to multiple compute nodes, it can be useful to specify the local MPI grid layout on each node. To specify this, the `-p` and `-q` input parameters are used. For example, the srun line above is launching to two compute nodes, each with 8 GPUs. The local MPI grid layout can be specifed as either:
+```
+srun -N 2 -n 16 run_rochpl -P 4 -Q 4 -p 2 -q 4 -N 128000 --NB 512
+```
+or 
+```
+srun -N 2 -n 16 run_rochpl -P 4 -Q 4 -p 4 -q 2 -N 128000 --NB 512
+```
+This helps to control where/how much inter-node communication is occuring. 
+
+## Performance evaluation
+rocHPL is typically weak scaled so that the global matrix fills all available VRAM on all GPUs. The matrix size N is usually selected to be a multiple of the blocksize NB. Some sample runs on 32GB MI100 GPUs include:
+* 1 MI100: `mpirun_rochpl -P 1 -Q 1 -N  64512 --NB 512`
+* 2 MI100: `mpirun_rochpl -P 1 -Q 2 -N  90112 --NB 512`
+* 4 MI100: `mpirun_rochpl -P 2 -Q 2 -N 126976 --NB 512`
+* 8 MI100: `mpirun_rochpl -P 2 -Q 4 -N 180224 --NB 512`
+
+Overall performance of the benchmark is measured in 64-bit floating point operations (FLOPs) per second. Performance is reported at the end of the run to the user's specified output (by default the performance is printed to stdout and a results file HPL.out).
+
+See [the Wiki](../../wiki/Common-rocHPL-run-configurations) for some common run configurations for various AMD Instinct GPUs.
+
+## Testing rocHPL
+At the end of each benchmark run, residual error checking is computed, and PASS or FAIL is printed to output.
+
+The simplest suite of tests should run configurations from 1 to 4 GPUs to exercise different communcation code paths. For example the tests:
+```
+mpirun_rochpl -P 1 -Q 1 -N 45312
+mpirun_rochpl -P 1 -Q 2 -N 45312
+mpirun_rochpl -P 2 -Q 1 -N 45312
+mpirun_rochpl -P 2 -Q 2 -N 45312
+```
+should all report PASSED.
+
+Please note that for successful testing, a device with at least 16GB of device memory is required.
+
+## Support
+Please use [the issue tracker][] for bugs and feature requests.
+
+## License
+The [license file][] can be found in the main repository.
+
+[HPL]: http://icl.utk.edu/hpl/
+[ROCm]: https://github.com/RadeonOpenCompute/ROCm
+[HIP]: https://github.com/ROCm-Developer-Tools/HIP
+[rocBLAS]: https://github.com/ROCmSoftwarePlatform/rocBLAS
+[BLIS]: https://github.com/amd/blis
+[OpenMPI]: https://github.com/open-mpi/ompi
+[UCX]: https://github.com/openucx/ucx
+[the issue tracker]: https://github.com/ROCmSoftwarePlatform/rocHPL/issues
+[license file]: https://github.com/ROCmSoftwarePlatform/rocHPL
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
new file mode 100644
index 0000000..7cbc012
--- /dev/null
+++ b/cmake/Dependencies.cmake
@@ -0,0 +1,152 @@
+# Modifications (c) 2019-2022 Advanced Micro Devices, Inc.
+#
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+# 3. Neither the name of the copyright holder nor the names of its contributors
+#    may be used to endorse or promote products derived from this software without
+#    specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+# IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+# Dependencies
+
+# Git
+find_package(Git REQUIRED)
+
+#Look for a BLAS lib
+# For some reason cmake doesn't let us manually specify a search path in FindBLAS,
+# so let's add our own libraries
+get_filename_component(HPL_BLAS_DIR ${HPL_BLAS_DIR} ABSOLUTE)
+
+# Look for BLIS in the provided path
+find_library(BLAS_LIBRARIES NAMES blis
+             PATHS ${HPL_BLAS_DIR}
+             NO_DEFAULT_PATH)
+
+if (NOT BLAS_LIBRARIES)
+  # If we dont find BLIS, look for openblas
+  find_library(BLAS_LIBRARIES NAMES openblas
+               PATHS ${HPL_BLAS_DIR}
+               NO_DEFAULT_PATH)
+endif()
+if (NOT BLAS_LIBRARIES)
+  # If we dont find BLIS or openBLAS, look for MKL
+  find_library(BLAS_LIBRARIES NAMES mkl_core
+               PATHS ${HPL_BLAS_DIR}
+               NO_DEFAULT_PATH)
+  find_library(BLAS_SEQ_LIBRARIES NAMES mkl_sequential
+               PATHS ${HPL_BLAS_DIR}
+               NO_DEFAULT_PATH)
+  find_library(BLAS_LP64_LIBRARIES NAMES mkl_intel_lp64
+               PATHS ${HPL_BLAS_DIR}
+               NO_DEFAULT_PATH)
+endif()
+
+if (BLAS_LIBRARIES)
+  message(STATUS "Found BLAS: ${BLAS_LIBRARIES}")
+else()
+  # If we still havent found a blas library, maybe cmake will?
+  find_package(BLAS REQUIRED)
+endif()
+add_library(BLAS::BLAS IMPORTED INTERFACE)
+set_property(TARGET BLAS::BLAS PROPERTY INTERFACE_LINK_LIBRARIES  "${BLAS_LP64_LIBRARIES};${BLAS_SEQ_LIBRARIES};${BLAS_LIBRARIES}")
+
+# Find OpenMP package
+find_package(OpenMP)
+if (NOT OPENMP_FOUND)
+  message("-- OpenMP not found. Compiling WITHOUT OpenMP support.")
+else()
+  option(HPL_OPENMP "Compile WITH OpenMP support." ON)
+endif()
+
+# MPI
+set(MPI_HOME ${HPL_MPI_DIR})
+find_package(MPI REQUIRED)
+
+# Add some paths
+list(APPEND CMAKE_PREFIX_PATH ${ROCBLAS_PATH} ${ROCM_PATH}/hip ${ROCM_PATH})
+
+find_library(ROCTRACER NAMES roctracer64
+             PATHS ${ROCM_PATH}/lib
+             NO_DEFAULT_PATH)
+find_library(ROCTX NAMES roctx64
+             PATHS ${ROCM_PATH}/lib
+             NO_DEFAULT_PATH)
+
+message("-- roctracer:  ${ROCTRACER}")
+message("-- roctx:      ${ROCTX}")
+
+add_library(roc::roctracer SHARED IMPORTED)
+set_target_properties(roc::roctracer PROPERTIES
+  INTERFACE_INCLUDE_DIRECTORIES "${ROCM_PATH}/include"
+  INTERFACE_LINK_LIBRARIES "hip::host"
+  IMPORTED_LOCATION "${ROCTRACER}"
+  IMPORTED_SONAME "libroctracer.so")
+add_library(roc::roctx SHARED IMPORTED)
+set_target_properties(roc::roctx PROPERTIES
+  INTERFACE_INCLUDE_DIRECTORIES "${ROCM_PATH}/include"
+  INTERFACE_LINK_LIBRARIES "hip::host"
+  IMPORTED_LOCATION "${ROCTX}"
+  IMPORTED_SONAME "libroctx64.so")
+
+# Find HIP package
+find_package(HIP REQUIRED)
+
+# rocblas
+find_package(rocblas REQUIRED)
+
+get_target_property(rocblas_LIBRARIES roc::rocblas IMPORTED_LOCATION_RELEASE)
+
+message("-- rocBLAS version:      ${rocblas_VERSION}")
+message("-- rocBLAS include dirs: ${rocblas_INCLUDE_DIRS}")
+message("-- rocBLAS libraries:    ${rocblas_LIBRARIES}")
+
+get_filename_component(ROCBLAS_LIB_PATH ${rocblas_LIBRARIES} DIRECTORY)
+
+# ROCm cmake package
+find_package(ROCM QUIET CONFIG PATHS ${CMAKE_PREFIX_PATH})
+if(NOT ROCM_FOUND)
+  set(PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern)
+  set(rocm_cmake_tag "master" CACHE STRING "rocm-cmake tag to download")
+  file(DOWNLOAD https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip
+       ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip STATUS status LOG log)
+
+  list(GET status 0 status_code)
+  list(GET status 1 status_string)
+
+  if(NOT status_code EQUAL 0)
+    message(FATAL_ERROR "error: downloading
+    'https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip' failed
+    status_code: ${status_code}
+    status_string: ${status_string}
+    log: ${log}
+    ")
+  endif()
+
+  execute_process(COMMAND ${CMAKE_COMMAND} -E tar xzf ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip
+                  WORKING_DIRECTORY ${PROJECT_EXTERN_DIR})
+
+  find_package(ROCM REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag})
+endif()
+
+include(ROCMSetupVersion)
+include(ROCMCreatePackage)
+include(ROCMInstallTargets)
+include(ROCMPackageConfigHelpers)
+include(ROCMInstallSymlinks)
+include(ROCMCheckTargetIds OPTIONAL)
diff --git a/include/hpl.hpp b/include/hpl.hpp
new file mode 100644
index 0000000..4c99804
--- /dev/null
+++ b/include/hpl.hpp
@@ -0,0 +1,64 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+#ifndef HPL_HPP
+#define HPL_HPP
+/*
+ * ---------------------------------------------------------------------
+ * HPL default compile options that can overridden in the cmake
+ * ---------------------------------------------------------------------
+ */
+#ifndef HPL_DETAILED_TIMING /* Do not enable detailed timings */
+#define HPL_NO_DETAILED_TIMING
+#endif
+
+#undef HPL_USE_COLLECTIVES
+//#define HPL_USE_COLLECTIVES
+
+/*
+ * ---------------------------------------------------------------------
+ * Include files
+ * ---------------------------------------------------------------------
+ */
+#include <omp.h>
+
+// NC: hipcc in ROCm 3.7 complains if __HIP_PLATFORM_HCC__ is defined in the
+// compile line
+#ifdef __HIPCC__
+#ifdef __HIP_PLATFORM_HCC__
+#undef __HIP_PLATFORM_HCC__
+#endif
+#endif
+#include "hip/hip_runtime_api.h"
+
+#include "hpl_version.hpp"
+#include "hpl_misc.hpp"
+#include "hpl_blas.hpp"
+#include "hpl_auxil.hpp"
+
+#include "hpl_pmisc.hpp"
+#include "hpl_pauxil.hpp"
+#include "hpl_panel.hpp"
+#include "hpl_pfact.hpp"
+#include "hpl_pgesv.hpp"
+
+#include "hpl_ptimer.hpp"
+#include "hpl_pmatgen.hpp"
+#include "hpl_ptest.hpp"
+
+#endif
+/*
+ * End of hpl.hpp
+ */
diff --git a/include/hpl_auxil.hpp b/include/hpl_auxil.hpp
new file mode 100644
index 0000000..537a3d0
--- /dev/null
+++ b/include/hpl_auxil.hpp
@@ -0,0 +1,90 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+#ifndef HPL_AUXIL_HPP
+#define HPL_AUXIL_HPP
+/*
+ * ---------------------------------------------------------------------
+ * Include files
+ * ---------------------------------------------------------------------
+ */
+#include "hpl_misc.hpp"
+#include "hpl_blas.hpp"
+/*
+ * ---------------------------------------------------------------------
+ * typedef definitions
+ * ---------------------------------------------------------------------
+ */
+typedef enum {
+  HPL_NORM_A = 800,
+  HPL_NORM_1 = 801,
+  HPL_NORM_I = 802
+} HPL_T_NORM;
+
+typedef enum {
+  HPL_MACH_EPS   = 900, /* relative machine precision */
+  HPL_MACH_SFMIN = 901, /* safe minimum st 1/sfmin does not overflow */
+  HPL_MACH_BASE  = 902, /* base = base of the machine */
+  HPL_MACH_PREC  = 903, /* prec  = eps*base */
+  HPL_MACH_MLEN  = 904, /* number of (base) digits in the mantissa */
+  HPL_MACH_RND   = 905, /* 1.0 if rounding occurs in addition */
+  HPL_MACH_EMIN  = 906, /* min exponent before (gradual) underflow */
+  HPL_MACH_RMIN  = 907, /* underflow threshold base**(emin-1) */
+  HPL_MACH_EMAX  = 908, /* largest exponent before overflow */
+  HPL_MACH_RMAX  = 909  /* overflow threshold - (base**emax)*(1-eps) */
+
+} HPL_T_MACH;
+/*
+ * ---------------------------------------------------------------------
+ * Function prototypes
+ * ---------------------------------------------------------------------
+ */
+void HPL_fprintf(FILE*, const char*, ...);
+void HPL_warn(FILE*, int, const char*, const char*, ...);
+void HPL_abort(int, const char*, const char*, ...);
+
+void HPL_dlacpy(const int,
+                const int,
+                const double*,
+                const int,
+                double*,
+                const int);
+
+void HPL_dlatcpy(const int,
+                 const int,
+                 const double*,
+                 const int,
+                 double*,
+                 const int);
+
+void HPL_dlatcpy_gpu(const int,
+                     const int,
+                     const double*,
+                     const int,
+                     double*,
+                     const int);
+
+double HPL_dlange(const HPL_T_NORM,
+                  const int,
+                  const int,
+                  const double*,
+                  const int);
+
+double HPL_dlamch(const HPL_T_MACH);
+
+#endif
+/*
+ * End of hpl_auxil.hpp
+ */
diff --git a/include/hpl_blas.hpp b/include/hpl_blas.hpp
new file mode 100644
index 0000000..56dce6c
--- /dev/null
+++ b/include/hpl_blas.hpp
@@ -0,0 +1,266 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+#ifndef HPL_BLAS_HPP
+#define HPL_BLAS_HPP
+/*
+ * ---------------------------------------------------------------------
+ * Include files
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl_misc.hpp"
+#include <rocblas.h>
+#include <roctracer.h>
+#include <roctx.h>
+
+extern rocblas_handle handle;
+extern hipStream_t    computeStream;
+extern hipStream_t    dataStream;
+
+#if __cplusplus
+extern "C" {
+#endif
+
+/*
+ * ---------------------------------------------------------------------
+ * typedef definitions
+ * ---------------------------------------------------------------------
+ */
+enum HPL_ORDER { HplRowMajor = 101, HplColumnMajor = 102 };
+enum HPL_TRANS { HplNoTrans = 111, HplTrans = 112, HplConjTrans = 113 };
+enum HPL_UPLO { HplUpper = 121, HplLower = 122 };
+enum HPL_DIAG { HplNonUnit = 131, HplUnit = 132 };
+enum HPL_SIDE { HplLeft = 141, HplRight = 142 };
+
+/*
+ * ---------------------------------------------------------------------
+ * Blocked OpenMP routines
+ * ---------------------------------------------------------------------
+ */
+
+void HPL_idamax_omp(const int     N,
+                    const double* X,
+                    const int     INCX,
+                    const int     NB,
+                    const int     II,
+                    const int     thread_rank,
+                    const int     thread_size,
+                    int*          max_index,
+                    double*       max_value);
+
+void HPL_dscal_omp(const int    N,
+                   const double ALPHA,
+                   double*      X,
+                   const int    INCX,
+                   const int    NB,
+                   const int    II,
+                   const int    thread_rank,
+                   const int    thread_size);
+
+void HPL_daxpy_omp(const int     N,
+                   const double  ALPHA,
+                   const double* X,
+                   const int     INCX,
+                   double*       Y,
+                   const int     INCY,
+                   const int     NB,
+                   const int     II,
+                   const int     thread_rank,
+                   const int     thread_size);
+
+void HPL_dger_omp(const enum HPL_ORDER ORDER,
+                  const int            M,
+                  const int            N,
+                  const double         ALPHA,
+                  const double*        X,
+                  const int            INCX,
+                  double*              Y,
+                  const int            INCY,
+                  double*              A,
+                  const int            LDA,
+                  const int            NB,
+                  const int            II,
+                  const int            thread_rank,
+                  const int            thread_size);
+
+void HPL_dgemv_omp(const enum HPL_ORDER ORDER,
+                   const enum HPL_TRANS TRANS,
+                   const int            M,
+                   const int            N,
+                   const double         ALPHA,
+                   const double*        A,
+                   const int            LDA,
+                   const double*        X,
+                   const int            INCX,
+                   const double         BETA,
+                   double*              Y,
+                   const int            INCY,
+                   const int            NB,
+                   const int            II,
+                   const int            thread_rank,
+                   const int            thread_size);
+
+void HPL_dgemm_omp(const enum HPL_ORDER ORDER,
+                   const enum HPL_TRANS TRANSA,
+                   const enum HPL_TRANS TRANSB,
+                   const int            M,
+                   const int            N,
+                   const int            K,
+                   const double         ALPHA,
+                   const double*        A,
+                   const int            LDA,
+                   const double*        B,
+                   const int            LDB,
+                   const double         BETA,
+                   double*              C,
+                   const int            LDC,
+                   const int            NB,
+                   const int            II,
+                   const int            thread_rank,
+                   const int            thread_size);
+
+/*
+ * ---------------------------------------------------------------------
+ * #define macro constants
+ * ---------------------------------------------------------------------
+ */
+#define CBLAS_INDEX int
+
+#define CBLAS_ORDER HPL_ORDER
+#define CblasRowMajor HplRowMajor
+#define CblasColMajor HplColMajor
+
+#define CBLAS_TRANSPOSE HPL_TRANS
+#define CblasNoTrans HplNoTrans
+#define CblasTrans HplTrans
+#define CblasConjTrans HplConjTrans
+
+#define CBLAS_UPLO HPL_UPLO
+#define CblasUpper HplUpper
+#define CblasLower HplLower
+
+#define CBLAS_DIAG HPL_DIAG
+#define CblasNonUnit HplNonUnit
+#define CblasUnit HplUnit
+
+#define CBLAS_SIDE HPL_SIDE
+#define CblasLeft HplLeft
+#define CblasRight HplRight
+/*
+ * ---------------------------------------------------------------------
+ * CBLAS Function prototypes
+ * ---------------------------------------------------------------------
+ */
+CBLAS_INDEX cblas_idamax(const int, const double*, const int);
+void        cblas_dswap(const int, double*, const int, double*, const int);
+void cblas_dcopy(const int, const double*, const int, double*, const int);
+
+void cblas_daxpy(const int,
+                 const double,
+                 const double*,
+                 const int,
+                 double*,
+                 const int);
+
+void cblas_dscal(const int, const double, double*, const int);
+
+void cblas_dgemv(const enum CBLAS_ORDER,
+                 const enum CBLAS_TRANSPOSE,
+                 const int,
+                 const int,
+                 const double,
+                 const double*,
+                 const int,
+                 const double*,
+                 const int,
+                 const double,
+                 double*,
+                 const int);
+
+void cblas_dger(const enum CBLAS_ORDER,
+                const int,
+                const int,
+                const double,
+                const double*,
+                const int,
+                const double*,
+                const int,
+                double*,
+                const int);
+
+void cblas_dtrsv(const enum CBLAS_ORDER,
+                 const enum CBLAS_UPLO,
+                 const enum CBLAS_TRANSPOSE,
+                 const enum CBLAS_DIAG,
+                 const int,
+                 const double*,
+                 const int,
+                 double*,
+                 const int);
+
+void cblas_dgemm(const enum CBLAS_ORDER,
+                 const enum CBLAS_TRANSPOSE,
+                 const enum CBLAS_TRANSPOSE,
+                 const int,
+                 const int,
+                 const int,
+                 const double,
+                 const double*,
+                 const int,
+                 const double*,
+                 const int,
+                 const double,
+                 double*,
+                 const int);
+
+void cblas_dtrsm(const enum CBLAS_ORDER,
+                 const enum CBLAS_SIDE,
+                 const enum CBLAS_UPLO,
+                 const enum CBLAS_TRANSPOSE,
+                 const enum CBLAS_DIAG,
+                 const int,
+                 const int,
+                 const double,
+                 const double*,
+                 const int,
+                 double*,
+                 const int);
+/*
+ * ---------------------------------------------------------------------
+ * HPL C BLAS macro definition
+ * ---------------------------------------------------------------------
+ */
+#define HPL_dswap cblas_dswap
+#define HPL_dcopy cblas_dcopy
+#define HPL_daxpy cblas_daxpy
+#define HPL_dscal cblas_dscal
+#define HPL_idamax cblas_idamax
+
+#define HPL_dgemv cblas_dgemv
+#define HPL_dtrsv cblas_dtrsv
+#define HPL_dger cblas_dger
+
+#define HPL_dgemm cblas_dgemm
+#define HPL_dtrsm cblas_dtrsm
+
+#if __cplusplus
+}
+#endif
+
+#endif
+/*
+ * hpl_blas.hpp
+ */
diff --git a/include/hpl_comm.hpp b/include/hpl_comm.hpp
new file mode 100644
index 0000000..f69c44e
--- /dev/null
+++ b/include/hpl_comm.hpp
@@ -0,0 +1,94 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+#ifndef HPL_COMM_HPP
+#define HPL_COMM_HPP
+/*
+ * ---------------------------------------------------------------------
+ * Include files
+ * ---------------------------------------------------------------------
+ */
+#include "hpl_pmisc.hpp"
+#include "hpl_panel.hpp"
+
+/*
+ * ---------------------------------------------------------------------
+ * #typedefs and data structures
+ * ---------------------------------------------------------------------
+ */
+typedef enum {
+  HPL_1RING   = 401, /* Unidirectional ring */
+  HPL_1RING_M = 402, /* Unidirectional ring (modified) */
+  HPL_2RING   = 403, /* Bidirectional ring */
+  HPL_2RING_M = 404, /* Bidirectional ring (modified) */
+  HPL_BLONG   = 405, /* long broadcast */
+  HPL_BLONG_M = 406, /* long broadcast (modified) */
+} HPL_T_TOP;
+
+typedef MPI_Op HPL_T_OP;
+
+#define HPL_SUM MPI_SUM
+#define HPL_MAX MPI_MAX
+#define HPL_MIN MPI_MIN
+
+extern MPI_Op       HPL_DMXSWP;
+extern MPI_Datatype PDFACT_ROW;
+/*
+ * ---------------------------------------------------------------------
+ * #define macro constants
+ * ---------------------------------------------------------------------
+ */
+#define HPL_FAILURE 0
+#define HPL_SUCCESS 1
+/*
+ * ---------------------------------------------------------------------
+ * comm function prototypes
+ * ---------------------------------------------------------------------
+ */
+int HPL_send(double*, int, int, int, MPI_Comm);
+int HPL_recv(double*, int, int, int, MPI_Comm);
+int HPL_sdrv(double*, int, int, double*, int, int, int, MPI_Comm);
+int HPL_bcast(double*, int, int, MPI_Comm, HPL_T_TOP top);
+int HPL_bcast_1ring(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM);
+int HPL_bcast_1rinM(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM);
+int HPL_bcast_2ring(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM);
+int HPL_bcast_2rinM(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM);
+int HPL_bcast_blong(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM);
+int HPL_bcast_blonM(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM);
+int HPL_scatterv(double*, const int*, const int*, const int, int, MPI_Comm);
+int HPL_allgatherv(double*, const int, const int*, const int*, MPI_Comm);
+int HPL_barrier(MPI_Comm);
+int HPL_broadcast(void*, const int, const HPL_T_TYPE, const int, MPI_Comm);
+
+int HPL_reduce(void*,
+               const int,
+               const HPL_T_TYPE,
+               const HPL_T_OP,
+               const int,
+               MPI_Comm);
+
+int HPL_all_reduce(void*,
+                   const int,
+                   const HPL_T_TYPE,
+                   const HPL_T_OP,
+                   MPI_Comm);
+
+void HPL_dmxswp(void*, void*, int*, MPI_Datatype*);
+void HPL_all_reduce_dmxswp(double*, const int, const int, MPI_Comm, double*);
+
+#endif
+/*
+ * End of hpl_comm.hpp
+ */
diff --git a/include/hpl_grid.hpp b/include/hpl_grid.hpp
new file mode 100644
index 0000000..9952df7
--- /dev/null
+++ b/include/hpl_grid.hpp
@@ -0,0 +1,106 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+#ifndef HPL_GRID_H
+#define HPL_GRID_H
+/*
+ * ---------------------------------------------------------------------
+ * Include files
+ * ---------------------------------------------------------------------
+ */
+#include "hpl_pmisc.hpp"
+
+/*
+ * ---------------------------------------------------------------------
+ * #typedefs and data structures
+ * ---------------------------------------------------------------------
+ */
+typedef enum { HPL_INT = 100, HPL_DOUBLE = 101 } HPL_T_TYPE;
+
+typedef enum { HPL_ROW_MAJOR = 201, HPL_COLUMN_MAJOR = 202 } HPL_T_ORDER;
+
+typedef struct HPL_S_grid {
+  MPI_Comm    all_comm;    /* grid communicator */
+  MPI_Comm    row_comm;    /* row communicator */
+  MPI_Comm    col_comm;    /* column communicator */
+  HPL_T_ORDER order;       /* ordering of the procs in the grid */
+  int         iam;         /* my rank in the grid */
+  int         myrow;       /* my row number in the grid */
+  int         mycol;       /* my column number in the grid */
+  int         nprow;       /* the total # of rows in the grid */
+  int         npcol;       /* the total # of columns in the grid */
+  int         local_myrow; /* my row number in the node-local grid */
+  int         local_mycol; /* my column number in the node-local grid */
+  int         local_nprow; /* the total # of rows in the node-local grid */
+  int         local_npcol; /* the total # of columns in the node-local grid */
+  int         nprocs;      /* the total # of procs in the grid */
+  int         row_ip2;     /* largest power of two <= nprow */
+  int         row_hdim;    /* row_ip2 procs hypercube dimension */
+  int         row_ip2m1;   /* largest power of two <= nprow-1 */
+  int         row_mask;    /* row_ip2m1 procs hypercube mask */
+  int         col_ip2;     /* largest power of two <= npcol */
+  int         col_hdim;    /* col_ip2 procs hypercube dimension */
+  int         col_ip2m1;   /* largest power of two <= npcol-1 */
+  int         col_mask;    /* col_ip2m1 procs hypercube mask */
+} HPL_T_grid;
+
+/*
+ * ---------------------------------------------------------------------
+ * #define macros definitions
+ * ---------------------------------------------------------------------
+ */
+#define HPL_2_MPI_TYPE(typ) ((typ == HPL_INT ? MPI_INT : MPI_DOUBLE))
+/*
+ * The following macros perform common modulo operations;  All functions
+ * except MPosMod assume arguments are < d (i.e., arguments are themsel-
+ * ves within modulo range).
+ */
+/* increment with mod */
+#define MModInc(I, d) \
+  if(++(I) == (d)) (I) = 0
+/* decrement with mod */
+#define MModDec(I, d) \
+  if(--(I) == -1) (I) = (d)-1
+/* positive modulo */
+#define MPosMod(I, d) ((I) - ((I) / (d)) * (d))
+/* add two numbers */
+#define MModAdd(I1, I2, d) \
+  (((I1) + (I2) < (d)) ? (I1) + (I2) : (I1) + (I2) - (d))
+/* add 1 to # */
+#define MModAdd1(I, d) (((I) != (d)-1) ? (I) + 1 : 0)
+/* subtract two numbers */
+#define MModSub(I1, I2, d) (((I1) < (I2)) ? (d) + (I1) - (I2) : (I1) - (I2))
+/* sub 1 from # */
+#define MModSub1(I, d) (((I) != 0) ? (I)-1 : (d)-1)
+/*
+ * ---------------------------------------------------------------------
+ * grid function prototypes
+ * ---------------------------------------------------------------------
+ */
+int HPL_grid_init(MPI_Comm,
+                  const HPL_T_ORDER,
+                  const int,
+                  const int,
+                  const int,
+                  const int,
+                  HPL_T_grid*);
+
+int HPL_grid_exit(HPL_T_grid*);
+int HPL_grid_info(const HPL_T_grid*, int*, int*, int*, int*);
+
+#endif
+/*
+ * End of hpl_grid.hpp
+ */
diff --git a/include/hpl_misc.hpp b/include/hpl_misc.hpp
new file mode 100644
index 0000000..75bd326
--- /dev/null
+++ b/include/hpl_misc.hpp
@@ -0,0 +1,66 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+#ifndef HPL_MISC_HPP
+#define HPL_MISC_HPP
+/*
+ * ---------------------------------------------------------------------
+ * Include files
+ * ---------------------------------------------------------------------
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <stdarg.h>
+
+/*
+ * ---------------------------------------------------------------------
+ * #define macro constants
+ * ---------------------------------------------------------------------
+ */
+#define HPL_rone 1.0
+#define HPL_rtwo 2.0
+#define HPL_rzero 0.0
+/*
+ * ---------------------------------------------------------------------
+ * #define macros definitions
+ * ---------------------------------------------------------------------
+ */
+#define Mabs(a_) (((a_) < 0) ? -(a_) : (a_))
+#define Mmin(a_, b_) (((a_) < (b_)) ? (a_) : (b_))
+#define Mmax(a_, b_) (((a_) > (b_)) ? (a_) : (b_))
+
+#define Mfloor(a, b) (((a) > 0) ? (((a) / (b))) : (-(((-(a)) + (b)-1) / (b))))
+#define Mceil(a, b) (((a) + (b)-1) / (b))
+#define Miceil(a, b) (((a) > 0) ? ((((a) + (b)-1) / (b))) : (-((-(a)) / (b))))
+
+#define Mupcase(C) (((C) > 96 && (C) < 123) ? (C)&0xDF : (C))
+#define Mlowcase(C) (((C) > 64 && (C) < 91) ? (C) | 32 : (C))
+/*
+ * Mptr returns a pointer to a_( i_, j_ ) for readability reasons and
+ * also less silly errors ...
+ */
+#define Mptr(a_, i_, j_, lda_) \
+  ((a_) + (size_t)(i_) + (size_t)(j_) * (size_t)(lda_))
+/*
+ * Align pointer
+ */
+#define HPL_PTR(ptr_, al_) ((((size_t)(ptr_) + (al_)-1) / (al_)) * (al_))
+#endif
+/*
+ * End of hpl_misc.hpp
+ */
diff --git a/include/hpl_panel.hpp b/include/hpl_panel.hpp
new file mode 100644
index 0000000..a8e5f44
--- /dev/null
+++ b/include/hpl_panel.hpp
@@ -0,0 +1,146 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+#ifndef HPL_PANEL_HPP
+#define HPL_PANEL_HPP
+/*
+ * ---------------------------------------------------------------------
+ * Include files
+ * ---------------------------------------------------------------------
+ */
+#include "hpl_pmisc.hpp"
+#include "hpl_grid.hpp"
+
+/*
+ * ---------------------------------------------------------------------
+ * Data Structures
+ * ---------------------------------------------------------------------
+ */
+typedef struct HPL_S_panel {
+  struct HPL_S_grid* grid;   /* ptr to the process grid */
+  struct HPL_S_palg* algo;   /* ptr to the algo parameters */
+  struct HPL_S_pmat* pmat;   /* ptr to the local array info */
+  double*            A;      /* ptr to trailing part of A */
+  double*            dA;     /* ptr to trailing part of A */
+  double*            LWORK;  /* L work space */
+  double*            dLWORK; /* L device-copy work space */
+  double*            UWORK;  /* U work space */
+  double*            dUWORK; /* U device-copy work space */
+  double*            fWORK;  /* pdfact work space */
+  double*            L2;     /* ptr to L */
+  double*            L1;     /* ptr to jb x jb upper block of A */
+  double*            dL2;    /* ptr to L */
+  double*            dL1;    /* ptr to jb x jb upper block of A */
+  double*            DINFO;  /* ptr to replicated scalar info */
+  double*            dDINFO; /* ptr to replicated scalar info */
+  int*               ipiv;
+  int*               dipiv;
+  int*               lindxA;
+  int*               dlindxA;
+  int*               lindxAU;
+  int*               dlindxAU;
+  int*               lindxU;
+  int*               dlindxU;
+  int*               permU;
+  int*               dpermU;
+  double*            U;   /* ptr to U */
+  double*            dU;  /* ptr to U */
+  double*            W;   /* ptr to W */
+  double*            dW;  /* ptr to W */
+  double*            U1;  /* ptr to U1 */
+  double*            dU1; /* ptr to U1 */
+  double*            W1;  /* ptr to W1 */
+  double*            dW1; /* ptr to W1 */
+  double*            U2;  /* ptr to U2 */
+  double*            dU2; /* ptr to U2 */
+  double*            W2;  /* ptr to W2 */
+  double*            dW2; /* ptr to W2 */
+  int                nu0;
+  int                nu1;
+  int                nu2;
+  int                ldu0;
+  int                ldu1;
+  int                ldu2;
+  int*               IWORK;      /* integer workspace for swapping */
+  void*              buffers[2]; /* buffers for panel bcast */
+  int                counts[2];  /* counts for panel bcast */
+  MPI_Datatype       dtypes[2];  /* data types for panel bcast */
+  MPI_Request        request[1]; /* requests for panel bcast */
+  MPI_Status         status[1];  /* status for panel bcast */
+  int                nb;         /* distribution blocking factor */
+  int                jb;         /* panel width */
+  int                m;          /* global # of rows of trailing part of A */
+  int                n;          /* global # of cols of trailing part of A */
+  int                ia;         /* global row index of trailing part of A */
+  int                ja;         /* global col index of trailing part of A */
+  int                mp;         /* local # of rows of trailing part of A */
+  int                nq;         /* local # of cols of trailing part of A */
+  int                ii;         /* local row index of trailing part of A */
+  int                jj;         /* local col index of trailing part of A */
+  int                lda;        /* local leading dim of array A */
+  int                dlda;       /* local leading dim of array A */
+  int                prow;       /* proc. row owning 1st row of trail. A */
+  int                pcol;       /* proc. col owning 1st col of trail. A */
+  int                msgid;      /* message id for panel bcast */
+  int                ldl2;       /* local leading dim of array L2 */
+  int                dldl2;      /* local leading dim of array L2 */
+  int                len;        /* length of the buffer to broadcast */
+  unsigned int       max_pinned_work_size; /* largest size of pinned A space */
+  unsigned int       max_lwork_size;       /* largest size of WORK space */
+  unsigned int       max_uwork_size;       /* largest size of WORK space */
+  unsigned int       max_iwork_size;       /* largest size of IWORK space */
+  unsigned int       max_fwork_size;       /* largest size of fWORK space */
+  unsigned int       free_work_now;        /* should we deallocate */
+} HPL_T_panel;
+
+/*
+ * ---------------------------------------------------------------------
+ * panel function prototypes
+ * ---------------------------------------------------------------------
+ */
+#include "hpl_pgesv.hpp"
+
+void HPL_pdpanel_new(HPL_T_grid*,
+                     HPL_T_palg*,
+                     const int,
+                     const int,
+                     const int,
+                     HPL_T_pmat*,
+                     const int,
+                     const int,
+                     const int,
+                     HPL_T_panel**);
+
+void HPL_pdpanel_init(HPL_T_grid*,
+                      HPL_T_palg*,
+                      const int,
+                      const int,
+                      const int,
+                      HPL_T_pmat*,
+                      const int,
+                      const int,
+                      const int,
+                      HPL_T_panel*);
+
+int  HPL_pdpanel_disp(HPL_T_panel**);
+int  HPL_pdpanel_free(HPL_T_panel*);
+void HPL_pdpanel_SendToHost(HPL_T_panel*);
+void HPL_pdpanel_SendToDevice(HPL_T_panel*);
+void HPL_pdpanel_Wait(HPL_T_panel* PANEL);
+int  HPL_pdpanel_bcast(HPL_T_panel*);
+#endif
+/*
+ * End of hpl_panel.hpp
+ */
diff --git a/include/hpl_pauxil.hpp b/include/hpl_pauxil.hpp
new file mode 100644
index 0000000..7730000
--- /dev/null
+++ b/include/hpl_pauxil.hpp
@@ -0,0 +1,287 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+#ifndef HPL_PAUXIL_HPP
+#define HPL_PAUXIL_HPP
+/*
+ * ---------------------------------------------------------------------
+ * Include files
+ * ---------------------------------------------------------------------
+ */
+#include "hpl_misc.hpp"
+#include "hpl_blas.hpp"
+#include "hpl_auxil.hpp"
+
+#include "hpl_pmisc.hpp"
+#include "hpl_grid.hpp"
+
+/*
+ * ---------------------------------------------------------------------
+ * #define macros definitions
+ * ---------------------------------------------------------------------
+ */
+/*
+ * Mindxg2p  returns the process coodinate owning the entry globally in-
+ * dexed by ig_.
+ */
+#define Mindxg2p(ig_, inb_, nb_, proc_, src_, nprocs_)          \
+  {                                                             \
+    if(((ig_) >= (inb_)) && ((src_) >= 0) && ((nprocs_) > 1)) { \
+      proc_ = (src_) + 1 + ((ig_) - (inb_)) / (nb_);            \
+      proc_ -= (proc_ / (nprocs_)) * (nprocs_);                 \
+    } else {                                                    \
+      proc_ = (src_);                                           \
+    }                                                           \
+  }
+
+#define Mindxg2l(il_, ig_, inb_, nb_, proc_, src_, nprocs_)               \
+  {                                                                       \
+    if(((ig_) < (inb_)) || ((src_) == -1) || ((nprocs_) == 1)) {          \
+      il_ = (ig_);                                                        \
+    } else {                                                              \
+      int i__, j__;                                                       \
+      j__ = (i__ = ((ig_) - (inb_)) / (nb_)) / (nprocs_);                 \
+      il_ = (nb_) * (j__ - i__) +                                         \
+            ((i__ + 1 - (j__ + 1) * (nprocs_)) ? (ig_) - (inb_) : (ig_)); \
+    }                                                                     \
+  }
+
+#define Mindxg2lp(il_, proc_, ig_, inb_, nb_, src_, nprocs_)              \
+  {                                                                       \
+    if(((ig_) < (inb_)) || ((src_) == -1) || ((nprocs_) == 1)) {          \
+      il_   = (ig_);                                                      \
+      proc_ = (src_);                                                     \
+    } else {                                                              \
+      int i__, j__;                                                       \
+      j__ = (i__ = ((ig_) - (inb_)) / (nb_)) / (nprocs_);                 \
+      il_ = (nb_) * (j__ - i__) +                                         \
+            ((i__ + 1 - (j__ + 1) * (nprocs_)) ? (ig_) - (inb_) : (ig_)); \
+      proc_ = (src_) + 1 + i__;                                           \
+      proc_ -= (proc_ / (nprocs_)) * (nprocs_);                           \
+    }                                                                     \
+  }
+/*
+ * Mindxl2g computes the global index ig_ corresponding to the local
+ * index il_ in process proc_.
+ */
+#define Mindxl2g(ig_, il_, inb_, nb_, proc_, src_, nprocs_)                   \
+  {                                                                           \
+    if(((src_) >= 0) && ((nprocs_) > 1)) {                                    \
+      if((proc_) == (src_)) {                                                 \
+        if((il_) < (inb_))                                                    \
+          ig_ = (il_);                                                        \
+        else                                                                  \
+          ig_ =                                                               \
+              (il_) + (nb_) * ((nprocs_)-1) * (((il_) - (inb_)) / (nb_) + 1); \
+      } else if((proc_) < (src_)) {                                           \
+        ig_ = (il_) + (inb_) +                                                \
+              (nb_) * (((nprocs_)-1) * ((il_) / (nb_)) + (proc_) - (src_)-1 + \
+                       (nprocs_));                                            \
+      } else {                                                                \
+        ig_ = (il_) + (inb_) +                                                \
+              (nb_) * (((nprocs_)-1) * ((il_) / (nb_)) + (proc_) - (src_)-1); \
+      }                                                                       \
+    } else {                                                                  \
+      ig_ = (il_);                                                            \
+    }                                                                         \
+  }
+/*
+ * MnumrocI computes the # of local indexes  np_ residing in the process
+ * of coordinate  proc_  corresponding to the interval of global indexes
+ * i_:i_+n_-1  assuming  that the global index 0 resides in  the process
+ * src_,  and that the indexes are distributed from src_ using the para-
+ * meters inb_, nb_ and nprocs_.
+ */
+#define MnumrocI(np_, n_, i_, inb_, nb_, proc_, src_, nprocs_)              \
+  {                                                                         \
+    if(((src_) >= 0) && ((nprocs_) > 1)) {                                  \
+      int inb__, mydist__, n__, nblk__, quot__, src__;                      \
+      if((inb__ = (inb_) - (i_)) <= 0) {                                    \
+        nblk__ = (-inb__) / (nb_) + 1;                                      \
+        src__  = (src_) + nblk__;                                           \
+        src__ -= (src__ / (nprocs_)) * (nprocs_);                           \
+        inb__ += nblk__ * (nb_);                                            \
+        if((n__ = (n_)-inb__) <= 0) {                                       \
+          if((proc_) == src__)                                              \
+            np_ = (n_);                                                     \
+          else                                                              \
+            np_ = 0;                                                        \
+        } else {                                                            \
+          if((mydist__ = (proc_)-src__) < 0) mydist__ += (nprocs_);         \
+          nblk__ = n__ / (nb_) + 1;                                         \
+          mydist__ -= nblk__ - (quot__ = (nblk__ / (nprocs_))) * (nprocs_); \
+          if(mydist__ < 0) {                                                \
+            if((proc_) != src__)                                            \
+              np_ = (nb_) + (nb_)*quot__;                                   \
+            else                                                            \
+              np_ = inb__ + (nb_)*quot__;                                   \
+          } else if(mydist__ > 0) {                                         \
+            np_ = (nb_)*quot__;                                             \
+          } else {                                                          \
+            if((proc_) != src__)                                            \
+              np_ = n__ + (nb_) + (nb_) * (quot__ - nblk__);                \
+            else                                                            \
+              np_ = (n_) + (nb_) * (quot__ - nblk__);                       \
+          }                                                                 \
+        }                                                                   \
+      } else {                                                              \
+        if((n__ = (n_)-inb__) <= 0) {                                       \
+          if((proc_) == (src_))                                             \
+            np_ = (n_);                                                     \
+          else                                                              \
+            np_ = 0;                                                        \
+        } else {                                                            \
+          if((mydist__ = (proc_) - (src_)) < 0) mydist__ += (nprocs_);      \
+          nblk__ = n__ / (nb_) + 1;                                         \
+          mydist__ -= nblk__ - (quot__ = (nblk__ / (nprocs_))) * (nprocs_); \
+          if(mydist__ < 0) {                                                \
+            if((proc_) != (src_))                                           \
+              np_ = (nb_) + (nb_)*quot__;                                   \
+            else                                                            \
+              np_ = inb__ + (nb_)*quot__;                                   \
+          } else if(mydist__ > 0) {                                         \
+            np_ = (nb_)*quot__;                                             \
+          } else {                                                          \
+            if((proc_) != (src_))                                           \
+              np_ = n__ + (nb_) + (nb_) * (quot__ - nblk__);                \
+            else                                                            \
+              np_ = (n_) + (nb_) * (quot__ - nblk__);                       \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else {                                                                \
+      np_ = (n_);                                                           \
+    }                                                                       \
+  }
+
+#define Mnumroc(np_, n_, inb_, nb_, proc_, src_, nprocs_) \
+  MnumrocI(np_, n_, 0, inb_, nb_, proc_, src_, nprocs_)
+/*
+ * ---------------------------------------------------------------------
+ * Function prototypes
+ * ---------------------------------------------------------------------
+ */
+void HPL_indxg2lp(int*,
+                  int*,
+                  const int,
+                  const int,
+                  const int,
+                  const int,
+                  const int);
+
+int HPL_indxg2l(const int, const int, const int, const int, const int);
+int HPL_indxg2p(const int, const int, const int, const int, const int);
+
+int HPL_indxl2g(const int,
+                const int,
+                const int,
+                const int,
+                const int,
+                const int);
+
+void HPL_infog2l(int,
+                 int,
+                 const int,
+                 const int,
+                 const int,
+                 const int,
+                 const int,
+                 const int,
+                 const int,
+                 const int,
+                 const int,
+                 const int,
+                 int*,
+                 int*,
+                 int*,
+                 int*);
+
+int HPL_numroc(const int,
+               const int,
+               const int,
+               const int,
+               const int,
+               const int);
+
+int HPL_numrocI(const int,
+                const int,
+                const int,
+                const int,
+                const int,
+                const int,
+                const int);
+
+void HPL_dlaswp00N(const int, const int, double*, const int, const int*);
+
+void HPL_dlaswp01T(const int,
+                   const int,
+                   double*,
+                   const int,
+                   double*,
+                   const int,
+                   const int*);
+
+void HPL_dlaswp02T(const int,
+                   const int,
+                   double*,
+                   const int,
+                   const int*,
+                   const int*);
+
+void HPL_dlaswp03T(const int,
+                   const int,
+                   double*,
+                   const int,
+                   double*,
+                   const int,
+                   const int*);
+
+void HPL_dlaswp04T(const int,
+                   const int,
+                   double*,
+                   const int,
+                   double*,
+                   const int,
+                   const int*);
+
+void HPL_dlaswp10N(const int, const int, double*, const int, const int*);
+
+void HPL_pabort(int, const char*, const char*, ...);
+void HPL_pwarn(FILE*, int, const char*, const char*, ...);
+
+void HPL_pdlaprnt(const HPL_T_grid*,
+                  const int,
+                  const int,
+                  const int,
+                  double*,
+                  const int,
+                  const int,
+                  const int,
+                  const char*);
+
+double HPL_pdlamch(MPI_Comm, const HPL_T_MACH);
+
+double HPL_pdlange(const HPL_T_grid*,
+                   const HPL_T_NORM,
+                   const int,
+                   const int,
+                   const int,
+                   const double*,
+                   const int);
+
+#endif
+/*
+ * End of hpl_pauxil.hpp
+ */
diff --git a/include/hpl_pfact.hpp b/include/hpl_pfact.hpp
new file mode 100644
index 0000000..edbd076
--- /dev/null
+++ b/include/hpl_pfact.hpp
@@ -0,0 +1,199 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+#ifndef HPL_PFACT_HPP
+#define HPL_PFACT_HPP
+/*
+ * ---------------------------------------------------------------------
+ * Include files
+ * ---------------------------------------------------------------------
+ */
+#include "hpl_misc.hpp"
+#include "hpl_blas.hpp"
+
+#include "hpl_pgesv.hpp"
+#include "hpl_pmisc.hpp"
+#include "hpl_pauxil.hpp"
+#include "hpl_panel.hpp"
+
+/*
+ * ---------------------------------------------------------------------
+ * #typedefs and data structures
+ * ---------------------------------------------------------------------
+ */
+typedef void (*HPL_T_PFA_FUN)(HPL_T_panel*,
+                              const int,
+                              const int,
+                              const int,
+                              double*,
+                              int,
+                              int,
+                              double*,
+                              int*);
+
+typedef void (*HPL_T_RFA_FUN)(HPL_T_panel*,
+                              const int,
+                              const int,
+                              const int,
+                              double*,
+                              int,
+                              int,
+                              double*,
+                              int*);
+/*
+ * ---------------------------------------------------------------------
+ * Function prototypes
+ * ---------------------------------------------------------------------
+ */
+void HPL_dlocmax(HPL_T_panel*,
+                 const int,
+                 const int,
+                 const int,
+                 double*,
+                 int,
+                 int,
+                 int*,
+                 double*);
+
+void HPL_dlocswpN(HPL_T_panel*, const int, const int, double*);
+void HPL_dlocswpT(HPL_T_panel*, const int, const int, double*);
+void HPL_pdmxswp(HPL_T_panel*, const int, const int, const int, double*);
+
+void HPL_pdpancrN(HPL_T_panel*,
+                  const int,
+                  const int,
+                  const int,
+                  double*,
+                  int,
+                  int,
+                  double*,
+                  int*);
+
+void HPL_pdpancrT(HPL_T_panel*,
+                  const int,
+                  const int,
+                  const int,
+                  double*,
+                  int,
+                  int,
+                  double*,
+                  int*);
+
+void HPL_pdpanllN(HPL_T_panel*,
+                  const int,
+                  const int,
+                  const int,
+                  double*,
+                  int,
+                  int,
+                  double*,
+                  int*);
+
+void HPL_pdpanllT(HPL_T_panel*,
+                  const int,
+                  const int,
+                  const int,
+                  double*,
+                  int,
+                  int,
+                  double*,
+                  int*);
+
+void HPL_pdpanrlN(HPL_T_panel*,
+                  const int,
+                  const int,
+                  const int,
+                  double*,
+                  int,
+                  int,
+                  double*,
+                  int*);
+
+void HPL_pdpanrlT(HPL_T_panel*,
+                  const int,
+                  const int,
+                  const int,
+                  double*,
+                  int,
+                  int,
+                  double*,
+                  int*);
+
+void HPL_pdrpancrN(HPL_T_panel*,
+                   const int,
+                   const int,
+                   const int,
+                   double*,
+                   int,
+                   int,
+                   double*,
+                   int*);
+
+void HPL_pdrpancrT(HPL_T_panel*,
+                   const int,
+                   const int,
+                   const int,
+                   double*,
+                   int,
+                   int,
+                   double*,
+                   int*);
+
+void HPL_pdrpanllN(HPL_T_panel*,
+                   const int,
+                   const int,
+                   const int,
+                   double*,
+                   int,
+                   int,
+                   double*,
+                   int*);
+
+void HPL_pdrpanllT(HPL_T_panel*,
+                   const int,
+                   const int,
+                   const int,
+                   double*,
+                   int,
+                   int,
+                   double*,
+                   int*);
+
+void HPL_pdrpanrlN(HPL_T_panel*,
+                   const int,
+                   const int,
+                   const int,
+                   double*,
+                   int,
+                   int,
+                   double*,
+                   int*);
+
+void HPL_pdrpanrlT(HPL_T_panel*,
+                   const int,
+                   const int,
+                   const int,
+                   double*,
+                   int,
+                   int,
+                   double*,
+                   int*);
+
+void HPL_pdfact(HPL_T_panel*);
+
+#endif
+/*
+ * End of hpl_pfact.hpp
+ */
diff --git a/include/hpl_pgesv.hpp b/include/hpl_pgesv.hpp
new file mode 100644
index 0000000..1acf860
--- /dev/null
+++ b/include/hpl_pgesv.hpp
@@ -0,0 +1,150 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+#ifndef HPL_PGESV_HPP
+#define HPL_PGESV_HPP
+/*
+ * ---------------------------------------------------------------------
+ * Include files
+ * ---------------------------------------------------------------------
+ */
+#include "hpl_misc.hpp"
+#include "hpl_blas.hpp"
+#include "hpl_auxil.hpp"
+
+#include "hpl_pmisc.hpp"
+#include "hpl_grid.hpp"
+#include "hpl_comm.hpp"
+#include "hpl_pauxil.hpp"
+#include "hpl_panel.hpp"
+#include "hpl_pfact.hpp"
+
+/*
+ * ---------------------------------------------------------------------
+ * #typedefs and data structures
+ * ---------------------------------------------------------------------
+ */
+typedef enum {
+  HPL_LEFT_LOOKING  = 301, /* Left looking lu fact variant */
+  HPL_CROUT         = 302, /* Crout lu fact variant */
+  HPL_RIGHT_LOOKING = 303  /* Right looking lu fact variant */
+} HPL_T_FACT;
+
+typedef enum {
+  HPL_SWAP00 = 451, /* Use HPL_pdlaswp00 */
+  HPL_SWAP01 = 452, /* Use HPL_pdlaswp01 */
+  HPL_SW_MIX = 453, /* Use HPL_pdlaswp00_ for small number of */
+                    /* columns, and HPL_pdlaswp01_ otherwise. */
+  HPL_NO_SWP = 499
+} HPL_T_SWAP;
+
+typedef enum {
+  HPL_LOOK_AHEAD = 0, /* look-ahead update */
+  HPL_UPD_1      = 1, /* first update */
+  HPL_UPD_2      = 2, /* second update */
+
+  HPL_N_UPD = 3
+} HPL_T_UPD;
+
+typedef void (*HPL_T_UPD_FUN)(HPL_T_panel*, const HPL_T_UPD);
+
+typedef struct HPL_S_palg {
+  HPL_T_TOP     btopo; /* row broadcast topology */
+  int           depth; /* look-ahead depth */
+  int           nbdiv; /* recursive division factor */
+  int           nbmin; /* recursion stopping criterium */
+  HPL_T_FACT    pfact; /* panel fact variant */
+  HPL_T_FACT    rfact; /* recursive fact variant */
+  HPL_T_PFA_FUN pffun; /* panel fact function ptr */
+  HPL_T_RFA_FUN rffun; /* recursive fact function ptr */
+  HPL_T_UPD_FUN upfun; /* update function */
+  HPL_T_SWAP    fswap; /* Swapping algorithm */
+  int           fsthr; /* Swapping threshold */
+  int           equil; /* Equilibration */
+  int           align; /* data alignment constant */
+  double        frac;  /* update split percentage */
+} HPL_T_palg;
+
+typedef struct HPL_S_pmat {
+  double* dA;   /* pointer to local piece of A */
+  double* dX;   /* pointer to solution vector */
+  int     n;    /* global problem size */
+  int     nb;   /* blocking factor */
+  int     ld;   /* local leading dimension */
+  int     mp;   /* local number of rows */
+  int     nq;   /* local number of columns */
+  int     info; /* computational flag */
+  double* A;
+  double* W;
+  double* dW;
+} HPL_T_pmat;
+
+extern hipEvent_t swapStartEvent[HPL_N_UPD], update[HPL_N_UPD];
+extern hipEvent_t swapUCopyEvent[HPL_N_UPD], swapWCopyEvent[HPL_N_UPD];
+extern hipEvent_t dgemmStart[HPL_N_UPD], dgemmStop[HPL_N_UPD];
+
+/*
+ * ---------------------------------------------------------------------
+ * #define macro constants
+ * ---------------------------------------------------------------------
+ */
+#define MSGID_BEGIN_PFACT 1001 /* message id ranges */
+#define MSGID_END_PFACT 2000
+#define MSGID_BEGIN_FACT 2001
+#define MSGID_END_FACT 3000
+#define MSGID_BEGIN_PTRSV 3001
+#define MSGID_END_PTRSV 4000
+
+#define MSGID_BEGIN_COLL 9001
+#define MSGID_END_COLL 10000
+/*
+ * ---------------------------------------------------------------------
+ * #define macros definitions
+ * ---------------------------------------------------------------------
+ */
+#define MNxtMgid(id_, beg_, end_) (((id_) + 1 > (end_) ? (beg_) : (id_) + 1))
+/*
+ * ---------------------------------------------------------------------
+ * Function prototypes
+ * ---------------------------------------------------------------------
+ */
+
+void HPL_pipid(HPL_T_panel*, int*, int*);
+void HPL_piplen(HPL_T_panel*, const int, const int*, int*, int*);
+void HPL_perm(const int, int*, int*, int*);
+
+void HPL_plindx(HPL_T_panel*,
+                const int,
+                const int*,
+                int*,
+                int*,
+                int*,
+                int*,
+                int*,
+                int*,
+                int*);
+
+void HPL_pdlaswp_start(HPL_T_panel* PANEL, const HPL_T_UPD UPD);
+void HPL_pdlaswp_exchange(HPL_T_panel* PANEL, const HPL_T_UPD UPD);
+void HPL_pdlaswp_end(HPL_T_panel* PANEL, const HPL_T_UPD UPD);
+void HPL_pdupdateNT(HPL_T_panel*, const HPL_T_UPD);
+void HPL_pdupdateTT(HPL_T_panel*, const HPL_T_UPD);
+void HPL_pdgesv(HPL_T_grid*, HPL_T_palg*, HPL_T_pmat*);
+void HPL_pdtrsv(HPL_T_grid*, HPL_T_pmat*);
+
+#endif
+/*
+ * End of hpl_pgesv.hpp
+ */
diff --git a/include/hpl_pmatgen.hpp b/include/hpl_pmatgen.hpp
new file mode 100644
index 0000000..603217f
--- /dev/null
+++ b/include/hpl_pmatgen.hpp
@@ -0,0 +1,73 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+#ifndef HPL_PMATGEN_HPP
+#define HPL_PMATGEN_HPP
+/*
+ * ---------------------------------------------------------------------
+ * Include files
+ * ---------------------------------------------------------------------
+ */
+#include "hpl_misc.hpp"
+
+#include "hpl_pmisc.hpp"
+#include "hpl_pauxil.hpp"
+#include "hpl_pgesv.hpp"
+#include "hpl_ptest.hpp"
+
+/*
+ * ---------------------------------------------------------------------
+ * #define macro constants
+ * ---------------------------------------------------------------------
+ */
+#define HPL_MULT 6364136223846793005UL
+#define HPL_IADD 1UL
+#define HPL_DIVFAC 2147483648.0
+#define HPL_POW16 65536.0
+#define HPL_HALF 0.5
+/*
+ * ---------------------------------------------------------------------
+ * Function prototypes
+ * ---------------------------------------------------------------------
+ */
+void HPL_xjumpm(const int      JUMPM,
+                const uint64_t MULT,
+                const uint64_t IADD,
+                const uint64_t IRANN,
+                uint64_t&      IRANM,
+                uint64_t&      IAM,
+                uint64_t&      ICM);
+
+void HPL_pdrandmat(const HPL_T_grid*,
+                   const int,
+                   const int,
+                   const int,
+                   double*,
+                   const int,
+                   const int);
+
+int HPL_pdmatgen(HPL_T_test*,
+                 HPL_T_grid*,
+                 HPL_T_palg*,
+                 HPL_T_pmat*,
+                 const int,
+                 const int);
+
+void HPL_pdmatfree(HPL_T_pmat*);
+
+#endif
+/*
+ * End of hpl_pmatgen.hpp
+ */
diff --git a/include/hpl_pmisc.hpp b/include/hpl_pmisc.hpp
new file mode 100644
index 0000000..883d9c0
--- /dev/null
+++ b/include/hpl_pmisc.hpp
@@ -0,0 +1,29 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+#ifndef HPL_PMISC_HPP
+#define HPL_PMISC_HPP
+/*
+ * ---------------------------------------------------------------------
+ * Include files
+ * ---------------------------------------------------------------------
+ */
+#include "hpl_misc.hpp"
+#include "mpi.h"
+
+#endif
+/*
+ * End of hpl_pmisc.hpp
+ */
diff --git a/include/hpl_ptest.hpp b/include/hpl_ptest.hpp
new file mode 100644
index 0000000..f3e93f9
--- /dev/null
+++ b/include/hpl_ptest.hpp
@@ -0,0 +1,118 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+#ifndef HPL_PTEST_HPP
+#define HPL_PTEST_HPP
+/*
+ * ---------------------------------------------------------------------
+ * Include files
+ * ---------------------------------------------------------------------
+ */
+#include "hpl_misc.hpp"
+#include "hpl_blas.hpp"
+#include "hpl_auxil.hpp"
+
+#include "hpl_pmisc.hpp"
+#include "hpl_pauxil.hpp"
+#include "hpl_panel.hpp"
+#include "hpl_pgesv.hpp"
+
+#include "hpl_ptimer.hpp"
+#include "hpl_pmatgen.hpp"
+
+/*
+ * ---------------------------------------------------------------------
+ * Data Structures
+ * ---------------------------------------------------------------------
+ */
+typedef struct HPL_S_test {
+  double epsil; /* epsilon machine */
+  double thrsh; /* threshold */
+  FILE*  outfp; /* output stream (only in proc 0) */
+  int    kfail; /* # of tests failed */
+  int    kpass; /* # of tests passed */
+  int    kskip; /* # of tests skipped */
+  int    ktest; /* total number of tests */
+} HPL_T_test;
+
+/*
+ * ---------------------------------------------------------------------
+ * #define macro constants for testing only
+ * ---------------------------------------------------------------------
+ */
+#define HPL_LINE_MAX 256
+#define HPL_MAX_PARAM 20
+#define HPL_ISEED 100
+/*
+ * ---------------------------------------------------------------------
+ * global timers for timing analysis only
+ * ---------------------------------------------------------------------
+ */
+#define HPL_TIMING_BEG 11    /* timer 0 reserved, used by main */
+#define HPL_TIMING_N 8       /* number of timers defined below */
+#define HPL_TIMING_RPFACT 11 /* starting from here, contiguous */
+#define HPL_TIMING_PFACT 12
+#define HPL_TIMING_MXSWP 13
+#define HPL_TIMING_COPY 14
+#define HPL_TIMING_LBCAST 15
+#define HPL_TIMING_LASWP 16
+#define HPL_TIMING_UPDATE 17
+#define HPL_TIMING_PTRSV 18
+/*
+ * ---------------------------------------------------------------------
+ * Function prototypes
+ * ---------------------------------------------------------------------
+ */
+void HPL_pdinfo(int    ARGC,
+                char** ARGV,
+                HPL_T_test*,
+                int*,
+                int*,
+                int*,
+                int*,
+                HPL_T_ORDER*,
+                int*,
+                int*,
+                int*,
+                int*,
+                int*,
+                int*,
+                HPL_T_FACT*,
+                int*,
+                int*,
+                int*,
+                int*,
+                int*,
+                HPL_T_FACT*,
+                int*,
+                HPL_T_TOP*,
+                int*,
+                int*,
+                HPL_T_SWAP*,
+                int*,
+                int*,
+                int*,
+                int*,
+                int*,
+                double*);
+
+void HPL_pdtest(HPL_T_test*, HPL_T_grid*, HPL_T_palg*, const int, const int);
+void HPL_InitGPU(const HPL_T_grid* GRID);
+void HPL_FreeGPU();
+
+#endif
+/*
+ * End of hpl_ptest.hpp
+ */
diff --git a/include/hpl_ptimer.hpp b/include/hpl_ptimer.hpp
new file mode 100644
index 0000000..b6ebf98
--- /dev/null
+++ b/include/hpl_ptimer.hpp
@@ -0,0 +1,71 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+#ifndef HPL_PTIMER_HPP
+#define HPL_PTIMER_HPP
+/*
+ * ---------------------------------------------------------------------
+ * Include files
+ * ---------------------------------------------------------------------
+ */
+#include "hpl_pmisc.hpp"
+
+/*
+ * ---------------------------------------------------------------------
+ * #define macro constants
+ * ---------------------------------------------------------------------
+ */
+#define HPL_NPTIMER 64
+#define HPL_PTIMER_STARTFLAG 5.0
+#define HPL_PTIMER_ERROR -1.0
+/*
+ * ---------------------------------------------------------------------
+ * type definitions
+ * ---------------------------------------------------------------------
+ */
+typedef enum { HPL_WALL_PTIME = 101, HPL_CPU_PTIME = 102 } HPL_T_PTIME;
+
+typedef enum {
+  HPL_AMAX_PTIME = 201,
+  HPL_AMIN_PTIME = 202,
+  HPL_SUM_PTIME  = 203
+} HPL_T_PTIME_OP;
+/*
+ * ---------------------------------------------------------------------
+ * Function prototypes
+ * ---------------------------------------------------------------------
+ */
+double HPL_ptimer_cputime(void);
+double HPL_ptimer_walltime(void);
+void   HPL_ptimer(const int);
+void   HPL_ptimer_boot(void);
+
+void HPL_ptimer_combine(MPI_Comm comm,
+                        const HPL_T_PTIME_OP,
+                        const HPL_T_PTIME,
+                        const int,
+                        const int,
+                        double*);
+
+void   HPL_ptimer_disable(void);
+void   HPL_ptimer_enable(void);
+double HPL_ptimer_inquire(const HPL_T_PTIME, const int);
+void   HPL_ptimer_stepReset(const int, const int);
+double HPL_ptimer_getStep(const int);
+
+#endif
+/*
+ * End of hpl_ptimer.hpp
+ */
diff --git a/include/hpl_version.hpp.in b/include/hpl_version.hpp.in
new file mode 100644
index 0000000..08cbb5f
--- /dev/null
+++ b/include/hpl_version.hpp.in
@@ -0,0 +1,24 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#ifndef HPL_VERSION_HPP
+#define HPL_VERSION_HPP
+
+// clang-format off
+#define __ROCHPL_VER_MAJOR     @rochpl_VERSION_MAJOR@
+#define __ROCHPL_VER_MINOR     @rochpl_VERSION_MINOR@
+#define __ROCHPL_VER_PATCH     @rochpl_VERSION_PATCH@
+#define __ROCHPL_VER_TWEAK     @rochpl_VERSION_TWEAK@
+// clang-format on
+
+#define __ROCHPL_VER \
+  10000 * __ROCHPL_VER_MAJOR + 100 * __ROCHPL_VER_MINOR + __ROCHPL_VER_PATCH
+
+#endif // VERSION_HPP
diff --git a/install.sh b/install.sh
new file mode 100755
index 0000000..4409f22
--- /dev/null
+++ b/install.sh
@@ -0,0 +1,390 @@
+#!/usr/bin/env bash
+# Author: Nico Trost
+# Modified by: Noel Chalmers
+
+#set -x #echo on
+
+# #################################################
+# helper functions
+# #################################################
+function display_help()
+{
+  echo "rocHPL build helper script"
+  echo "./install "
+  echo "    [-h|--help] prints this help message"
+  echo "    [-g|--debug] Set build type to Debug (otherwise build Release)"
+  echo "    [--prefix] Path to rocHPL install location (Default: build/rocHPL)"
+  echo "    [--with-rocm=<dir>] Path to ROCm install (Default: /opt/rocm)"
+  echo "    [--with-rocblas=<dir>] Path to rocBLAS library (Default: /opt/rocm/rocblas)"
+  echo "    [--with-cpublas=<dir>] Path to external CPU BLAS library (Default: clone+build BLIS)"
+  echo "    [--with-mpi=<dir>] Path to external MPI install (Default: clone+build OpenMPI)"
+  echo "    [--verbose-print] Verbose output during HPL setup (Default: true)"
+  echo "    [--progress-report] Print progress report to terminal during HPL run (Default: true)"
+  echo "    [--detailed-timing] Record detailed timers during HPL run (Default: true)"
+}
+
+# prereq: ${ID} must be defined before calling
+supported_distro( )
+{
+  if [ -z ${ID+foo} ]; then
+    printf "supported_distro(): \$ID must be set\n"
+    exit 2
+  fi
+
+  case "${ID}" in
+    ubuntu|centos|rhel|fedora|sles)
+        true
+        ;;
+    *)  printf "This script is currently supported on Ubuntu, CentOS, RHEL, Fedora and SLES\n"
+        exit 2
+        ;;
+  esac
+}
+
+exit_with_error( )
+{
+  if (( $1 == 2 )); then
+    # Failure in some install step
+    # Print some message about needed dependencies
+
+    # dependencies needed for executable to build
+    local library_dependencies_ubuntu=( "git" "make" "cmake" "libnuma-dev" "pkg-config" "autoconf" "libtool" "automake" "m4" "flex" "libgomp1")
+    local library_dependencies_centos=( "git" "make" "cmake3" "gcc-c++" "rpm-build" "epel-release" "numactl-libs" "autoconf" "libtool" "automake" "m4" "flex" "libgomp")
+    local library_dependencies_fedora=( "git" "make" "cmake" "gcc-c++" "libcxx-devel" "rpm-build" "numactl-libs"  "autoconf" "libtool" "automake" "m4" "flex" "libgomp")
+    local library_dependencies_sles=(   "git" "make" "cmake" "gcc-c++" "libcxxtools9" "rpm-build" "libnuma-devel" "autoconf" "libtool" "automake" "m4" "flex" "libgomp1")
+
+    if [[ "${with_rocm}" == /opt/rocm ]]; then
+      library_dependencies_ubuntu+=("rocblas" "rocblas-dev")
+      library_dependencies_centos+=("rocblas" "rocblas-devel")
+      library_dependencies_fedora+=("rocblas" "rocblas-dev")
+      library_dependencies_sles+=("rocblas" "rocblas-devel")
+    fi
+
+    printf "Installation failed. Some required packages may be missing.\n"
+    printf "The following package manager install command may be needed:\n"
+    case "${ID}" in
+      ubuntu)
+        printf "sudo apt install -y ${library_dependencies_ubuntu[*]}\n"
+        ;;
+
+      centos|rhel)
+        printf "sudo yum -y --nogpgcheck install ${library_dependencies_centos[*]}\n"
+        ;;
+
+      fedora)
+        printf "sudo dnf install -y ${library_dependencies_fedora[*]}\n"
+        ;;
+
+      sles)
+        printf "sudo zypper -n --no-gpg-checks install ${library_dependencies_sles[*]}\n"
+        ;;
+      *)
+        exit 2
+        ;;
+    esac
+  fi
+
+  exit $1
+}
+
+check_exit_code( )
+{
+  if (( $? != 0 )); then
+    exit $@
+  fi
+}
+
+
+# Install BLIS in rochpl/tpl
+install_blis( )
+{
+  if [ ! -d "./tpl/blis" ]; then
+    mkdir -p tpl && cd tpl
+    git clone https://github.com/amd/blis --branch 3.1
+    check_exit_code 2
+    cd blis; ./configure --prefix=${PWD} --enable-cblas --disable-sup-handling auto;
+    check_exit_code 2
+    make -j$(nproc)
+    check_exit_code 2
+    make install -j$(nproc)
+    check_exit_code 2
+    cd ../..
+  elif [ ! -f "./tpl/blis/lib/libblis.so" ]; then
+    cd tpl/blis; ./configure --prefix=${PWD} --enable-cblas --disable-sup-handling auto;
+    check_exit_code 2
+    make -j$(nproc)
+    check_exit_code 2
+    make install -j$(nproc)
+    check_exit_code 2
+    cd ../..
+  fi
+
+  # Check for successful build
+  if [ ! -f "./tpl/blis/lib/libblis.so" ]; then
+    echo "Error: BLIS install unsuccessful."
+    exit_with_error 2
+  fi
+}
+
+# Clone and build OpenMPI+UCX in rochpl/tpl
+install_openmpi( )
+{
+  #OpenMPI and UCX install to one of these locations depending on OS
+  ucx_lib_folder=./tpl/ucx/lib
+  ompi_lib_folder=./tpl/openmpi/lib
+  ucx_lib64_folder=./tpl/ucx/lib64
+  ompi_lib64_folder=./tpl/openmpi/lib64
+
+  if [ ! -d "./tpl/ucx" ]; then
+    mkdir -p tpl && cd tpl
+    git clone --branch master https://github.com/openucx/ucx.git ucx
+    check_exit_code 2
+    cd ucx;
+    git checkout b38c71e94ccbbafbaa308f04ad2539425f345483
+    ./autogen.sh; ./autogen.sh #why do we have to run this twice?
+    check_exit_code 2
+    mkdir build; cd build
+    ../contrib/configure-opt --prefix=${PWD}/../ --with-rocm=${with_rocm} --without-knem --without-cuda --without-java
+    check_exit_code 2
+    make -j$(nproc)
+    check_exit_code 2
+    make install
+    check_exit_code 2
+    cd ../../..
+  elif ([ ! -f "${ucx_lib_folder}/libucm.so" ] || [ ! -f "${ucx_lib_folder}/libucp.so" ]  || \
+        [ ! -f "${ucx_lib_folder}/libucs.so" ] || [ ! -f "${ucx_lib_folder}/libuct.so" ]) && \
+       ([ ! -f "${ucx_lib64_folder}/libucm.so" ] || [ ! -f "${ucx_lib64_folder}/libucp.so" ]  || \
+        [ ! -f "${ucx_lib64_folder}/libucs.so" ] || [ ! -f "${ucx_lib64_folder}/libuct.so" ]); then
+    cd tpl/ucx; 
+    git checkout b38c71e94ccbbafbaa308f04ad2539425f345483
+    ./autogen.sh; ./autogen.sh
+    check_exit_code 2
+    mkdir build; cd build
+    ../contrib/configure-opt --prefix=${PWD}/../ --with-rocm=${with_rocm} --without-knem --without-cuda --without-java
+    check_exit_code 2
+    make -j$(nproc)
+    check_exit_code 2
+    make install
+    check_exit_code 2
+    cd ../../..
+  fi
+
+  # Check for successful build
+  if ([ ! -f "${ucx_lib_folder}/libucm.so" ] || [ ! -f "${ucx_lib_folder}/libucp.so" ]  || \
+      [ ! -f "${ucx_lib_folder}/libucs.so" ] || [ ! -f "${ucx_lib_folder}/libuct.so" ]) &&
+     ([ ! -f "${ucx_lib64_folder}/libucm.so" ] || [ ! -f "${ucx_lib64_folder}/libucp.so" ]  || \
+      [ ! -f "${ucx_lib64_folder}/libucs.so" ] || [ ! -f "${ucx_lib64_folder}/libuct.so" ]); then
+    echo "Error: UCX install unsuccessful."
+    exit 3
+  fi
+
+  if [ ! -d "./tpl/openmpi" ]; then
+    mkdir -p tpl && cd tpl
+    git clone --branch v4.1.4 https://github.com/open-mpi/ompi.git openmpi
+    check_exit_code 2
+    cd openmpi; ./autogen.pl;
+    check_exit_code 2
+    mkdir build; cd build
+    ../configure --prefix=${PWD}/../ --with-ucx=${PWD}/../../ucx --without-verbs
+    check_exit_code 2
+    make -j$(nproc)
+    check_exit_code 2
+    make install
+    check_exit_code 2
+    cd ../../..
+  elif [ ! -f "${ompi_lib_folder}/libmpi.so" ] && [ ! -f "${ompi_lib64_folder}/libmpi.so" ]; then
+    cd tpl/openmpi; ./autogen.pl;
+    check_exit_code 2
+    mkdir build; cd build
+    ../configure --prefix=${PWD}/../ --with-ucx=${PWD}/../../ucx --without-verbs
+    check_exit_code 2
+    make -j$(nproc)
+    check_exit_code 2
+    make install
+    check_exit_code 2
+    cd ../../..
+  fi
+
+  # Check for successful build
+  if [ ! -f "${ompi_lib_folder}/libmpi.so" ] && [ ! -f "${ompi_lib64_folder}/libmpi.so" ]; then
+    echo "Error: OpenMPI install unsuccessful."
+    exit_with_error 2
+  fi
+}
+
+# #################################################
+# Pre-requisites check
+# #################################################
+# Exit code 0: alls well
+# Exit code 1: problems with getopt
+# Exit code 2: problems with supported platforms
+
+# check if getopt command is installed
+type getopt > /dev/null
+if [[ $? -ne 0 ]]; then
+  echo "This script uses getopt to parse arguments; try installing the util-linux package";
+  exit_with_error 1
+fi
+
+# os-release file describes the system
+if [[ -e "/etc/os-release" ]]; then
+  source /etc/os-release
+else
+  echo "This script depends on the /etc/os-release file"
+  exit_with_error 1
+fi
+
+# The following function exits script if an unsupported distro is detected
+supported_distro
+
+# #################################################
+# global variables
+# #################################################
+install_prefix=rocHPL
+build_release=true
+with_rocm=/opt/rocm
+with_mpi=tpl/openmpi
+with_rocblas=/opt/rocm/rocblas
+with_cpublas=tpl/blis/lib
+openmpi_ucx=false
+verbose_print=true
+progress_report=true
+detailed_timing=true
+
+# #################################################
+# Parameter parsing
+# #################################################
+
+# check if we have a modern version of getopt that can handle whitespace and long parameters
+getopt -T
+if [[ $? -eq 4 ]]; then
+  GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,debug,prefix:,with-rocm:,with-mpi:,with-rocblas:,with-cpublas:,verbose-print:,progress-report:,detailed-timing: --options hg -- "$@")
+else
+  echo "Need a new version of getopt"
+  exit_with_error 1
+fi
+
+if [[ $? -ne 0 ]]; then
+  echo "getopt invocation failed; could not parse the command line";
+  exit_with_error 1
+fi
+
+eval set -- "${GETOPT_PARSE}"
+
+while true; do
+  case "${1}" in
+    -h|--help)
+        display_help
+        exit 0
+        ;;
+    -g|--debug)
+        build_release=false
+        shift ;;
+    --prefix)
+        install_prefix=${2}
+        shift 2 ;;
+    --with-rocm)
+        with_rocm=${2}
+        shift 2 ;;
+    --with-mpi)
+        with_mpi=${2}
+        shift 2 ;;
+    --with-rocblas)
+        with_rocblas=${2}
+        shift 2 ;;
+    --with-cpublas)
+        with_cpublas=${2}
+        shift 2 ;;
+    --verbose-print)
+        verbose_print=${2}
+        shift 2 ;;
+    --progress-report)
+        progress_report=${2}
+        shift 2 ;;
+    --detailed-timing)
+        detailed_timing=${2}
+        shift 2 ;;
+    --) shift ; break ;;
+    *)  echo "Unexpected command line parameter received; aborting";
+        exit_with_error 1
+        ;;
+  esac
+done
+
+build_dir=./build
+printf "\033[32mCreating project build directory in: \033[33m${build_dir}\033[0m\n"
+
+# #################################################
+# prep
+# #################################################
+# ensure a clean build environment
+rm -rf ${build_dir}
+
+# Default cmake executable is called cmake
+cmake_executable=cmake
+
+# We append customary rocm path; if user provides custom rocm path in ${path}, our
+# hard-coded path has lesser priority
+export ROCM_PATH=${with_rocm}
+export PATH=${PATH}:${ROCM_PATH}/bin
+
+pushd .
+  # #################################################
+  # BLAS
+  # #################################################
+  if [[ "${with_cpublas}" == tpl/blis/lib ]]; then
+
+    install_blis
+
+  fi
+
+  # #################################################
+  # MPI
+  # #################################################
+  if [[ "${with_mpi}" == tpl/openmpi ]]; then
+
+    #gpu_aware_mpi=ON #turn on GPU-aware MPI when using internal MPI library
+    with_mpi=${PWD}/tpl/openmpi
+    openmpi_ucx=true
+    install_openmpi
+
+  fi
+
+  # #################################################
+  # configure & build
+  # #################################################
+  cmake_common_options="-DCMAKE_INSTALL_PREFIX=${install_prefix} -DHPL_BLAS_DIR=${with_cpublas}
+                        -DHPL_MPI_DIR=${with_mpi} -DROCM_PATH=${with_rocm} -DROCBLAS_PATH=${with_rocblas}"
+
+  # build type
+  if [[ "${build_release}" == true ]]; then
+    cmake_common_options="${cmake_common_options} -DCMAKE_BUILD_TYPE=Release"
+  else
+    cmake_common_options="${cmake_common_options} -DCMAKE_BUILD_TYPE=Debug"
+  fi
+
+  shopt -s nocasematch
+  if [[ "${verbose_print}" == on || "${verbose_print}" == true || "${verbose_print}" == 1 || "${verbose_print}" == enabled ]]; then
+    cmake_common_options="${cmake_common_options} -DHPL_VERBOSE_PRINT=ON"
+  fi
+  if [[ "${progress_report}" == on || "${progress_report}" == true || "${progress_report}" == 1 || "${progress_report}" == enabled ]]; then
+    cmake_common_options="${cmake_common_options} -DHPL_PROGRESS_REPORT=ON"
+  fi
+  if [[ "${detailed_timing}" == on || "${detailed_timing}" == true || "${detailed_timing}" == 1 || "${detailed_timing}" == enabled ]]; then
+    cmake_common_options="${cmake_common_options} -DHPL_DETAILED_TIMING=ON"
+  fi
+  shopt -u nocasematch
+
+  if [[ "${openmpi_ucx}" == true ]]; then
+    cmake_common_options="${cmake_common_options} -DHPL_OPENMPI_UCX=ON"
+  fi
+
+  # Build library with AMD toolchain because of existence of device kernels
+  mkdir -p ${build_dir} && cd ${build_dir}
+  ${cmake_executable} ${cmake_common_options} ..
+  check_exit_code 2
+
+  make -j$(nproc) install
+  check_exit_code 2
+
+popd
diff --git a/scripts/HPL.dat b/scripts/HPL.dat
new file mode 100644
index 0000000..3b7fec0
--- /dev/null
+++ b/scripts/HPL.dat
@@ -0,0 +1,31 @@
+HPLinpack benchmark input file
+Innovative Computing Laboratory, University of Tennessee
+HPL.out      output file name (if any)
+0            device out (6=stdout,7=stderr,file)
+1            # of problems sizes (N)
+45312        Ns
+1            # of NBs
+384          NBs
+1            PMAP process mapping (0=Row-,1=Column-major)
+1            # of process grids (P x Q)
+1            Ps
+1            Qs
+16.0         threshold
+1            # of panel fact
+2            PFACTs (0=left, 1=Crout, 2=Right)
+1            # of recursive stopping criterium
+2            NBMINs (>= 1)
+1            # of panels in recursion
+2            NDIVs
+1            # of recursive panel fact.
+2            RFACTs (0=left, 1=Crout, 2=Right)
+1            # of broadcast
+6            BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM)
+1            # of lookahead depth
+1            DEPTHs (>=0)
+1            SWAP (0=bin-exch,1=long,2=mix)
+64           swapping threshold
+1            L1 in (0=transposed,1=no-transposed) form
+0            U  in (0=transposed,1=no-transposed) form
+0            Equilibration (0=no,1=yes)
+8            memory alignment in double (> 0)
diff --git a/scripts/mpirun_rochpl.in b/scripts/mpirun_rochpl.in
new file mode 100755
index 0000000..8138238
--- /dev/null
+++ b/scripts/mpirun_rochpl.in
@@ -0,0 +1,198 @@
+#!/usr/bin/env bash
+# Author: Noel Chalmers
+
+# set -x #echo on
+
+# #################################################
+# helper functions
+# #################################################
+function display_help()
+{
+  echo "rocHPL MPI run helper script"
+  echo "./mpirun_rochpl "
+  echo "    [-P]    Specific MPI grid size: the number of         "
+  echo "            rows in MPI grid.                             "
+  echo "    [-Q]    Specific MPI grid size: the number of         "
+  echo "            columns in MPI grid.                          "
+  echo "    [-p]    Specific node-local MPI grid size: the number "
+  echo "            of rows in node-local MPI grid. Must evenly   "
+  echo "            divide P.                                     "
+  echo "    [-q]    Specific node-local MPI grid size: the number "
+  echo "            of columns in node-local MPI grid. Must evenly"
+  echo "            divide Q.                                     "
+  echo "    [-N]    Specific matrix size: the number of           "
+  echo "            rows/columns in global matrix.                "
+  echo "    [--NB]  Specific panel size: the number of            "
+  echo "            rows/columns in panels.                       "
+  echo "    [-f]    Specific split fraction: the percentange to   "
+  echo "            split the trailing submatrix.                 "
+  echo "    [-i]    Input file. When set, all other commnand      "
+  echo "            line parameters are ignored, and problem      "
+  echo "            parameters are read from input file.          "
+  echo "    [-h|--help] prints this help message                  "
+  echo "    [--version] Print rocHPL version number.              "
+}
+
+# This function is helpful for dockerfiles that do not have sudo installed, but the default user is root
+# true is a system command that completes successfully, function returns success
+# prereq: ${ID} must be defined before calling
+supported_distro( )
+{
+  if [ -z ${ID+foo} ]; then
+    printf "supported_distro(): \$ID must be set\n"
+    exit 2
+  fi
+
+  case "${ID}" in
+    ubuntu|centos|rhel|fedora|sles)
+        true
+        ;;
+    *)  printf "This script is currently supported on Ubuntu, CentOS, RHEL, Fedora and SLES\n"
+        exit 2
+        ;;
+  esac
+}
+
+# #################################################
+# Pre-requisites check
+# #################################################
+# Exit code 0: alls well
+# Exit code 1: problems with getopt
+# Exit code 2: problems with supported platforms
+
+# check if getopt command is installed
+type getopt > /dev/null
+if [[ $? -ne 0 ]]; then
+  echo "This script uses getopt to parse arguments; try installing the util-linux package";
+  exit 1
+fi
+
+# os-release file describes the system
+if [[ -e "/etc/os-release" ]]; then
+  source /etc/os-release
+else
+  echo "This script depends on the /etc/os-release file"
+  exit 2
+fi
+
+# The following function exits script if an unsupported distro is detected
+supported_distro
+
+# #################################################
+# global variables
+# #################################################
+# Grab options from CMake config
+rochpl_bin=@CMAKE_INSTALL_PREFIX@/bin/rochpl
+mpi_bin=@MPIEXEC_EXECUTABLE@
+rochpl_runscript=$(dirname "$0")/run_rochpl #assume run_rochpl is in the same location
+openmpi_ucx=@HPL_OPENMPI_UCX@
+
+P=1
+Q=1
+p=-1
+q=-1
+N=45312
+NB=384
+frac=0.6
+
+filename=HPL.dat
+inputfile=false
+cmdrun=false
+
+# #################################################
+# Parameter parsing
+# #################################################
+
+# check if we have a modern version of getopt that can handle whitespace and long parameters
+getopt -T
+if [[ $? -eq 4 ]]; then
+  GETOPT_PARSE=$(getopt --name "${0}" --longoptions NB:,help,version, --options hP:Q:p:q:N:i:f: -- "$@")
+else
+  echo "Need a new version of getopt"
+  exit 1
+fi
+
+if [[ $? -ne 0 ]]; then
+  echo "getopt invocation failed; could not parse the command line";
+  exit 1
+fi
+
+eval set -- "${GETOPT_PARSE}"
+
+while true; do
+  case "${1}" in
+    -h|--help)
+        display_help
+        exit 0
+        ;;
+    --version)
+        ${mpi_bin} -np 1 ${rochpl_bin} --version
+        exit 0
+        ;;
+    -P)
+        P=${2}
+        shift 2 ;;
+    -Q)
+        Q=${2}
+        shift 2 ;;
+    -p)
+        p=${2}
+        shift 2 ;;
+    -q)
+        q=${2}
+        shift 2 ;;
+    -N)
+        N=${2}
+        cmdrun=true
+        shift 2 ;;
+    --NB)
+        NB=${2}
+        cmdrun=true
+        shift 2 ;;
+    -f)
+        frac=${2}
+        shift 2 ;;
+    -i)
+        filename=${2}
+        inputfile=true
+        shift 2 ;;
+    --) shift ; break ;;
+    *)  echo "Unexpected command line parameter received; aborting";
+        exit 1
+        ;;
+  esac
+done
+
+#if nothing but np and ppn parameters where given, default to running
+# with default input file
+if [[ "${inputfile}" == false && "${cmdrun}" == false ]]; then
+  inputfile=true
+fi
+
+np=$(($P*$Q))
+if [[ "$np" -lt 1 ]]; then
+  echo "Invalid MPI grid parameters; aborting";
+  exit 1
+fi
+
+# count the number of physical cores
+num_cpu_cores=$(lscpu | grep "Core(s)" | awk '{print $4}')
+num_cpu_sockets=$(lscpu | grep Socket | awk '{print $2}')
+total_cpu_cores=$(($num_cpu_cores*$num_cpu_sockets))
+
+#Default MPI options
+mpi_args="--map-by node:PE=${total_cpu_cores} --bind-to core:overload-allowed"
+
+if [[ ${openmpi_ucx} == ON ]]; then
+  # run with openmpi + ucx
+  mpi_args="--mca pml ucx --mca btl ^vader,tcp,openib,uct ${mpi_args}"
+fi
+
+if [[ "${inputfile}" == true ]]; then
+  rochpl_args="-P ${P} -Q ${Q} -p ${p} -q ${q} -i ${filename} -f ${frac}"
+else
+  rochpl_args="-P ${P} -Q ${Q} -p ${p} -q ${q} -N ${N} --NB ${NB} -f ${frac}"
+fi
+
+#run
+${mpi_bin} -np ${np} ${mpi_args} ${rochpl_runscript} ${rochpl_args}
diff --git a/scripts/run_rochpl.in b/scripts/run_rochpl.in
new file mode 100755
index 0000000..5684932
--- /dev/null
+++ b/scripts/run_rochpl.in
@@ -0,0 +1,410 @@
+#!/usr/bin/env bash
+# Author: Noel Chalmers
+
+# set -x #echo on
+
+# #################################################
+# helper functions
+# #################################################
+function display_help()
+{
+  echo "rocHPL run helper script"
+  echo "./run_rochpl "
+  echo "    [-P]    Specific MPI grid size: the number of         "
+  echo "            rows in MPI grid.                             "
+  echo "    [-Q]    Specific MPI grid size: the number of         "
+  echo "            columns in MPI grid.                          "
+  echo "    [-p]    Specific node-local MPI grid size: the number "
+  echo "            of rows in node-local MPI grid. Must evenly   "
+  echo "            divide P.                                     "
+  echo "    [-q]    Specific node-local MPI grid size: the number "
+  echo "            of columns in node-local MPI grid. Must evenly"
+  echo "            divide Q.                                     "
+  echo "    [-N]    Specific matrix size: the number of           "
+  echo "            rows/columns in global matrix.                "
+  echo "    [--NB]  Specific panel size: the number of            "
+  echo "            rows/columns in panels.                       "
+  echo "    [-f]    Specific split fraction: the percentange to   "
+  echo "            split the trailing submatrix.                 "
+  echo "    [-i]    Input file. When set, all other commnand      "
+  echo "            line parameters are ignored, and problem      "
+  echo "            parameters are read from input file.          "
+  echo "    [-h|--help] prints this help message                  "
+  echo "    [--version] Print rocHPL version number.              "
+}
+
+# This function is helpful for dockerfiles that do not have sudo installed, but the default user is root
+# true is a system command that completes successfully, function returns success
+# prereq: ${ID} must be defined before calling
+supported_distro( )
+{
+  if [ -z ${ID+foo} ]; then
+    printf "supported_distro(): \$ID must be set\n"
+    exit 2
+  fi
+
+  case "${ID}" in
+    ubuntu|centos|rhel|fedora|sles)
+        true
+        ;;
+    *)  printf "This script is currently supported on Ubuntu, CentOS, RHEL, Fedora and SLES\n"
+        exit 2
+        ;;
+  esac
+}
+
+# #################################################
+# Pre-requisites check
+# #################################################
+# Exit code 0: alls well
+# Exit code 1: problems with getopt
+# Exit code 2: problems with supported platforms
+
+# check if getopt command is installed
+type getopt > /dev/null
+if [[ $? -ne 0 ]]; then
+  echo "This script uses getopt to parse arguments; try installing the util-linux package";
+  exit 1
+fi
+
+# os-release file describes the system
+if [[ -e "/etc/os-release" ]]; then
+  source /etc/os-release
+else
+  echo "This script depends on the /etc/os-release file"
+  exit 2
+fi
+
+# The following function exits script if an unsupported distro is detected
+supported_distro
+
+# #################################################
+# global variables
+# #################################################
+# Grab options from CMake config
+rochpl_bin=@CMAKE_INSTALL_PREFIX@/bin/rochpl
+rocm_dir=@ROCM_PATH@
+rocblas_dir=@ROCBLAS_LIB_PATH@
+blas_dir=@HPL_BLAS_DIR@
+
+P=1
+Q=1
+p=-1
+q=-1
+N=45312
+NB=384
+frac=0.6
+
+filename=HPL.dat
+inputfile=false
+cmdrun=false
+
+export LD_LIBRARY_PATH=${rocblas_dir}:${blas_dir}:${rocm_dir}/lib:$LD_LIBRARY_PATH
+
+oversubscribe=true
+
+# #################################################
+# Parameter parsing
+# #################################################
+
+# check if we have a modern version of getopt that can handle whitespace and long parameters
+getopt -T
+if [[ $? -eq 4 ]]; then
+  GETOPT_PARSE=$(getopt --name "${0}" --longoptions NB:,help,version, --options hP:Q:p:q:N:i:f: -- "$@")
+else
+  echo "Need a new version of getopt"
+  exit 1
+fi
+
+if [[ $? -ne 0 ]]; then
+  echo "getopt invocation failed; could not parse the command line";
+  exit 1
+fi
+
+eval set -- "${GETOPT_PARSE}"
+
+while true; do
+  case "${1}" in
+    -h|--help)
+        display_help
+        exit 0
+        ;;
+    --version)
+        ${rochpl_bin} --version
+        exit 0
+        ;;
+    -P)
+        P=${2}
+        shift 2 ;;
+    -Q)
+        Q=${2}
+        shift 2 ;;
+    -p)
+        p=${2}
+        shift 2 ;;
+    -q)
+        q=${2}
+        shift 2 ;;
+    -N)
+        N=${2}
+        cmdrun=true
+        shift 2 ;;
+    --NB)
+        NB=${2}
+        cmdrun=true
+        shift 2 ;;
+    -f)
+        frac=${2}
+        shift 2 ;;
+    -i)
+        filename=${2}
+        inputfile=true
+        shift 2 ;;
+    --) shift ; break ;;
+    *)  echo "Unexpected command line parameter received; aborting";
+        exit 1
+        ;;
+  esac
+done
+
+#if nothing but np and ppn parameters where given, default to running
+# with default input file
+if [[ "${inputfile}" == false && "${cmdrun}" == false ]]; then
+  inputfile=true
+fi
+
+np=$(($P*$Q))
+if [[ "$np" -lt 1 ]]; then
+  echo "Invalid MPI grid parameters; aborting";
+  exit 1
+fi
+
+#######################################
+# Now figure out the CPU core mappings
+#######################################
+
+# Get local process numbering
+set +u
+if [[ -n ${OMPI_COMM_WORLD_LOCAL_RANK+x} ]]; then
+  globalRank=$OMPI_COMM_WORLD_RANK
+  globalSize=$OMPI_COMM_WORLD_SIZE
+  rank=$OMPI_COMM_WORLD_LOCAL_RANK
+  size=$OMPI_COMM_WORLD_LOCAL_SIZE
+elif [[ -n ${SLURM_LOCALID+x} ]]; then
+  globalRank=$SLURM_PROCID
+  globalSize=$SLURM_NTASKS
+  rank=$SLURM_LOCALID
+  size=$SLURM_TASKS_PER_NODE
+  #Slurm can return a string like "2(x2),1". Get the first number
+  size=$(echo $size | sed -r 's/^([^.]+).*$/\1/; s/^[^0-9]*([0-9]+).*$/\1/')
+fi
+set -u
+
+#Determing node-local grid size
+if [[ "$p" -lt 1 && "$q" -lt 1 ]]; then
+  # no node-local grid was specified, pick defaults
+  q=$(( (Q<=size) ? Q : size))
+
+  if [[ $((size % q)) -gt 0 ]]; then
+    echo "Invalid MPI grid parameters; Unable to form node-local grid; aborting";
+    exit 1
+  fi
+
+  p=$(( size/q ))
+
+elif [[ "$p" -lt 1 ]]; then
+  #q was specified
+
+  if [[ $((size % q)) -gt 0 ]]; then
+    echo "Invalid MPI grid parameters; Unable to form node-local grid; aborting";
+    exit 1
+  fi
+
+  p=$(( size/q ))
+
+elif [[ "$q" -lt 1 ]]; then
+  #p was specified
+
+  if [[ $((size % p)) -gt 0 ]]; then
+    echo "Invalid MPI grid parameters; Unable to form node-local grid; aborting";
+    exit 1
+  fi
+
+  q=$(( size/p ))
+
+else
+  #Both p and q were specified
+  if [[ $size -ne $((p*q)) ]]; then
+    echo "Invalid MPI grid parameters; Unable to form node-local grid; aborting";
+    exit 1
+  fi
+fi
+
+# Check that the columns are evenly divided among nodes
+if [[ $((P % p)) -gt 0 ]]; then
+  echo "Invalid MPI grid parameters; Must have the same number of P rows on every node; aborting";
+  exit 1
+fi
+
+# Check that the rows are evenly divided among nodes
+if [[ $((Q % q)) -gt 0 ]]; then
+  echo "Invalid MPI grid parameters; Must have the same number of Q columns on every node; aborting";
+  exit 1
+fi
+
+# count the number of physical cores on node
+num_cpu_cores=$(lscpu | grep "Core(s)" | awk '{print $4}')
+num_cpu_sockets=$(lscpu | grep Socket | awk '{print $2}')
+total_cpu_cores=$(($num_cpu_cores*$num_cpu_sockets))
+
+# Ranks in different processes rows will take distinct chunks of cores
+row_stride=$((total_cpu_cores/p))
+col_stride=$((row_stride/q))
+
+myp=$((rank%p))
+myq=$((rank/p))
+
+#Although ranks are column-major order, we select GPUs in row-major order on node
+mygpu=$((myq+myp*q))
+
+# Try to detect special Bard-peak core mapping
+if [[ -n ${HPL_PLATFORM+x} ]]; then
+  platform=$HPL_PLATFORM
+else
+  platform=$(cat /sys/class/dmi/id/product_name)
+fi
+
+if [[ "$platform" == "BardPeak" || "$platform" == "HPE_CRAY_EX235A" ]]; then
+  # Special core mapping for BardPeak
+
+  # Debug
+  # if [[ $globalRank == 0 ]]; then
+  #   echo "BardPeak platform detected"
+  # fi
+
+  # Sanity check
+  if [[ $size -gt 8 ]]; then
+    echo "Unsupported number of ranks on BardPeak platform; aborting";
+    exit 1
+  fi
+
+  # GCD0 cores="48-55"
+  # GCD1 cores="56-63"
+  # GCD2 cores="16-23"
+  # GCD3 cores="24-31"
+  # GCD4 cores="0-7"
+  # GCD5 cores="8-15"
+  # GCD6 cores="32-39"
+  # GCD7 cores="40-47"
+
+  root_cores=(48 56 16 24 0 8 32 40)
+  root_core=${root_cores[mygpu]}
+
+  # First omp place is the root core
+  omp_places="{$root_core}"
+
+  # First assign the CCD
+  for i in $(seq $((root_core+1)) $((root_core+8-1)))
+  do
+    omp_places+=",{$i}"
+  done
+  omp_num_threads=8
+
+  places="{$root_core-$((root_core+7))}"
+
+  # Loop through unassigned CCDs
+  for c in $(seq $((mygpu+size)) $size 7)
+  do
+    iroot_core=${root_cores[c]}
+    for i in $(seq $((iroot_core)) $((iroot_core+8-1)))
+    do
+      omp_places+=",{$i}"
+    done
+    omp_num_threads=$((omp_num_threads+8))
+    places+=",{$iroot_core-$((iroot_core+7))}"
+  done
+
+  if [[ "${oversubscribe}" == true ]]; then
+    # Add cores from different columns, without their root cores
+    for j in $(seq 0  $((q-1)))
+    do
+      if [[ "$j" == "$myq" ]]; then
+        continue
+      fi
+      for jj in $(seq 0 $size 7)
+      do
+        q_gpu=$((jj+j+myp*q))
+        q_core=$((root_cores[q_gpu]))
+        offset=$(( (q_gpu>=size) ? 0 : 1))
+        for i in $(seq $((q_core+offset)) $((q_core+8-1)))
+        do
+          omp_places+=",{$i}"
+        done
+        omp_num_threads=$((omp_num_threads+8-offset))
+        places+=",{$((q_core+offset))-$((q_core+7))}"
+      done
+    done
+  fi
+
+else
+  # Default core mapping
+  root_core=$((myp*row_stride + myq*col_stride))
+
+  omp_num_threads=${col_stride}
+  # First omp place is the root core
+  omp_places="{$root_core}"
+
+  # Make contiuguous chunk of cores (to maximize L1/L2 locality)
+  for i in $(seq $((root_core+1)) $((root_core+col_stride-1)))
+  do
+    omp_places+=",{$i}"
+  done
+
+  if [[ $col_stride -gt 1 ]]; then
+    places="{$root_core-$((root_core+col_stride-1))}"
+  else
+    places="{$root_core}"
+  fi
+
+  if [[ "${oversubscribe}" == true ]]; then
+    # Add cores from different columns, without their root cores
+    for j in $(seq 0 $((q-1)))
+    do
+      if [[ "$j" == "$myq" ]]; then
+        continue
+      fi
+      q_core=$((myp*row_stride + j*col_stride))
+      for i in $(seq $((q_core+1)) $((q_core+col_stride-1)))
+      do
+        omp_places+=",{$i}"
+      done
+      omp_num_threads=$((omp_num_threads+col_stride-1))
+
+      if [[ $col_stride -gt 2 ]]; then
+        places+=",{$((q_core+1))-$((q_core+col_stride-1))}"
+      elif [[ $col_stride -gt 1 ]]; then
+        places+=",{$((q_core+1))}"
+      fi
+
+    done
+  fi
+fi
+
+# Export OpenMP config
+export OMP_NUM_THREADS=${omp_num_threads}
+export OMP_PLACES=${omp_places}
+export OMP_PROC_BIND=true
+
+if [[ $globalRank -lt $size ]]; then
+  echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] CPU Cores: $omp_num_threads - $places"
+fi
+
+rochpl_args="-P ${P} -Q ${Q} -p ${p} -q ${q} -f ${frac}"
+if [[ "${inputfile}" == true ]]; then
+  rochpl_args+=" -i ${filename}"
+else
+  rochpl_args+=" -N ${N} -NB ${NB}"
+fi
+
+#run
+${rochpl_bin} ${rochpl_args}
diff --git a/src/HPL_InitGPU.cpp b/src/HPL_InitGPU.cpp
new file mode 100644
index 0000000..f5b62b7
--- /dev/null
+++ b/src/HPL_InitGPU.cpp
@@ -0,0 +1,119 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+#include <algorithm>
+
+rocblas_handle handle;
+
+hipStream_t computeStream, dataStream;
+
+hipEvent_t swapStartEvent[HPL_N_UPD], update[HPL_N_UPD];
+hipEvent_t dgemmStart[HPL_N_UPD], dgemmStop[HPL_N_UPD];
+
+static char host_name[MPI_MAX_PROCESSOR_NAME];
+
+/*
+  This function finds out how many MPI processes are running on the same node
+  and assigns a local rank that can be used to map a process to a device.
+  This function needs to be called by all the MPI processes.
+*/
+void HPL_InitGPU(const HPL_T_grid* GRID) {
+  char host_name[MPI_MAX_PROCESSOR_NAME];
+
+  int i, n, namelen, rank, nprocs;
+  int dev;
+
+  int nprow, npcol, myrow, mycol;
+  (void)HPL_grid_info(GRID, &nprow, &npcol, &myrow, &mycol);
+
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+  MPI_Get_processor_name(host_name, &namelen);
+
+  int localRank = GRID->local_mycol + GRID->local_myrow * GRID->local_npcol;
+  int localSize = GRID->local_npcol * GRID->local_nprow;
+
+  /* Find out how many GPUs are in the system and their device number */
+  int deviceCount;
+  hipGetDeviceCount(&deviceCount);
+
+  if(deviceCount < 1) {
+    if(localRank == 0)
+      HPL_pwarn(stderr,
+                __LINE__,
+                "HPL_InitGPU",
+                "Node %s found no GPUs. Is the ROCm kernel module loaded?",
+                host_name);
+    MPI_Finalize();
+    exit(1);
+  }
+
+  dev = localRank % deviceCount;
+
+#ifdef HPL_VERBOSE_PRINT
+  if(rank < localSize) {
+    hipDeviceProp_t props;
+    hipGetDeviceProperties(&props, dev);
+
+    printf("GPU  Binding: Process %d [(p,q)=(%d,%d)] GPU: %d, pciBusID %x \n",
+           rank,
+           GRID->local_myrow,
+           GRID->local_mycol,
+           dev,
+           props.pciBusID);
+  }
+#endif
+
+  /* Assign device to MPI process, initialize BLAS and probe device properties
+   */
+  hipSetDevice(dev);
+
+  hipStreamCreate(&computeStream);
+  hipStreamCreate(&dataStream);
+
+  hipEventCreate(swapStartEvent + HPL_LOOK_AHEAD);
+  hipEventCreate(swapStartEvent + HPL_UPD_1);
+  hipEventCreate(swapStartEvent + HPL_UPD_2);
+
+  hipEventCreate(update + HPL_LOOK_AHEAD);
+  hipEventCreate(update + HPL_UPD_1);
+  hipEventCreate(update + HPL_UPD_2);
+
+  hipEventCreate(dgemmStart + HPL_LOOK_AHEAD);
+  hipEventCreate(dgemmStart + HPL_UPD_1);
+  hipEventCreate(dgemmStart + HPL_UPD_2);
+
+  hipEventCreate(dgemmStop + HPL_LOOK_AHEAD);
+  hipEventCreate(dgemmStop + HPL_UPD_1);
+  hipEventCreate(dgemmStop + HPL_UPD_2);
+}
+
+void HPL_FreeGPU() {
+  hipEventDestroy(swapStartEvent[HPL_LOOK_AHEAD]);
+  hipEventDestroy(swapStartEvent[HPL_UPD_1]);
+  hipEventDestroy(swapStartEvent[HPL_UPD_2]);
+
+  hipEventDestroy(update[HPL_LOOK_AHEAD]);
+  hipEventDestroy(update[HPL_UPD_1]);
+  hipEventDestroy(update[HPL_UPD_2]);
+
+  hipEventDestroy(dgemmStart[HPL_LOOK_AHEAD]);
+  hipEventDestroy(dgemmStart[HPL_UPD_1]);
+  hipEventDestroy(dgemmStart[HPL_UPD_2]);
+
+  hipEventDestroy(dgemmStop[HPL_LOOK_AHEAD]);
+  hipEventDestroy(dgemmStop[HPL_UPD_1]);
+  hipEventDestroy(dgemmStop[HPL_UPD_2]);
+
+  hipStreamDestroy(dataStream);
+  hipStreamDestroy(computeStream);
+}
diff --git a/src/HPL_pddriver.cpp b/src/HPL_pddriver.cpp
new file mode 100644
index 0000000..8b65d4d
--- /dev/null
+++ b/src/HPL_pddriver.cpp
@@ -0,0 +1,285 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+int main(int ARGC, char** ARGV) {
+  /*
+   * Purpose
+   * =======
+   *
+   * main is the main driver program for testing the HPL routines.
+   * This  program is  driven  by  a short data file named  "HPL.dat".
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  int nval[HPL_MAX_PARAM], nbval[HPL_MAX_PARAM], pval[HPL_MAX_PARAM],
+      qval[HPL_MAX_PARAM], nbmval[HPL_MAX_PARAM], ndvval[HPL_MAX_PARAM],
+      ndhval[HPL_MAX_PARAM];
+
+  HPL_T_FACT pfaval[HPL_MAX_PARAM], rfaval[HPL_MAX_PARAM];
+
+  HPL_T_TOP topval[HPL_MAX_PARAM];
+
+  HPL_T_grid grid;
+  HPL_T_palg algo;
+  HPL_T_test test;
+  int L1notran, Unotran, align, equil, in, inb, inbm, indh, indv, ipfa, ipq,
+      irfa, itop, mycol, myrow, ns, nbs, nbms, ndhs, ndvs, npcol, npfs, npqs,
+      nprow, nrfs, ntps, rank, size, tswap;
+  HPL_T_ORDER pmapping;
+  HPL_T_FACT  rpfa;
+  HPL_T_SWAP  fswap;
+  double      frac;
+  int         p, q;
+
+  MPI_Init(&ARGC, &ARGV);
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+  MPI_Op_create(HPL_dmxswp, true, &HPL_DMXSWP);
+
+  /*
+   * Read and check validity of test parameters from input file
+   *
+   * HPL Version 1.0, Linpack benchmark input file
+   * Your message here
+   * HPL.out      output file name (if any)
+   * 6            device out (6=stdout,7=stderr,file)
+   * 4            # of problems sizes (N)
+   * 29 30 34 35  Ns
+   * 4            # of NBs
+   * 1 2 3 4      NBs
+   * 0            PMAP process mapping (0=Row-,1=Column-major)
+   * 3            # of process grids (P x Q)
+   * 2 1 4        Ps
+   * 2 4 1        Qs
+   * 16.0         threshold
+   * 3            # of panel fact
+   * 0 1 2        PFACTs (0=left, 1=Crout, 2=Right)
+   * 2            # of recursive stopping criterium
+   * 2 4          NBMINs (>= 1)
+   * 1            # of panels in recursion
+   * 2            NDIVs
+   * 3            # of recursive panel fact.
+   * 0 1 2        RFACTs (0=left, 1=Crout, 2=Right)
+   * 1            # of broadcast
+   * 0            BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM)
+   * 1            # of lookahead depth
+   * 0            DEPTHs (>=0)
+   * 2            SWAP (0=bin-exch,1=long,2=mix)
+   * 4            swapping threshold
+   * 0            L1 in (0=transposed,1=no-transposed) form
+   * 0            U  in (0=transposed,1=no-transposed) form
+   * 1            Equilibration (0=no,1=yes)
+   * 8            memory alignment in double (> 0)
+   */
+  HPL_pdinfo(ARGC,
+             ARGV,
+             &test,
+             &ns,
+             nval,
+             &nbs,
+             nbval,
+             &pmapping,
+             &npqs,
+             pval,
+             qval,
+             &p,
+             &q,
+             &npfs,
+             pfaval,
+             &nbms,
+             nbmval,
+             &ndvs,
+             ndvval,
+             &nrfs,
+             rfaval,
+             &ntps,
+             topval,
+             &ndhs,
+             ndhval,
+             &fswap,
+             &tswap,
+             &L1notran,
+             &Unotran,
+             &equil,
+             &align,
+             &frac);
+
+  /*
+   * Loop over different process grids - Define process grid. Go to bottom
+   * of process grid loop if this case does not use my process.
+   */
+  for(ipq = 0; ipq < npqs; ipq++) {
+    (void)HPL_grid_init(
+        MPI_COMM_WORLD, pmapping, pval[ipq], qval[ipq], p, q, &grid);
+    (void)HPL_grid_info(&grid, &nprow, &npcol, &myrow, &mycol);
+
+    if((myrow < 0) || (myrow >= nprow) || (mycol < 0) || (mycol >= npcol))
+      goto label_end_of_npqs;
+
+    // Initialize GPU
+    HPL_InitGPU(&grid);
+
+    for(in = 0; in < ns; in++) {       /* Loop over various problem sizes */
+      for(inb = 0; inb < nbs; inb++) { /* Loop over various blocking factors */
+        for(indh = 0; indh < ndhs;
+            indh++) { /* Loop over various lookahead depths */
+          for(itop = 0; itop < ntps;
+              itop++) { /* Loop over various broadcast topologies */
+            for(irfa = 0; irfa < nrfs;
+                irfa++) { /* Loop over various recursive factorizations */
+              for(ipfa = 0; ipfa < npfs;
+                  ipfa++) { /* Loop over various panel factorizations */
+                for(inbm = 0; inbm < nbms;
+                    inbm++) { /* Loop over various recursive stopping criteria
+                               */
+                  for(indv = 0; indv < ndvs;
+                      indv++) { /* Loop over various # of panels in recursion */
+                                /*
+                                 * Set up the algorithm parameters
+                                 */
+                    algo.btopo = topval[itop];
+                    algo.depth = ndhval[indh];
+                    algo.nbmin = nbmval[inbm];
+                    algo.nbdiv = ndvval[indv];
+
+                    algo.pfact = rpfa = pfaval[ipfa];
+
+                    if(L1notran != 0) {
+                      if(rpfa == HPL_LEFT_LOOKING)
+                        algo.pffun = HPL_pdpanllN;
+                      else if(rpfa == HPL_CROUT)
+                        algo.pffun = HPL_pdpancrN;
+                      else
+                        algo.pffun = HPL_pdpanrlN;
+
+                      algo.rfact = rpfa = rfaval[irfa];
+                      if(rpfa == HPL_LEFT_LOOKING)
+                        algo.rffun = HPL_pdrpanllN;
+                      else if(rpfa == HPL_CROUT)
+                        algo.rffun = HPL_pdrpancrN;
+                      else
+                        algo.rffun = HPL_pdrpanrlN;
+
+                      algo.upfun = HPL_pdupdateNT;
+                    } else {
+                      if(rpfa == HPL_LEFT_LOOKING)
+                        algo.pffun = HPL_pdpanllT;
+                      else if(rpfa == HPL_CROUT)
+                        algo.pffun = HPL_pdpancrT;
+                      else
+                        algo.pffun = HPL_pdpanrlT;
+
+                      algo.rfact = rpfa = rfaval[irfa];
+                      if(rpfa == HPL_LEFT_LOOKING)
+                        algo.rffun = HPL_pdrpanllT;
+                      else if(rpfa == HPL_CROUT)
+                        algo.rffun = HPL_pdrpancrT;
+                      else
+                        algo.rffun = HPL_pdrpanrlT;
+
+                      algo.upfun = HPL_pdupdateTT;
+                    }
+
+                    algo.fswap = fswap;
+                    algo.fsthr = tswap;
+                    algo.equil = equil;
+                    algo.align = align;
+
+                    algo.frac = frac;
+
+                    HPL_pdtest(&test, &grid, &algo, nval[in], nbval[inb]);
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    (void)HPL_grid_exit(&grid);
+    HPL_FreeGPU();
+
+  label_end_of_npqs:;
+  }
+  /*
+   * Print ending messages, close output file, exit.
+   */
+  if(rank == 0) {
+    test.ktest = test.kpass + test.kfail + test.kskip;
+#ifndef HPL_DETAILED_TIMING
+    HPL_fprintf(test.outfp,
+                "%s%s\n",
+                "========================================",
+                "========================================");
+#else
+    if(test.thrsh > HPL_rzero)
+      HPL_fprintf(test.outfp,
+                  "%s%s\n",
+                  "========================================",
+                  "========================================");
+#endif
+
+    HPL_fprintf(test.outfp,
+                "\n%s %6d %s\n",
+                "Finished",
+                test.ktest,
+                "tests with the following results:");
+    if(test.thrsh > HPL_rzero) {
+      HPL_fprintf(test.outfp,
+                  "         %6d %s\n",
+                  test.kpass,
+                  "tests completed and passed residual checks,");
+      HPL_fprintf(test.outfp,
+                  "         %6d %s\n",
+                  test.kfail,
+                  "tests completed and failed residual checks,");
+      HPL_fprintf(test.outfp,
+                  "         %6d %s\n",
+                  test.kskip,
+                  "tests skipped because of illegal input values.");
+    } else {
+      HPL_fprintf(test.outfp,
+                  "         %6d %s\n",
+                  test.kpass,
+                  "tests completed without checking,");
+      HPL_fprintf(test.outfp,
+                  "         %6d %s\n",
+                  test.kskip,
+                  "tests skipped because of illegal input values.");
+    }
+
+    HPL_fprintf(test.outfp,
+                "%s%s\n",
+                "----------------------------------------",
+                "----------------------------------------");
+    HPL_fprintf(test.outfp, "\nEnd of Tests.\n");
+    HPL_fprintf(test.outfp,
+                "%s%s\n",
+                "========================================",
+                "========================================");
+
+    if((test.outfp != stdout) && (test.outfp != stderr))
+      (void)fclose(test.outfp);
+  }
+
+  MPI_Finalize();
+
+  return (0);
+}
diff --git a/src/HPL_pdinfo.cpp b/src/HPL_pdinfo.cpp
new file mode 100644
index 0000000..f04e79e
--- /dev/null
+++ b/src/HPL_pdinfo.cpp
@@ -0,0 +1,1557 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+#include <iostream>
+#include <cstdio>
+#include <cstring>
+
+void HPL_pdinfo(int          ARGC,
+                char**       ARGV,
+                HPL_T_test*  TEST,
+                int*         NS,
+                int*         N,
+                int*         NBS,
+                int*         NB,
+                HPL_T_ORDER* PMAPPIN,
+                int*         NPQS,
+                int*         P,
+                int*         Q,
+                int*         p,
+                int*         q,
+                int*         NPFS,
+                HPL_T_FACT*  PF,
+                int*         NBMS,
+                int*         NBM,
+                int*         NDVS,
+                int*         NDV,
+                int*         NRFS,
+                HPL_T_FACT*  RF,
+                int*         NTPS,
+                HPL_T_TOP*   TP,
+                int*         NDHS,
+                int*         DH,
+                HPL_T_SWAP*  FSWAP,
+                int*         TSWAP,
+                int*         L1NOTRAN,
+                int*         UNOTRAN,
+                int*         EQUIL,
+                int*         ALIGN,
+                double*      FRAC) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pdinfo reads  the  startup  information for the various tests and
+   * transmits it to all processes.
+   *
+   * Arguments
+   * =========
+   *
+   * TEST    (global output)               HPL_T_test *
+   *         On entry, TEST  points to a testing data structure.  On exit,
+   *         the fields of this data structure are initialized as follows:
+   *         TEST->outfp  specifies the output file where the results will
+   *         be printed.  It is only defined and used by  the process 0 of
+   *         the grid.  TEST->thrsh specifies the threshhold value for the
+   *         test ratio.  TEST->epsil is the relative machine precision of
+   *         the distributed computer.  Finally  the test counters, kfail,
+   *         kpass, kskip, ktest are initialized to zero.
+   *
+   * NS      (global output)               int *
+   *         On exit,  NS  specifies the number of different problem sizes
+   *         to be tested. NS is less than or equal to HPL_MAX_PARAM.
+   *
+   * N       (global output)               int *
+   *         On entry, N is an array of dimension HPL_MAX_PARAM.  On exit,
+   *         the first NS entries of this array contain the  problem sizes
+   *         to run the code with.
+   *
+   * NBS     (global output)               int *
+   *         On exit,  NBS  specifies the number of different distribution
+   *         blocking factors to be tested. NBS must be less than or equal
+   *         to HPL_MAX_PARAM.
+   *
+   * NB      (global output)               int *
+   *         On exit,  PMAPPIN  specifies the process mapping onto the no-
+   *         des of the  MPI machine configuration.  PMAPPIN  defaults  to
+   *         row-major ordering.
+   *
+   * PMAPPIN (global output)               HPL_T_ORDER *
+   *         On entry, NB is an array of dimension HPL_MAX_PARAM. On exit,
+   *         the first NBS entries of this array contain the values of the
+   *         various distribution blocking factors, to run the code with.
+   *
+   * NPQS    (global output)               int *
+   *         On exit, NPQS  specifies the  number of different values that
+   *         can be used for P and Q, i.e., the number of process grids to
+   *         run  the  code with.  NPQS must be  less  than  or  equal  to
+   *         HPL_MAX_PARAM.
+   *
+   * P       (global output)               int *
+   *         On entry, P  is an array of dimension HPL_MAX_PARAM. On exit,
+   *         the first NPQS entries of this array contain the values of P,
+   *         the number of process rows of the  NPQS grids to run the code
+   *         with.
+   *
+   * Q       (global output)               int *
+   *         On entry, Q  is an array of dimension HPL_MAX_PARAM. On exit,
+   *         the first NPQS entries of this array contain the values of Q,
+   *         the number of process columns of the  NPQS  grids to  run the
+   *         code with.
+   *
+   * p       (global output)               int *
+   *         On exit, p specifies the number of rows in the node-local MPI
+   *         grid
+   *
+   * q       (global output)               int *
+   *         On exit, q specifies the number of columns in the node-local
+   *         MPI grid
+   *
+   * NPFS    (global output)               int *
+   *         On exit, NPFS  specifies the  number of different values that
+   *         can be used for PF : the panel factorization algorithm to run
+   *         the code with. NPFS is less than or equal to HPL_MAX_PARAM.
+   *
+   * PF      (global output)               HPL_T_FACT *
+   *         On entry, PF is an array of dimension HPL_MAX_PARAM. On exit,
+   *         the first  NPFS  entries  of this array  contain  the various
+   *         panel factorization algorithms to run the code with.
+   *
+   * NBMS    (global output)               int *
+   *         On exit,  NBMS  specifies  the  number  of  various recursive
+   *         stopping criteria  to be tested.  NBMS  must be  less than or
+   *         equal to HPL_MAX_PARAM.
+   *
+   * NBM     (global output)               int *
+   *         On entry,  NBM  is an array of  dimension  HPL_MAX_PARAM.  On
+   *         exit, the first NBMS entries of this array contain the values
+   *         of the various recursive stopping criteria to be tested.
+   *
+   * NDVS    (global output)               int *
+   *         On exit,  NDVS  specifies  the number  of various numbers  of
+   *         panels in recursion to be tested.  NDVS is less than or equal
+   *         to HPL_MAX_PARAM.
+   *
+   * NDV     (global output)               int *
+   *         On entry,  NDV  is an array of  dimension  HPL_MAX_PARAM.  On
+   *         exit, the first NDVS entries of this array contain the values
+   *         of the various numbers of panels in recursion to be tested.
+   *
+   * NRFS    (global output)               int *
+   *         On exit, NRFS  specifies the  number of different values that
+   *         can be used for RF : the recursive factorization algorithm to
+   *         be tested. NRFS is less than or equal to HPL_MAX_PARAM.
+   *
+   * RF      (global output)               HPL_T_FACT *
+   *         On entry, RF is an array of dimension HPL_MAX_PARAM. On exit,
+   *         the first  NRFS  entries  of  this array contain  the various
+   *         recursive factorization algorithms to run the code with.
+   *
+   * NTPS    (global output)               int *
+   *         On exit, NTPS  specifies the  number of different values that
+   *         can be used for the  broadcast topologies  to be tested. NTPS
+   *         is less than or equal to HPL_MAX_PARAM.
+   *
+   * TP      (global output)               HPL_T_TOP *
+   *         On entry, TP is an array of dimension HPL_MAX_PARAM. On exit,
+   *         the  first NTPS  entries of this  array  contain  the various
+   *         broadcast (along rows) topologies to run the code with.
+   *
+   * NDHS    (global output)               int *
+   *         On exit, NDHS  specifies the  number of different values that
+   *         can be used for the  lookahead depths to be  tested.  NDHS is
+   *         less than or equal to HPL_MAX_PARAM.
+   *
+   * DH      (global output)               int *
+   *         On entry,  DH  is  an array of  dimension  HPL_MAX_PARAM.  On
+   *         exit, the first NDHS entries of this array contain the values
+   *         of lookahead depths to run the code with.  Such a value is at
+   *         least 0 (no-lookahead) or greater than zero.
+   *
+   * FSWAP   (global output)               HPL_T_SWAP *
+   *         On exit, FSWAP specifies the swapping algorithm to be used in
+   *         all tests.
+   *
+   * TSWAP   (global output)               int *
+   *         On exit,  TSWAP  specifies the swapping threshold as a number
+   *         of columns when the mixed swapping algorithm was chosen.
+   *
+   * L1NOTRA (global output)               int *
+   *         On exit, L1NOTRAN specifies whether the upper triangle of the
+   *         panels of columns  should  be stored  in  no-transposed  form
+   *         (L1NOTRAN=1) or in transposed form (L1NOTRAN=0).
+   *
+   * UNOTRAN (global output)               int *
+   *         On exit, UNOTRAN  specifies whether the panels of rows should
+   *         be stored in  no-transposed form  (UNOTRAN=1)  or  transposed
+   *         form (UNOTRAN=0) during their broadcast.
+   *
+   * EQUIL   (global output)               int *
+   *         On exit,  EQUIL  specifies  whether  equilibration during the
+   *         swap-broadcast  of  the  panel of rows  should  be  performed
+   *         (EQUIL=1) or not (EQUIL=0).
+   *
+   * ALIGN   (global output)               int *
+   *         On exit,  ALIGN  specifies the alignment  of  the dynamically
+   *         allocated buffers in double precision words. ALIGN is greater
+   *         than zero.
+   *
+   * FRAC    (global output)               double *
+   *         On exit,  FRAC  specifies the percentage in which to split the
+   *         the trailing update.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  char file[HPL_LINE_MAX], line[HPL_LINE_MAX], auth[HPL_LINE_MAX],
+      num[HPL_LINE_MAX];
+  FILE* infp;
+  int*  iwork = NULL;
+  char* lineptr;
+  int   error = 0, fid, i, j, lwork, maxp, nprocs, rank, size;
+
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &size);
+  /*
+   * Initialize the TEST data structure with default values
+   */
+  TEST->outfp = stderr;
+  TEST->epsil = 2.0e-16;
+  TEST->thrsh = 16.0;
+  TEST->kfail = TEST->kpass = TEST->kskip = TEST->ktest = 0;
+
+  // parse settings
+  int         _P = 1, _Q = 1, n = 45312, nb = 384;
+  int         _p = -1, _q = -1;
+  bool        cmdlinerun    = false;
+  bool        inputfile     = false;
+  double      frac          = 0.6;
+  std::string inputFileName = "HPL.dat";
+
+  for(int i = 1; i < ARGC; i++) {
+    if(strcmp(ARGV[i], "-h") == 0 || strcmp(ARGV[i], "--help") == 0) {
+      if(rank == 0) {
+        std::cout
+            << "rocHPL client command line options:                      "
+               "           \n"
+               "-P  [ --ranksP ] arg (=1)          Specific MPI grid "
+               "size: the number of      \n"
+               "                                   rows in MPI grid.     "
+               "                     \n"
+               "-Q  [ --ranksQ ] arg (=1)          Specific MPI grid "
+               "size: the number of      \n"
+               "                                   columns in MPI grid.  "
+               "                     \n"
+               "-N  [ --sizeN ]  arg (=45312)      Specific matrix size: "
+               "the number of rows   \n"
+               "                                   /columns in global "
+               "matrix.                 \n"
+               "-NB [ --sizeNB ] arg (=384)        Specific panel size: "
+               "the number of rows    \n"
+               "                                   /columns in panels.   "
+               "                     \n"
+               "-f  [ --frac ] arg (=0.6)          Specific update split: "
+               "the percentage to    \n"
+               "                                   split the trailing "
+               "submatrix.           \n"
+               "-i  [ --input ]  arg (=HPL.dat)    Input file. When set, "
+               "all other commnand   \n"
+               "                                   line parameters are "
+               "ignored, and problem   \n"
+               "                                   parameters are read "
+               "from input file.       \n"
+               "-h  [ --help ]                     Produces this help "
+               "message                 \n"
+               "--version                          Prints the version "
+               "number                  \n";
+      }
+      MPI_Barrier(MPI_COMM_WORLD);
+      MPI_Finalize();
+      exit(0);
+    }
+
+    if(strcmp(ARGV[i], "--version") == 0) {
+      if(rank == 0) {
+        std::cout << "rocHPL version: " << __ROCHPL_VER_MAJOR << "."
+                  << __ROCHPL_VER_MINOR << "." << __ROCHPL_VER_PATCH
+                  << std::endl;
+      }
+      MPI_Barrier(MPI_COMM_WORLD);
+      MPI_Finalize();
+      exit(0);
+    }
+
+    if(strcmp(ARGV[i], "-P") == 0 || strcmp(ARGV[i], "--ranksP") == 0) {
+      _P         = atoi(ARGV[i + 1]);
+      cmdlinerun = true;
+      i++;
+      if(_P < 1) {
+        if(rank == 0)
+          HPL_pwarn(stderr,
+                    __LINE__,
+                    "HPL_pdinfo",
+                    "Illegal value for P. Exiting ...");
+        MPI_Finalize();
+        exit(1);
+      }
+    }
+    if(strcmp(ARGV[i], "-Q") == 0 || strcmp(ARGV[i], "--ranksQ") == 0) {
+      _Q         = atoi(ARGV[i + 1]);
+      cmdlinerun = true;
+      i++;
+      if(_Q < 1) {
+        if(rank == 0)
+          HPL_pwarn(stderr,
+                    __LINE__,
+                    "HPL_pdinfo",
+                    "Illegal value for Q. Exiting ...");
+        MPI_Finalize();
+        exit(1);
+      }
+    }
+    if(strcmp(ARGV[i], "-p") == 0) {
+      _p         = atoi(ARGV[i + 1]);
+      cmdlinerun = true;
+      i++;
+    }
+    if(strcmp(ARGV[i], "-q") == 0) {
+      _q         = atoi(ARGV[i + 1]);
+      cmdlinerun = true;
+      i++;
+    }
+
+    if(strcmp(ARGV[i], "-N") == 0 || strcmp(ARGV[i], "--sizeN") == 0) {
+      n          = atoi(ARGV[i + 1]);
+      cmdlinerun = true;
+      i++;
+      if(n < 1) {
+        if(rank == 0)
+          HPL_pwarn(stderr,
+                    __LINE__,
+                    "HPL_pdinfo",
+                    "Illegal value for N. Exiting ...");
+        MPI_Finalize();
+        exit(1);
+      }
+    }
+    if(strcmp(ARGV[i], "-NB") == 0 || strcmp(ARGV[i], "--sizeNB") == 0) {
+      nb         = atoi(ARGV[i + 1]);
+      cmdlinerun = true;
+      i++;
+      if(nb < 1) {
+        if(rank == 0)
+          HPL_pwarn(stderr,
+                    __LINE__,
+                    "HPL_pdinfo",
+                    "Illegal value for NB. Exiting ...");
+        MPI_Finalize();
+        exit(1);
+      }
+    }
+    if(strcmp(ARGV[i], "-f") == 0 || strcmp(ARGV[i], "--frac") == 0) {
+      frac = atof(ARGV[i + 1]);
+      i++;
+    }
+    if(strcmp(ARGV[i], "-i") == 0 || strcmp(ARGV[i], "--input") == 0) {
+      inputFileName = ARGV[i + 1];
+      inputfile     = true;
+      i++;
+    }
+  }
+
+  /*
+   * Check for enough processes in machine configuration
+   */
+  maxp = _P * _Q;
+  if(maxp > size) {
+    if(rank == 0)
+      HPL_pwarn(stderr,
+                __LINE__,
+                "HPL_pdinfo",
+                "Need at least %d processes for these tests",
+                maxp);
+    MPI_Finalize();
+    exit(1);
+  }
+
+  /*
+   * Split fraction
+   */
+  *FRAC = frac;
+
+  /*Node-local grid*/
+  MPI_Comm nodeComm;
+  MPI_Comm_split_type(
+      MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, &nodeComm);
+
+  int localRank;
+  int localSize;
+  MPI_Comm_rank(nodeComm, &localRank);
+  MPI_Comm_size(nodeComm, &localSize);
+
+  if(_p < 1 && _q < 1) { // Neither p nor q specified
+    _q = localSize;      // Assume a 1xq node-local grid
+    _p = 1;
+  } else if(_p < 1) { // q specified
+    if(localSize % _q != 0) {
+      if(rank == 0)
+        HPL_pwarn(stderr,
+                  __LINE__,
+                  "HPL_pdinfo",
+                  "Node-local MPI grid cannot be split into q=%d columns",
+                  _q);
+      MPI_Finalize();
+      exit(1);
+    }
+    _p = localSize / _q;
+  } else if(_q < 1) { // p specified
+    if(localSize % _p != 0) {
+      if(rank == 0)
+        HPL_pwarn(stderr,
+                  __LINE__,
+                  "HPL_pdinfo",
+                  "Node-local MPI grid cannot be split into p=%d rows",
+                  _p);
+      MPI_Finalize();
+      exit(1);
+    }
+    _q = localSize / _p;
+  } else {
+    if(localSize != _p * _q) {
+      if(rank == 0)
+        HPL_pwarn(
+            stderr, __LINE__, "HPL_pdinfo", "Invalid Node-local MPI grid");
+      MPI_Finalize();
+      exit(1);
+    }
+  }
+
+  /*Check grid can be distributed to nodes*/
+  if(_Q % _q != 0 || _P % _p != 0) {
+    if(rank == 0)
+      HPL_pwarn(stderr,
+                __LINE__,
+                "HPL_pdinfo",
+                "MPI grid is not uniformly distributed amoung nodes, "
+                "(P,Q)=(%d,%d) and (p,q)=(%d,%d)",
+                _P,
+                _Q,
+                _p,
+                _q);
+    MPI_Finalize();
+    exit(1);
+  }
+  MPI_Comm_free(&nodeComm);
+  /*
+   * Node-local Process grids, mapping
+   */
+  *p = _p;
+  *q = _q;
+
+  if(inputfile == false && cmdlinerun == true) {
+    // We were given run paramters via the cmd line so skip
+    // trying to read from an input file and just fill a
+    // TEST structure.
+
+    /*
+     * Problem size (>=0) (N)
+     */
+    *NS  = 1;
+    N[0] = n;
+    /*
+     * Block size (>=1) (NB)
+     */
+    *NBS  = 1;
+    NB[0] = nb;
+    /*
+     * Process grids, mapping, (>=1) (P, Q)
+     */
+    *PMAPPIN = HPL_COLUMN_MAJOR;
+    *NPQS    = 1;
+    P[0]     = _P;
+    Q[0]     = _Q;
+    /*
+     * Panel factorization algorithm (PF)
+     */
+    *NPFS = 1;
+    PF[0] = HPL_RIGHT_LOOKING; // HPL_LEFT_LOOKING, HPL_CROUT;
+    /*
+     * Recursive stopping criterium (>=1) (NBM)
+     */
+    *NBMS  = 1;
+    NBM[0] = 16;
+    /*
+     * Number of panels in recursion (>=2) (NDV)
+     */
+    *NDVS  = 1;
+    NDV[0] = 2;
+    /*
+     * Recursive panel factorization (RF)
+     */
+    *NRFS = 1;
+    RF[0] = HPL_RIGHT_LOOKING; // HPL_LEFT_LOOKING, HPL_CROUT;
+    /*
+     * Broadcast topology (TP) (0=rg, 1=2rg, 2=rgM, 3=2rgM, 4=L)
+     */
+    *NTPS = 1;
+    TP[0] = HPL_1RING;
+    /*
+     * Lookahead depth (>=0) (NDH)
+     */
+    *NDHS = 1;
+    DH[0] = 1;
+    /*
+     * Swapping algorithm (0,1 or 2) (FSWAP)
+     */
+    *FSWAP = HPL_SWAP01;
+    /*
+     * Swapping threshold (>=0) (TSWAP)
+     */
+    *TSWAP = 64;
+    /*
+     * L1 in (no-)transposed form (0 or 1)
+     */
+    *L1NOTRAN = 1;
+    /*
+     * U  in (no-)transposed form (0 or 1)
+     */
+    *UNOTRAN = 0;
+    /*
+     * Equilibration (0=no, 1=yes)
+     */
+    *EQUIL = 0;
+    /*
+     * Memory alignment in bytes (> 0) (ALIGN)
+     */
+    *ALIGN = 8;
+
+    /*
+     * Compute and broadcast machine epsilon
+     */
+    TEST->epsil = HPL_pdlamch(MPI_COMM_WORLD, HPL_MACH_EPS);
+
+    if(rank == 0) {
+      if((TEST->outfp = fopen("HPL.out", "w")) == NULL) { error = 1; }
+    }
+    (void)HPL_all_reduce((void*)(&error), 1, HPL_INT, HPL_MAX, MPI_COMM_WORLD);
+    if(error) {
+      if(rank == 0)
+        HPL_pwarn(stderr, __LINE__, "HPL_pdinfo", "cannot open file HPL.out.");
+      MPI_Finalize();
+      exit(1);
+    }
+  } else {
+    /*
+     * Process 0 reads the input data, broadcasts to other processes and
+     * writes needed information to TEST->outfp.
+     */
+    char* status;
+    if(rank == 0) {
+      /*
+       * Open file and skip data file header
+       */
+      if((infp = fopen(inputFileName.c_str(), "r")) == NULL) {
+        HPL_pwarn(stderr,
+                  __LINE__,
+                  "HPL_pdinfo",
+                  "cannot open file %s",
+                  inputFileName.c_str());
+        error = 1;
+        goto label_error;
+      }
+
+      status = fgets(line, HPL_LINE_MAX - 2, infp);
+      status = fgets(auth, HPL_LINE_MAX - 2, infp);
+      /*
+       * Read name and unit number for summary output file
+       */
+      status = fgets(line, HPL_LINE_MAX - 2, infp);
+      (void)sscanf(line, "%s", file);
+      status = fgets(line, HPL_LINE_MAX - 2, infp);
+      (void)sscanf(line, "%s", num);
+      fid = atoi(num);
+      if(fid == 6)
+        TEST->outfp = stdout;
+      else if(fid == 7)
+        TEST->outfp = stderr;
+      else if((TEST->outfp = fopen(file, "w")) == NULL) {
+        HPL_pwarn(stderr, __LINE__, "HPL_pdinfo", "cannot open file %s.", file);
+        error = 1;
+        goto label_error;
+      }
+      /*
+       * Read and check the parameter values for the tests.
+       *
+       * Problem size (>=0) (N)
+       */
+      status = fgets(line, HPL_LINE_MAX - 2, infp);
+      (void)sscanf(line, "%s", num);
+      *NS = atoi(num);
+      if((*NS < 1) || (*NS > HPL_MAX_PARAM)) {
+        HPL_pwarn(stderr,
+                  __LINE__,
+                  "HPL_pdinfo",
+                  "%s %d",
+                  "Number of values of N is less than 1 or greater than",
+                  HPL_MAX_PARAM);
+        error = 1;
+        goto label_error;
+      }
+
+      status  = fgets(line, HPL_LINE_MAX - 2, infp);
+      lineptr = line;
+      for(i = 0; i < *NS; i++) {
+        (void)sscanf(lineptr, "%s", num);
+        lineptr += strlen(num) + 1;
+        if((N[i] = atoi(num)) < 0) {
+          HPL_pwarn(stderr, __LINE__, "HPL_pdinfo", "Value of N less than 0");
+          error = 1;
+          goto label_error;
+        }
+      }
+      /*
+       * Block size (>=1) (NB)
+       */
+      status = fgets(line, HPL_LINE_MAX - 2, infp);
+      (void)sscanf(line, "%s", num);
+      *NBS = atoi(num);
+      if((*NBS < 1) || (*NBS > HPL_MAX_PARAM)) {
+        HPL_pwarn(stderr,
+                  __LINE__,
+                  "HPL_pdinfo",
+                  "%s %s %d",
+                  "Number of values of NB is less than 1 or",
+                  "greater than",
+                  HPL_MAX_PARAM);
+        error = 1;
+        goto label_error;
+      }
+
+      status  = fgets(line, HPL_LINE_MAX - 2, infp);
+      lineptr = line;
+      for(i = 0; i < *NBS; i++) {
+        (void)sscanf(lineptr, "%s", num);
+        lineptr += strlen(num) + 1;
+        if((NB[i] = atoi(num)) < 1) {
+          HPL_pwarn(stderr, __LINE__, "HPL_pdinfo", "Value of NB less than 1");
+          error = 1;
+          goto label_error;
+        }
+      }
+      /*
+       * Process grids, mapping, (>=1) (P, Q)
+       */
+      status = fgets(line, HPL_LINE_MAX - 2, infp);
+      (void)sscanf(line, "%s", num);
+      *PMAPPIN = (atoi(num) == 1 ? HPL_COLUMN_MAJOR : HPL_ROW_MAJOR);
+
+      status = fgets(line, HPL_LINE_MAX - 2, infp);
+      (void)sscanf(line, "%s", num);
+      *NPQS = atoi(num);
+      if((*NPQS < 1) || (*NPQS > HPL_MAX_PARAM)) {
+        HPL_pwarn(stderr,
+                  __LINE__,
+                  "HPL_pdinfo",
+                  "%s %s %d",
+                  "Number of values of grids is less",
+                  "than 1 or greater than",
+                  HPL_MAX_PARAM);
+        error = 1;
+        goto label_error;
+      }
+
+      status  = fgets(line, HPL_LINE_MAX - 2, infp);
+      lineptr = line;
+      for(i = 0; i < *NPQS; i++) {
+        (void)sscanf(lineptr, "%s", num);
+        lineptr += strlen(num) + 1;
+        if((P[i] = atoi(num)) < 1) {
+          HPL_pwarn(stderr, __LINE__, "HPL_pdinfo", "Value of P less than 1");
+          error = 1;
+          goto label_error;
+        }
+      }
+      status  = fgets(line, HPL_LINE_MAX - 2, infp);
+      lineptr = line;
+      for(i = 0; i < *NPQS; i++) {
+        (void)sscanf(lineptr, "%s", num);
+        lineptr += strlen(num) + 1;
+        if((Q[i] = atoi(num)) < 1) {
+          HPL_pwarn(stderr, __LINE__, "HPL_pdinfo", "Value of Q less than 1");
+          error = 1;
+          goto label_error;
+        }
+      }
+      /*
+       * Check for enough processes in machine configuration
+       */
+      maxp = 0;
+      for(i = 0; i < *NPQS; i++) {
+        nprocs = P[i] * Q[i];
+        maxp   = Mmax(maxp, nprocs);
+      }
+      if(maxp > size) {
+        HPL_pwarn(stderr,
+                  __LINE__,
+                  "HPL_pdinfo",
+                  "Need at least %d processes for these tests",
+                  maxp);
+        error = 1;
+        goto label_error;
+      }
+      /*
+       * Checking threshold value (TEST->thrsh)
+       */
+      status = fgets(line, HPL_LINE_MAX - 2, infp);
+      (void)sscanf(line, "%s", num);
+      TEST->thrsh = atof(num);
+      /*
+       * Panel factorization algorithm (PF)
+       */
+      status = fgets(line, HPL_LINE_MAX - 2, infp);
+      (void)sscanf(line, "%s", num);
+      *NPFS = atoi(num);
+      if((*NPFS < 1) || (*NPFS > HPL_MAX_PARAM)) {
+        HPL_pwarn(stderr,
+                  __LINE__,
+                  "HPL_pdinfo",
+                  "%s %s %d",
+                  "number of values of PFACT",
+                  "is less than 1 or greater than",
+                  HPL_MAX_PARAM);
+        error = 1;
+        goto label_error;
+      }
+      status  = fgets(line, HPL_LINE_MAX - 2, infp);
+      lineptr = line;
+      for(i = 0; i < *NPFS; i++) {
+        (void)sscanf(lineptr, "%s", num);
+        lineptr += strlen(num) + 1;
+        j = atoi(num);
+        if(j == 0)
+          PF[i] = HPL_LEFT_LOOKING;
+        else if(j == 1)
+          PF[i] = HPL_CROUT;
+        else if(j == 2)
+          PF[i] = HPL_RIGHT_LOOKING;
+        else
+          PF[i] = HPL_RIGHT_LOOKING;
+      }
+      /*
+       * Recursive stopping criterium (>=1) (NBM)
+       */
+      status = fgets(line, HPL_LINE_MAX - 2, infp);
+      (void)sscanf(line, "%s", num);
+      *NBMS = atoi(num);
+      if((*NBMS < 1) || (*NBMS > HPL_MAX_PARAM)) {
+        HPL_pwarn(stderr,
+                  __LINE__,
+                  "HPL_pdinfo",
+                  "%s %s %d",
+                  "Number of values of NBMIN",
+                  "is less than 1 or greater than",
+                  HPL_MAX_PARAM);
+        error = 1;
+        goto label_error;
+      }
+      status  = fgets(line, HPL_LINE_MAX - 2, infp);
+      lineptr = line;
+      for(i = 0; i < *NBMS; i++) {
+        (void)sscanf(lineptr, "%s", num);
+        lineptr += strlen(num) + 1;
+        if((NBM[i] = atoi(num)) < 1) {
+          HPL_pwarn(
+              stderr, __LINE__, "HPL_pdinfo", "Value of NBMIN less than 1");
+          error = 1;
+          goto label_error;
+        }
+      }
+      /*
+       * Number of panels in recursion (>=2) (NDV)
+       */
+      status = fgets(line, HPL_LINE_MAX - 2, infp);
+      (void)sscanf(line, "%s", num);
+      *NDVS = atoi(num);
+      if((*NDVS < 1) || (*NDVS > HPL_MAX_PARAM)) {
+        HPL_pwarn(stderr,
+                  __LINE__,
+                  "HPL_pdinfo",
+                  "%s %s %d",
+                  "Number of values of NDIV",
+                  "is less than 1 or greater than",
+                  HPL_MAX_PARAM);
+        error = 1;
+        goto label_error;
+      }
+      status  = fgets(line, HPL_LINE_MAX - 2, infp);
+      lineptr = line;
+      for(i = 0; i < *NDVS; i++) {
+        (void)sscanf(lineptr, "%s", num);
+        lineptr += strlen(num) + 1;
+        if((NDV[i] = atoi(num)) < 2) {
+          HPL_pwarn(
+              stderr, __LINE__, "HPL_pdinfo", "Value of NDIV less than 2");
+          error = 1;
+          goto label_error;
+        }
+      }
+      /*
+       * Recursive panel factorization (RF)
+       */
+      status = fgets(line, HPL_LINE_MAX - 2, infp);
+      (void)sscanf(line, "%s", num);
+      *NRFS = atoi(num);
+      if((*NRFS < 1) || (*NRFS > HPL_MAX_PARAM)) {
+        HPL_pwarn(stderr,
+                  __LINE__,
+                  "HPL_pdinfo",
+                  "%s %s %d",
+                  "Number of values of RFACT",
+                  "is less than 1 or greater than",
+                  HPL_MAX_PARAM);
+        error = 1;
+        goto label_error;
+      }
+      status  = fgets(line, HPL_LINE_MAX - 2, infp);
+      lineptr = line;
+      for(i = 0; i < *NRFS; i++) {
+        (void)sscanf(lineptr, "%s", num);
+        lineptr += strlen(num) + 1;
+        j = atoi(num);
+        if(j == 0)
+          RF[i] = HPL_LEFT_LOOKING;
+        else if(j == 1)
+          RF[i] = HPL_CROUT;
+        else if(j == 2)
+          RF[i] = HPL_RIGHT_LOOKING;
+        else
+          RF[i] = HPL_RIGHT_LOOKING;
+      }
+      /*
+       * Broadcast topology (TP) (0=rg, 1=2rg, 2=rgM, 3=2rgM, 4=L)
+       */
+      status = fgets(line, HPL_LINE_MAX - 2, infp);
+      (void)sscanf(line, "%s", num);
+      *NTPS = atoi(num);
+      if((*NTPS < 1) || (*NTPS > HPL_MAX_PARAM)) {
+        HPL_pwarn(stderr,
+                  __LINE__,
+                  "HPL_pdinfo",
+                  "%s %s %d",
+                  "Number of values of BCAST",
+                  "is less than 1 or greater than",
+                  HPL_MAX_PARAM);
+        error = 1;
+        goto label_error;
+      }
+      status  = fgets(line, HPL_LINE_MAX - 2, infp);
+      lineptr = line;
+      for(i = 0; i < *NTPS; i++) {
+        (void)sscanf(lineptr, "%s", num);
+        lineptr += strlen(num) + 1;
+        j = atoi(num);
+        if(j == 0)
+          TP[i] = HPL_1RING;
+        else if(j == 1)
+          TP[i] = HPL_1RING_M;
+        else if(j == 2)
+          TP[i] = HPL_2RING;
+        else if(j == 3)
+          TP[i] = HPL_2RING_M;
+        else if(j == 4)
+          TP[i] = HPL_BLONG;
+        else // if(j == 5)
+          TP[i] = HPL_BLONG_M;
+      }
+      /*
+       * Lookahead depth (>=0) (NDH)
+       */
+      status = fgets(line, HPL_LINE_MAX - 2, infp);
+      (void)sscanf(line, "%s", num);
+      *NDHS = atoi(num);
+      if((*NDHS < 1) || (*NDHS > HPL_MAX_PARAM)) {
+        HPL_pwarn(stderr,
+                  __LINE__,
+                  "HPL_pdinfo",
+                  "%s %s %d",
+                  "Number of values of DEPTH",
+                  "is less than 1 or greater than",
+                  HPL_MAX_PARAM);
+        error = 1;
+        goto label_error;
+      }
+      status  = fgets(line, HPL_LINE_MAX - 2, infp);
+      lineptr = line;
+      for(i = 0; i < *NDHS; i++) {
+        (void)sscanf(lineptr, "%s", num);
+        lineptr += strlen(num) + 1;
+        if((DH[i] = atoi(num)) < 0) {
+          HPL_pwarn(
+              stderr, __LINE__, "HPL_pdinfo", "Value of DEPTH less than 0");
+          error = 1;
+          goto label_error;
+        }
+        // NC: We require lookahead depth of 1
+        if(DH[i] != 1) {
+          HPL_pwarn(stderr, __LINE__, "HPL_pdinfo", "Value of DEPTH must be 1");
+          error = 1;
+          goto label_error;
+        }
+      }
+      /*
+       * Swapping algorithm (0,1 or 2) (FSWAP)
+       */
+      status = fgets(line, HPL_LINE_MAX - 2, infp);
+      (void)sscanf(line, "%s", num);
+      j = atoi(num);
+      if(j == 0)
+        *FSWAP = HPL_SWAP00;
+      else if(j == 1)
+        *FSWAP = HPL_SWAP01;
+      else if(j == 2)
+        *FSWAP = HPL_SW_MIX;
+      else
+        *FSWAP = HPL_SWAP01;
+      // NC: Only one rowswapping algorithm implemented
+      if(*FSWAP != HPL_SWAP01) {
+        HPL_pwarn(stderr, __LINE__, "HPL_pdinfo", "Value of SWAP must be 1");
+        error = 1;
+        goto label_error;
+      }
+      /*
+       * Swapping threshold (>=0) (TSWAP)
+       */
+      status = fgets(line, HPL_LINE_MAX - 2, infp);
+      (void)sscanf(line, "%s", num);
+      *TSWAP = atoi(num);
+      if(*TSWAP <= 0) *TSWAP = 0;
+      /*
+       * L1 in (no-)transposed form (0 or 1)
+       */
+      status = fgets(line, HPL_LINE_MAX - 2, infp);
+      (void)sscanf(line, "%s", num);
+      *L1NOTRAN = atoi(num);
+      if((*L1NOTRAN != 0) && (*L1NOTRAN != 1)) *L1NOTRAN = 0;
+      /*
+       * U  in (no-)transposed form (0 or 1)
+       */
+      status = fgets(line, HPL_LINE_MAX - 2, infp);
+      (void)sscanf(line, "%s", num);
+      *UNOTRAN = atoi(num);
+      if((*UNOTRAN != 0) && (*UNOTRAN != 1)) *UNOTRAN = 0;
+
+      // NC: We don't support holding U in no-transpose form anymore
+      if(*UNOTRAN != 0) {
+        HPL_pwarn(stderr,
+                  __LINE__,
+                  "HPL_pdinfo",
+                  "U  in no-transposed form unsupported");
+        error = 1;
+        goto label_error;
+      }
+      /*
+       * Equilibration (0=no, 1=yes)
+       */
+      status = fgets(line, HPL_LINE_MAX - 2, infp);
+      (void)sscanf(line, "%s", num);
+      *EQUIL = atoi(num);
+      if((*EQUIL != 0) && (*EQUIL != 1)) *EQUIL = 1;
+
+      // NC: We don't currently support Equilibration
+      if(*EQUIL != 0) {
+        HPL_pwarn(stderr,
+                  __LINE__,
+                  "HPL_pdinfo",
+                  "Equilibration currently unsupported");
+        error = 1;
+        goto label_error;
+      }
+      /*
+       * Memory alignment in bytes (> 0) (ALIGN)
+       */
+      status = fgets(line, HPL_LINE_MAX - 2, infp);
+      (void)sscanf(line, "%s", num);
+      *ALIGN = atoi(num);
+      if(*ALIGN <= 0) *ALIGN = 4;
+
+      /*
+       * Close input file
+       */
+    label_error:
+      (void)fclose(infp);
+    } else {
+      TEST->outfp = NULL;
+    }
+
+    /*
+     * Check for error on reading input file
+     */
+    (void)HPL_all_reduce((void*)(&error), 1, HPL_INT, HPL_MAX, MPI_COMM_WORLD);
+    if(error) {
+      if(rank == 0)
+        HPL_pwarn(stderr,
+                  __LINE__,
+                  "HPL_pdinfo",
+                  "Illegal input in file HPL.dat. Exiting ...");
+      MPI_Finalize();
+      exit(1);
+    }
+    /*
+     * Compute and broadcast machine epsilon
+     */
+    TEST->epsil = HPL_pdlamch(MPI_COMM_WORLD, HPL_MACH_EPS);
+    /*
+     * Pack information arrays and broadcast
+     */
+    (void)HPL_broadcast(
+        (void*)(&(TEST->thrsh)), 1, HPL_DOUBLE, 0, MPI_COMM_WORLD);
+    /*
+     * Broadcast array sizes
+     */
+    iwork = (int*)malloc((size_t)(15) * sizeof(int));
+    if(rank == 0) {
+      iwork[0]  = *NS;
+      iwork[1]  = *NBS;
+      iwork[2]  = (*PMAPPIN == HPL_ROW_MAJOR ? 0 : 1);
+      iwork[3]  = *NPQS;
+      iwork[4]  = *NPFS;
+      iwork[5]  = *NBMS;
+      iwork[6]  = *NDVS;
+      iwork[7]  = *NRFS;
+      iwork[8]  = *NTPS;
+      iwork[9]  = *NDHS;
+      iwork[10] = *TSWAP;
+      iwork[11] = *L1NOTRAN;
+      iwork[12] = *UNOTRAN;
+      iwork[13] = *EQUIL;
+      iwork[14] = *ALIGN;
+    }
+    (void)HPL_broadcast((void*)iwork, 15, HPL_INT, 0, MPI_COMM_WORLD);
+    if(rank != 0) {
+      *NS       = iwork[0];
+      *NBS      = iwork[1];
+      *PMAPPIN  = (iwork[2] == 0 ? HPL_ROW_MAJOR : HPL_COLUMN_MAJOR);
+      *NPQS     = iwork[3];
+      *NPFS     = iwork[4];
+      *NBMS     = iwork[5];
+      *NDVS     = iwork[6];
+      *NRFS     = iwork[7];
+      *NTPS     = iwork[8];
+      *NDHS     = iwork[9];
+      *TSWAP    = iwork[10];
+      *L1NOTRAN = iwork[11];
+      *UNOTRAN  = iwork[12];
+      *EQUIL    = iwork[13];
+      *ALIGN    = iwork[14];
+    }
+    if(iwork) free(iwork);
+    /*
+     * Pack information arrays and broadcast
+     */
+    lwork = (*NS) + (*NBS) + 2 * (*NPQS) + (*NPFS) + (*NBMS) + (*NDVS) +
+            (*NRFS) + (*NTPS) + (*NDHS) + 1;
+    iwork = (int*)malloc((size_t)(lwork) * sizeof(int));
+    if(rank == 0) {
+      j = 0;
+      for(i = 0; i < *NS; i++) {
+        iwork[j] = N[i];
+        j++;
+      }
+      for(i = 0; i < *NBS; i++) {
+        iwork[j] = NB[i];
+        j++;
+      }
+      for(i = 0; i < *NPQS; i++) {
+        iwork[j] = P[i];
+        j++;
+      }
+      for(i = 0; i < *NPQS; i++) {
+        iwork[j] = Q[i];
+        j++;
+      }
+      for(i = 0; i < *NPFS; i++) {
+        if(PF[i] == HPL_LEFT_LOOKING)
+          iwork[j] = 0;
+        else if(PF[i] == HPL_CROUT)
+          iwork[j] = 1;
+        else if(PF[i] == HPL_RIGHT_LOOKING)
+          iwork[j] = 2;
+        j++;
+      }
+      for(i = 0; i < *NBMS; i++) {
+        iwork[j] = NBM[i];
+        j++;
+      }
+      for(i = 0; i < *NDVS; i++) {
+        iwork[j] = NDV[i];
+        j++;
+      }
+      for(i = 0; i < *NRFS; i++) {
+        if(RF[i] == HPL_LEFT_LOOKING)
+          iwork[j] = 0;
+        else if(RF[i] == HPL_CROUT)
+          iwork[j] = 1;
+        else if(RF[i] == HPL_RIGHT_LOOKING)
+          iwork[j] = 2;
+        j++;
+      }
+      for(i = 0; i < *NTPS; i++) {
+        if(TP[i] == HPL_1RING)
+          iwork[j] = 0;
+        else if(TP[i] == HPL_1RING_M)
+          iwork[j] = 1;
+        else if(TP[i] == HPL_2RING)
+          iwork[j] = 2;
+        else if(TP[i] == HPL_2RING_M)
+          iwork[j] = 3;
+        else if(TP[i] == HPL_BLONG)
+          iwork[j] = 4;
+        else if(TP[i] == HPL_BLONG_M)
+          iwork[j] = 5;
+        j++;
+      }
+      for(i = 0; i < *NDHS; i++) {
+        iwork[j] = DH[i];
+        j++;
+      }
+
+      if(*FSWAP == HPL_SWAP00)
+        iwork[j] = 0;
+      else if(*FSWAP == HPL_SWAP01)
+        iwork[j] = 1;
+      else if(*FSWAP == HPL_SW_MIX)
+        iwork[j] = 2;
+      j++;
+    }
+    (void)HPL_broadcast((void*)iwork, lwork, HPL_INT, 0, MPI_COMM_WORLD);
+    if(rank != 0) {
+      j = 0;
+      for(i = 0; i < *NS; i++) {
+        N[i] = iwork[j];
+        j++;
+      }
+      for(i = 0; i < *NBS; i++) {
+        NB[i] = iwork[j];
+        j++;
+      }
+      for(i = 0; i < *NPQS; i++) {
+        P[i] = iwork[j];
+        j++;
+      }
+      for(i = 0; i < *NPQS; i++) {
+        Q[i] = iwork[j];
+        j++;
+      }
+
+      for(i = 0; i < *NPFS; i++) {
+        if(iwork[j] == 0)
+          PF[i] = HPL_LEFT_LOOKING;
+        else if(iwork[j] == 1)
+          PF[i] = HPL_CROUT;
+        else if(iwork[j] == 2)
+          PF[i] = HPL_RIGHT_LOOKING;
+        j++;
+      }
+      for(i = 0; i < *NBMS; i++) {
+        NBM[i] = iwork[j];
+        j++;
+      }
+      for(i = 0; i < *NDVS; i++) {
+        NDV[i] = iwork[j];
+        j++;
+      }
+      for(i = 0; i < *NRFS; i++) {
+        if(iwork[j] == 0)
+          RF[i] = HPL_LEFT_LOOKING;
+        else if(iwork[j] == 1)
+          RF[i] = HPL_CROUT;
+        else if(iwork[j] == 2)
+          RF[i] = HPL_RIGHT_LOOKING;
+        j++;
+      }
+      for(i = 0; i < *NTPS; i++) {
+        if(iwork[j] == 0)
+          TP[i] = HPL_1RING;
+        else if(iwork[j] == 1)
+          TP[i] = HPL_1RING_M;
+        else if(iwork[j] == 2)
+          TP[i] = HPL_2RING;
+        else if(iwork[j] == 3)
+          TP[i] = HPL_2RING_M;
+        else if(iwork[j] == 4)
+          TP[i] = HPL_BLONG;
+        else if(iwork[j] == 5)
+          TP[i] = HPL_BLONG_M;
+        j++;
+      }
+      for(i = 0; i < *NDHS; i++) {
+        DH[i] = iwork[j];
+        j++;
+      }
+
+      if(iwork[j] == 0)
+        *FSWAP = HPL_SWAP00;
+      else if(iwork[j] == 1)
+        *FSWAP = HPL_SWAP01;
+      else if(iwork[j] == 2)
+        *FSWAP = HPL_SW_MIX;
+      j++;
+    }
+    if(iwork) free(iwork);
+  }
+
+  /*
+   * regurgitate input
+   */
+  if(rank == 0) {
+    HPL_fprintf(TEST->outfp,
+                "%s%s\n",
+                "========================================",
+                "========================================");
+    HPL_fprintf(TEST->outfp,
+                "%s%s\n",
+                "HPLinpack 2.2  --  High-Performance Linpack benchmark  --  ",
+                " February 24, 2016");
+    HPL_fprintf(TEST->outfp,
+                "%s%s\n",
+                "Written by A. Petitet and R. Clint Whaley,  ",
+                "Innovative Computing Laboratory, UTK");
+    HPL_fprintf(TEST->outfp,
+                "%s%s\n",
+                "Modified by Piotr Luszczek, ",
+                "Innovative Computing Laboratory, UTK");
+    HPL_fprintf(TEST->outfp,
+                "%s%s\n",
+                "Modified by Julien Langou, ",
+                "University of Colorado Denver");
+    HPL_fprintf(TEST->outfp,
+                "%s%s\n",
+                "========================================",
+                "========================================");
+
+    HPL_fprintf(TEST->outfp,
+                "\n%s\n",
+                "An explanation of the input/output parameters follows:");
+    HPL_fprintf(TEST->outfp, "%s\n", "T/V    : Wall time / encoded variant.");
+    HPL_fprintf(
+        TEST->outfp, "%s\n", "N      : The order of the coefficient matrix A.");
+    HPL_fprintf(
+        TEST->outfp, "%s\n", "NB     : The partitioning blocking factor.");
+    HPL_fprintf(TEST->outfp, "%s\n", "P      : The number of process rows.");
+    HPL_fprintf(TEST->outfp, "%s\n", "Q      : The number of process columns.");
+    HPL_fprintf(TEST->outfp,
+                "%s\n",
+                "Time   : Time in seconds to solve the linear system.");
+    HPL_fprintf(TEST->outfp,
+                "%s\n\n",
+                "Gflops : Rate of execution for solving the linear system.");
+    HPL_fprintf(
+        TEST->outfp, "%s\n", "The following parameter values will be used:");
+    /*
+     * Problem size
+     */
+    HPL_fprintf(TEST->outfp, "\nN      :");
+    for(i = 0; i < Mmin(8, *NS); i++) HPL_fprintf(TEST->outfp, "%8d ", N[i]);
+    if(*NS > 8) {
+      HPL_fprintf(TEST->outfp, "\n        ");
+      for(i = 8; i < Mmin(16, *NS); i++) HPL_fprintf(TEST->outfp, "%8d ", N[i]);
+      if(*NS > 16) {
+        HPL_fprintf(TEST->outfp, "\n        ");
+        for(i = 16; i < *NS; i++) HPL_fprintf(TEST->outfp, "%8d ", N[i]);
+      }
+    }
+    /*
+     * Distribution blocking factor
+     */
+    HPL_fprintf(TEST->outfp, "\nNB     :");
+    for(i = 0; i < Mmin(8, *NBS); i++) HPL_fprintf(TEST->outfp, "%8d ", NB[i]);
+    if(*NBS > 8) {
+      HPL_fprintf(TEST->outfp, "\n        ");
+      for(i = 8; i < Mmin(16, *NBS); i++)
+        HPL_fprintf(TEST->outfp, "%8d ", NB[i]);
+      if(*NBS > 16) {
+        HPL_fprintf(TEST->outfp, "\n        ");
+        for(i = 16; i < *NBS; i++) HPL_fprintf(TEST->outfp, "%8d ", NB[i]);
+      }
+    }
+    /*
+     * Process mapping
+     */
+    HPL_fprintf(TEST->outfp, "\nPMAP   :");
+    if(*PMAPPIN == HPL_ROW_MAJOR)
+      HPL_fprintf(TEST->outfp, " Row-major process mapping");
+    else if(*PMAPPIN == HPL_COLUMN_MAJOR)
+      HPL_fprintf(TEST->outfp, " Column-major process mapping");
+    /*
+     * Process grid
+     */
+    HPL_fprintf(TEST->outfp, "\nP      :");
+    for(i = 0; i < Mmin(8, *NPQS); i++) HPL_fprintf(TEST->outfp, "%8d ", P[i]);
+    if(*NPQS > 8) {
+      HPL_fprintf(TEST->outfp, "\n        ");
+      for(i = 8; i < Mmin(16, *NPQS); i++)
+        HPL_fprintf(TEST->outfp, "%8d ", P[i]);
+      if(*NPQS > 16) {
+        HPL_fprintf(TEST->outfp, "\n        ");
+        for(i = 16; i < *NPQS; i++) HPL_fprintf(TEST->outfp, "%8d ", P[i]);
+      }
+    }
+    HPL_fprintf(TEST->outfp, "\nQ      :");
+    for(i = 0; i < Mmin(8, *NPQS); i++) HPL_fprintf(TEST->outfp, "%8d ", Q[i]);
+    if(*NPQS > 8) {
+      HPL_fprintf(TEST->outfp, "\n        ");
+      for(i = 8; i < Mmin(16, *NPQS); i++)
+        HPL_fprintf(TEST->outfp, "%8d ", Q[i]);
+      if(*NPQS > 16) {
+        HPL_fprintf(TEST->outfp, "\n        ");
+        for(i = 16; i < *NPQS; i++) HPL_fprintf(TEST->outfp, "%8d ", Q[i]);
+      }
+    }
+    /*
+     * Panel Factorization
+     */
+    HPL_fprintf(TEST->outfp, "\nPFACT  :");
+    for(i = 0; i < Mmin(8, *NPFS); i++) {
+      if(PF[i] == HPL_LEFT_LOOKING)
+        HPL_fprintf(TEST->outfp, "    Left ");
+      else if(PF[i] == HPL_CROUT)
+        HPL_fprintf(TEST->outfp, "   Crout ");
+      else if(PF[i] == HPL_RIGHT_LOOKING)
+        HPL_fprintf(TEST->outfp, "   Right ");
+    }
+    if(*NPFS > 8) {
+      HPL_fprintf(TEST->outfp, "\n        ");
+      for(i = 8; i < Mmin(16, *NPFS); i++) {
+        if(PF[i] == HPL_LEFT_LOOKING)
+          HPL_fprintf(TEST->outfp, "    Left ");
+        else if(PF[i] == HPL_CROUT)
+          HPL_fprintf(TEST->outfp, "   Crout ");
+        else if(PF[i] == HPL_RIGHT_LOOKING)
+          HPL_fprintf(TEST->outfp, "   Right ");
+      }
+      if(*NPFS > 16) {
+        HPL_fprintf(TEST->outfp, "\n        ");
+        for(i = 16; i < *NPFS; i++) {
+          if(PF[i] == HPL_LEFT_LOOKING)
+            HPL_fprintf(TEST->outfp, "    Left ");
+          else if(PF[i] == HPL_CROUT)
+            HPL_fprintf(TEST->outfp, "   Crout ");
+          else if(PF[i] == HPL_RIGHT_LOOKING)
+            HPL_fprintf(TEST->outfp, "   Right ");
+        }
+      }
+    }
+    /*
+     * Recursive stopping criterium
+     */
+    HPL_fprintf(TEST->outfp, "\nNBMIN  :");
+    for(i = 0; i < Mmin(8, *NBMS); i++)
+      HPL_fprintf(TEST->outfp, "%8d ", NBM[i]);
+    if(*NBMS > 8) {
+      HPL_fprintf(TEST->outfp, "\n        ");
+      for(i = 8; i < Mmin(16, *NBMS); i++)
+        HPL_fprintf(TEST->outfp, "%8d ", NBM[i]);
+      if(*NBMS > 16) {
+        HPL_fprintf(TEST->outfp, "\n        ");
+        for(i = 16; i < *NBMS; i++) HPL_fprintf(TEST->outfp, "%8d ", NBM[i]);
+      }
+    }
+    /*
+     * Number of panels in recursion
+     */
+    HPL_fprintf(TEST->outfp, "\nNDIV   :");
+    for(i = 0; i < Mmin(8, *NDVS); i++)
+      HPL_fprintf(TEST->outfp, "%8d ", NDV[i]);
+    if(*NDVS > 8) {
+      HPL_fprintf(TEST->outfp, "\n        ");
+      for(i = 8; i < Mmin(16, *NDVS); i++)
+        HPL_fprintf(TEST->outfp, "%8d ", NDV[i]);
+      if(*NDVS > 16) {
+        HPL_fprintf(TEST->outfp, "\n        ");
+        for(i = 16; i < *NDVS; i++) HPL_fprintf(TEST->outfp, "%8d ", NDV[i]);
+      }
+    }
+    /*
+     * Recursive Factorization
+     */
+    HPL_fprintf(TEST->outfp, "\nRFACT  :");
+    for(i = 0; i < Mmin(8, *NRFS); i++) {
+      if(RF[i] == HPL_LEFT_LOOKING)
+        HPL_fprintf(TEST->outfp, "    Left ");
+      else if(RF[i] == HPL_CROUT)
+        HPL_fprintf(TEST->outfp, "   Crout ");
+      else if(RF[i] == HPL_RIGHT_LOOKING)
+        HPL_fprintf(TEST->outfp, "   Right ");
+    }
+    if(*NRFS > 8) {
+      HPL_fprintf(TEST->outfp, "\n        ");
+      for(i = 8; i < Mmin(16, *NRFS); i++) {
+        if(RF[i] == HPL_LEFT_LOOKING)
+          HPL_fprintf(TEST->outfp, "    Left ");
+        else if(RF[i] == HPL_CROUT)
+          HPL_fprintf(TEST->outfp, "   Crout ");
+        else if(RF[i] == HPL_RIGHT_LOOKING)
+          HPL_fprintf(TEST->outfp, "   Right ");
+      }
+      if(*NRFS > 16) {
+        HPL_fprintf(TEST->outfp, "\n        ");
+        for(i = 16; i < *NRFS; i++) {
+          if(RF[i] == HPL_LEFT_LOOKING)
+            HPL_fprintf(TEST->outfp, "    Left ");
+          else if(RF[i] == HPL_CROUT)
+            HPL_fprintf(TEST->outfp, "   Crout ");
+          else if(RF[i] == HPL_RIGHT_LOOKING)
+            HPL_fprintf(TEST->outfp, "   Right ");
+        }
+      }
+    }
+    /*
+     * Broadcast topology
+     */
+    HPL_fprintf(TEST->outfp, "\nBCAST  :");
+    for(i = 0; i < Mmin(8, *NTPS); i++) {
+      if(TP[i] == HPL_1RING)
+        HPL_fprintf(TEST->outfp, "   1ring ");
+      else if(TP[i] == HPL_1RING_M)
+        HPL_fprintf(TEST->outfp, "  1ringM ");
+      else if(TP[i] == HPL_2RING)
+        HPL_fprintf(TEST->outfp, "   2ring ");
+      else if(TP[i] == HPL_2RING_M)
+        HPL_fprintf(TEST->outfp, "  2ringM ");
+      else if(TP[i] == HPL_BLONG)
+        HPL_fprintf(TEST->outfp, "   Blong ");
+      else if(TP[i] == HPL_BLONG_M)
+        HPL_fprintf(TEST->outfp, "  BlongM ");
+    }
+    if(*NTPS > 8) {
+      HPL_fprintf(TEST->outfp, "\n        ");
+      for(i = 8; i < Mmin(16, *NTPS); i++) {
+        if(TP[i] == HPL_1RING)
+          HPL_fprintf(TEST->outfp, "   1ring ");
+        else if(TP[i] == HPL_1RING_M)
+          HPL_fprintf(TEST->outfp, "  1ringM ");
+        else if(TP[i] == HPL_2RING)
+          HPL_fprintf(TEST->outfp, "   2ring ");
+        else if(TP[i] == HPL_2RING_M)
+          HPL_fprintf(TEST->outfp, "  2ringM ");
+        else if(TP[i] == HPL_BLONG)
+          HPL_fprintf(TEST->outfp, "   Blong ");
+        else if(TP[i] == HPL_BLONG_M)
+          HPL_fprintf(TEST->outfp, "  BlongM ");
+      }
+      if(*NTPS > 16) {
+        HPL_fprintf(TEST->outfp, "\n        ");
+        for(i = 16; i < *NTPS; i++) {
+          if(TP[i] == HPL_1RING)
+            HPL_fprintf(TEST->outfp, "   1ring ");
+          else if(TP[i] == HPL_1RING_M)
+            HPL_fprintf(TEST->outfp, "  1ringM ");
+          else if(TP[i] == HPL_2RING)
+            HPL_fprintf(TEST->outfp, "   2ring ");
+          else if(TP[i] == HPL_2RING_M)
+            HPL_fprintf(TEST->outfp, "  2ringM ");
+          else if(TP[i] == HPL_BLONG)
+            HPL_fprintf(TEST->outfp, "   Blong ");
+          else if(TP[i] == HPL_BLONG_M)
+            HPL_fprintf(TEST->outfp, "  BlongM ");
+        }
+      }
+    }
+    /*
+     * Lookahead depths
+     */
+    HPL_fprintf(TEST->outfp, "\nDEPTH  :");
+    for(i = 0; i < Mmin(8, *NDHS); i++) HPL_fprintf(TEST->outfp, "%8d ", DH[i]);
+    if(*NDHS > 8) {
+      HPL_fprintf(TEST->outfp, "\n        ");
+      for(i = 8; i < Mmin(16, *NDHS); i++)
+        HPL_fprintf(TEST->outfp, "%8d ", DH[i]);
+      if(*NDHS > 16) {
+        HPL_fprintf(TEST->outfp, "\n        ");
+        for(i = 16; i < *NDHS; i++) HPL_fprintf(TEST->outfp, "%8d ", DH[i]);
+      }
+    }
+    /*
+     * Swapping algorithm
+     */
+    HPL_fprintf(TEST->outfp, "\nSWAP   :");
+    if(*FSWAP == HPL_SWAP00)
+      HPL_fprintf(TEST->outfp, " Binary-exchange");
+    else if(*FSWAP == HPL_SWAP01)
+      HPL_fprintf(TEST->outfp, " Spread-roll (long)");
+    else if(*FSWAP == HPL_SW_MIX)
+      HPL_fprintf(TEST->outfp, " Mix (threshold = %d)", *TSWAP);
+    /*
+     * L1 storage form
+     */
+    HPL_fprintf(TEST->outfp, "\nL1     :");
+    if(*L1NOTRAN != 0)
+      HPL_fprintf(TEST->outfp, " no-transposed form");
+    else
+      HPL_fprintf(TEST->outfp, " transposed form");
+    /*
+     * U  storage form
+     */
+    HPL_fprintf(TEST->outfp, "\nU      :");
+    if(*UNOTRAN != 0)
+      HPL_fprintf(TEST->outfp, " no-transposed form");
+    else
+      HPL_fprintf(TEST->outfp, " transposed form");
+    /*
+     * Equilibration
+     */
+    HPL_fprintf(TEST->outfp, "\nEQUIL  :");
+    if(*EQUIL != 0)
+      HPL_fprintf(TEST->outfp, " yes");
+    else
+      HPL_fprintf(TEST->outfp, " no");
+    /*
+     * Alignment
+     */
+    HPL_fprintf(TEST->outfp, "\nALIGN  : %d double precision words", *ALIGN);
+
+    HPL_fprintf(TEST->outfp, "\n\n");
+    /*
+     * For testing only
+     */
+    if(TEST->thrsh > HPL_rzero) {
+      HPL_fprintf(TEST->outfp,
+                  "%s%s\n\n",
+                  "----------------------------------------",
+                  "----------------------------------------");
+      HPL_fprintf(TEST->outfp,
+                  "%s\n",
+                  "- The matrix A is randomly generated for each test.");
+      HPL_fprintf(TEST->outfp,
+                  "%s\n",
+                  "- The following scaled residual check will be computed:");
+      HPL_fprintf(TEST->outfp,
+                  "%s\n",
+                  "      ||Ax-b||_oo / ( eps * ( || x ||_oo * || A ||_oo + || "
+                  "b ||_oo ) * N )");
+      HPL_fprintf(TEST->outfp,
+                  "%s %21.6e\n",
+                  "- The relative machine precision (eps) is taken to be     ",
+                  TEST->epsil);
+      HPL_fprintf(
+          TEST->outfp,
+          "%s   %11.1f\n\n",
+          "- Computational tests pass if scaled residuals are less than      ",
+          TEST->thrsh);
+    }
+  }
+}
diff --git a/src/HPL_pdtest.cpp b/src/HPL_pdtest.cpp
new file mode 100644
index 0000000..afdda39
--- /dev/null
+++ b/src/HPL_pdtest.cpp
@@ -0,0 +1,501 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include <limits>
+#include "hpl.hpp"
+
+void HPL_pdtest(HPL_T_test* TEST,
+                HPL_T_grid* GRID,
+                HPL_T_palg* ALGO,
+                const int   N,
+                const int   NB) {
+/*
+ * Purpose
+ * =======
+ *
+ * HPL_pdtest performs  one  test  given a set of parameters such as the
+ * process grid, the  problem size, the distribution blocking factor ...
+ * This function generates  the data, calls  and times the linear system
+ * solver,  checks  the  accuracy  of the  obtained vector solution  and
+ * writes this information to the file pointed to by TEST->outfp.
+ *
+ * Arguments
+ * =========
+ *
+ * TEST    (global input)                HPL_T_test *
+ *         On entry,  TEST  points  to a testing data structure:  outfp
+ *         specifies the output file where the results will be printed.
+ *         It is only defined and used by the process  0  of the  grid.
+ *         thrsh  specifies  the  threshhold value  for the test ratio.
+ *         Concretely, a test is declared "PASSED"  if and only if the
+ *         following inequality is satisfied:
+ *         ||Ax-b||_oo / ( epsil *
+ *                         ( || x ||_oo * || A ||_oo + || b ||_oo ) *
+ *                          N )  < thrsh.
+ *         epsil  is the  relative machine precision of the distributed
+ *         computer. Finally the test counters, kfail, kpass, kskip and
+ *         ktest are updated as follows:  if the test passes,  kpass is
+ *         incremented by one;  if the test fails, kfail is incremented
+ *         by one; if the test is skipped, kskip is incremented by one.
+ *         ktest is left unchanged.
+ *
+ * GRID    (local input)                 HPL_T_grid *
+ *         On entry,  GRID  points  to the data structure containing the
+ *         process grid information.
+ *
+ * ALGO    (global input)                HPL_T_palg *
+ *         On entry,  ALGO  points to  the data structure containing the
+ *         algorithmic parameters to be used for this test.
+ *
+ * N       (global input)                const int
+ *         On entry,  N specifies the order of the coefficient matrix A.
+ *         N must be at least zero.
+ *
+ * NB      (global input)                const int
+ *         On entry,  NB specifies the blocking factor used to partition
+ *         and distribute the matrix A. NB must be larger than one.
+ *
+ * ---------------------------------------------------------------------
+ */
+/*
+ * .. Local Variables ..
+ */
+#ifdef HPL_DETAILED_TIMING
+  double HPL_w[HPL_TIMING_N];
+#endif
+  HPL_T_pmat mat;
+  double     wtime[1];
+  int        ierr;
+  double     Anorm1, AnormI, Gflops, Xnorm1, XnormI, BnormI, resid0, resid1;
+  double*    Bptr;
+  double*    dBptr;
+  static int first = 1;
+  int        ii, ip2, mycol, myrow, npcol, nprow, nq;
+  char       ctop, cpfact, crfact;
+  time_t     current_time_start, current_time_end;
+
+  (void)HPL_grid_info(GRID, &nprow, &npcol, &myrow, &mycol);
+
+  /*
+   * Allocate matrix, right-hand-side, and vector solution x. [ A | b ] is
+   * N by N+1.  One column is added in every process column for the solve.
+   * The  result  however  is stored in a 1 x N vector replicated in every
+   * process row. In every process, A is lda * (nq+1), x is 1 * nq and the
+   * workspace is mp.
+   */
+  ierr = HPL_pdmatgen(TEST, GRID, ALGO, &mat, N, NB);
+
+  if(ierr != HPL_SUCCESS) {
+    (TEST->kskip)++;
+    HPL_pdmatfree(&mat);
+    return;
+  }
+
+  /* Create row-swapping data type */
+  MPI_Type_contiguous(NB + 4, MPI_DOUBLE, &PDFACT_ROW);
+  MPI_Type_commit(&PDFACT_ROW);
+
+  /*
+   * generate matrix and right-hand-side, [ A | b ] which is N by N+1.
+   */
+  HPL_pdrandmat(GRID, N, N + 1, NB, mat.dA, mat.ld, HPL_ISEED);
+
+  /*
+   * Solve linear system
+   */
+  HPL_ptimer_boot();
+  (void)HPL_barrier(GRID->all_comm);
+  time(&current_time_start);
+  HPL_ptimer(0);
+  HPL_pdgesv(GRID, ALGO, &mat);
+  HPL_ptimer(0);
+  time(&current_time_end);
+
+  /*
+   * Gather max of all CPU and WALL clock timings and print timing results
+   */
+  HPL_ptimer_combine(
+      GRID->all_comm, HPL_AMAX_PTIME, HPL_WALL_PTIME, 1, 0, wtime);
+
+  if((myrow == 0) && (mycol == 0)) {
+    if(first) {
+      HPL_fprintf(TEST->outfp,
+                  "%s%s\n",
+                  "========================================",
+                  "========================================");
+      HPL_fprintf(TEST->outfp,
+                  "%s%s\n",
+                  "T/V                N    NB     P     Q",
+                  "               Time                 Gflops");
+      HPL_fprintf(TEST->outfp,
+                  "%s%s\n",
+                  "----------------------------------------",
+                  "----------------------------------------");
+      if(TEST->thrsh <= HPL_rzero) first = 0;
+    }
+    /*
+     * 2/3 N^3 - 1/2 N^2 flops for LU factorization + 2 N^2 flops for solve.
+     * Print WALL time
+     */
+    Gflops = (((double)(N) / 1.0e+9) * ((double)(N) / wtime[0])) *
+             ((2.0 / 3.0) * (double)(N) + (3.0 / 2.0));
+
+    cpfact = (((HPL_T_FACT)(ALGO->pfact) == (HPL_T_FACT)(HPL_LEFT_LOOKING))
+                  ? (char)('L')
+                  : (((HPL_T_FACT)(ALGO->pfact) == (HPL_T_FACT)(HPL_CROUT))
+                         ? (char)('C')
+                         : (char)('R')));
+    crfact = (((HPL_T_FACT)(ALGO->rfact) == (HPL_T_FACT)(HPL_LEFT_LOOKING))
+                  ? (char)('L')
+                  : (((HPL_T_FACT)(ALGO->rfact) == (HPL_T_FACT)(HPL_CROUT))
+                         ? (char)('C')
+                         : (char)('R')));
+
+    if(ALGO->btopo == HPL_1RING)
+      ctop = '0';
+    else if(ALGO->btopo == HPL_1RING_M)
+      ctop = '1';
+    else if(ALGO->btopo == HPL_2RING)
+      ctop = '2';
+    else if(ALGO->btopo == HPL_2RING_M)
+      ctop = '3';
+    else if(ALGO->btopo == HPL_BLONG)
+      ctop = '4';
+    else /* if( ALGO->btopo == HPL_BLONG_M ) */
+      ctop = '5';
+
+    if(wtime[0] > HPL_rzero) {
+      HPL_fprintf(TEST->outfp,
+                  "W%c%1d%c%c%1d%c%1d%12d %5d %5d %5d %18.2f     %18.3e\n",
+                  (GRID->order == HPL_ROW_MAJOR ? 'R' : 'C'),
+                  ALGO->depth,
+                  ctop,
+                  crfact,
+                  ALGO->nbdiv,
+                  cpfact,
+                  ALGO->nbmin,
+                  N,
+                  NB,
+                  nprow,
+                  npcol,
+                  wtime[0],
+                  Gflops);
+      HPL_fprintf(TEST->outfp,
+                  "HPL_pdgesv() start time %s\n",
+                  ctime(&current_time_start));
+      HPL_fprintf(TEST->outfp,
+                  "HPL_pdgesv() end time   %s\n",
+                  ctime(&current_time_end));
+    }
+#ifdef HPL_PROGRESS_REPORT
+    printf("Final Score:    %7.4e GFLOPS \n", Gflops);
+#endif
+  }
+#ifdef HPL_DETAILED_TIMING
+  HPL_ptimer_combine(GRID->all_comm,
+                     HPL_AMAX_PTIME,
+                     HPL_WALL_PTIME,
+                     HPL_TIMING_N,
+                     HPL_TIMING_BEG,
+                     HPL_w);
+  if((myrow == 0) && (mycol == 0)) {
+    HPL_fprintf(TEST->outfp,
+                "%s%s\n",
+                "--VVV--VVV--VVV--VVV--VVV--VVV--VVV--V",
+                "VV--VVV--VVV--VVV--VVV--VVV--VVV--VVV-");
+    /*
+     * Lbcast
+     */
+    if(HPL_w[HPL_TIMING_LBCAST - HPL_TIMING_BEG] > HPL_rzero)
+      HPL_fprintf(TEST->outfp,
+                  "Max aggregated wall time bcast . . . : %18.2f\n",
+                  HPL_w[HPL_TIMING_LBCAST - HPL_TIMING_BEG]);
+    /*
+     * Panel copy
+     */
+    if(HPL_w[HPL_TIMING_COPY - HPL_TIMING_BEG] > HPL_rzero)
+      HPL_fprintf(TEST->outfp,
+                  "+ Max aggregated wall time panel copy: %18.2f\n",
+                  HPL_w[HPL_TIMING_MXSWP - HPL_TIMING_BEG]);
+    /*
+     * Recursive panel factorization
+     */
+    if(HPL_w[HPL_TIMING_RPFACT - HPL_TIMING_BEG] > HPL_rzero)
+      HPL_fprintf(TEST->outfp,
+                  "+ Max aggregated wall time rfact . . : %18.2f\n",
+                  HPL_w[HPL_TIMING_RPFACT - HPL_TIMING_BEG]);
+    /*
+     * Panel factorization
+     */
+    if(HPL_w[HPL_TIMING_PFACT - HPL_TIMING_BEG] > HPL_rzero)
+      HPL_fprintf(TEST->outfp,
+                  "+ + Max aggregated wall time pfact . : %18.2f\n",
+                  HPL_w[HPL_TIMING_PFACT - HPL_TIMING_BEG]);
+    /*
+     * Panel factorization (swap)
+     */
+    if(HPL_w[HPL_TIMING_MXSWP - HPL_TIMING_BEG] > HPL_rzero)
+      HPL_fprintf(TEST->outfp,
+                  "+ + Max aggregated wall time mxswp . : %18.2f\n",
+                  HPL_w[HPL_TIMING_MXSWP - HPL_TIMING_BEG]);
+    /*
+     * Update (swap)
+     */
+    if(HPL_w[HPL_TIMING_LASWP - HPL_TIMING_BEG] > HPL_rzero)
+      HPL_fprintf(TEST->outfp,
+                  "Max aggregated wall time laswp . . . : %18.2f\n",
+                  HPL_w[HPL_TIMING_LASWP - HPL_TIMING_BEG]);
+    /*
+     * Update
+     */
+    if(HPL_w[HPL_TIMING_UPDATE - HPL_TIMING_BEG] > HPL_rzero)
+      HPL_fprintf(TEST->outfp,
+                  "Max aggregated wall time update  . . : %18.2f\n",
+                  HPL_w[HPL_TIMING_UPDATE - HPL_TIMING_BEG]);
+    /*
+     * Upper triangular system solve
+     */
+    if(HPL_w[HPL_TIMING_PTRSV - HPL_TIMING_BEG] > HPL_rzero)
+      HPL_fprintf(TEST->outfp,
+                  "Max aggregated wall time up tr sv  . : %18.2f\n",
+                  HPL_w[HPL_TIMING_PTRSV - HPL_TIMING_BEG]);
+
+    if(TEST->thrsh <= HPL_rzero)
+      HPL_fprintf(TEST->outfp,
+                  "%s%s\n",
+                  "========================================",
+                  "========================================");
+  }
+#endif
+
+  /* Release row swapping datatype */
+  MPI_Type_free(&PDFACT_ROW);
+
+  /*
+   * Quick return, if I am not interested in checking the computations
+   */
+  if(TEST->thrsh <= HPL_rzero) {
+    (TEST->kpass)++;
+    HPL_pdmatfree(&mat);
+    return;
+  }
+  /*
+   * Check info returned by solve
+   */
+  if(mat.info != 0) {
+    if((myrow == 0) && (mycol == 0))
+      HPL_pwarn(TEST->outfp,
+                __LINE__,
+                "HPL_pdtest",
+                "%s %d, %s",
+                "Error code returned by solve is",
+                mat.info,
+                "skip");
+    (TEST->kskip)++;
+    HPL_pdmatfree(&mat);
+    return;
+  }
+  /*
+   * Check computation, re-generate [ A | b ], compute norm 1 and inf of A and
+   * x, and norm inf of b - A x. Display residual checks.
+   */
+  HPL_pdrandmat(GRID, N, N + 1, NB, mat.dA, mat.ld, HPL_ISEED);
+
+  Anorm1 = HPL_pdlange(GRID, HPL_NORM_1, N, N, NB, mat.dA, mat.ld);
+  AnormI = HPL_pdlange(GRID, HPL_NORM_I, N, N, NB, mat.dA, mat.ld);
+  /*
+   * Because x is distributed in process rows, switch the norms
+   */
+  XnormI = HPL_pdlange(GRID, HPL_NORM_1, 1, N, NB, mat.dX, 1);
+  Xnorm1 = HPL_pdlange(GRID, HPL_NORM_I, 1, N, NB, mat.dX, 1);
+  /*
+   * If I am in the col that owns b, (1) compute local BnormI, (2) all_reduce to
+   * find the max (in the col). Then (3) broadcast along the rows so that every
+   * process has BnormI. Note that since we use a uniform distribution in
+   * [-0.5,0.5] for the entries of B, it is very likely that BnormI (<=,~) 0.5.
+   */
+
+  // Bptr  = Mptr( mat.A , 0, nq, mat.ld );
+  size_t BptrBytes = Mmax(mat.nq, mat.ld) * sizeof(double);
+  Bptr             = (double*)malloc(BptrBytes);
+
+  nq    = HPL_numroc(N, NB, NB, mycol, 0, npcol);
+  dBptr = Mptr(mat.dA, 0, nq, mat.ld);
+  if(mycol == HPL_indxg2p(N, NB, NB, 0, npcol)) {
+    if(mat.mp > 0) {
+      // int id = HPL_idamax( mat.mp, Bptr, 1);
+      // BnormI = Bptr[id];
+      int id;
+      rocblas_idamax(handle, mat.mp, dBptr, 1, &id);
+
+      // Note: id is in Fortran indexing
+      hipMemcpy(
+          &BnormI, dBptr + id - 1, 1 * sizeof(double), hipMemcpyDeviceToHost);
+      BnormI = Mabs(BnormI);
+    } else {
+      BnormI = HPL_rzero;
+    }
+    (void)HPL_all_reduce(
+        (void*)(&BnormI), 1, HPL_DOUBLE, HPL_MAX, GRID->col_comm);
+  }
+  (void)HPL_broadcast((void*)(&BnormI),
+                      1,
+                      HPL_DOUBLE,
+                      HPL_indxg2p(N, NB, NB, 0, npcol),
+                      GRID->row_comm);
+  /*
+   * If I own b, compute ( b - A x ) and ( - A x ) otherwise
+   */
+
+  // rocBLAS < v4.2 has an integer overflow problem in dgemv, so
+  // chunk the nq columns to compute the full dgemv
+  const int nq_chunk = std::numeric_limits<int>::max() / (mat.ld);
+
+  if(mycol == HPL_indxg2p(N, NB, NB, 0, npcol)) {
+    const double one  = 1.0;
+    const double mone = -1.0;
+
+    for(int nn = 0; nn < nq; nn += nq_chunk) {
+      int nb = Mmin(nq - nn, nq_chunk);
+      rocblas_dgemv(handle,
+                    rocblas_operation_none,
+                    mat.mp,
+                    nb,
+                    &mone,
+                    Mptr(mat.dA, 0, nn, mat.ld),
+                    mat.ld,
+                    Mptr(mat.dX, 0, nn, 1),
+                    1,
+                    &one,
+                    dBptr,
+                    1);
+    }
+
+    hipMemcpy(Bptr, dBptr, mat.mp * sizeof(double), hipMemcpyDeviceToHost);
+  } else if(nq > 0) {
+    const double one  = 1.0;
+    const double zero = 0.0;
+    const double mone = -1.0;
+
+    int nb = Mmin(nq, nq_chunk);
+    rocblas_dgemv(handle,
+                  rocblas_operation_none,
+                  mat.mp,
+                  nb,
+                  &mone,
+                  Mptr(mat.dA, 0, 0, mat.ld),
+                  mat.ld,
+                  Mptr(mat.dX, 0, 0, 1),
+                  1,
+                  &zero,
+                  dBptr,
+                  1);
+
+    for(int nn = nb; nn < nq; nn += nq_chunk) {
+      int nb = Mmin(nq - nn, nq_chunk);
+      rocblas_dgemv(handle,
+                    rocblas_operation_none,
+                    mat.mp,
+                    nb,
+                    &mone,
+                    Mptr(mat.dA, 0, nn, mat.ld),
+                    mat.ld,
+                    Mptr(mat.dX, 0, nn, 1),
+                    1,
+                    &one,
+                    dBptr,
+                    1);
+    }
+
+    hipMemcpy(Bptr, dBptr, mat.mp * sizeof(double), hipMemcpyDeviceToHost);
+  } else {
+    for(ii = 0; ii < mat.mp; ii++) Bptr[ii] = HPL_rzero;
+  }
+  /*
+   * Reduce the distributed residual in process column 0
+   */
+  if(mat.mp > 0)
+    (void)HPL_reduce(Bptr, mat.mp, HPL_DOUBLE, HPL_SUM, 0, GRID->row_comm);
+
+  /*
+   * Compute || b - A x ||_oo
+   */
+  hipMemcpy(dBptr, Bptr, mat.mp * sizeof(double), hipMemcpyHostToDevice);
+  resid0 = HPL_pdlange(GRID, HPL_NORM_I, N, 1, NB, dBptr, mat.ld);
+  /*
+   * Computes and displays norms, residuals ...
+   */
+  if(N <= 0) {
+    resid1 = HPL_rzero;
+  } else {
+    resid1 = resid0 / (TEST->epsil * (AnormI * XnormI + BnormI) * (double)(N));
+  }
+
+  if(resid1 < TEST->thrsh)
+    (TEST->kpass)++;
+  else
+    (TEST->kfail)++;
+
+  if((myrow == 0) && (mycol == 0)) {
+    HPL_fprintf(TEST->outfp,
+                "%s%s\n",
+                "----------------------------------------",
+                "----------------------------------------");
+    HPL_fprintf(TEST->outfp,
+                "%s%16.7f%s%s\n",
+                "||Ax-b||_oo/(eps*(||A||_oo*||x||_oo+||b||_oo)*N)= ",
+                resid1,
+                " ...... ",
+                (resid1 < TEST->thrsh ? "PASSED" : "FAILED"));
+
+    if(resid1 >= TEST->thrsh) {
+      HPL_fprintf(TEST->outfp,
+                  "%s%18.6f\n",
+                  "||Ax-b||_oo  . . . . . . . . . . . . . . . . . = ",
+                  resid0);
+      HPL_fprintf(TEST->outfp,
+                  "%s%18.6f\n",
+                  "||A||_oo . . . . . . . . . . . . . . . . . . . = ",
+                  AnormI);
+      HPL_fprintf(TEST->outfp,
+                  "%s%18.6f\n",
+                  "||A||_1  . . . . . . . . . . . . . . . . . . . = ",
+                  Anorm1);
+      HPL_fprintf(TEST->outfp,
+                  "%s%18.6f\n",
+                  "||x||_oo . . . . . . . . . . . . . . . . . . . = ",
+                  XnormI);
+      HPL_fprintf(TEST->outfp,
+                  "%s%18.6f\n",
+                  "||x||_1  . . . . . . . . . . . . . . . . . . . = ",
+                  Xnorm1);
+      HPL_fprintf(TEST->outfp,
+                  "%s%18.6f\n",
+                  "||b||_oo . . . . . . . . . . . . . . . . . . . = ",
+                  BnormI);
+    }
+
+#ifdef HPL_PROGRESS_REPORT
+    if(resid1 < TEST->thrsh)
+      printf("Residual Check: PASSED \n");
+    else
+      printf("Residual Check: FAILED \n");
+#endif
+  }
+
+  if(Bptr) free(Bptr);
+  HPL_pdmatfree(&mat);
+}
diff --git a/src/auxil/HPL_abort.cpp b/src/auxil/HPL_abort.cpp
new file mode 100644
index 0000000..c83dc9e
--- /dev/null
+++ b/src/auxil/HPL_abort.cpp
@@ -0,0 +1,74 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_abort(int LINE, const char* SRNAME, const char* FORM, ...) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_abort displays an error message on stderr and halts execution.
+   *
+   *
+   * Arguments
+   * =========
+   *
+   * LINE    (local input)                 int
+   *         On entry,  LINE  specifies the line  number in the file where
+   *         the  error  has  occured.  When  LINE  is not a positive line
+   *         number, it is ignored.
+   *
+   * SRNAME  (local input)                 const char *
+   *         On entry, SRNAME  should  be the name of the routine  calling
+   *         this error handler.
+   *
+   * FORM    (local input)                 const char *
+   *         On entry, FORM specifies the format, i.e., how the subsequent
+   *         arguments are converted for output.
+   *
+   *         (local input)                 ...
+   *         On entry,  ...  is the list of arguments to be printed within
+   *         the format string.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  va_list argptr;
+  char    cline[128];
+
+  va_start(argptr, FORM);
+  (void)vsprintf(cline, FORM, argptr);
+  va_end(argptr);
+  /*
+   * Display an error message
+   */
+  if(LINE <= 0)
+    HPL_fprintf(stderr,
+                "%s %s:\n>>> %s <<< Abort ...\n\n",
+                "HPL ERROR in function",
+                SRNAME,
+                cline);
+  else
+    HPL_fprintf(stderr,
+                "%s %d %s %s:\n>>> %s <<< Abort ...\n\n",
+                "HPL ERROR on line",
+                LINE,
+                "of function",
+                SRNAME,
+                cline);
+  exit(0);
+}
diff --git a/src/auxil/HPL_dlacpy.cpp b/src/auxil/HPL_dlacpy.cpp
new file mode 100644
index 0000000..7216618
--- /dev/null
+++ b/src/auxil/HPL_dlacpy.cpp
@@ -0,0 +1,68 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_dlacpy(const int     M,
+                const int     N,
+                const double* A,
+                const int     LDA,
+                double*       B,
+                const int     LDB) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_dlacpy copies an array A into an array B.
+   *
+   *
+   * Arguments
+   * =========
+   *
+   * M       (local input)                 const int
+   *         On entry,  M specifies the number of rows of the arrays A and
+   *         B. M must be at least zero.
+   *
+   * N       (local input)                 const int
+   *         On entry,  N specifies  the number of columns of the arrays A
+   *         and B. N must be at least zero.
+   *
+   * A       (local input)                 const double *
+   *         On entry, A points to an array of dimension (LDA,N).
+   *
+   * LDA     (local input)                 const int
+   *         On entry, LDA specifies the leading dimension of the array A.
+   *         LDA must be at least MAX(1,M).
+   *
+   * B       (local output)                double *
+   *         On entry, B points to an array of dimension (LDB,N). On exit,
+   *         B is overwritten with A.
+   *
+   * LDB     (local input)                 const int
+   *         On entry, LDB specifies the leading dimension of the array B.
+   *         LDB must be at least MAX(1,M).
+   *
+   * ---------------------------------------------------------------------
+   */
+  /*
+   * .. Local Variables ..
+   */
+  int j;
+
+  if((M <= 0) || (N <= 0)) return;
+
+  for(j = 0; j < N; j++, A += LDA, B += LDB) HPL_dcopy(M, A, 1, B, 1);
+}
diff --git a/src/auxil/HPL_dlamch.cpp b/src/auxil/HPL_dlamch.cpp
new file mode 100644
index 0000000..9adc0f6
--- /dev/null
+++ b/src/auxil/HPL_dlamch.cpp
@@ -0,0 +1,763 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+/*
+ * ---------------------------------------------------------------------
+ * Static function prototypes
+ * ---------------------------------------------------------------------
+ */
+static void HPL_dlamc1(int*, int*, int*, int*);
+static void HPL_dlamc2(int*, int*, int*, double*, int*, double*, int*, double*);
+static double HPL_dlamc3(const double, const double);
+static void   HPL_dlamc4(int*, const double, const int);
+static void   HPL_dlamc5(const int,
+                         const int,
+                         const int,
+                         const int,
+                         int*,
+                         double*);
+static double HPL_dipow(const double, const int);
+
+double HPL_dlamch(const HPL_T_MACH CMACH) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_dlamch determines  machine-specific  arithmetic constants such as
+   * the relative machine precision  (eps),  the safe minimum (sfmin) such
+   * that 1 / sfmin does not overflow, the base of the machine (base), the
+   * precision (prec), the  number of (base) digits  in the  mantissa (t),
+   * whether rounding occurs in addition (rnd=1.0 and 0.0 otherwise),  the
+   * minimum exponent before  (gradual)  underflow (emin),  the  underflow
+   * threshold (rmin) base**(emin-1), the largest exponent before overflow
+   * (emax), the overflow threshold (rmax) (base**emax)*(1-eps).
+   *
+   * Notes
+   * =====
+   *
+   * This function has been manually translated from the Fortran 77 LAPACK
+   * auxiliary function dlamch.f  (version 2.0 -- 1992), that  was  itself
+   * based on the function ENVRON  by Malcolm and incorporated suggestions
+   * by Gentleman and Marovich. See
+   *
+   * Malcolm M. A.,  Algorithms  to  reveal  properties  of floating-point
+   * arithmetic.,  Comms. of the ACM, 15, 949-951 (1972).
+   *
+   * Gentleman W. M. and Marovich S. B.,  More  on algorithms  that reveal
+   * properties of  floating point arithmetic units.,  Comms. of  the ACM,
+   * 17, 276-277 (1974).
+   *
+   * Arguments
+   * =========
+   *
+   * CMACH   (local input)                 const HPL_T_MACH
+   *         Specifies the value to be returned by HPL_dlamch
+   *            = HPL_MACH_EPS,   HPL_dlamch := eps (default)
+   *            = HPL_MACH_SFMIN, HPL_dlamch := sfmin
+   *            = HPL_MACH_BASE,  HPL_dlamch := base
+   *            = HPL_MACH_PREC,  HPL_dlamch := eps*base
+   *            = HPL_MACH_MLEN,  HPL_dlamch := t
+   *            = HPL_MACH_RND,   HPL_dlamch := rnd
+   *            = HPL_MACH_EMIN,  HPL_dlamch := emin
+   *            = HPL_MACH_RMIN,  HPL_dlamch := rmin
+   *            = HPL_MACH_EMAX,  HPL_dlamch := emax
+   *            = HPL_MACH_RMAX,  HPL_dlamch := rmax
+   *
+   *         where
+   *
+   *            eps   = relative machine precision,
+   *            sfmin = safe minimum,
+   *            base  = base of the machine,
+   *            prec  = eps*base,
+   *            t     = number of digits in the mantissa,
+   *            rnd   = 1.0 if rounding occurs in addition,
+   *            emin  = minimum exponent before underflow,
+   *            rmin  = underflow threshold,
+   *            emax  = largest exponent before overflow,
+   *            rmax  = overflow threshold.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  static double eps, sfmin, base, t, rnd, emin, rmin, emax, rmax, prec;
+  double        small;
+  static int    first = 1;
+  int           beta = 0, imax = 0, imin = 0, it = 0, lrnd = 0;
+
+  if(first != 0) {
+    first = 0;
+    HPL_dlamc2(&beta, &it, &lrnd, &eps, &imin, &rmin, &imax, &rmax);
+    base = (double)(beta);
+    t    = (double)(it);
+    if(lrnd != 0) {
+      rnd = HPL_rone;
+      eps = HPL_dipow(base, 1 - it) / HPL_rtwo;
+    } else {
+      rnd = HPL_rzero;
+      eps = HPL_dipow(base, 1 - it);
+    }
+    prec  = eps * base;
+    emin  = (double)(imin);
+    emax  = (double)(imax);
+    sfmin = rmin;
+    small = HPL_rone / rmax;
+    /*
+     * Use  SMALL  plus a bit,  to avoid the possibility of rounding causing
+     * overflow when computing  1/sfmin.
+     */
+    if(small >= sfmin) sfmin = small * (HPL_rone + eps);
+  }
+
+  if(CMACH == HPL_MACH_EPS) return (eps);
+  if(CMACH == HPL_MACH_SFMIN) return (sfmin);
+  if(CMACH == HPL_MACH_BASE) return (base);
+  if(CMACH == HPL_MACH_PREC) return (prec);
+  if(CMACH == HPL_MACH_MLEN) return (t);
+  if(CMACH == HPL_MACH_RND) return (rnd);
+  if(CMACH == HPL_MACH_EMIN) return (emin);
+  if(CMACH == HPL_MACH_RMIN) return (rmin);
+  if(CMACH == HPL_MACH_EMAX) return (emax);
+  if(CMACH == HPL_MACH_RMAX) return (rmax);
+
+  return (eps);
+}
+
+static void HPL_dlamc1(int* BETA, int* T, int* RND, int* IEEE1) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_dlamc1  determines  the machine parameters given by BETA, T, RND,
+   * and IEEE1.
+   *
+   * Notes
+   * =====
+   *
+   * This function has been manually translated from the Fortran 77 LAPACK
+   * auxiliary function dlamc1.f  (version 2.0 -- 1992), that  was  itself
+   * based on the function ENVRON  by Malcolm and incorporated suggestions
+   * by Gentleman and Marovich. See
+   *
+   * Malcolm M. A.,  Algorithms  to  reveal  properties  of floating-point
+   * arithmetic.,  Comms. of the ACM, 15, 949-951 (1972).
+   *
+   * Gentleman W. M. and Marovich S. B.,  More  on algorithms  that reveal
+   * properties of  floating point arithmetic units.,  Comms. of  the ACM,
+   * 17, 276-277 (1974).
+   *
+   * Arguments
+   * =========
+   *
+   * BETA    (local output)              int *
+   *         The base of the machine.
+   *
+   * T       (local output)              int *
+   *         The number of ( BETA ) digits in the mantissa.
+   *
+   * RND     (local output)              int *
+   *         Specifies whether proper rounding (RND=1) or chopping (RND=0)
+   *         occurs in addition.  This may not be a  reliable guide to the
+   *         way in which the machine performs its arithmetic.
+   *
+   * IEEE1   (local output)              int *
+   *         Specifies  whether  rounding  appears  to be done in the IEEE
+   *         `round to nearest' style (IEEE1=1), (IEEE1=0) otherwise.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  double     a, b, c, f, one, qtr, savec, t1, t2;
+  static int first = 1, lbeta, lieee1, lrnd, lt;
+
+  if(first != 0) {
+    first = 0;
+    one   = HPL_rone;
+    /*
+     * lbeta, lieee1, lt and lrnd are the local values of BETA, IEEE1, T and
+     * RND. Throughout this routine we use the function HPL_dlamc3 to ensure
+     * that relevant values are stored and not held in registers, or are not
+     * affected by optimizers.
+     *
+     * Compute  a = 2.0**m  with the  smallest  positive integer m such that
+     * fl( a + 1.0 ) == a.
+     */
+    a = HPL_rone;
+    c = HPL_rone;
+    do {
+      a *= HPL_rtwo;
+      c = HPL_dlamc3(a, one);
+      c = HPL_dlamc3(c, -a);
+    } while(c == HPL_rone);
+    /*
+     * Now compute b = 2.0**m with the smallest positive integer m such that
+     * fl( a + b ) > a.
+     */
+    b = HPL_rone;
+    c = HPL_dlamc3(a, b);
+    while(c == a) {
+      b *= HPL_rtwo;
+      c = HPL_dlamc3(a, b);
+    }
+    /*
+     * Now compute the base.  a and c  are  neighbouring floating point num-
+     * bers in the interval ( BETA**T, BETA**( T + 1 ) ) and so their diffe-
+     * rence is BETA.  Adding 0.25 to c is to ensure that it is truncated to
+     * BETA and not (BETA-1).
+     */
+    qtr   = one / 4.0;
+    savec = c;
+    c     = HPL_dlamc3(c, -a);
+    lbeta = (int)(c + qtr);
+    /*
+     * Now  determine  whether  rounding or chopping occurs, by adding a bit
+     * less than BETA/2 and a bit more than BETA/2 to a.
+     */
+    b = (double)(lbeta);
+    f = HPL_dlamc3(b / HPL_rtwo, -b / 100.0);
+    c = HPL_dlamc3(f, a);
+    if(c == a) {
+      lrnd = 1;
+    } else {
+      lrnd = 0;
+    }
+    f = HPL_dlamc3(b / HPL_rtwo, b / 100.0);
+    c = HPL_dlamc3(f, a);
+    if((lrnd != 0) && (c == a)) lrnd = 0;
+    /*
+     * Try  and decide whether rounding is done in the  IEEE  round to nea-
+     * rest style.  b/2 is half a unit in the last place of the two numbers
+     * a  and savec. Furthermore, a is even, i.e. has last bit zero, and sa-
+     * vec is odd.  Thus adding b/2 to a should not change a, but adding b/2
+     * to savec should change savec.
+     */
+    t1 = HPL_dlamc3(b / HPL_rtwo, a);
+    t2 = HPL_dlamc3(b / HPL_rtwo, savec);
+    if((t1 == a) && (t2 > savec) && (lrnd != 0))
+      lieee1 = 1;
+    else
+      lieee1 = 0;
+    /*
+     * Now find the mantissa, T. It should be the integer part of log to the
+     * base BETA of a, however it is safer to determine T by powering. So we
+     * find T as the smallest positive integer for which fl( beta**t + 1.0 )
+     * is equal to 1.0.
+     */
+    lt = 0;
+    a  = HPL_rone;
+    c  = HPL_rone;
+
+    do {
+      lt++;
+      a *= (double)(lbeta);
+      c = HPL_dlamc3(a, one);
+      c = HPL_dlamc3(c, -a);
+    } while(c == HPL_rone);
+  }
+
+  *BETA  = lbeta;
+  *T     = lt;
+  *RND   = lrnd;
+  *IEEE1 = lieee1;
+}
+
+static void HPL_dlamc2(int*    BETA,
+                       int*    T,
+                       int*    RND,
+                       double* EPS,
+                       int*    EMIN,
+                       double* RMIN,
+                       int*    EMAX,
+                       double* RMAX) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_dlamc2  determines the machine  parameters specified in its argu-
+   * ment list.
+   *
+   * Notes
+   * =====
+   *
+   * This function has been manually translated from the Fortran 77 LAPACK
+   * auxiliary function  dlamc2.f (version 2.0 -- 1992), that  was  itself
+   * based on a function PARANOIA  by  W. Kahan of the University of Cali-
+   * fornia at Berkeley for the computation of the  relative machine epsi-
+   * lon eps.
+   *
+   * Arguments
+   * =========
+   *
+   * BETA    (local output)              int *
+   *         The base of the machine.
+   *
+   * T       (local output)              int *
+   *         The number of ( BETA ) digits in the mantissa.
+   *
+   * RND     (local output)              int *
+   *         Specifies whether proper rounding (RND=1) or chopping (RND=0)
+   *         occurs in addition. This may not be a reliable  guide to  the
+   *         way in which the machine performs its arithmetic.
+   *
+   * EPS     (local output)              double *
+   *         The smallest positive number such that fl( 1.0 - EPS ) < 1.0,
+   *         where fl denotes the computed value.
+   *
+   * EMIN    (local output)              int *
+   *         The minimum exponent before (gradual) underflow occurs.
+   *
+   * RMIN    (local output)              double *
+   *         The smallest  normalized  number  for  the  machine, given by
+   *         BASE**( EMIN - 1 ), where  BASE  is the floating  point value
+   *         of BETA.
+   *
+   * EMAX    (local output)              int *
+   *         The maximum exponent before overflow occurs.
+   *
+   * RMAX    (local output)              double *
+   *         The  largest  positive  number  for  the  machine,  given  by
+   *         BASE**EMAX * ( 1 - EPS ), where  BASE  is the floating  point
+   *         value of BETA.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  static double leps, lrmax, lrmin;
+  double        a, b, c, half, one, rbase, sixth, small, third, two, zero;
+  static int    first = 1, iwarn = 0, lbeta = 0, lemax, lemin, lt = 0;
+  int           gnmin = 0, gpmin = 0, i, ieee, lieee1 = 0, lrnd = 0, ngnmin = 0,
+      ngpmin = 0;
+
+  if(first != 0) {
+    first = 0;
+    zero  = HPL_rzero;
+    one   = HPL_rone;
+    two   = HPL_rtwo;
+    /*
+     * lbeta, lt, lrnd, leps, lemin and lrmin are the local values of  BETA,
+     * T, RND, EPS, EMIN and RMIN.
+     *
+     * Throughout this routine we use the function HPL_dlamc3 to ensure that
+     * relevant values are stored and not held in registers,  or are not af-
+     * fected by optimizers.
+     *
+     * HPL_dlamc1 returns the parameters  lbeta, lt, lrnd and lieee1.
+     */
+    HPL_dlamc1(&lbeta, &lt, &lrnd, &lieee1);
+    /*
+     * Start to find eps.
+     */
+    b    = (double)(lbeta);
+    a    = HPL_dipow(b, -lt);
+    leps = a;
+    /*
+     * Try some tricks to see whether or not this is the correct  EPS.
+     */
+    b     = two / 3.0;
+    half  = one / HPL_rtwo;
+    sixth = HPL_dlamc3(b, -half);
+    third = HPL_dlamc3(sixth, sixth);
+    b     = HPL_dlamc3(third, -half);
+    b     = HPL_dlamc3(b, sixth);
+    b     = Mabs(b);
+    if(b < leps) b = leps;
+
+    leps = HPL_rone;
+
+    while((leps > b) && (b > zero)) {
+      leps = b;
+      c    = HPL_dlamc3(half * leps, HPL_dipow(two, 5) * HPL_dipow(leps, 2));
+      c    = HPL_dlamc3(half, -c);
+      b    = HPL_dlamc3(half, c);
+      c    = HPL_dlamc3(half, -b);
+      b    = HPL_dlamc3(half, c);
+    }
+    if(a < leps) leps = a;
+    /*
+     * Computation of EPS complete.
+     *
+     * Now find  EMIN.  Let a = + or - 1, and + or - (1 + BASE**(-3)).  Keep
+     * dividing a by BETA until (gradual) underflow occurs. This is detected
+     * when we cannot recover the previous a.
+     */
+    rbase = one / (double)(lbeta);
+    small = one;
+    for(i = 0; i < 3; i++) small = HPL_dlamc3(small * rbase, zero);
+    a = HPL_dlamc3(one, small);
+    HPL_dlamc4(&ngpmin, one, lbeta);
+    HPL_dlamc4(&ngnmin, -one, lbeta);
+    HPL_dlamc4(&gpmin, a, lbeta);
+    HPL_dlamc4(&gnmin, -a, lbeta);
+
+    ieee = 0;
+
+    if((ngpmin == ngnmin) && (gpmin == gnmin)) {
+      if(ngpmin == gpmin) {
+        /*
+         * Non twos-complement machines, no gradual underflow; e.g.,  VAX )
+         */
+        lemin = ngpmin;
+      } else if((gpmin - ngpmin) == 3) {
+        /*
+         * Non twos-complement machines with gradual underflow; e.g., IEEE stan-
+         * dard followers
+         */
+        lemin = ngpmin - 1 + lt;
+        ieee  = 1;
+      } else {
+        /*
+         * A guess; no known machine
+         */
+        lemin = Mmin(ngpmin, gpmin);
+        iwarn = 1;
+      }
+    } else if((ngpmin == gpmin) && (ngnmin == gnmin)) {
+      if(Mabs(ngpmin - ngnmin) == 1) {
+        /*
+         * Twos-complement machines, no gradual underflow; e.g., CYBER 205
+         */
+        lemin = Mmax(ngpmin, ngnmin);
+      } else {
+        /*
+         * A guess; no known machine
+         */
+        lemin = Mmin(ngpmin, ngnmin);
+        iwarn = 1;
+      }
+    } else if((Mabs(ngpmin - ngnmin) == 1) && (gpmin == gnmin)) {
+      if((gpmin - Mmin(ngpmin, ngnmin)) == 3) {
+        /*
+         * Twos-complement machines with gradual underflow; no known machine
+         */
+        lemin = Mmax(ngpmin, ngnmin) - 1 + lt;
+      } else {
+        /*
+         * A guess; no known machine
+         */
+        lemin = Mmin(ngpmin, ngnmin);
+        iwarn = 1;
+      }
+    } else {
+      /*
+       * A guess; no known machine
+       */
+      lemin = Mmin(ngpmin, ngnmin);
+      lemin = Mmin(lemin, gpmin);
+      lemin = Mmin(lemin, gnmin);
+      iwarn = 1;
+    }
+    /*
+     * Comment out this if block if EMIN is ok
+     */
+    if(iwarn != 0) {
+      first = 1;
+      HPL_fprintf(stderr,
+                  "\n %s %8d\n%s\n%s\n%s\n",
+                  "WARNING. The value EMIN may be incorrect:- EMIN =",
+                  lemin,
+                  "If, after inspection, the value EMIN looks acceptable, "
+                  "please comment ",
+                  "out the  if  block  as marked within the code of routine  "
+                  "HPL_dlamc2, ",
+                  "otherwise supply EMIN explicitly.");
+    }
+    /*
+     * Assume IEEE arithmetic if we found denormalised  numbers above, or if
+     * arithmetic seems to round in the  IEEE style,  determined  in routine
+     * HPL_dlamc1.  A true  IEEE  machine should have both things true; how-
+     * ever, faulty machines may have one or the other.
+     */
+    if((ieee != 0) || (lieee1 != 0))
+      ieee = 1;
+    else
+      ieee = 0;
+    /*
+     * Compute  RMIN by successive division by  BETA. We could compute  RMIN
+     * as BASE**( EMIN - 1 ), but some machines underflow during this compu-
+     * tation.
+     */
+    lrmin = HPL_rone;
+    for(i = 0; i < 1 - lemin; i++) lrmin = HPL_dlamc3(lrmin * rbase, zero);
+    /*
+     * Finally, call HPL_dlamc5 to compute emax and rmax.
+     */
+    HPL_dlamc5(lbeta, lt, lemin, ieee, &lemax, &lrmax);
+  }
+  *BETA = lbeta;
+  *T    = lt;
+  *RND  = lrnd;
+  *EPS  = leps;
+  *EMIN = lemin;
+  *RMIN = lrmin;
+  *EMAX = lemax;
+  *RMAX = lrmax;
+}
+
+static double HPL_dlamc3(const double A, const double B) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_dlamc3  is intended to force a and b  to be stored prior to doing
+   * the addition of  a  and  b,  for  use  in situations where optimizers
+   * might hold one of these in a register.
+   *
+   * Notes
+   * =====
+   *
+   * This function has been manually translated from the Fortran 77 LAPACK
+   * auxiliary function dlamc3.f (version 2.0 -- 1992).
+   *
+   * Arguments
+   * =========
+   *
+   * A, B    (local input)               double
+   *         The values a and b.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  return (A + B);
+}
+
+static void HPL_dlamc4(int* EMIN, const double START, const int BASE) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_dlamc4 is a service function for HPL_dlamc2.
+   *
+   * Notes
+   * =====
+   *
+   * This function has been manually translated from the Fortran 77 LAPACK
+   * auxiliary function dlamc4.f (version 2.0 -- 1992).
+   *
+   * Arguments
+   * =========
+   *
+   * EMIN    (local output)              int *
+   *         The minimum exponent before  (gradual) underflow, computed by
+   *         setting A = START and dividing  by  BASE until the previous A
+   *         can not be recovered.
+   *
+   * START   (local input)               double
+   *         The starting point for determining EMIN.
+   *
+   * BASE    (local input)               int
+   *         The base of the machine.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  double a, b1, b2, c1, c2, d1, d2, one, rbase, zero;
+  int    i;
+
+  a     = START;
+  one   = HPL_rone;
+  rbase = one / (double)(BASE);
+  zero  = HPL_rzero;
+  *EMIN = 1;
+  b1    = HPL_dlamc3(a * rbase, zero);
+  c1 = c2 = d1 = d2 = a;
+
+  do {
+    (*EMIN)--;
+    a  = b1;
+    b1 = HPL_dlamc3(a / BASE, zero);
+    c1 = HPL_dlamc3(b1 * BASE, zero);
+    d1 = zero;
+    for(i = 0; i < BASE; i++) d1 = d1 + b1;
+    b2 = HPL_dlamc3(a * rbase, zero);
+    c2 = HPL_dlamc3(b2 / rbase, zero);
+    d2 = zero;
+    for(i = 0; i < BASE; i++) d2 = d2 + b2;
+  } while((c1 == a) && (c2 == a) && (d1 == a) && (d2 == a));
+}
+
+static void HPL_dlamc5(const int BETA,
+                       const int P,
+                       const int EMIN,
+                       const int IEEE,
+                       int*      EMAX,
+                       double*   RMAX) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_dlamc5  attempts  to compute RMAX, the largest machine  floating-
+   * point number, without overflow.  It assumes that EMAX + abs(EMIN) sum
+   * approximately to a power of 2.  It will fail  on machines where  this
+   * assumption does not hold, for example, the  Cyber 205 (EMIN = -28625,
+   * EMAX = 28718).  It will also fail if  the value supplied for  EMIN is
+   * too large (i.e. too close to zero), probably with overflow.
+   *
+   * Notes
+   * =====
+   *
+   * This function has been manually translated from the Fortran 77 LAPACK
+   * auxiliary function dlamc5.f (version 2.0 -- 1992).
+   *
+   * Arguments
+   * =========
+   *
+   * BETA    (local input)               int
+   *         The base of floating-point arithmetic.
+   *
+   * P       (local input)               int
+   *         The number of base BETA digits in the mantissa of a floating-
+   *         point value.
+   *
+   * EMIN    (local input)               int
+   *         The minimum exponent before (gradual) underflow.
+   *
+   * IEEE    (local input)               int
+   *         A logical flag specifying whether or not  the arithmetic sys-
+   *         tem is thought to comply with the IEEE standard.
+   *
+   * EMAX    (local output)              int *
+   *         The largest exponent before overflow.
+   *
+   * RMAX    (local output)              double *
+   *         The largest machine floating-point number.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  double oldy   = HPL_rzero, recbas, y, z;
+  int    exbits = 1, expsum, i, lexp = 1, nbits, ttry, uexp;
+/* ..
+ * .. Executable Statements ..
+ */
+/*
+ * First compute  lexp  and  uexp, two powers of 2 that bound abs(EMIN).
+ * We then assume that  EMAX + abs( EMIN ) will sum approximately to the
+ * bound that  is closest to abs( EMIN ). (EMAX  is the  exponent of the
+ * required number RMAX).
+ */
+l_10:
+  ttry = (int)((unsigned int)(lexp) << 1);
+  if(ttry <= (-EMIN)) {
+    lexp = ttry;
+    exbits++;
+    goto l_10;
+  }
+
+  if(lexp == -EMIN) {
+    uexp = lexp;
+  } else {
+    uexp = ttry;
+    exbits++;
+  }
+  /*
+   * Now -lexp is less than or equal to EMIN, and -uexp is greater than or
+   * equal to EMIN. exbits is the number of bits needed to store the expo-
+   * nent.
+   */
+  if((uexp + EMIN) > (-lexp - EMIN)) {
+    expsum = (int)((unsigned int)(lexp) << 1);
+  } else {
+    expsum = (int)((unsigned int)(uexp) << 1);
+  }
+  /*
+   * expsum is the exponent range, approximately equal to EMAX - EMIN + 1.
+   */
+  *EMAX = expsum + EMIN - 1;
+  /*
+   * nbits  is  the total number of bits needed to store a  floating-point
+   * number.
+   */
+  nbits = 1 + exbits + P;
+
+  if((nbits % 2 == 1) && (BETA == 2)) {
+    /*
+     * Either there are an odd number of bits used to store a floating-point
+     * number, which is unlikely, or some bits are not used in the represen-
+     * tation of numbers,  which is possible,  (e.g. Cray machines)  or  the
+     * mantissa has an implicit bit, (e.g. IEEE machines, Dec Vax machines),
+     * which is perhaps the most likely. We have to assume the last alterna-
+     * tive.  If this is true,  then we need to reduce  EMAX  by one because
+     * there must be some way of representing zero  in an  implicit-bit sys-
+     * tem. On machines like Cray we are reducing EMAX by one unnecessarily.
+     */
+    (*EMAX)--;
+  }
+
+  if(IEEE != 0) {
+    /*
+     * Assume we are on an IEEE  machine which reserves one exponent for in-
+     * finity and NaN.
+     */
+    (*EMAX)--;
+  }
+  /*
+   * Now create RMAX, the largest machine number, which should be equal to
+   * (1.0 - BETA**(-P)) * BETA**EMAX . First compute 1.0-BETA**(-P), being
+   * careful that the result is less than 1.0.
+   */
+  recbas = HPL_rone / (double)(BETA);
+  z      = (double)(BETA)-HPL_rone;
+  y      = HPL_rzero;
+
+  for(i = 0; i < P; i++) {
+    z *= recbas;
+    if(y < HPL_rone) oldy = y;
+    y = HPL_dlamc3(y, z);
+  }
+
+  if(y >= HPL_rone) y = oldy;
+  /*
+   * Now multiply by BETA**EMAX to get RMAX.
+   */
+  for(i = 0; i < *EMAX; i++) y = HPL_dlamc3(y * BETA, HPL_rzero);
+
+  *RMAX = y;
+}
+
+static double HPL_dipow(const double X, const int N) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_dipow computes the integer n-th power of a real scalar x.
+   *
+   * Arguments
+   * =========
+   *
+   * X       (local input)               const double
+   *         The real scalar x.
+   *
+   * N       (local input)               const int
+   *         The integer power to raise x to.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  double r, y = HPL_rone;
+  int    k, n;
+
+  if(X == HPL_rzero) return (HPL_rzero);
+  if(N < 0) {
+    n = -N;
+    r = HPL_rone / X;
+  } else {
+    n = N;
+    r = X;
+  }
+  for(k = 0; k < n; k++) y *= r;
+
+  return (y);
+}
diff --git a/src/auxil/HPL_dlange.cpp b/src/auxil/HPL_dlange.cpp
new file mode 100644
index 0000000..9f72ed4
--- /dev/null
+++ b/src/auxil/HPL_dlange.cpp
@@ -0,0 +1,132 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+double HPL_dlange(const HPL_T_NORM NORM,
+                  const int        M,
+                  const int        N,
+                  const double*    A,
+                  const int        LDA) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_dlange returns  the value of the one norm,  or the infinity norm,
+   * or the element of largest absolute value of a matrix A:
+   *
+   *    max(abs(A(i,j))) when NORM = HPL_NORM_A,
+   *    norm1(A),        when NORM = HPL_NORM_1,
+   *    normI(A),        when NORM = HPL_NORM_I,
+   *
+   * where norm1 denotes the one norm of a matrix (maximum column sum) and
+   * normI denotes  the infinity norm of a matrix (maximum row sum).  Note
+   * that max(abs(A(i,j))) is not a matrix norm.
+   *
+   * Arguments
+   * =========
+   *
+   * NORM    (local input)                 const HPL_T_NORM
+   *         On entry,  NORM  specifies  the  value to be returned by this
+   *         function as described above.
+   *
+   * M       (local input)                 const int
+   *         On entry,  M  specifies  the number  of rows of the matrix A.
+   *         M must be at least zero.
+   *
+   * N       (local input)                 const int
+   *         On entry,  N specifies the number of columns of the matrix A.
+   *         N must be at least zero.
+   *
+   * A       (local input)                 const double *
+   *         On entry,  A  points to an  array of dimension  (LDA,N), that
+   *         contains the matrix A.
+   *
+   * LDA     (local input)                 const int
+   *         On entry, LDA specifies the leading dimension of the array A.
+   *         LDA must be at least max(1,M).
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  double s, v0 = HPL_rzero, *work = NULL;
+  int    i, j;
+
+  if((M <= 0) || (N <= 0)) return (HPL_rzero);
+
+  if(NORM == HPL_NORM_A) {
+    /*
+     * max( abs( A ) )
+     */
+    for(j = 0; j < N; j++) {
+      for(i = 0; i < M; i++) {
+        v0 = Mmax(v0, Mabs(*A));
+        A++;
+      }
+      A += LDA - M;
+    }
+  } else if(NORM == HPL_NORM_1) {
+    /*
+     * Find norm_1( A ).
+     */
+    work = (double*)malloc((size_t)(N) * sizeof(double));
+    if(work == NULL) {
+      HPL_abort(__LINE__, "HPL_dlange", "Memory allocation failed");
+    } else {
+      for(j = 0; j < N; j++) {
+        s = HPL_rzero;
+        for(i = 0; i < M; i++) {
+          s += Mabs(*A);
+          A++;
+        }
+        work[j] = s;
+        A += LDA - M;
+      }
+      /*
+       * Find maximum sum of columns for 1-norm
+       */
+      v0 = work[HPL_idamax(N, work, 1)];
+      v0 = Mabs(v0);
+      if(work) free(work);
+    }
+  } else if(NORM == HPL_NORM_I) {
+    /*
+     * Find norm_inf( A )
+     */
+    work = (double*)malloc((size_t)(M) * sizeof(double));
+    if(work == NULL) {
+      HPL_abort(__LINE__, "HPL_dlange", "Memory allocation failed");
+    } else {
+      for(i = 0; i < M; i++) { work[i] = HPL_rzero; }
+
+      for(j = 0; j < N; j++) {
+        for(i = 0; i < M; i++) {
+          work[i] += Mabs(*A);
+          A++;
+        }
+        A += LDA - M;
+      }
+      /*
+       * Find maximum sum of rows for inf-norm
+       */
+      v0 = work[HPL_idamax(M, work, 1)];
+      v0 = Mabs(v0);
+      if(work) free(work);
+    }
+  }
+
+  return (v0);
+}
diff --git a/src/auxil/HPL_dlaprnt.cpp b/src/auxil/HPL_dlaprnt.cpp
new file mode 100644
index 0000000..ae8d7f5
--- /dev/null
+++ b/src/auxil/HPL_dlaprnt.cpp
@@ -0,0 +1,76 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_dlaprnt(const int   M,
+                 const int   N,
+                 double*     A,
+                 const int   IA,
+                 const int   JA,
+                 const int   LDA,
+                 const char* CMATNM) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_dlaprnt prints to standard error an M-by-N matrix A.
+   *
+   *
+   * Arguments
+   * =========
+   *
+   * M       (local input)                 const int
+   *         On entry,  M  specifies the number of rows of A. M must be at
+   *         least zero.
+   *
+   * N       (local input)                 const int
+   *         On entry,  N  specifies the number of columns of A. N must be
+   *         at least zero.
+   *
+   * A       (local input)                 double *
+   *         On entry, A  points to an array of dimension (LDA,N).
+   *
+   * IA      (local input)                 const int
+   *         On entry, IA specifies the starting row index to be printed.
+   *
+   * JA      (local input)                 const int
+   *         On entry,  JA  specifies  the  starting  column index  to be
+   *         printed.
+   *
+   * LDA     (local input)                 const int
+   *         On entry, LDA specifies the leading dimension of the array A.
+   *         LDA must be at least max(1,M).
+   *
+   * CMATNM  (local input)                 const char *
+   *         On entry, CMATNM is the name of the matrix to be printed.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  int i, j;
+
+  for(j = 0; j < N; j++) {
+    for(i = 0; i < M; i++) {
+      HPL_fprintf(stderr,
+                  "%s(%6d,%6d)=%30.18f\n",
+                  CMATNM,
+                  IA + i,
+                  JA + j,
+                  *(Mptr(A, i, j, LDA)));
+    }
+  }
+}
diff --git a/src/auxil/HPL_dlatcpy.cpp b/src/auxil/HPL_dlatcpy.cpp
new file mode 100644
index 0000000..da62278
--- /dev/null
+++ b/src/auxil/HPL_dlatcpy.cpp
@@ -0,0 +1,68 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_dlatcpy(const int     M,
+                 const int     N,
+                 const double* A,
+                 const int     LDA,
+                 double*       B,
+                 const int     LDB) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_dlatcpy copies the transpose of an array A into an array B.
+   *
+   *
+   * Arguments
+   * =========
+   *
+   * M       (local input)                 const int
+   *         On entry,  M specifies the number of  rows of the array B and
+   *         the number of columns of A. M must be at least zero.
+   *
+   * N       (local input)                 const int
+   *         On entry,  N specifies the number of  rows of the array A and
+   *         the number of columns of B. N must be at least zero.
+   *
+   * A       (local input)                 const double *
+   *         On entry, A points to an array of dimension (LDA,M).
+   *
+   * LDA     (local input)                 const int
+   *         On entry, LDA specifies the leading dimension of the array A.
+   *         LDA must be at least MAX(1,N).
+   *
+   * B       (local output)                double *
+   *         On entry, B points to an array of dimension (LDB,N). On exit,
+   *         B is overwritten with the transpose of A.
+   *
+   * LDB     (local input)                 const int
+   *         On entry, LDB specifies the leading dimension of the array B.
+   *         LDB must be at least MAX(1,M).
+   *
+   * ---------------------------------------------------------------------
+   */
+  /*
+   * .. Local Variables ..
+   */
+  int j;
+
+  if((M <= 0) || (N <= 0)) return;
+
+  for(j = 0; j < N; j++, B += LDB) HPL_dcopy(M, A + j, LDA, B, 1);
+}
diff --git a/src/auxil/HPL_dlatcpy_device.cpp b/src/auxil/HPL_dlatcpy_device.cpp
new file mode 100644
index 0000000..c41caab
--- /dev/null
+++ b/src/auxil/HPL_dlatcpy_device.cpp
@@ -0,0 +1,113 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+#include <hip/hip_runtime.h>
+
+#define TILE_DIM 64
+#define BLOCK_ROWS 16
+
+__global__ void dlatcpy_gpu(const int M,
+                            const int N,
+                            const double* __restrict__ A,
+                            const int LDA,
+                            double* __restrict__ B,
+                            const int LDB) {
+
+  __shared__ double s_tile[TILE_DIM][TILE_DIM + 1];
+
+  int I = blockIdx.x * TILE_DIM + threadIdx.y;
+  int J = blockIdx.y * TILE_DIM + threadIdx.x;
+
+  if(J < N) {
+    if(I + 0 < M)
+      s_tile[threadIdx.y + 0][threadIdx.x] = A[((size_t)I + 0) * LDA + J];
+    if(I + 16 < M)
+      s_tile[threadIdx.y + 16][threadIdx.x] = A[((size_t)I + 16) * LDA + J];
+    if(I + 32 < M)
+      s_tile[threadIdx.y + 32][threadIdx.x] = A[((size_t)I + 32) * LDA + J];
+    if(I + 48 < M)
+      s_tile[threadIdx.y + 48][threadIdx.x] = A[((size_t)I + 48) * LDA + J];
+  }
+
+  I = blockIdx.x * TILE_DIM + threadIdx.x;
+  J = blockIdx.y * TILE_DIM + threadIdx.y;
+
+  __syncthreads();
+
+  if(I < M) {
+    if(J + 0 < N)
+      B[I + ((size_t)J + 0) * LDB] = s_tile[threadIdx.x][threadIdx.y + 0];
+    if(J + 16 < N)
+      B[I + ((size_t)J + 16) * LDB] = s_tile[threadIdx.x][threadIdx.y + 16];
+    if(J + 32 < N)
+      B[I + ((size_t)J + 32) * LDB] = s_tile[threadIdx.x][threadIdx.y + 32];
+    if(J + 48 < N)
+      B[I + ((size_t)J + 48) * LDB] = s_tile[threadIdx.x][threadIdx.y + 48];
+  }
+}
+
+void HPL_dlatcpy_gpu(const int     M,
+                     const int     N,
+                     const double* A,
+                     const int     LDA,
+                     double*       B,
+                     const int     LDB) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_dlatcpy copies the transpose of an array A into an array B.
+   *
+   *
+   * Arguments
+   * =========
+   *
+   * M       (local input)                 const int
+   *         On entry,  M specifies the number of  rows of the array B and
+   *         the number of columns of A. M must be at least zero.
+   *
+   * N       (local input)                 const int
+   *         On entry,  N specifies the number of  rows of the array A and
+   *         the number of columns of B. N must be at least zero.
+   *
+   * A       (local input)                 const double *
+   *         On entry, A points to an array of dimension (LDA,M).
+   *
+   * LDA     (local input)                 const int
+   *         On entry, LDA specifies the leading dimension of the array A.
+   *         LDA must be at least MAX(1,N).
+   *
+   * B       (local output)                double *
+   *         On entry, B points to an array of dimension (LDB,N). On exit,
+   *         B is overwritten with the transpose of A.
+   *
+   * LDB     (local input)                 const int
+   *         On entry, LDB specifies the leading dimension of the array B.
+   *         LDB must be at least MAX(1,M).
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  if((M <= 0) || (N <= 0)) return;
+
+  hipStream_t stream;
+  rocblas_get_stream(handle, &stream);
+
+  dim3 grid_size((M + TILE_DIM - 1) / TILE_DIM, (N + TILE_DIM - 1) / TILE_DIM);
+  dim3 block_size(TILE_DIM, BLOCK_ROWS);
+  dlatcpy_gpu<<<grid_size, block_size, 0, stream>>>(M, N, A, LDA, B, LDB);
+}
diff --git a/src/auxil/HPL_fprintf.cpp b/src/auxil/HPL_fprintf.cpp
new file mode 100644
index 0000000..d0ee2f0
--- /dev/null
+++ b/src/auxil/HPL_fprintf.cpp
@@ -0,0 +1,53 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_fprintf(FILE* STREAM, const char* FORM, ...) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_fprintf is a wrapper around fprintf flushing the output stream.
+   *
+   *
+   * Arguments
+   * =========
+   *
+   * STREAM  (local input)                 FILE *
+   *         On entry, STREAM specifies the output stream.
+   *
+   * FORM    (local input)                 const char *
+   *         On entry, FORM specifies the format, i.e., how the subsequent
+   *         arguments are converted for output.
+   *
+   *         (local input)                 ...
+   *         On entry,  ...  is the list of arguments to be printed within
+   *         the format string.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  va_list argptr;
+  char    cline[256];
+
+  va_start(argptr, FORM);
+  (void)vsprintf(cline, FORM, argptr);
+  va_end(argptr);
+
+  (void)fprintf(STREAM, "%s", cline);
+  (void)fflush(STREAM);
+}
diff --git a/src/auxil/HPL_warn.cpp b/src/auxil/HPL_warn.cpp
new file mode 100644
index 0000000..d6e66ab
--- /dev/null
+++ b/src/auxil/HPL_warn.cpp
@@ -0,0 +1,80 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_warn(FILE*       STREAM,
+              int         LINE,
+              const char* SRNAME,
+              const char* FORM,
+              ...) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_warn displays an error message.
+   *
+   *
+   * Arguments
+   * =========
+   *
+   * STREAM  (local input)                 FILE *
+   *         On entry, STREAM specifies the output stream.
+   *
+   * LINE    (local input)                 int
+   *         On entry,  LINE  specifies the line  number in the file where
+   *         the  error  has  occured.  When  LINE  is not a positive line
+   *         number, it is ignored.
+   *
+   * SRNAME  (local input)                 const char *
+   *         On entry, SRNAME  should  be the name of the routine  calling
+   *         this error handler.
+   *
+   * FORM    (local input)                 const char *
+   *         On entry, FORM specifies the format, i.e., how the subsequent
+   *         arguments are converted for output.
+   *
+   *         (local input)                 ...
+   *         On entry,  ...  is the list of arguments to be printed within
+   *         the format string.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  va_list argptr;
+  char    cline[128];
+
+  va_start(argptr, FORM);
+  (void)vsprintf(cline, FORM, argptr);
+  va_end(argptr);
+  /*
+   * Display an error message
+   */
+  if(LINE <= 0)
+    HPL_fprintf(STREAM,
+                "%s %s:\n>>> %s <<<\n\n",
+                "HPL ERROR in function",
+                SRNAME,
+                cline);
+  else
+    HPL_fprintf(STREAM,
+                "%s %d %s %s:\n>>> %s <<<\n\n",
+                "HPL ERROR on line",
+                LINE,
+                "of function",
+                SRNAME,
+                cline);
+}
diff --git a/src/blas/HPL_daxpy.cpp b/src/blas/HPL_daxpy.cpp
new file mode 100644
index 0000000..b1bc697
--- /dev/null
+++ b/src/blas/HPL_daxpy.cpp
@@ -0,0 +1,43 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+#include "hpl.hpp"
+
+void HPL_daxpy_omp(const int     N,
+                   const double  ALPHA,
+                   const double* X,
+                   const int     INCX,
+                   double*       Y,
+                   const int     INCY,
+                   const int     NB,
+                   const int     II,
+                   const int     thread_rank,
+                   const int     thread_size) {
+
+  int tile = 0;
+  if(tile % thread_size == thread_rank) {
+    const int nn = Mmin(NB - II, N);
+    HPL_daxpy(nn, ALPHA, X, INCX, Y, INCY);
+  }
+  ++tile;
+  int i = NB - II;
+  for(; i < N; i += NB) {
+    if(tile % thread_size == thread_rank) {
+      const int nn = Mmin(NB, N - i);
+      HPL_daxpy(nn, ALPHA, X + i * INCX, INCX, Y + i * INCY, INCY);
+    }
+    ++tile;
+  }
+}
diff --git a/src/blas/HPL_dgemm.cpp b/src/blas/HPL_dgemm.cpp
new file mode 100644
index 0000000..1bd824f
--- /dev/null
+++ b/src/blas/HPL_dgemm.cpp
@@ -0,0 +1,65 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+#include "hpl.hpp"
+
+void HPL_dgemm_omp(const enum HPL_ORDER ORDER,
+                   const enum HPL_TRANS TRANSA,
+                   const enum HPL_TRANS TRANSB,
+                   const int            M,
+                   const int            N,
+                   const int            K,
+                   const double         ALPHA,
+                   const double*        A,
+                   const int            LDA,
+                   const double*        B,
+                   const int            LDB,
+                   const double         BETA,
+                   double*              C,
+                   const int            LDC,
+                   const int            NB,
+                   const int            II,
+                   const int            thread_rank,
+                   const int            thread_size) {
+
+  int tile = 0;
+  if(tile % thread_size == thread_rank) {
+    const int mm = Mmin(NB - II, M);
+    HPL_dgemm(
+        ORDER, TRANSA, TRANSB, mm, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC);
+  }
+  ++tile;
+  int i = NB - II;
+  for(; i < M; i += NB) {
+    if(tile % thread_size == thread_rank) {
+      const int mm = Mmin(NB, M - i);
+      HPL_dgemm(ORDER,
+                TRANSA,
+                TRANSB,
+                mm,
+                N,
+                K,
+                ALPHA,
+                A + i,
+                LDA,
+                B,
+                LDB,
+                BETA,
+                C + i,
+                LDC);
+    }
+    ++tile;
+  }
+}
diff --git a/src/blas/HPL_dgemv.cpp b/src/blas/HPL_dgemv.cpp
new file mode 100644
index 0000000..19631cb
--- /dev/null
+++ b/src/blas/HPL_dgemv.cpp
@@ -0,0 +1,60 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+#include "hpl.hpp"
+
+void HPL_dgemv_omp(const enum HPL_ORDER ORDER,
+                   const enum HPL_TRANS TRANS,
+                   const int            M,
+                   const int            N,
+                   const double         ALPHA,
+                   const double*        A,
+                   const int            LDA,
+                   const double*        X,
+                   const int            INCX,
+                   const double         BETA,
+                   double*              Y,
+                   const int            INCY,
+                   const int            NB,
+                   const int            II,
+                   const int            thread_rank,
+                   const int            thread_size) {
+
+  int tile = 0;
+  if(tile % thread_size == thread_rank) {
+    const int mm = Mmin(NB - II, M);
+    HPL_dgemv(ORDER, TRANS, mm, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY);
+  }
+  ++tile;
+  int i = NB - II;
+  for(; i < M; i += NB) {
+    if(tile % thread_size == thread_rank) {
+      const int mm = Mmin(NB, M - i);
+      HPL_dgemv(ORDER,
+                TRANS,
+                mm,
+                N,
+                ALPHA,
+                A + i,
+                LDA,
+                X,
+                INCX,
+                BETA,
+                Y + i * INCY,
+                INCY);
+    }
+    ++tile;
+  }
+}
diff --git a/src/blas/HPL_dger.cpp b/src/blas/HPL_dger.cpp
new file mode 100644
index 0000000..23b2d69
--- /dev/null
+++ b/src/blas/HPL_dger.cpp
@@ -0,0 +1,47 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+#include "hpl.hpp"
+
+void HPL_dger_omp(const enum HPL_ORDER ORDER,
+                  const int            M,
+                  const int            N,
+                  const double         ALPHA,
+                  const double*        X,
+                  const int            INCX,
+                  double*              Y,
+                  const int            INCY,
+                  double*              A,
+                  const int            LDA,
+                  const int            NB,
+                  const int            II,
+                  const int            thread_rank,
+                  const int            thread_size) {
+
+  int tile = 0;
+  if(tile % thread_size == thread_rank) {
+    const int mm = Mmin(NB - II, M);
+    HPL_dger(ORDER, mm, N, ALPHA, X, INCX, Y, INCY, A, LDA);
+  }
+  ++tile;
+  int i = NB - II;
+  for(; i < M; i += NB) {
+    if(tile % thread_size == thread_rank) {
+      const int mm = Mmin(NB, M - i);
+      HPL_dger(ORDER, mm, N, ALPHA, X + i * INCX, INCX, Y, INCY, A + i, LDA);
+    }
+    ++tile;
+  }
+}
diff --git a/src/blas/HPL_dscal.cpp b/src/blas/HPL_dscal.cpp
new file mode 100644
index 0000000..fa26fcb
--- /dev/null
+++ b/src/blas/HPL_dscal.cpp
@@ -0,0 +1,41 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+#include "hpl.hpp"
+
+void HPL_dscal_omp(const int    N,
+                   const double ALPHA,
+                   double*      X,
+                   const int    INCX,
+                   const int    NB,
+                   const int    II,
+                   const int    thread_rank,
+                   const int    thread_size) {
+
+  int tile = 0;
+  if(tile % thread_size == thread_rank) {
+    const int nn = Mmin(NB - II, N);
+    HPL_dscal(nn, ALPHA, X, INCX);
+  }
+  ++tile;
+  int i = NB - II;
+  for(; i < N; i += NB) {
+    if(tile % thread_size == thread_rank) {
+      const int nn = Mmin(NB, N - i);
+      HPL_dscal(nn, ALPHA, X + i * INCX, INCX);
+    }
+    ++tile;
+  }
+}
diff --git a/src/blas/HPL_idamax.cpp b/src/blas/HPL_idamax.cpp
new file mode 100644
index 0000000..b9eeb63
--- /dev/null
+++ b/src/blas/HPL_idamax.cpp
@@ -0,0 +1,64 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+#include "hpl.hpp"
+
+void HPL_idamax_omp(const int     N,
+                    const double* X,
+                    const int     INCX,
+                    const int     NB,
+                    const int     II,
+                    const int     thread_rank,
+                    const int     thread_size,
+                    int*          max_index,
+                    double*       max_value) {
+
+  max_index[thread_rank] = 0;
+  max_value[thread_rank] = 0.0;
+
+  if(N < 1) return;
+
+  int tile = 0;
+  if(tile % thread_size == thread_rank) {
+    const int nn           = Mmin(NB - II, N);
+    max_index[thread_rank] = HPL_idamax(nn, X, INCX);
+    max_value[thread_rank] = X[max_index[thread_rank] * INCX];
+  }
+  ++tile;
+  int i = NB - II;
+  for(; i < N; i += NB) {
+    if(tile % thread_size == thread_rank) {
+      const int nn  = Mmin(NB, N - i);
+      const int idm = HPL_idamax(nn, X + i * INCX, INCX);
+      if(abs(X[(idm + i) * INCX]) > abs(max_value[thread_rank])) {
+        max_value[thread_rank] = X[(idm + i) * INCX];
+        max_index[thread_rank] = idm + i;
+      }
+    }
+    ++tile;
+  }
+
+#pragma omp barrier
+
+  // finish reduction
+  if(thread_rank == 0) {
+    for(int rank = 1; rank < thread_size; ++rank) {
+      if(abs(max_value[rank]) > abs(max_value[0])) {
+        max_value[0] = max_value[rank];
+        max_index[0] = max_index[rank];
+      }
+    }
+  }
+}
diff --git a/src/comm/HPL_all_reduce.cpp b/src/comm/HPL_all_reduce.cpp
new file mode 100644
index 0000000..10ed5cb
--- /dev/null
+++ b/src/comm/HPL_all_reduce.cpp
@@ -0,0 +1,59 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+int HPL_all_reduce(void*            BUFFER,
+                   const int        COUNT,
+                   const HPL_T_TYPE DTYPE,
+                   const HPL_T_OP   OP,
+                   MPI_Comm         COMM) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_all_reduce performs   a   global   reduce  operation  across  all
+   * processes of a group leaving the results on all processes.
+   *
+   * Arguments
+   * =========
+   *
+   * BUFFER  (local input/global output)   void *
+   *         On entry,  BUFFER  points to  the  buffer to be combined.  On
+   *         exit, this array contains the combined data and  is identical
+   *         on all processes in the group.
+   *
+   * COUNT   (global input)                const int
+   *         On entry,  COUNT  indicates the number of entries in  BUFFER.
+   *         COUNT must be at least zero.
+   *
+   * DTYPE   (global input)                const HPL_T_TYPE
+   *         On entry,  DTYPE  specifies the type of the buffers operands.
+   *
+   * OP      (global input)                const HPL_T_OP
+   *         On entry, OP is a pointer to the local combine function.
+   *
+   * COMM    (global/local input)          MPI_Comm
+   *         The MPI communicator identifying the process collection.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  int ierr = MPI_Allreduce(
+      MPI_IN_PLACE, BUFFER, COUNT, HPL_2_MPI_TYPE(DTYPE), OP, COMM);
+
+  return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE));
+}
diff --git a/src/comm/HPL_all_reduce_dmxswp.cpp b/src/comm/HPL_all_reduce_dmxswp.cpp
new file mode 100644
index 0000000..8c473b4
--- /dev/null
+++ b/src/comm/HPL_all_reduce_dmxswp.cpp
@@ -0,0 +1,298 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+#include <assert.h>
+
+/* MPI_Op_create is called in main to bind HPL_dmxswp to this MPI_Op */
+MPI_Op       HPL_DMXSWP;
+MPI_Datatype PDFACT_ROW;
+
+/* Swap-broadcast comparison function usable in MPI_Allreduce */
+void HPL_dmxswp(void* invec, void* inoutvec, int* len, MPI_Datatype* datatype) {
+
+  assert(*datatype == PDFACT_ROW);
+  assert(*len == 1);
+
+  int N;
+  MPI_Type_size(PDFACT_ROW, &N);
+
+  double* Wwork = static_cast<double*>(invec);
+  double* WORK  = static_cast<double*>(inoutvec);
+
+  const int jb = (N / sizeof(double)) - 4;
+
+  // check max column value and overwirte row if new max is found
+  const double gmax = Mabs(WORK[0]);
+  const double tmp1 = Mabs(Wwork[0]);
+  if((tmp1 > gmax) || ((tmp1 == gmax) && (Wwork[3] < WORK[3]))) {
+    HPL_dcopy(jb + 4, Wwork, 1, WORK, 1);
+  }
+}
+
+void HPL_all_reduce_dmxswp(double*   BUFFER,
+                           const int COUNT,
+                           const int ROOT,
+                           MPI_Comm  COMM,
+                           double*   WORK) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_all_reduce_dmxswp is a specialized all_reduce that performs
+   * the swap-broadcast of rows.
+   *
+   * Arguments
+   * =========
+   *
+   * BUFFER  (local input/global output)   double *
+   *         On entry,  BUFFER  points to  the  buffer to be combined.  On
+   *         exit, this array contains the combined data and  is identical
+   *         on all processes in the group.
+   *
+   * COUNT   (global input)                const int
+   *         On entry,  COUNT  indicates the number of entries in  BUFFER.
+   *         COUNT must be 4+2*JB, where JB is the length of the rows being
+   *         swapped.
+   *
+   * ROOT    (local input)                 int
+   *         On entry, ROOT specifies the rank of the process owning the
+   *         row to be swapped.
+   *
+   * COMM    (global/local input)          MPI_Comm
+   *         The MPI communicator identifying the process collection.
+   *
+   * WORK    (local workspace)             double *
+   *         On entry, WORK  is a workarray of size at least COUNT.
+
+   * ---------------------------------------------------------------------
+   */
+
+  roctxRangePush("HPL_all_reduce_dmxswp");
+
+#ifdef HPL_USE_COLLECTIVES
+
+  const int myrow = static_cast<int>(BUFFER[3]);
+  const int jb    = (COUNT - 4) / 2;
+
+  /* Use a normal all_reduce */
+  (void)MPI_Allreduce(MPI_IN_PLACE, BUFFER, 1, PDFACT_ROW, HPL_DMXSWP, COMM);
+
+  /*Location of max row*/
+  const int maxrow = static_cast<int>(BUFFER[3]);
+
+  if(myrow == ROOT) { /*Root send top row to maxrow*/
+    if(maxrow != ROOT) {
+      double* Wwork = BUFFER + 4 + jb;
+      HPL_send(Wwork, jb, maxrow, MSGID_BEGIN_PFACT, COMM);
+    }
+  } else if(myrow == maxrow) { /*Recv top row from ROOT*/
+    double* Wwork = BUFFER + 4 + jb;
+    HPL_recv(Wwork, jb, ROOT, MSGID_BEGIN_PFACT, COMM);
+  }
+
+#else
+
+  double       gmax, tmp1;
+  double *     A0, *Wmx;
+  unsigned int hdim, ip2, ip2_, ipow, k, mask;
+  int Np2, cnt_, cnt0, i, icurrow, mydist, mydis_, myrow, n0, nprow, partner,
+      rcnt, root, scnt, size_;
+
+  MPI_Comm_rank(COMM, &myrow);
+  MPI_Comm_size(COMM, &nprow);
+
+  /*
+   * ip2   : largest power of two <= nprow;
+   * hdim  : ip2 procs hypercube dim;
+   */
+  hdim = 0;
+  ip2  = 1;
+  k    = nprow;
+  while(k > 1) {
+    k >>= 1;
+    ip2 <<= 1;
+    hdim++;
+  }
+
+  n0      = (COUNT - 4) / 2;
+  icurrow = ROOT;
+  Np2     = (int)((size_ = nprow - ip2) != 0);
+  mydist  = MModSub(myrow, icurrow, nprow);
+
+  /*
+   * Set up pointers in workspace:  WORK and Wwork  point to the beginning
+   * of the buffers of size 4 + 2*N0 to be combined. Wmx points to the row
+   * owning the local (before combine) and global (after combine) absolute
+   * value max. A0 points to the copy of the current row of the matrix.
+   */
+
+  cnt0 = (cnt_ = n0 + 4) + n0;
+  A0   = (Wmx = BUFFER + 4) + n0;
+
+  /*
+   * Combine the results (bi-directional exchange):  the process coordina-
+   * tes are relative to icurrow,  this allows to reduce the communication
+   * volume when nprow is not a power of 2.
+   *
+   * When nprow is not a power of 2:  proc[i-ip2] receives local data from
+   * proc[i]  for all i in [ip2..nprow).  In addition,  proc[0]  (icurrow)
+   * sends to proc[ip2] the current row of A  for later broadcast in procs
+   * [ip2..nprow).
+   */
+  if((Np2 != 0) && ((partner = (int)((unsigned int)(mydist) ^ ip2)) < nprow)) {
+    if((mydist & ip2) != 0) {
+      if(mydist == (int)(ip2))
+        (void)HPL_sdrv(BUFFER,
+                       cnt_,
+                       MSGID_BEGIN_PFACT,
+                       A0,
+                       n0,
+                       MSGID_BEGIN_PFACT,
+                       MModAdd(partner, icurrow, nprow),
+                       COMM);
+      else
+        (void)HPL_send(BUFFER,
+                       cnt_,
+                       MModAdd(partner, icurrow, nprow),
+                       MSGID_BEGIN_PFACT,
+                       COMM);
+    } else {
+      if(mydist == 0)
+        (void)HPL_sdrv(A0,
+                       n0,
+                       MSGID_BEGIN_PFACT,
+                       WORK,
+                       cnt_,
+                       MSGID_BEGIN_PFACT,
+                       MModAdd(partner, icurrow, nprow),
+                       COMM);
+      else
+        (void)HPL_recv(WORK,
+                       cnt_,
+                       MModAdd(partner, icurrow, nprow),
+                       MSGID_BEGIN_PFACT,
+                       COMM);
+
+      tmp1 = Mabs(WORK[0]);
+      gmax = Mabs(BUFFER[0]);
+      if((tmp1 > gmax) || ((tmp1 == gmax) && (WORK[3] < BUFFER[3]))) {
+        HPL_dcopy(cnt_, WORK, 1, BUFFER, 1);
+      }
+    }
+  }
+
+  if(mydist < (int)(ip2)) {
+    /*
+     * power of 2 part of the processes collection: processes  [0..ip2)  are
+     * combining (binary exchange); proc[0] has two rows to send, but one to
+     * receive.  At every step  k  in [0..hdim) of the algorithm,  a process
+     * pair exchanging 2 rows is such that  myrow >> k+1 is 0.  Among  those
+     * processes the ones  that are sending one more row than  what they are
+     * receiving are such that myrow >> k is equal to 0.
+     */
+    k    = 0;
+    ipow = 1;
+
+    while(k < hdim) {
+      if(((unsigned int)(mydist) >> (k + 1)) == 0) {
+        if(((unsigned int)(mydist) >> k) == 0) {
+          scnt = cnt0;
+          rcnt = cnt_;
+        } else {
+          scnt = cnt_;
+          rcnt = cnt0;
+        }
+      } else {
+        scnt = rcnt = cnt_;
+      }
+
+      partner = (int)((unsigned int)(mydist) ^ ipow);
+      (void)HPL_sdrv(BUFFER,
+                     scnt,
+                     MSGID_BEGIN_PFACT,
+                     WORK,
+                     rcnt,
+                     MSGID_BEGIN_PFACT,
+                     MModAdd(partner, icurrow, nprow),
+                     COMM);
+
+      tmp1 = Mabs(WORK[0]);
+      gmax = Mabs(BUFFER[0]);
+      if((tmp1 > gmax) || ((tmp1 == gmax) && (WORK[3] < BUFFER[3]))) {
+        HPL_dcopy((rcnt == cnt0 ? cnt0 : cnt_), WORK, 1, BUFFER, 1);
+      } else if(rcnt == cnt0) {
+        HPL_dcopy(n0, WORK + cnt_, 1, A0, 1);
+      }
+
+      ipow <<= 1;
+      k++;
+    }
+  } else if(size_ > 1) {
+    /*
+     * proc[ip2] broadcast current row of A to procs [ip2+1..nprow).
+     */
+    k    = (unsigned int)(size_)-1;
+    ip2_ = mask = 1;
+    while(k > 1) {
+      k >>= 1;
+      ip2_ <<= 1;
+      mask <<= 1;
+      mask++;
+    }
+
+    root   = MModAdd(icurrow, (int)(ip2), nprow);
+    mydis_ = MModSub(myrow, root, nprow);
+
+    do {
+      mask ^= ip2_;
+      if((mydis_ & mask) == 0) {
+        partner = (int)(mydis_ ^ ip2_);
+        if((mydis_ & ip2_) != 0) {
+          (void)HPL_recv(
+              A0, n0, MModAdd(root, partner, nprow), MSGID_BEGIN_PFACT, COMM);
+        } else if(partner < size_) {
+          (void)HPL_send(
+              A0, n0, MModAdd(root, partner, nprow), MSGID_BEGIN_PFACT, COMM);
+        }
+      }
+      ip2_ >>= 1;
+    } while(ip2_ > 0);
+  }
+  /*
+   * If nprow is not a power of 2,  for all i in [ip2..nprow), proc[i-ip2]
+   * sends the pivot row to proc[i]  along  with the first four entries of
+   * the BUFFER array.
+   */
+  if((Np2 != 0) && ((partner = (int)((unsigned int)(mydist) ^ ip2)) < nprow)) {
+    if((mydist & ip2) != 0) {
+      (void)HPL_recv(BUFFER,
+                     cnt_,
+                     MModAdd(partner, icurrow, nprow),
+                     MSGID_BEGIN_PFACT,
+                     COMM);
+    } else {
+      (void)HPL_send(BUFFER,
+                     cnt_,
+                     MModAdd(partner, icurrow, nprow),
+                     MSGID_BEGIN_PFACT,
+                     COMM);
+    }
+  }
+
+#endif
+  roctxRangePop();
+}
diff --git a/src/comm/HPL_allgatherv.cpp b/src/comm/HPL_allgatherv.cpp
new file mode 100644
index 0000000..17f0bad
--- /dev/null
+++ b/src/comm/HPL_allgatherv.cpp
@@ -0,0 +1,128 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+int HPL_allgatherv(double*    BUF,
+                   const int  SCOUNT,
+                   const int* RCOUNT,
+                   const int* DISPL,
+                   MPI_Comm   COMM) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_allgatherv is a simple wrapper around an in-place MPI_Allgatherv.
+   * Its  main  purpose is to  allow for some  experimentation / tuning
+   * of this simple routine. Successful  completion  is  indicated  by
+   * the  returned  error  code HPL_SUCCESS.
+   *
+   * Arguments
+   * =========
+   *
+   * BUF    (local input/output)           double *
+   *         On entry, on the root process BUF specifies the starting
+   *         address of buffer to be gathered.
+   *
+   * SCOUNT  (local input)                 int
+   *         On entry,  SCOUNT is an array of length SIZE specifiying
+   *         the number of  double precision entries in BUF to send to
+   *         each process.
+   *
+   * RCOUNT  (local input)                 int
+   *         On entry,  RCOUNT is an array of length SIZE specifiying
+   *         the number of double precision entries in BUF to receive from
+   *         each process.
+   *
+   * DISPL   (local input)                 int *
+   *         On entry,  DISPL is an array of length SIZE specifiying the
+   *         displacement (relative to BUF) from which to place the incoming
+   *         data from each process.
+   *
+   * COMM    (local input)                 MPI_Comm
+   *         The MPI communicator identifying the communication space.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  roctxRangePush("HPL_Allgatherv");
+
+#ifdef HPL_USE_COLLECTIVES
+
+  int ierr = MPI_Allgatherv(
+      MPI_IN_PLACE, SCOUNT, MPI_DOUBLE, BUF, RCOUNT, DISPL, MPI_DOUBLE, COMM);
+
+#else
+
+  int rank, size, ierr = MPI_SUCCESS;
+  MPI_Comm_rank(COMM, &rank);
+  MPI_Comm_size(COMM, &size);
+
+  /*
+   * Ring exchange
+   */
+  const int npm1 = size - 1;
+  const int prev = MModSub1(rank, size);
+  const int next = MModAdd1(rank, size);
+
+  const int tag = 0;
+
+  for(int k = 0; k < npm1; k++) {
+    MPI_Request request;
+    MPI_Status  status;
+    const int   l = (int)((unsigned int)(k) >> 1);
+
+    int il, lengthS, lengthR, partner, ibufS, ibufR;
+    if(((rank + k) & 1) != 0) {
+      il      = MModAdd(rank, l, size);
+      ibufS   = DISPL[il];
+      lengthS = RCOUNT[il];
+      il      = MModSub(rank, l + 1, size);
+      ibufR   = DISPL[il];
+      lengthR = RCOUNT[il];
+      partner = prev;
+    } else {
+      il      = MModSub(rank, l, size);
+      ibufS   = DISPL[il];
+      lengthS = RCOUNT[il];
+      il      = MModAdd(rank, l + 1, size);
+      ibufR   = DISPL[il];
+      lengthR = RCOUNT[il];
+      partner = next;
+    }
+
+    if(lengthR > 0) {
+      if(ierr == MPI_SUCCESS)
+        ierr = MPI_Irecv(
+            BUF + ibufR, lengthR, MPI_DOUBLE, partner, tag, COMM, &request);
+    }
+
+    if(lengthS > 0) {
+      if(ierr == MPI_SUCCESS)
+        ierr = MPI_Send(BUF + ibufS, lengthS, MPI_DOUBLE, partner, tag, COMM);
+    }
+
+    if(lengthR > 0) {
+      if(ierr == MPI_SUCCESS) ierr = MPI_Wait(&request, &status);
+    }
+  }
+
+#endif
+
+  roctxRangePop();
+
+  return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE));
+}
diff --git a/src/comm/HPL_barrier.cpp b/src/comm/HPL_barrier.cpp
new file mode 100644
index 0000000..747571d
--- /dev/null
+++ b/src/comm/HPL_barrier.cpp
@@ -0,0 +1,40 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+int HPL_barrier(MPI_Comm COMM) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_barrier blocks the caller until all process members have call it.
+   * The  call  returns  at any process  only after all group members have
+   * entered the call.
+   *
+   * Arguments
+   * =========
+   *
+   * COMM    (global/local input)          MPI_Comm
+   *         The MPI communicator identifying the process collection.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  int ierr = MPI_Barrier(COMM);
+
+  return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE));
+}
diff --git a/src/comm/HPL_bcast.cpp b/src/comm/HPL_bcast.cpp
new file mode 100644
index 0000000..7cfce11
--- /dev/null
+++ b/src/comm/HPL_bcast.cpp
@@ -0,0 +1,82 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+int HPL_bcast(double*   SBUF,
+              int       SCOUNT,
+              int       ROOT,
+              MPI_Comm  COMM,
+              HPL_T_TOP top) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_bcast is a simple wrapper around  MPI_Bcast.  Its  main  purpose is
+   * to  allow for some  experimentation / tuning  of this simple routine.
+   * Successful  completion  is  indicated  by  the  returned  error  code
+   * HPL_SUCCESS.  In the case of messages of length less than or equal to
+   * zero, this function returns immediately.
+   *
+   * Arguments
+   * =========
+   *
+   * SBUF    (local input)                 double *
+   *         On entry, SBUF specifies the starting address of buffer to be
+   *         broadcast.
+   *
+   * SCOUNT  (local input)                 int
+   *         On entry,  SCOUNT  specifies  the number of  double precision
+   *         entries in SBUF. SCOUNT must be at least zero.
+   *
+   * ROOT    (local input)                 int
+   *         On entry, ROOT specifies the rank of the origin process in
+   *         the communication space defined by COMM.
+   *
+   * COMM    (local input)                 MPI_Comm
+   *         The MPI communicator identifying the communication space.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  if(SCOUNT <= 0) return (HPL_SUCCESS);
+
+  int ierr;
+
+  roctxRangePush("HPL_Bcast");
+
+#ifdef HPL_USE_COLLECTIVES
+
+  ierr = MPI_Bcast(SBUF, SCOUNT, MPI_DOUBLE, ROOT, COMM);
+
+#else
+
+  switch(top) {
+    case HPL_1RING_M: ierr = HPL_bcast_1rinM(SBUF, SCOUNT, ROOT, COMM); break;
+    case HPL_1RING: ierr = HPL_bcast_1ring(SBUF, SCOUNT, ROOT, COMM); break;
+    case HPL_2RING_M: ierr = HPL_bcast_2rinM(SBUF, SCOUNT, ROOT, COMM); break;
+    case HPL_2RING: ierr = HPL_bcast_2ring(SBUF, SCOUNT, ROOT, COMM); break;
+    case HPL_BLONG_M: ierr = HPL_bcast_blonM(SBUF, SCOUNT, ROOT, COMM); break;
+    case HPL_BLONG: ierr = HPL_bcast_blong(SBUF, SCOUNT, ROOT, COMM); break;
+    default: ierr = HPL_FAILURE;
+  }
+
+#endif
+
+  roctxRangePop();
+
+  return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE));
+}
diff --git a/src/comm/HPL_bcast_1rinM.cpp b/src/comm/HPL_bcast_1rinM.cpp
new file mode 100644
index 0000000..7927d5a
--- /dev/null
+++ b/src/comm/HPL_bcast_1rinM.cpp
@@ -0,0 +1,109 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+int HPL_bcast_1rinM(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM) {
+
+  int rank, size;
+  MPI_Comm_rank(COMM, &rank);
+  MPI_Comm_size(COMM, &size);
+
+  if(size <= 1) return (MPI_SUCCESS);
+
+  /*Root immediately sends to ROOT+1*/
+  if(rank == ROOT) {
+    MPI_Send(SBUF, SCOUNT, MPI_DOUBLE, MModAdd1(ROOT, size), ROOT, COMM);
+  } else if(rank == MModAdd1(ROOT, size)) {
+    MPI_Recv(SBUF, SCOUNT, MPI_DOUBLE, ROOT, ROOT, COMM, MPI_STATUS_IGNORE);
+    return MPI_SUCCESS;
+  }
+
+  if(size == 2) return (MPI_SUCCESS);
+
+  /*One ring exchange to rule them all*/
+  int chunk_size = 512 * 512; // 2MB
+
+  chunk_size = std::min(chunk_size, SCOUNT);
+
+  MPI_Request request[2];
+
+  request[0] = MPI_REQUEST_NULL;
+  request[1] = MPI_REQUEST_NULL;
+
+  const int Nchunks = (SCOUNT + chunk_size - 1) / chunk_size;
+
+  const int tag = rank;
+  const int next =
+      (rank == ROOT) ? MModAdd(ROOT, 2, size) : MModAdd1(rank, size);
+  const int prev =
+      (rank == MModAdd(ROOT, 2, size)) ? ROOT : MModSub1(rank, size);
+
+  double* RBUF = SBUF;
+
+  /*Shift to ROOT=0*/
+  rank = MModSub(rank, ROOT, size);
+
+  int Nsend = (rank == size - 1) ? 0 : SCOUNT;
+  int Nrecv = (rank == 0) ? 0 : SCOUNT;
+
+  /*Recv from left*/
+  int Nr = std::min(Nrecv, chunk_size);
+  if(Nr > 0) { MPI_Irecv(RBUF, Nr, MPI_DOUBLE, prev, prev, COMM, request + 0); }
+
+  /*Send to right if there is data present to send*/
+  int Ns = std::min(Nsend - Nrecv, chunk_size);
+  if(Ns > 0) { MPI_Isend(SBUF, Ns, MPI_DOUBLE, next, tag, COMM, request + 1); }
+
+  while(Nsend > 0 || Nrecv > 0) {
+    int index = -1;
+    MPI_Waitany(2, request, &index, MPI_STATUSES_IGNORE);
+
+    if(index == 0) { /*Recv'd from left*/
+      /*If we're waiting on this recv in order to send, send now*/
+      if(Nrecv == Nsend) {
+        Ns = Nr;
+        MPI_Isend(SBUF, Ns, MPI_DOUBLE, next, tag, COMM, request + 1);
+      }
+
+      /*Count the recv'd amounts*/
+      Nrecv -= Nr;
+      RBUF += Nr;
+
+      /*Post next recv if needed*/
+      Nr = std::min(Nrecv, chunk_size);
+      if(Nr > 0) {
+        MPI_Irecv(RBUF, Nr, MPI_DOUBLE, prev, prev, COMM, request + 0);
+      } else {
+        request[0] = MPI_REQUEST_NULL;
+      }
+
+    } else if(index == 1) { /*Sent to right */
+      Nsend -= Ns;
+      SBUF += Ns;
+
+      /*Send to right if there is data present to send*/
+      Ns = std::min(Nsend - Nrecv, chunk_size);
+      if(Ns > 0) {
+        MPI_Isend(SBUF, Ns, MPI_DOUBLE, next, tag, COMM, request + 1);
+      } else {
+        request[1] = MPI_REQUEST_NULL;
+      }
+    }
+  }
+
+  return MPI_SUCCESS;
+}
diff --git a/src/comm/HPL_bcast_1ring.cpp b/src/comm/HPL_bcast_1ring.cpp
new file mode 100644
index 0000000..4a1854f
--- /dev/null
+++ b/src/comm/HPL_bcast_1ring.cpp
@@ -0,0 +1,99 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+int HPL_bcast_1ring(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM) {
+
+  int rank, size;
+  MPI_Comm_rank(COMM, &rank);
+  MPI_Comm_size(COMM, &size);
+
+  if(size <= 1) return (MPI_SUCCESS);
+
+  /*One ring exchange to rule them all*/
+  int chunk_size = 512 * 512; // 2MB
+  // int chunk_size = 64 * 512; // 256KB
+
+  chunk_size = std::min(chunk_size, SCOUNT);
+
+  MPI_Request request[2];
+
+  request[0] = MPI_REQUEST_NULL;
+  request[1] = MPI_REQUEST_NULL;
+
+  const int Nchunks = (SCOUNT + chunk_size - 1) / chunk_size;
+
+  const int tag  = rank;
+  const int next = MModAdd1(rank, size);
+  const int prev = MModSub1(rank, size);
+
+  /*Mid point of message*/
+  double* RBUF = SBUF;
+
+  /*Shift to ROOT=0*/
+  rank = MModSub(rank, ROOT, size);
+
+  int Nsend = (rank == size - 1) ? 0 : SCOUNT;
+  int Nrecv = (rank == 0) ? 0 : SCOUNT;
+
+  /*Recv from left*/
+  int Nr = std::min(Nrecv, chunk_size);
+  if(Nr > 0) { MPI_Irecv(RBUF, Nr, MPI_DOUBLE, prev, prev, COMM, request + 0); }
+
+  /*Send to right if there is data present to send*/
+  int Ns = std::min(Nsend - Nrecv, chunk_size);
+  if(Ns > 0) { MPI_Isend(SBUF, Ns, MPI_DOUBLE, next, tag, COMM, request + 1); }
+
+  while(Nsend > 0 || Nrecv > 0) {
+    int index = -1;
+    MPI_Waitany(2, request, &index, MPI_STATUSES_IGNORE);
+
+    if(index == 0) { /*Recv'd from left*/
+      /*If we're waiting on this recv in order to send, send now*/
+      if(Nrecv == Nsend) {
+        Ns = Nr;
+        MPI_Isend(SBUF, Ns, MPI_DOUBLE, next, tag, COMM, request + 1);
+      }
+
+      /*Count the recv'd amounts*/
+      Nrecv -= Nr;
+      RBUF += Nr;
+
+      /*Post next recv if needed*/
+      Nr = std::min(Nrecv, chunk_size);
+      if(Nr > 0) {
+        MPI_Irecv(RBUF, Nr, MPI_DOUBLE, prev, prev, COMM, request + 0);
+      } else {
+        request[0] = MPI_REQUEST_NULL;
+      }
+
+    } else if(index == 1) { /*Sent to right */
+      Nsend -= Ns;
+      SBUF += Ns;
+
+      /*Send to right if there is data present to send*/
+      Ns = std::min(Nsend - Nrecv, chunk_size);
+      if(Ns > 0) {
+        MPI_Isend(SBUF, Ns, MPI_DOUBLE, next, tag, COMM, request + 1);
+      } else {
+        request[1] = MPI_REQUEST_NULL;
+      }
+    }
+  }
+
+  return MPI_SUCCESS;
+}
diff --git a/src/comm/HPL_bcast_2rinM.cpp b/src/comm/HPL_bcast_2rinM.cpp
new file mode 100644
index 0000000..cea56cf
--- /dev/null
+++ b/src/comm/HPL_bcast_2rinM.cpp
@@ -0,0 +1,165 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+int HPL_bcast_2rinM(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM) {
+
+  int rank, size;
+  MPI_Comm_rank(COMM, &rank);
+  MPI_Comm_size(COMM, &size);
+
+  if(size <= 1) return (MPI_SUCCESS);
+
+  /*Root immediately sends to ROOT+1*/
+  if(rank == ROOT) {
+    MPI_Send(SBUF, SCOUNT, MPI_DOUBLE, MModAdd1(ROOT, size), ROOT, COMM);
+  } else if(rank == MModAdd1(ROOT, size)) {
+    MPI_Recv(SBUF, SCOUNT, MPI_DOUBLE, ROOT, ROOT, COMM, MPI_STATUS_IGNORE);
+    return MPI_SUCCESS;
+  }
+
+  if(size == 2) return (MPI_SUCCESS);
+
+  /*One ring exchange to rule them all*/
+  int chunk_size = 512 * 512; // 2MB
+
+  chunk_size = std::min(chunk_size, SCOUNT);
+
+  MPI_Request request[4];
+
+  request[0] = MPI_REQUEST_NULL;
+  request[1] = MPI_REQUEST_NULL;
+  request[2] = MPI_REQUEST_NULL;
+  request[3] = MPI_REQUEST_NULL;
+
+  const int Nchunks     = (SCOUNT + chunk_size - 1) / chunk_size;
+  const int NchunksHalf = (Nchunks + 1) / 2;
+
+  const int tag = rank;
+  const int next =
+      (rank == ROOT) ? MModAdd(ROOT, 2, size) : MModAdd1(rank, size);
+  const int prev =
+      (rank == MModAdd(ROOT, 2, size)) ? ROOT : MModSub1(rank, size);
+
+  /*Mid point of message*/
+  double* SBUF0 = SBUF;
+  double* SBUF1 = SBUF + NchunksHalf * chunk_size;
+
+  double* RBUF0 = SBUF0;
+  double* RBUF1 = SBUF1;
+
+  /*Shift to ROOT=0*/
+  rank = MModSub(rank, ROOT, size);
+
+  int Nsend0 = (rank == size - 1) ? 0 : NchunksHalf * chunk_size;
+  int Nsend1 = (rank == 2) ? 0 : SCOUNT - NchunksHalf * chunk_size;
+
+  int Nrecv0 = (rank == 0) ? 0 : NchunksHalf * chunk_size;
+  int Nrecv1 = (rank == 0) ? 0 : SCOUNT - NchunksHalf * chunk_size;
+
+  /*Recv from left*/
+  int Nr0 = std::min(Nrecv0, chunk_size);
+  if(Nr0 > 0) {
+    MPI_Irecv(RBUF0, Nr0, MPI_DOUBLE, prev, prev, COMM, request + 0);
+  }
+
+  /*Recv from right*/
+  int Nr1 = std::min(Nrecv1, chunk_size);
+  if(Nr1 > 0) {
+    MPI_Irecv(RBUF1, Nr1, MPI_DOUBLE, next, next, COMM, request + 1);
+  }
+
+  /*Send to right if there is data present to send*/
+  int Ns0 = std::min(Nsend0 - Nrecv0, chunk_size);
+  if(Ns0 > 0) {
+    MPI_Isend(SBUF0, Ns0, MPI_DOUBLE, next, tag, COMM, request + 2);
+  }
+
+  /*Send to left if there is data present to send*/
+  int Ns1 = std::min(Nsend1 - Nrecv1, chunk_size);
+  if(Ns1 > 0) {
+    MPI_Isend(SBUF1, Ns1, MPI_DOUBLE, prev, tag, COMM, request + 3);
+  }
+
+  while(Nsend0 > 0 || Nsend1 > 0 || Nrecv0 > 0 || Nrecv1 > 0) {
+    int index = -1;
+    MPI_Waitany(4, request, &index, MPI_STATUSES_IGNORE);
+
+    if(index == 0) { /*Recv'd from left*/
+      /*If we're waiting on this recv in order to send, send now*/
+      if(Nrecv0 == Nsend0) {
+        Ns0 = Nr0;
+        MPI_Isend(SBUF0, Ns0, MPI_DOUBLE, next, tag, COMM, request + 2);
+      }
+
+      /*Count the recv'd amounts*/
+      Nrecv0 -= Nr0;
+      RBUF0 += Nr0;
+
+      /*Post next recv if needed*/
+      Nr0 = std::min(Nrecv0, chunk_size);
+      if(Nr0 > 0) {
+        MPI_Irecv(RBUF0, Nr0, MPI_DOUBLE, prev, prev, COMM, request + 0);
+      } else {
+        request[0] = MPI_REQUEST_NULL;
+      }
+
+    } else if(index == 1) { /*Recv'd from right*/
+      /*If we're waiting on this recv in order to send, send now*/
+      if(Nrecv1 == Nsend1) {
+        Ns1 = Nr1;
+        MPI_Isend(SBUF1, Ns1, MPI_DOUBLE, prev, tag, COMM, request + 3);
+      }
+
+      /*Count the recv'd amounts*/
+      Nrecv1 -= Nr1;
+      RBUF1 += Nr1;
+
+      /*Post next recv if needed*/
+      Nr1 = std::min(Nrecv1, chunk_size);
+      if(Nr1 > 0) {
+        MPI_Irecv(RBUF1, Nr1, MPI_DOUBLE, next, next, COMM, request + 1);
+      } else {
+        request[1] = MPI_REQUEST_NULL;
+      }
+
+    } else if(index == 2) { /*Sent to right */
+      Nsend0 -= Ns0;
+      SBUF0 += Ns0;
+
+      /*Send to right if there is data present to send*/
+      Ns0 = std::min(Nsend0 - Nrecv0, chunk_size);
+      if(Ns0 > 0) {
+        MPI_Isend(SBUF0, Ns0, MPI_DOUBLE, next, tag, COMM, request + 2);
+      } else {
+        request[2] = MPI_REQUEST_NULL;
+      }
+    } else { /*index==3, Sent to left */
+      Nsend1 -= Ns1;
+      SBUF1 += Ns1;
+
+      Ns1 = std::min(Nsend1 - Nrecv1, chunk_size);
+      if(Ns1 > 0) {
+        MPI_Isend(SBUF1, Ns1, MPI_DOUBLE, prev, tag, COMM, request + 3);
+      } else {
+        request[3] = MPI_REQUEST_NULL;
+      }
+    }
+  }
+
+  return MPI_SUCCESS;
+}
diff --git a/src/comm/HPL_bcast_2ring.cpp b/src/comm/HPL_bcast_2ring.cpp
new file mode 100644
index 0000000..d6c36dd
--- /dev/null
+++ b/src/comm/HPL_bcast_2ring.cpp
@@ -0,0 +1,153 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+int HPL_bcast_2ring(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM) {
+
+  int rank, size;
+  MPI_Comm_rank(COMM, &rank);
+  MPI_Comm_size(COMM, &size);
+
+  if(size <= 1) return (MPI_SUCCESS);
+
+  /*One ring exchange to rule them all*/
+  int chunk_size = 512 * 512; // 2MB
+
+  chunk_size = std::min(chunk_size, SCOUNT);
+
+  MPI_Request request[4];
+
+  request[0] = MPI_REQUEST_NULL;
+  request[1] = MPI_REQUEST_NULL;
+  request[2] = MPI_REQUEST_NULL;
+  request[3] = MPI_REQUEST_NULL;
+
+  const int Nchunks     = (SCOUNT + chunk_size - 1) / chunk_size;
+  const int NchunksHalf = (Nchunks + 1) / 2;
+
+  const int tag  = rank;
+  const int next = MModAdd1(rank, size);
+  const int prev = MModSub1(rank, size);
+
+  /*Mid point of message*/
+  double* SBUF0 = SBUF;
+  double* SBUF1 = SBUF + NchunksHalf * chunk_size;
+
+  double* RBUF0 = SBUF0;
+  double* RBUF1 = SBUF1;
+
+  /*Shift to ROOT=0*/
+  rank = MModSub(rank, ROOT, size);
+
+  int Nsend0 = (rank == size - 1) ? 0 : NchunksHalf * chunk_size;
+  int Nsend1 = (rank == 1) ? 0 : SCOUNT - NchunksHalf * chunk_size;
+
+  int Nrecv0 = (rank == 0) ? 0 : NchunksHalf * chunk_size;
+  int Nrecv1 = (rank == 0) ? 0 : SCOUNT - NchunksHalf * chunk_size;
+
+  /*Recv from left*/
+  int Nr0 = std::min(Nrecv0, chunk_size);
+  if(Nr0 > 0) {
+    MPI_Irecv(RBUF0, Nr0, MPI_DOUBLE, prev, prev, COMM, request + 0);
+  }
+
+  /*Recv from right*/
+  int Nr1 = std::min(Nrecv1, chunk_size);
+  if(Nr1 > 0) {
+    MPI_Irecv(RBUF1, Nr1, MPI_DOUBLE, next, next, COMM, request + 1);
+  }
+
+  /*Send to right if there is data present to send*/
+  int Ns0 = std::min(Nsend0 - Nrecv0, chunk_size);
+  if(Ns0 > 0) {
+    MPI_Isend(SBUF0, Ns0, MPI_DOUBLE, next, tag, COMM, request + 2);
+  }
+
+  /*Send to left if there is data present to send*/
+  int Ns1 = std::min(Nsend1 - Nrecv1, chunk_size);
+  if(Ns1 > 0) {
+    MPI_Isend(SBUF1, Ns1, MPI_DOUBLE, prev, tag, COMM, request + 3);
+  }
+
+  while(Nsend0 > 0 || Nsend1 > 0 || Nrecv0 > 0 || Nrecv1 > 0) {
+    int index = -1;
+    MPI_Waitany(4, request, &index, MPI_STATUSES_IGNORE);
+
+    if(index == 0) { /*Recv'd from left*/
+      /*If we're waiting on this recv in order to send, send now*/
+      if(Nrecv0 == Nsend0) {
+        Ns0 = Nr0;
+        MPI_Isend(SBUF0, Ns0, MPI_DOUBLE, next, tag, COMM, request + 2);
+      }
+
+      /*Count the recv'd amounts*/
+      Nrecv0 -= Nr0;
+      RBUF0 += Nr0;
+
+      /*Post next recv if needed*/
+      Nr0 = std::min(Nrecv0, chunk_size);
+      if(Nr0 > 0) {
+        MPI_Irecv(RBUF0, Nr0, MPI_DOUBLE, prev, prev, COMM, request + 0);
+      } else {
+        request[0] = MPI_REQUEST_NULL;
+      }
+
+    } else if(index == 1) { /*Recv'd from right*/
+      /*If we're waiting on this recv in order to send, send now*/
+      if(Nrecv1 == Nsend1) {
+        Ns1 = Nr1;
+        MPI_Isend(SBUF1, Ns1, MPI_DOUBLE, prev, tag, COMM, request + 3);
+      }
+
+      /*Count the recv'd amounts*/
+      Nrecv1 -= Nr1;
+      RBUF1 += Nr1;
+
+      /*Post next recv if needed*/
+      Nr1 = std::min(Nrecv1, chunk_size);
+      if(Nr1 > 0) {
+        MPI_Irecv(RBUF1, Nr1, MPI_DOUBLE, next, next, COMM, request + 1);
+      } else {
+        request[1] = MPI_REQUEST_NULL;
+      }
+
+    } else if(index == 2) { /*Sent to right */
+      Nsend0 -= Ns0;
+      SBUF0 += Ns0;
+
+      /*Send to right if there is data present to send*/
+      Ns0 = std::min(Nsend0 - Nrecv0, chunk_size);
+      if(Ns0 > 0) {
+        MPI_Isend(SBUF0, Ns0, MPI_DOUBLE, next, tag, COMM, request + 2);
+      } else {
+        request[2] = MPI_REQUEST_NULL;
+      }
+    } else { /*index==3, Sent to left */
+      Nsend1 -= Ns1;
+      SBUF1 += Ns1;
+
+      Ns1 = std::min(Nsend1 - Nrecv1, chunk_size);
+      if(Ns1 > 0) {
+        MPI_Isend(SBUF1, Ns1, MPI_DOUBLE, prev, tag, COMM, request + 3);
+      } else {
+        request[3] = MPI_REQUEST_NULL;
+      }
+    }
+  }
+
+  return MPI_SUCCESS;
+}
diff --git a/src/comm/HPL_bcast_blonM.cpp b/src/comm/HPL_bcast_blonM.cpp
new file mode 100644
index 0000000..fd85f13
--- /dev/null
+++ b/src/comm/HPL_bcast_blonM.cpp
@@ -0,0 +1,185 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+int HPL_bcast_blonM(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM) {
+
+  int rank, size;
+  MPI_Comm_rank(COMM, &rank);
+  MPI_Comm_size(COMM, &size);
+
+  if(size <= 1) return (MPI_SUCCESS);
+
+  /*
+   * Cast phase:  ROOT process  sends to its right neighbor,  then spread
+   * the panel on the other npcol - 2 processes.  If  I  am  not the ROOT
+   * process, probe for message received.  If the message is there,  then
+   * receive it. If I am just after the ROOT process, return.  Otherwise,
+   * keep spreading on those npcol - 2 processes.  Otherwise,  inform the
+   * caller that the panel has still not been received.
+   */
+  int count, ierr = MPI_SUCCESS, ibuf, ibufR, ibufS, indx, ip2 = 1, k, l, lbuf,
+             lbufR, lbufS, mask = 1, mydist, mydist2, next, npm1, npm2, partner,
+             prev;
+
+  const int tag = ROOT;
+  next          = MModAdd1(rank, size);
+  prev          = MModSub1(rank, size);
+
+  if(rank == ROOT) {
+    if(ierr == MPI_SUCCESS)
+      ierr =
+          MPI_Send(SBUF, SCOUNT, MPI_DOUBLE, MModAdd1(rank, size), tag, COMM);
+  } else if(prev == ROOT) {
+    if(ierr == MPI_SUCCESS)
+      ierr = MPI_Recv(
+          SBUF, SCOUNT, MPI_DOUBLE, ROOT, tag, COMM, MPI_STATUS_IGNORE);
+  }
+  /*
+   * if I am just after the ROOT, exit now. The message receive  completed
+   * successfully, this guy is done. If there are only 2 processes in each
+   * row of processes, we are done as well.
+   */
+  if((prev == ROOT) || (size == 2)) return ierr;
+  /*
+   * Otherwise, proceed with broadcast -  Spread  the panel across process
+   * columns
+   */
+  npm2 = (npm1 = size - 1) - 1;
+
+  k = npm2;
+  while(k > 1) {
+    k >>= 1;
+    ip2 <<= 1;
+    mask <<= 1;
+    mask++;
+  }
+  if(rank == ROOT)
+    mydist2 = (mydist = 0);
+  else
+    mydist2 = (mydist = MModSub(rank, ROOT, size) - 1);
+
+  indx  = ip2;
+  count = SCOUNT / npm1;
+  count = Mmax(count, 1);
+
+  do {
+    mask ^= ip2;
+
+    if((mydist & mask) == 0) {
+      lbuf = SCOUNT - (ibuf = indx * count);
+      if(indx + ip2 < npm1) {
+        l    = ip2 * count;
+        lbuf = Mmin(lbuf, l);
+      }
+
+      partner = mydist ^ ip2;
+
+      if((mydist & ip2) != 0) {
+        partner = MModAdd(ROOT, partner, size);
+        if(partner != ROOT) partner = MModAdd1(partner, size);
+
+        if(lbuf > 0) {
+          if(ierr == MPI_SUCCESS)
+            ierr = MPI_Recv(SBUF + ibuf,
+                            lbuf,
+                            MPI_DOUBLE,
+                            partner,
+                            tag,
+                            COMM,
+                            MPI_STATUS_IGNORE);
+        }
+      } else if(partner < npm1) {
+        partner = MModAdd(ROOT, partner, size);
+        if(partner != ROOT) partner = MModAdd1(partner, size);
+
+        if(lbuf > 0) {
+          if(ierr == MPI_SUCCESS)
+            ierr = MPI_Send(SBUF + ibuf, lbuf, MPI_DOUBLE, partner, tag, COMM);
+        }
+      }
+    }
+
+    if(mydist2 < ip2) {
+      ip2 >>= 1;
+      indx -= ip2;
+    } else {
+      mydist2 -= ip2;
+      ip2 >>= 1;
+      indx += ip2;
+    }
+
+  } while(ip2 > 0);
+  /*
+   * Roll the pieces
+   */
+  if(MModSub1(prev, size) == ROOT) prev = ROOT;
+  if(rank == ROOT) next = MModAdd1(next, size);
+
+  for(k = 0; k < npm2; k++) {
+    l = (k >> 1);
+    /*
+     * Who is sending to who and how much
+     */
+    if(((mydist + k) & 1) != 0) {
+      ibufS = (indx = MModAdd(mydist, l, npm1)) * count;
+      lbufS = (indx == npm2 ? SCOUNT : ibufS + count);
+      lbufS = Mmin(SCOUNT, lbufS) - ibufS;
+      lbufS = Mmax(0, lbufS);
+
+      ibufR = (indx = MModSub(mydist, l + 1, npm1)) * count;
+      lbufR = (indx == npm2 ? SCOUNT : ibufR + count);
+      lbufR = Mmin(SCOUNT, lbufR) - ibufR;
+      lbufR = Mmax(0, lbufR);
+
+      partner = prev;
+    } else {
+      ibufS = (indx = MModSub(mydist, l, npm1)) * count;
+      lbufS = (indx == npm2 ? SCOUNT : ibufS + count);
+      lbufS = Mmin(SCOUNT, lbufS) - ibufS;
+      lbufS = Mmax(0, lbufS);
+
+      ibufR = (indx = MModAdd(mydist, l + 1, npm1)) * count;
+      lbufR = (indx == npm2 ? SCOUNT : ibufR + count);
+      lbufR = Mmin(SCOUNT, lbufR) - ibufR;
+      lbufR = Mmax(0, lbufR);
+
+      partner = next;
+    }
+    /*
+     * Exchange the messages
+     */
+    MPI_Request request;
+    MPI_Status  status;
+
+    if(lbufR > 0) {
+      if(ierr == MPI_SUCCESS)
+        ierr = MPI_Irecv(
+            SBUF + ibufR, lbufR, MPI_DOUBLE, partner, tag, COMM, &request);
+    }
+
+    if(lbufS > 0) {
+      if(ierr == MPI_SUCCESS)
+        ierr = MPI_Send(SBUF + ibufS, lbufS, MPI_DOUBLE, partner, tag, COMM);
+    }
+
+    if(lbufR > 0)
+      if(ierr == MPI_SUCCESS) ierr = MPI_Wait(&request, &status);
+  }
+
+  return ierr;
+}
diff --git a/src/comm/HPL_bcast_blong.cpp b/src/comm/HPL_bcast_blong.cpp
new file mode 100644
index 0000000..9f3f81e
--- /dev/null
+++ b/src/comm/HPL_bcast_blong.cpp
@@ -0,0 +1,161 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+int HPL_bcast_blong(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM) {
+
+  int rank, size;
+  MPI_Comm_rank(COMM, &rank);
+  MPI_Comm_size(COMM, &size);
+
+  if(size <= 1) return (MPI_SUCCESS);
+
+  /*
+   * Cast phase:  If I am the ROOT process, start spreading the panel.  If
+   * I am not the ROOT process,  test  for  message receive completion. If
+   * the message  is there,  then receive it,  and  keep  spreading  in  a
+   * blocking fashion this time.  Otherwise,  inform  the caller  that the
+   * panel has still not been received.
+   */
+  int count, ierr = MPI_SUCCESS, ibuf, ibufR, ibufS, indx, ip2, k, l, lbuf,
+             lbufR, lbufS, mask, mydist, mydist2, npm1, partner, next, prev;
+
+  const int tag = 0;
+
+  // ip2  : largest power of two <= size-1;
+  // mask : ip2 procs hypercube mask;
+  mask = ip2 = 1;
+  k          = size - 1;
+  while(k > 1) {
+    k >>= 1;
+    ip2 <<= 1;
+    mask <<= 1;
+    mask++;
+  }
+
+  npm1    = size - 1;
+  mydist2 = (mydist = MModSub(rank, ROOT, size));
+  indx    = ip2;
+  count   = SCOUNT / size;
+  count   = Mmax(count, 1);
+  /*
+   * Spread the panel across process columns
+   */
+  do {
+    mask ^= ip2;
+
+    if((mydist & mask) == 0) {
+      lbuf = SCOUNT - (ibuf = indx * count);
+      if(indx + ip2 < size) {
+        l    = ip2 * count;
+        lbuf = Mmin(lbuf, l);
+      }
+
+      partner = mydist ^ ip2;
+
+      if((mydist & ip2) != 0) {
+        partner = MModAdd(ROOT, partner, size);
+
+        if(lbuf > 0) {
+          if(ierr == MPI_SUCCESS)
+            ierr = MPI_Recv(SBUF + ibuf,
+                            lbuf,
+                            MPI_DOUBLE,
+                            partner,
+                            tag,
+                            COMM,
+                            MPI_STATUS_IGNORE);
+        }
+      } else if(partner < size) {
+        partner = MModAdd(ROOT, partner, size);
+
+        if(lbuf > 0) {
+          if(ierr == MPI_SUCCESS)
+            ierr = MPI_Send(SBUF + ibuf, lbuf, MPI_DOUBLE, partner, tag, COMM);
+        }
+      }
+    }
+
+    if(mydist2 < ip2) {
+      ip2 >>= 1;
+      indx -= ip2;
+    } else {
+      mydist2 -= ip2;
+      ip2 >>= 1;
+      indx += ip2;
+    }
+
+  } while(ip2 > 0);
+  /*
+   * Roll the pieces
+   */
+  prev = MModSub1(rank, size);
+  next = MModAdd1(rank, size);
+
+  for(k = 0; k < npm1; k++) {
+    l = (k >> 1);
+    /*
+     * Who is sending to who and how much
+     */
+    if(((mydist + k) & 1) != 0) {
+      ibufS = (indx = MModAdd(mydist, l, size)) * count;
+      lbufS = (indx == npm1 ? SCOUNT : ibufS + count);
+      lbufS = Mmin(SCOUNT, lbufS) - ibufS;
+      lbufS = Mmax(0, lbufS);
+
+      ibufR = (indx = MModSub(mydist, l + 1, size)) * count;
+      lbufR = (indx == npm1 ? SCOUNT : ibufR + count);
+      lbufR = Mmin(SCOUNT, lbufR) - ibufR;
+      lbufR = Mmax(0, lbufR);
+
+      partner = prev;
+    } else {
+      ibufS = (indx = MModSub(mydist, l, size)) * count;
+      lbufS = (indx == npm1 ? SCOUNT : ibufS + count);
+      lbufS = Mmin(SCOUNT, lbufS) - ibufS;
+      lbufS = Mmax(0, lbufS);
+
+      ibufR = (indx = MModAdd(mydist, l + 1, size)) * count;
+      lbufR = (indx == npm1 ? SCOUNT : ibufR + count);
+      lbufR = Mmin(SCOUNT, lbufR) - ibufR;
+      lbufR = Mmax(0, lbufR);
+
+      partner = next;
+    }
+    /*
+     * Exchange the messages
+     */
+    MPI_Request request;
+    MPI_Status  status;
+
+    if(lbufR > 0) {
+      if(ierr == MPI_SUCCESS)
+        ierr = MPI_Irecv(
+            SBUF + ibufR, lbufR, MPI_DOUBLE, partner, tag, COMM, &request);
+    }
+
+    if(lbufS > 0) {
+      if(ierr == MPI_SUCCESS)
+        ierr = MPI_Send(SBUF + ibufS, lbufS, MPI_DOUBLE, partner, tag, COMM);
+    }
+
+    if(lbufR > 0)
+      if(ierr == MPI_SUCCESS) ierr = MPI_Wait(&request, &status);
+  }
+
+  return ierr;
+}
diff --git a/src/comm/HPL_broadcast.cpp b/src/comm/HPL_broadcast.cpp
new file mode 100644
index 0000000..e9362f5
--- /dev/null
+++ b/src/comm/HPL_broadcast.cpp
@@ -0,0 +1,58 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+int HPL_broadcast(void*            BUFFER,
+                  const int        COUNT,
+                  const HPL_T_TYPE DTYPE,
+                  const int        ROOT,
+                  MPI_Comm         COMM) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_broadcast broadcasts  a message from the process with rank ROOT to
+   * all processes in the group.
+   *
+   * Arguments
+   * =========
+   *
+   * BUFFER  (local input/output)          void *
+   *         On entry,  BUFFER  points to  the  buffer to be broadcast. On
+   *         exit, this array contains the broadcast data and is identical
+   *         on all processes in the group.
+   *
+   * COUNT   (global input)                const int
+   *         On entry,  COUNT  indicates the number of entries in  BUFFER.
+   *         COUNT must be at least zero.
+   *
+   * DTYPE   (global input)                const HPL_T_TYPE
+   *         On entry,  DTYPE  specifies the type of the buffers operands.
+   *
+   * ROOT    (global input)                const int
+   *         On entry, ROOT is the coordinate of the source process.
+   *
+   * COMM    (global/local input)          MPI_Comm
+   *         The MPI communicator identifying the process collection.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  int ierr = MPI_Bcast(BUFFER, COUNT, HPL_2_MPI_TYPE(DTYPE), ROOT, COMM);
+
+  return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE));
+}
diff --git a/src/comm/HPL_recv.cpp b/src/comm/HPL_recv.cpp
new file mode 100644
index 0000000..665f9bb
--- /dev/null
+++ b/src/comm/HPL_recv.cpp
@@ -0,0 +1,63 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+int HPL_recv(double* RBUF, int RCOUNT, int SRC, int RTAG, MPI_Comm COMM) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_recv is a simple wrapper around  MPI_Recv.  Its  main  purpose is
+   * to  allow for some  experimentation / tuning  of this simple routine.
+   * Successful  completion  is  indicated  by  the  returned  error  code
+   * HPL_SUCCESS.  In the case of messages of length less than or equal to
+   * zero, this function returns immediately.
+   *
+   * Arguments
+   * =========
+   *
+   * RBUF    (local output)                double *
+   *         On entry, RBUF specifies the starting address of buffer to be
+   *         received.
+   *
+   * RCOUNT  (local input)                 int
+   *         On entry,  RCOUNT  specifies  the number  of double precision
+   *         entries in RBUF. RCOUNT must be at least zero.
+   *
+   * SRC     (local input)                 int
+   *         On entry, SRC  specifies the rank of the  sending  process in
+   *         the communication space defined by COMM.
+   *
+   * RTAG    (local input)                 int
+   *         On entry,  STAG specifies the message tag to be used for this
+   *         communication operation.
+   *
+   * COMM    (local input)                 MPI_Comm
+   *         The MPI communicator identifying the communication space.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  if(RCOUNT <= 0) return (HPL_SUCCESS);
+
+  MPI_Status status;
+
+  int ierr =
+      MPI_Recv((void*)(RBUF), RCOUNT, MPI_DOUBLE, SRC, RTAG, COMM, &status);
+
+  return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE));
+}
diff --git a/src/comm/HPL_reduce.cpp b/src/comm/HPL_reduce.cpp
new file mode 100644
index 0000000..ab378f3
--- /dev/null
+++ b/src/comm/HPL_reduce.cpp
@@ -0,0 +1,74 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+int HPL_reduce(void*            BUFFER,
+               const int        COUNT,
+               const HPL_T_TYPE DTYPE,
+               const HPL_T_OP   OP,
+               const int        ROOT,
+               MPI_Comm         COMM) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_reduce performs a global reduce operation across all processes of
+   * a group.  Note that the input buffer is  used as workarray and in all
+   * processes but the accumulating process corrupting the original data.
+   *
+   * Arguments
+   * =========
+   *
+   * BUFFER  (local input/output)          void *
+   *         On entry,  BUFFER  points to  the  buffer to be  reduced.  On
+   *         exit,  and  in process of rank  ROOT  this array contains the
+   *         reduced data.  This  buffer  is also used as workspace during
+   *         the operation in the other processes of the group.
+   *
+   * COUNT   (global input)                const int
+   *         On entry,  COUNT  indicates the number of entries in  BUFFER.
+   *         COUNT must be at least zero.
+   *
+   * DTYPE   (global input)                const HPL_T_TYPE
+   *         On entry,  DTYPE  specifies the type of the buffers operands.
+   *
+   * OP      (global input)                const HPL_T_OP
+   *         On entry, OP is a pointer to the local combine function.
+   *
+   * ROOT    (global input)                const int
+   *         On entry, ROOT is the coordinate of the accumulating process.
+   *
+   * COMM    (global/local input)          MPI_Comm
+   *         The MPI communicator identifying the process collection.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  int ierr;
+
+  int rank;
+  MPI_Comm_rank(COMM, &rank);
+
+  if(rank == ROOT)
+    ierr = MPI_Reduce(
+        MPI_IN_PLACE, BUFFER, COUNT, HPL_2_MPI_TYPE(DTYPE), OP, ROOT, COMM);
+  else
+    ierr =
+        MPI_Reduce(BUFFER, NULL, COUNT, HPL_2_MPI_TYPE(DTYPE), OP, ROOT, COMM);
+
+  return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE));
+}
diff --git a/src/comm/HPL_scatterv.cpp b/src/comm/HPL_scatterv.cpp
new file mode 100644
index 0000000..433be71
--- /dev/null
+++ b/src/comm/HPL_scatterv.cpp
@@ -0,0 +1,125 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+int HPL_scatterv(double*    BUF,
+                 const int* SCOUNT,
+                 const int* DISPL,
+                 const int  RCOUNT,
+                 int        ROOT,
+                 MPI_Comm   COMM) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_scatterv is a simple wrapper around an in-place MPI_Scatterv.
+   * Its  main  purpose is to  allow for some  experimentation / tuning
+   * of this simple routine. Successful  completion  is  indicated  by
+   * the  returned  error  code HPL_SUCCESS.
+   *
+   * Arguments
+   * =========
+   *
+   * BUF    (local input/output)           double *
+   *         On entry, on the root process BUF specifies the starting
+   *         address of buffer to be scattered. On non-root processes,
+   *         BUF specifies the starting point of the received buffer.
+   *
+   * SCOUNT  (local input)                 int *
+   *         On entry,  SCOUNT is an array of length SIZE specifiying
+   *         the number of  double precision entries in BUF to send to
+   *         each process.
+   *
+   * DISPL   (local input)                 int *
+   *         On entry,  DISPL is an array of length SIZE specifiying the
+   *         displacement (relative to BUF) from which to take the outgoing
+   *         data to each process from the root process, and the displacement
+   *         (relative to BUF) from which to receive the incoming data on
+   *         each non-root process.
+   *
+   * RCOUNT  (local input)                 int
+   *         On entry,  RCOUNT  specifies  the number of  double precision
+   *         entries in BUF to be received from the ROOT process.
+   *
+   * ROOT    (local input)                 int
+   *         On entry, ROOT specifies the rank of the origin process in
+   *         the communication space defined by COMM.
+   *
+   * COMM    (local input)                 MPI_Comm
+   *         The MPI communicator identifying the communication space.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  int rank, ierr = MPI_SUCCESS;
+  MPI_Comm_rank(COMM, &rank);
+
+  roctxRangePush("HPL_Scatterv");
+
+#ifdef HPL_USE_COLLECTIVES
+
+  if(rank == ROOT) {
+    ierr = MPI_Scatterv(BUF,
+                        SCOUNT,
+                        DISPL,
+                        MPI_DOUBLE,
+                        MPI_IN_PLACE,
+                        RCOUNT,
+                        MPI_DOUBLE,
+                        ROOT,
+                        COMM);
+  } else {
+    ierr = MPI_Scatterv(
+        NULL, SCOUNT, DISPL, MPI_DOUBLE, BUF, RCOUNT, MPI_DOUBLE, ROOT, COMM);
+  }
+
+#else
+
+  int size;
+  MPI_Comm_size(COMM, &size);
+
+  const int tag = ROOT;
+  if(rank == ROOT) {
+    MPI_Request requests[size];
+
+    /*Just send size-1 messages*/
+    for(int i = 0; i < size; ++i) {
+
+      requests[i] = MPI_REQUEST_NULL;
+
+      if(i == ROOT) { continue; }
+      const int ibuf = DISPL[i];
+      const int lbuf = SCOUNT[i];
+
+      if(lbuf > 0) {
+        (void)MPI_Isend(
+            BUF + ibuf, lbuf, MPI_DOUBLE, i, tag, COMM, requests + i);
+      }
+    }
+
+    MPI_Waitall(size, requests, MPI_STATUSES_IGNORE);
+  } else {
+    if(RCOUNT > 0)
+      ierr =
+          MPI_Recv(BUF, RCOUNT, MPI_DOUBLE, ROOT, tag, COMM, MPI_STATUS_IGNORE);
+  }
+
+#endif
+  roctxRangePop();
+
+  return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE));
+}
diff --git a/src/comm/HPL_sdrv.cpp b/src/comm/HPL_sdrv.cpp
new file mode 100644
index 0000000..2fa24ec
--- /dev/null
+++ b/src/comm/HPL_sdrv.cpp
@@ -0,0 +1,91 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+int HPL_sdrv(double*  SBUF,
+             int      SCOUNT,
+             int      STAG,
+             double*  RBUF,
+             int      RCOUNT,
+             int      RTAG,
+             int      PARTNER,
+             MPI_Comm COMM) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_sdrv is a simple wrapper around MPI_Sendrecv. Its main purpose is
+   * to allow for some experimentation and tuning of this simple function.
+   * Messages  of  length  less than  or  equal to zero  are not sent  nor
+   * received.  Successful completion  is  indicated by the returned error
+   * code HPL_SUCCESS.
+   *
+   * Arguments
+   * =========
+   *
+   * SBUF    (local input)                 double *
+   *         On entry, SBUF specifies the starting address of buffer to be
+   *         sent.
+   *
+   * SCOUNT  (local input)                 int
+   *         On entry,  SCOUNT  specifies  the number  of double precision
+   *         entries in SBUF. SCOUNT must be at least zero.
+   *
+   * STAG    (local input)                 int
+   *         On entry,  STAG  specifies the message tag to be used for the
+   *         sending communication operation.
+   *
+   * RBUF    (local output)                double *
+   *         On entry, RBUF specifies the starting address of buffer to be
+   *         received.
+   *
+   * RCOUNT  (local input)                 int
+   *         On entry,  RCOUNT  specifies  the number  of double precision
+   *         entries in RBUF. RCOUNT must be at least zero.
+   *
+   * RTAG    (local input)                 int
+   *         On entry,  RTAG  specifies the message tag to be used for the
+   *         receiving communication operation.
+   *
+   * PARTNER (local input)                 int
+   *         On entry,  PARTNER  specifies  the rank of the  collaborative
+   *         process in the communication space defined by COMM.
+   *
+   * COMM    (local input)                 MPI_Comm
+   *         The MPI communicator identifying the communication space.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  MPI_Status status;
+  int        ierr;
+
+  ierr = MPI_Sendrecv(SBUF,
+                      SCOUNT,
+                      MPI_DOUBLE,
+                      PARTNER,
+                      STAG,
+                      RBUF,
+                      RCOUNT,
+                      MPI_DOUBLE,
+                      PARTNER,
+                      RTAG,
+                      COMM,
+                      &status);
+
+  return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE));
+}
diff --git a/src/comm/HPL_send.cpp b/src/comm/HPL_send.cpp
new file mode 100644
index 0000000..5dae5c7
--- /dev/null
+++ b/src/comm/HPL_send.cpp
@@ -0,0 +1,60 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+int HPL_send(double* SBUF, int SCOUNT, int DEST, int STAG, MPI_Comm COMM) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_send is a simple wrapper around  MPI_Send.  Its  main  purpose is
+   * to  allow for some  experimentation / tuning  of this simple routine.
+   * Successful  completion  is  indicated  by  the  returned  error  code
+   * MPI_SUCCESS.  In the case of messages of length less than or equal to
+   * zero, this function returns immediately.
+   *
+   * Arguments
+   * =========
+   *
+   * SBUF    (local input)                 double *
+   *         On entry, SBUF specifies the starting address of buffer to be
+   *         sent.
+   *
+   * SCOUNT  (local input)                 int
+   *         On entry,  SCOUNT  specifies  the number of  double precision
+   *         entries in SBUF. SCOUNT must be at least zero.
+   *
+   * DEST    (local input)                 int
+   *         On entry, DEST specifies the rank of the receiving process in
+   *         the communication space defined by COMM.
+   *
+   * STAG    (local input)                 int
+   *         On entry,  STAG specifies the message tag to be used for this
+   *         communication operation.
+   *
+   * COMM    (local input)                 MPI_Comm
+   *         The MPI communicator identifying the communication space.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  if(SCOUNT <= 0) return (HPL_SUCCESS);
+
+  int ierr = MPI_Send((void*)(SBUF), SCOUNT, MPI_DOUBLE, DEST, STAG, COMM);
+
+  return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE));
+}
diff --git a/src/grid/HPL_grid_exit.cpp b/src/grid/HPL_grid_exit.cpp
new file mode 100644
index 0000000..be94d9c
--- /dev/null
+++ b/src/grid/HPL_grid_exit.cpp
@@ -0,0 +1,58 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+int HPL_grid_exit(HPL_T_grid* GRID) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_grid_exit marks  the process  grid object for  deallocation.  The
+   * returned  error  code  MPI_SUCCESS  indicates  successful completion.
+   * Other error codes are (MPI) implementation dependent.
+   *
+   * Arguments
+   * =========
+   *
+   * GRID    (local input/output)          HPL_T_grid *
+   *         On entry,  GRID  points  to the data structure containing the
+   *         process grid to be released.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  int hplerr = MPI_SUCCESS, mpierr;
+
+  if(GRID->all_comm != MPI_COMM_NULL) {
+    mpierr = MPI_Comm_free(&(GRID->row_comm));
+    if(mpierr != MPI_SUCCESS) hplerr = mpierr;
+    mpierr = MPI_Comm_free(&(GRID->col_comm));
+    if(mpierr != MPI_SUCCESS) hplerr = mpierr;
+    mpierr = MPI_Comm_free(&(GRID->all_comm));
+    if(mpierr != MPI_SUCCESS) hplerr = mpierr;
+  }
+
+  GRID->order = HPL_COLUMN_MAJOR;
+
+  GRID->iam = GRID->myrow = GRID->mycol = -1;
+  GRID->nprow = GRID->npcol = GRID->nprocs = -1;
+
+  GRID->row_ip2 = GRID->row_hdim = GRID->row_ip2m1 = GRID->row_mask = -1;
+  GRID->col_ip2 = GRID->col_hdim = GRID->col_ip2m1 = GRID->col_mask = -1;
+
+  return (hplerr);
+}
diff --git a/src/grid/HPL_grid_info.cpp b/src/grid/HPL_grid_info.cpp
new file mode 100644
index 0000000..51db2b6
--- /dev/null
+++ b/src/grid/HPL_grid_info.cpp
@@ -0,0 +1,66 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+int HPL_grid_info(const HPL_T_grid* GRID,
+                  int*              NPROW,
+                  int*              NPCOL,
+                  int*              MYROW,
+                  int*              MYCOL) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_grid_info returns  the grid shape and the coordinates in the grid
+   * of the calling process.  Successful  completion  is  indicated by the
+   * returned error code  MPI_SUCCESS. Other error codes depend on the MPI
+   * implementation.
+   *
+   * Arguments
+   * =========
+   *
+   * GRID    (local input)                 const HPL_T_grid *
+   *         On entry,  GRID  points  to the data structure containing the
+   *         process grid information.
+   *
+   * NPROW   (global output)               int *
+   *         On exit,   NPROW  specifies the number of process rows in the
+   *         grid. NPROW is at least one.
+   *
+   * NPCOL   (global output)               int *
+   *         On exit,   NPCOL  specifies  the number of process columns in
+   *         the grid. NPCOL is at least one.
+   *
+   * MYROW   (global output)               int *
+   *         On exit,  MYROW  specifies my  row process  coordinate in the
+   *         grid. MYROW is greater than or equal  to zero  and  less than
+   *         NPROW.
+   *
+   * MYCOL   (global output)               int *
+   *         On exit,  MYCOL specifies my column process coordinate in the
+   *         grid. MYCOL is greater than or equal  to zero  and  less than
+   *         NPCOL.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  *NPROW = GRID->nprow;
+  *NPCOL = GRID->npcol;
+  *MYROW = GRID->myrow;
+  *MYCOL = GRID->mycol;
+  return (MPI_SUCCESS);
+}
diff --git a/src/grid/HPL_grid_init.cpp b/src/grid/HPL_grid_init.cpp
new file mode 100644
index 0000000..ca885bf
--- /dev/null
+++ b/src/grid/HPL_grid_init.cpp
@@ -0,0 +1,190 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+int HPL_grid_init(MPI_Comm          COMM,
+                  const HPL_T_ORDER ORDER,
+                  const int         NPROW,
+                  const int         NPCOL,
+                  const int         p,
+                  const int         q,
+                  HPL_T_grid*       GRID) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_grid_init creates a NPROW x NPCOL  process  grid using column- or
+   * row-major ordering from an initial collection of processes identified
+   * by an  MPI  communicator.  Successful  completion is indicated by the
+   * returned error code MPI_SUCCESS.  Other error codes depend on the MPI
+   * implementation. The coordinates of processes that are not part of the
+   * grid are set to values outside of [0..NPROW) x [0..NPCOL).
+   *
+   * Arguments
+   * =========
+   *
+   * COMM    (global/local input)          MPI_Comm
+   *         On entry,  COMM  is  the  MPI  communicator  identifying  the
+   *         initial  collection  of  processes out of which  the  grid is
+   *         formed.
+   *
+   * ORDER   (global input)                const HPL_T_ORDER
+   *         On entry, ORDER specifies how the processes should be ordered
+   *         in the grid as follows:
+   *            ORDER = HPL_ROW_MAJOR    row-major    ordering;
+   *            ORDER = HPL_COLUMN_MAJOR column-major ordering;
+   *
+   * NPROW   (global input)                const int
+   *         On entry,  NPROW  specifies the number of process rows in the
+   *         grid to be created. NPROW must be at least one.
+   *
+   * NPCOL   (global input)                const int
+   *         On entry,  NPCOL  specifies  the number of process columns in
+   *         the grid to be created. NPCOL must be at least one.
+   *
+   * GRID    (local input/output)          HPL_T_grid *
+   *         On entry,  GRID  points  to the data structure containing the
+   *         process grid information to be initialized.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  int hdim, hplerr = MPI_SUCCESS, ierr, ip2, k, mask, mycol, myrow, nprocs,
+            rank, size;
+  int local_myrow, local_mycol;
+
+  MPI_Comm_rank(COMM, &rank);
+  MPI_Comm_size(COMM, &size);
+  /*
+   * Abort if illegal process grid
+   */
+  nprocs = NPROW * NPCOL;
+  if((nprocs > size) || (NPROW < 1) || (NPCOL < 1)) {
+    HPL_pabort(__LINE__, "HPL_grid_init", "Illegal Grid");
+  }
+  /*
+   * Row- or column-major ordering of the processes
+   */
+  int local_size = p * q;
+  int local_rank = rank % local_size;
+  int node       = rank / local_size; // node number
+
+  if(ORDER == HPL_ROW_MAJOR) {
+    GRID->order = HPL_ROW_MAJOR;
+    local_mycol = local_rank % q;
+    local_myrow = local_rank / q;
+
+    int noderow = node / (NPCOL / q);
+    int nodecol = node % (NPCOL / q);
+
+    myrow = noderow * p + local_myrow;
+    mycol = nodecol * q + local_mycol;
+
+    myrow = rank / NPCOL;
+    mycol = rank - myrow * NPCOL;
+  } else {
+    GRID->order = HPL_COLUMN_MAJOR;
+    local_mycol = local_rank / p;
+    local_myrow = local_rank % p;
+
+    int noderow = node % (NPROW / p);
+    int nodecol = node / (NPROW / p);
+
+    myrow = noderow * p + local_myrow;
+    mycol = nodecol * q + local_mycol;
+  }
+
+  GRID->iam         = rank;
+  GRID->local_myrow = local_myrow;
+  GRID->local_mycol = local_mycol;
+  GRID->myrow       = myrow;
+  GRID->mycol       = mycol;
+  GRID->local_nprow = p;
+  GRID->local_npcol = q;
+  GRID->nprow       = NPROW;
+  GRID->npcol       = NPCOL;
+  GRID->nprocs      = nprocs;
+  /*
+   * row_ip2   : largest power of two <= nprow;
+   * row_hdim  : row_ip2 procs hypercube dim;
+   * row_ip2m1 : largest power of two <= nprow-1;
+   * row_mask  : row_ip2m1 procs hypercube mask;
+   */
+  hdim = 0;
+  ip2  = 1;
+  k    = NPROW;
+  while(k > 1) {
+    k >>= 1;
+    ip2 <<= 1;
+    hdim++;
+  }
+  GRID->row_ip2  = ip2;
+  GRID->row_hdim = hdim;
+
+  mask = ip2 = 1;
+  k          = NPROW - 1;
+  while(k > 1) {
+    k >>= 1;
+    ip2 <<= 1;
+    mask <<= 1;
+    mask++;
+  }
+  GRID->row_ip2m1 = ip2;
+  GRID->row_mask  = mask;
+  /*
+   * col_ip2   : largest power of two <= npcol;
+   * col_hdim  : col_ip2 procs hypercube dim;
+   * col_ip2m1 : largest power of two <= npcol-1;
+   * col_mask  : col_ip2m1 procs hypercube mask;
+   */
+  hdim = 0;
+  ip2  = 1;
+  k    = NPCOL;
+  while(k > 1) {
+    k >>= 1;
+    ip2 <<= 1;
+    hdim++;
+  }
+  GRID->col_ip2  = ip2;
+  GRID->col_hdim = hdim;
+
+  mask = ip2 = 1;
+  k          = NPCOL - 1;
+  while(k > 1) {
+    k >>= 1;
+    ip2 <<= 1;
+    mask <<= 1;
+    mask++;
+  }
+  GRID->col_ip2m1 = ip2;
+  GRID->col_mask  = mask;
+  /*
+   * All communicator, leave if I am not part of this grid. Creation of the
+   * row- and column communicators.
+   */
+  ierr = MPI_Comm_split(
+      COMM, (rank < nprocs ? 0 : MPI_UNDEFINED), rank, &(GRID->all_comm));
+  if(GRID->all_comm == MPI_COMM_NULL) return (ierr);
+
+  ierr = MPI_Comm_split(GRID->all_comm, myrow, mycol, &(GRID->row_comm));
+  if(ierr != MPI_SUCCESS) hplerr = ierr;
+
+  ierr = MPI_Comm_split(GRID->all_comm, mycol, myrow, &(GRID->col_comm));
+  if(ierr != MPI_SUCCESS) hplerr = ierr;
+
+  return (hplerr);
+}
diff --git a/src/matgen/HPL_pdmatgen.cpp b/src/matgen/HPL_pdmatgen.cpp
new file mode 100644
index 0000000..0f4c50f
--- /dev/null
+++ b/src/matgen/HPL_pdmatgen.cpp
@@ -0,0 +1,262 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+#include <hip/hip_runtime_api.h>
+#include <cassert>
+#include <unistd.h>
+
+const int max_nthreads = 128;
+
+static int Malloc(HPL_T_grid*  GRID,
+                  void**       ptr,
+                  const size_t bytes,
+                  int          info[3]) {
+
+  int mycol, myrow, npcol, nprow;
+  (void)HPL_grid_info(GRID, &nprow, &npcol, &myrow, &mycol);
+
+  unsigned long pg_size = sysconf(_SC_PAGESIZE);
+  int           err     = posix_memalign(ptr, pg_size, bytes);
+
+  /*Check allocation is valid*/
+  info[0] = (err != 0);
+  info[1] = myrow;
+  info[2] = mycol;
+  (void)HPL_all_reduce((void*)(info), 3, HPL_INT, HPL_MAX, GRID->all_comm);
+  if(info[0] != 0) {
+    return HPL_FAILURE;
+  } else {
+    return HPL_SUCCESS;
+  }
+}
+
+static int hostMalloc(HPL_T_grid*  GRID,
+                      void**       ptr,
+                      const size_t bytes,
+                      int          info[3]) {
+
+  int mycol, myrow, npcol, nprow;
+  (void)HPL_grid_info(GRID, &nprow, &npcol, &myrow, &mycol);
+
+  hipError_t err = hipHostMalloc(ptr, bytes);
+
+  /*Check allocation is valid*/
+  info[0] = (err != hipSuccess);
+  info[1] = myrow;
+  info[2] = mycol;
+  (void)HPL_all_reduce((void*)(info), 3, HPL_INT, HPL_MAX, GRID->all_comm);
+  if(info[0] != 0) {
+    return HPL_FAILURE;
+  } else {
+    return HPL_SUCCESS;
+  }
+}
+
+static int deviceMalloc(HPL_T_grid*  GRID,
+                        void**       ptr,
+                        const size_t bytes,
+                        int          info[3]) {
+
+  int mycol, myrow, npcol, nprow;
+  (void)HPL_grid_info(GRID, &nprow, &npcol, &myrow, &mycol);
+
+  hipError_t err = hipMalloc(ptr, bytes);
+
+  /*Check allocation is valid*/
+  info[0] = (err != hipSuccess);
+  info[1] = myrow;
+  info[2] = mycol;
+  (void)HPL_all_reduce((void*)(info), 3, HPL_INT, HPL_MAX, GRID->all_comm);
+  if(info[0] != 0) {
+    return HPL_FAILURE;
+  } else {
+    return HPL_SUCCESS;
+  }
+}
+
+int HPL_pdmatgen(HPL_T_test* TEST,
+                 HPL_T_grid* GRID,
+                 HPL_T_palg* ALGO,
+                 HPL_T_pmat* mat,
+                 const int   N,
+                 const int   NB) {
+
+  int ii, ip2, im4096;
+  int mycol, myrow, npcol, nprow, nq, info[3];
+  (void)HPL_grid_info(GRID, &nprow, &npcol, &myrow, &mycol);
+
+  mat->n    = N;
+  mat->nb   = NB;
+  mat->info = 0;
+  mat->mp   = HPL_numroc(N, NB, NB, myrow, 0, nprow);
+  nq        = HPL_numroc(N, NB, NB, mycol, 0, npcol);
+  /*
+   * Allocate matrix, right-hand-side, and vector solution x. [ A | b ] is
+   * N by N+1.  One column is added in every process column for the solve.
+   * The  result  however  is stored in a 1 x N vector replicated in every
+   * process row. In every process, A is lda * (nq+1), x is 1 * nq and the
+   * workspace is mp.
+   */
+  mat->ld = Mmax(1, mat->mp);
+  mat->ld = ((mat->ld + 95) / 128) * 128 + 32; /*pad*/
+
+  mat->nq = nq + 1;
+
+  mat->dA = nullptr;
+  mat->dX = nullptr;
+
+  mat->dW = nullptr;
+  mat->W  = nullptr;
+
+  /* Create a rocBLAS handle */
+  rocblas_create_handle(&handle);
+  rocblas_set_pointer_mode(handle, rocblas_pointer_mode_host);
+  rocblas_initialize();
+  rocblas_set_stream(handle, computeStream);
+
+  /*
+   * Allocate dynamic memory
+   */
+
+  // allocate on device
+  size_t numbytes = ((size_t)(mat->ld) * (size_t)(mat->nq)) * sizeof(double);
+
+#ifdef HPL_VERBOSE_PRINT
+  if((myrow == 0) && (mycol == 0)) {
+    printf("Local matrix size = %g GBs\n",
+           ((double)numbytes) / (1024 * 1024 * 1024));
+  }
+#endif
+
+  if(deviceMalloc(GRID, (void**)&(mat->dA), numbytes, info) != HPL_SUCCESS) {
+    HPL_pwarn(TEST->outfp,
+              __LINE__,
+              "HPL_pdmatgen",
+              "[%d,%d] %s",
+              info[1],
+              info[2],
+              "Device memory allocation failed for for A and b. Skip.");
+    return HPL_FAILURE;
+  }
+
+  // seperate space for X vector
+  if(deviceMalloc(GRID, (void**)&(mat->dX), mat->nq * sizeof(double), info) !=
+     HPL_SUCCESS) {
+    HPL_pwarn(TEST->outfp,
+              __LINE__,
+              "HPL_pdmatgen",
+              "[%d,%d] %s",
+              info[1],
+              info[2],
+              "Device memory allocation failed for for x. Skip.");
+    return HPL_FAILURE;
+  }
+
+  int Anp;
+  Mnumroc(Anp, mat->n, mat->nb, mat->nb, myrow, 0, nprow);
+
+  /*Need space for a column of panels for pdfact on CPU*/
+  size_t A_hostsize = mat->ld * mat->nb * sizeof(double);
+
+  if(hostMalloc(GRID, (void**)&(mat->A), A_hostsize, info) != HPL_SUCCESS) {
+    HPL_pwarn(TEST->outfp,
+              __LINE__,
+              "HPL_pdmatgen",
+              "[%d,%d] %s",
+              info[1],
+              info[2],
+              "Panel memory allocation failed. Skip.");
+    return HPL_FAILURE;
+  }
+
+#pragma omp parallel
+  {
+    /*First touch*/
+    const int thread_rank = omp_get_thread_num();
+    const int thread_size = omp_get_num_threads();
+    assert(thread_size <= max_nthreads);
+
+    for(int i = 0; i < mat->ld; i += NB) {
+      if((i / NB) % thread_size == thread_rank) {
+        const int mm = std::min(NB, mat->ld - i);
+        for(int k = 0; k < NB; ++k) {
+          for(int j = 0; j < mm; ++j) {
+            mat->A[j + i + static_cast<size_t>(mat->ld) * k] = 0.0;
+          }
+        }
+      }
+    }
+  }
+
+  size_t dworkspace_size = 0;
+  size_t workspace_size  = 0;
+
+  /*pdtrsv needs two vectors for B and W (and X on host) */
+  dworkspace_size = Mmax(2 * Anp * sizeof(double), dworkspace_size);
+  workspace_size  = Mmax((2 * Anp + nq) * sizeof(double), workspace_size);
+
+  /*Scratch space for rows in pdlaswp (with extra space for padding) */
+  dworkspace_size =
+      Mmax((nq + mat->nb + 256) * mat->nb * sizeof(double), dworkspace_size);
+  workspace_size =
+      Mmax((nq + mat->nb + 256) * mat->nb * sizeof(double), workspace_size);
+
+  if(deviceMalloc(GRID, (void**)&(mat->dW), dworkspace_size, info) !=
+     HPL_SUCCESS) {
+    HPL_pwarn(TEST->outfp,
+              __LINE__,
+              "HPL_pdmatgen",
+              "[%d,%d] %s",
+              info[1],
+              info[2],
+              "Device memory allocation failed for U workspace. Skip.");
+    return HPL_FAILURE;
+  }
+  if(hostMalloc(GRID, (void**)&(mat->W), workspace_size, info) != HPL_SUCCESS) {
+    HPL_pwarn(TEST->outfp,
+              __LINE__,
+              "HPL_pdmatgen",
+              "[%d,%d] %s",
+              info[1],
+              info[2],
+              "Host memory allocation failed for U workspace. Skip.");
+    return HPL_FAILURE;
+  }
+
+  return HPL_SUCCESS;
+}
+
+void HPL_pdmatfree(HPL_T_pmat* mat) {
+
+  if(mat->dA) {
+    hipFree(mat->dA);
+    mat->dA = nullptr;
+  }
+  if(mat->dX) {
+    hipFree(mat->dX);
+    mat->dX = nullptr;
+  }
+  if(mat->dW) {
+    hipFree(mat->dW);
+    mat->dW = nullptr;
+  }
+
+  if(mat->A) {
+    hipHostFree(mat->A);
+    mat->A = nullptr;
+  }
+  if(mat->W) {
+    hipHostFree(mat->W);
+    mat->W = nullptr;
+  }
+
+  rocblas_destroy_handle(handle);
+}
diff --git a/src/matgen/HPL_pdrandmat_device.cpp b/src/matgen/HPL_pdrandmat_device.cpp
new file mode 100644
index 0000000..66b16c7
--- /dev/null
+++ b/src/matgen/HPL_pdrandmat_device.cpp
@@ -0,0 +1,201 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+#define BLOCK_SIZE 512
+
+__global__ void hpl_randmat(const int      mp,
+                            const int      nq,
+                            const int      NB,
+                            const int      LDA,
+                            const uint64_t cblkjumpA,
+                            const uint64_t cblkjumpC,
+                            const uint64_t rblkjumpA,
+                            const uint64_t rblkjumpC,
+                            const uint64_t cjumpA,
+                            const uint64_t cjumpC,
+                            const uint64_t rjumpA,
+                            const uint64_t rjumpC,
+                            const uint64_t startrand,
+                            double* __restrict__ A) {
+
+  const int jblk = blockIdx.y;
+  const int iblk = blockIdx.x;
+
+  /* Get panel size */
+  const int jb = (jblk == gridDim.y - 1) ? nq - ((nq - 1) / NB) * NB : NB;
+  const int ib = (iblk == gridDim.x - 1) ? mp - ((mp - 1) / NB) * NB : NB;
+
+  double* Ab = A + iblk * NB + static_cast<size_t>(jblk * NB) * LDA;
+
+  /* Start at first uint64_t */
+  uint64_t irand = startrand;
+
+  /* Jump rand M*NB*npcol for each jblk */
+  for(int j = 0; j < jblk; ++j) { irand = cblkjumpA * irand + cblkjumpC; }
+
+  /* Jump rand NB*nprow for each iblk */
+  for(int i = 0; i < iblk; ++i) { irand = rblkjumpA * irand + rblkjumpC; }
+
+  /* Shift per-column irand */
+  const int n = threadIdx.x;
+  for(int j = 0; j < threadIdx.x; ++j) { irand = cjumpA * irand + cjumpC; }
+
+  for(int n = threadIdx.x; n < jb; n += blockDim.x) {
+    /*Grab rand at top of block*/
+    uint64_t r = irand;
+
+    /* Each thread traverses a column */
+    for(int m = 0; m < ib; ++m) {
+      /*Generate a random double from the current r */
+      const double p1 = ((r & (65535LU << 0)) >> 0);
+      const double p2 = ((r & (65535LU << 16)) >> 16);
+      const double p3 = ((r & (65535LU << 32)) >> 32);
+      const double p4 = ((r & (65535LU << 48)) >> 48);
+
+      Ab[m + n * LDA] =
+          (HPL_HALF - (((p1) + (p2)*HPL_POW16) / HPL_DIVFAC * HPL_HALF + (p3) +
+                       (p4)*HPL_POW16) /
+                          HPL_DIVFAC * HPL_HALF);
+
+      /*Increment rand*/
+      r = rjumpA * r + rjumpC;
+    }
+
+    /* Block-shift per-column irand */
+    for(int j = 0; j < blockDim.x; ++j) { irand = cjumpA * irand + cjumpC; }
+  }
+}
+
+void HPL_pdrandmat(const HPL_T_grid* GRID,
+                   const int         M,
+                   const int         N,
+                   const int         NB,
+                   double*           A,
+                   const int         LDA,
+                   const int         ISEED) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pdrandmat generates (or regenerates) a parallel random matrix A.
+   *
+   * The  pseudo-random  generator uses the linear congruential algorithm:
+   * X(n+1) = (a * X(n) + c) mod m  as  described  in the  Art of Computer
+   * Programming, Knuth 1973, Vol. 2.
+   *
+   * Arguments
+   * =========
+   *
+   * GRID    (local input)                 const HPL_T_grid *
+   *         On entry,  GRID  points  to the data structure containing the
+   *         process grid information.
+   *
+   * M       (global input)                const int
+   *         On entry,  M  specifies  the number  of rows of the matrix A.
+   *         M must be at least zero.
+   *
+   * N       (global input)                const int
+   *         On entry,  N specifies the number of columns of the matrix A.
+   *         N must be at least zero.
+   *
+   * NB      (global input)                const int
+   *         On entry,  NB specifies the blocking factor used to partition
+   *         and distribute the matrix A. NB must be larger than one.
+   *
+   * A       (local output)                double *
+   *         On entry,  A  points  to an array of dimension (LDA,LocQ(N)).
+   *         On exit, this array contains the coefficients of the randomly
+   *         generated matrix.
+   *
+   * LDA     (local input)                 const int
+   *         On entry, LDA specifies the leading dimension of the array A.
+   *         LDA must be at least max(1,LocP(M)).
+   *
+   * ISEED   (global input)                const int
+   *         On entry, ISEED  specifies  the  seed  number to generate the
+   *         matrix A. ISEED must be at least zero.
+   *
+   * ---------------------------------------------------------------------
+   */
+  int mp, mycol, myrow, npcol, nprow, nq;
+  (void)HPL_grid_info(GRID, &nprow, &npcol, &myrow, &mycol);
+
+  uint64_t mult64  = HPL_MULT;
+  uint64_t iadd64  = HPL_IADD;
+  uint64_t jseed64 = static_cast<uint64_t>(ISEED);
+
+  /*
+   * Generate an M by N matrix starting in process (0,0)
+   */
+  Mnumroc(mp, M, NB, NB, myrow, 0, nprow);
+  Mnumroc(nq, N, NB, NB, mycol, 0, npcol);
+
+  if((mp <= 0) || (nq <= 0)) return;
+
+  /*
+   * Compute multiplier/adder for various jumps in random sequence
+   */
+  const int jump1 = 1;
+  const int jump2 = nprow * NB;
+  const int jump3 = M;
+  const int jump4 = npcol * NB;
+  const int jump5 = NB;
+  const int jump6 = mycol;
+  const int jump7 = myrow * NB;
+
+  uint64_t startrand;
+  uint64_t rjumpA, rblkjumpA, cjumpA, cblkjumpA, ia564;
+  uint64_t rjumpC, rblkjumpC, cjumpC, cblkjumpC, ic564;
+  uint64_t itmp164, itmp264, itmp364;
+
+  /* Compute different jump coefficients */
+  HPL_xjumpm(jump1, mult64, iadd64, jseed64, startrand, rjumpA, rjumpC);
+  HPL_xjumpm(jump2, mult64, iadd64, startrand, itmp164, rblkjumpA, rblkjumpC);
+  HPL_xjumpm(jump3, mult64, iadd64, startrand, itmp164, cjumpA, cjumpC);
+  HPL_xjumpm(jump4, cjumpA, cjumpC, startrand, itmp164, cblkjumpA, cblkjumpC);
+
+  /* Shift the starting random value for this rank */
+  HPL_xjumpm(jump5, cjumpA, cjumpC, startrand, itmp164, ia564, ic564);
+  HPL_xjumpm(jump6, ia564, ic564, startrand, itmp364, itmp164, itmp264);
+  HPL_xjumpm(jump7, mult64, iadd64, itmp364, startrand, itmp164, itmp264);
+
+  /*
+   * Local number of blocks
+   */
+  const int mblks = (mp + NB - 1) / NB;
+  const int nblks = (nq + NB - 1) / NB;
+
+  /* Initialize on GPU */
+  dim3 grid = dim3(mblks, nblks);
+  hpl_randmat<<<grid, BLOCK_SIZE>>>(mp,
+                                    nq,
+                                    NB,
+                                    LDA,
+                                    cblkjumpA,
+                                    cblkjumpC,
+                                    rblkjumpA,
+                                    rblkjumpC,
+                                    cjumpA,
+                                    cjumpC,
+                                    rjumpA,
+                                    rjumpC,
+                                    startrand,
+                                    A);
+
+  hipDeviceSynchronize();
+}
diff --git a/src/matgen/HPL_xjumpm.cpp b/src/matgen/HPL_xjumpm.cpp
new file mode 100644
index 0000000..f1cd377
--- /dev/null
+++ b/src/matgen/HPL_xjumpm.cpp
@@ -0,0 +1,92 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_xjumpm(const int      JUMPM,
+                const uint64_t MULT,
+                const uint64_t IADD,
+                const uint64_t IRANN,
+                uint64_t&      IRANM,
+                uint64_t&      IAM,
+                uint64_t&      ICM) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_xjumpm computes  the constants  A and C  to jump JUMPM numbers in
+   * the random sequence: X(n+JUMPM) = A*X(n)+C.  The constants encoded in
+   * MULT and IADD  specify  how to jump from one entry in the sequence to
+   * the next.
+   *
+   * Arguments
+   * =========
+   *
+   * JUMPM   (local input)                 const int
+   *         On entry,  JUMPM  specifies  the  number  of entries  in  the
+   *         sequence to jump over. When JUMPM is less or equal than zero,
+   *         A and C are not computed, IRANM is set to IRANN corresponding
+   *         to a jump of size zero.
+   *
+   * MULT    (local input)                 unint64_t
+   *         On entry, MULT is an array of dimension 2,  that contains the
+   *         16-lower  and 15-higher bits of the constant  a  to jump from
+   *         X(n) to X(n+1) = a*X(n) + c in the random sequence.
+   *
+   * IADD    (local input)                 unint64_t
+   *         On entry, IADD is an array of dimension 2,  that contains the
+   *         16-lower  and 15-higher bits of the constant  c  to jump from
+   *         X(n) to X(n+1) = a*X(n) + c in the random sequence.
+   *
+   * IRANN   (local input)                 unint64_t
+   *         On entry, IRANN is an array of dimension 2. that contains the
+   *         16-lower and 15-higher bits of the encoding of X(n).
+   *
+   * IRANM   (local output)                unint64_t
+   *         On entry,  IRANM  is an array of dimension 2.   On exit, this
+   *         array  contains respectively  the 16-lower and 15-higher bits
+   *         of the encoding of X(n+JUMPM).
+   *
+   * IAM     (local output)                unint64_t
+   *         On entry, IAM is an array of dimension 2. On exit, when JUMPM
+   *         is  greater  than  zero,  this  array  contains  the  encoded
+   *         constant  A  to jump from  X(n) to  X(n+JUMPM)  in the random
+   *         sequence. IAM(0:1)  contains  respectively  the  16-lower and
+   *         15-higher  bits  of this constant  A. When  JUMPM  is less or
+   *         equal than zero, this array is not referenced.
+   *
+   * ICM     (local output)                unint64_t
+   *         On entry, ICM is an array of dimension 2. On exit, when JUMPM
+   *         is  greater  than  zero,  this  array  contains  the  encoded
+   *         constant  C  to jump from  X(n)  to  X(n+JUMPM) in the random
+   *         sequence. ICM(0:1)  contains  respectively  the  16-lower and
+   *         15-higher  bits  of this constant  C. When  JUMPM  is less or
+   *         equal than zero, this array is not referenced.
+   *
+   * ---------------------------------------------------------------------
+   */
+  if(JUMPM > 0) {
+    IAM = MULT;
+    ICM = IADD;
+    for(int k = 1; k <= JUMPM - 1; k++) {
+      IAM *= MULT;
+      ICM = ICM * MULT + IADD;
+    }
+    IRANM = IRANN * IAM + ICM;
+  } else {
+    IRANM = IRANN;
+  }
+}
diff --git a/src/panel/HPL_pdpanel_SendToDevice.cpp b/src/panel/HPL_pdpanel_SendToDevice.cpp
new file mode 100644
index 0000000..30f7e12
--- /dev/null
+++ b/src/panel/HPL_pdpanel_SendToDevice.cpp
@@ -0,0 +1,216 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+#include "hpl.hpp"
+
+void HPL_unroll_ipiv(const int mp,
+                     const int jb,
+                     int*      ipiv,
+                     int*      ipiv_ex,
+                     int*      upiv) {
+
+  for(int i = 0; i < mp; i++) { upiv[i] = i; } // initialize ids
+  for(int i = 0; i < jb; i++) {                // swap ids
+    int id        = upiv[i];
+    upiv[i]       = upiv[ipiv[i]];
+    upiv[ipiv[i]] = id;
+  }
+
+  for(int i = 0; i < jb; i++) { ipiv_ex[i] = -1; }
+
+  int cnt = 0;
+  for(int i = jb; i < mp; i++) { // find swapped ids outside of panel
+    if(upiv[i] < jb) { ipiv_ex[upiv[i]] = i; }
+  }
+}
+
+void HPL_pdpanel_SendToDevice(HPL_T_panel* PANEL) {
+  double *A, *dA;
+  int     jb, i, ml2;
+
+  jb = PANEL->jb;
+
+  if(jb <= 0) return;
+
+  // only the root column copies to device
+  if(PANEL->grid->mycol == PANEL->pcol) {
+
+    if(PANEL->grid->nprow == 1) {
+
+      // unroll pivoting and send to device now
+      int* ipiv    = PANEL->ipiv;
+      int* ipiv_ex = PANEL->ipiv + jb;
+      int* upiv    = PANEL->IWORK + jb; // scratch space
+
+      for(i = 0; i < jb; i++) { ipiv[i] -= PANEL->ii; } // shift
+      HPL_unroll_ipiv(PANEL->mp, jb, ipiv, ipiv_ex, upiv);
+
+      int* dipiv    = PANEL->dipiv;
+      int* dipiv_ex = PANEL->dipiv + jb;
+
+      hipMemcpy2DAsync(dipiv,
+                       jb * sizeof(int),
+                       upiv,
+                       jb * sizeof(int),
+                       jb * sizeof(int),
+                       1,
+                       hipMemcpyHostToDevice,
+                       dataStream);
+      hipMemcpy2DAsync(dipiv_ex,
+                       jb * sizeof(int),
+                       ipiv_ex,
+                       jb * sizeof(int),
+                       jb * sizeof(int),
+                       1,
+                       hipMemcpyHostToDevice,
+                       dataStream);
+
+    } else {
+
+      int  k       = (int)((unsigned int)(jb) << 1);
+      int* iflag   = PANEL->IWORK;
+      int* ipl     = iflag + 1;
+      int* ipID    = ipl + 1;
+      int* ipA     = ipID + ((unsigned int)(k) << 1);
+      int* iplen   = ipA + 1;
+      int* ipmap   = iplen + PANEL->grid->nprow + 1;
+      int* ipmapm1 = ipmap + PANEL->grid->nprow;
+      int* upiv    = ipmapm1 + PANEL->grid->nprow;
+      int* iwork   = upiv + PANEL->mp;
+
+      int* lindxU   = PANEL->lindxU;
+      int* lindxA   = PANEL->lindxA;
+      int* lindxAU  = PANEL->lindxAU;
+      int* permU    = PANEL->permU;
+      int* permU_ex = permU + jb;
+      int* ipiv     = PANEL->ipiv;
+
+      int* dlindxU   = PANEL->dlindxU;
+      int* dlindxA   = PANEL->dlindxA;
+      int* dlindxAU  = PANEL->dlindxAU;
+      int* dpermU    = PANEL->dpermU;
+      int* dpermU_ex = dpermU + jb;
+      int* dipiv     = PANEL->dipiv;
+
+      if(*iflag == -1) /* no index arrays have been computed so far */
+      {
+        HPL_pipid(PANEL, ipl, ipID);
+        HPL_plindx(PANEL,
+                   *ipl,
+                   ipID,
+                   ipA,
+                   lindxU,
+                   lindxAU,
+                   lindxA,
+                   iplen,
+                   permU,
+                   iwork);
+        *iflag = 1;
+      }
+
+      int N = Mmax(*ipA, jb);
+      if(N > 0) {
+        hipMemcpy2DAsync(dlindxA,
+                         k * sizeof(int),
+                         lindxA,
+                         k * sizeof(int),
+                         N * sizeof(int),
+                         1,
+                         hipMemcpyHostToDevice,
+                         dataStream);
+        hipMemcpy2DAsync(dlindxAU,
+                         k * sizeof(int),
+                         lindxAU,
+                         k * sizeof(int),
+                         N * sizeof(int),
+                         1,
+                         hipMemcpyHostToDevice,
+                         dataStream);
+      }
+
+      hipMemcpyAsync(
+          dlindxU, lindxU, jb * sizeof(int), hipMemcpyHostToDevice, dataStream);
+
+      hipMemcpy2DAsync(dpermU,
+                       jb * sizeof(int),
+                       permU,
+                       jb * sizeof(int),
+                       jb * sizeof(int),
+                       1,
+                       hipMemcpyHostToDevice,
+                       dataStream);
+
+      // send the ipivs along with L2 in the Bcast
+      hipMemcpy2DAsync(dipiv,
+                       jb * sizeof(int),
+                       ipiv,
+                       jb * sizeof(int),
+                       jb * sizeof(int),
+                       1,
+                       hipMemcpyHostToDevice,
+                       dataStream);
+    }
+  }
+
+  // copy A and/or L2
+  if(PANEL->grid->mycol == PANEL->pcol) {
+    // copy L1
+    hipMemcpy2DAsync(PANEL->dL1,
+                     jb * sizeof(double),
+                     PANEL->L1,
+                     jb * sizeof(double),
+                     jb * sizeof(double),
+                     jb,
+                     hipMemcpyHostToDevice,
+                     dataStream);
+
+    if(PANEL->grid->npcol > 1) { // L2 is its own array
+      if(PANEL->grid->myrow == PANEL->prow) {
+        hipMemcpy2DAsync(Mptr(PANEL->dA, 0, -jb, PANEL->dlda),
+                         PANEL->dlda * sizeof(double),
+                         Mptr(PANEL->A, 0, 0, PANEL->lda),
+                         PANEL->lda * sizeof(double),
+                         jb * sizeof(double),
+                         jb,
+                         hipMemcpyHostToDevice,
+                         dataStream);
+
+        if((PANEL->mp - jb) > 0)
+          hipMemcpy2DAsync(PANEL->dL2,
+                           PANEL->dldl2 * sizeof(double),
+                           Mptr(PANEL->A, jb, 0, PANEL->lda),
+                           PANEL->lda * sizeof(double),
+                           (PANEL->mp - jb) * sizeof(double),
+                           jb,
+                           hipMemcpyHostToDevice,
+                           dataStream);
+      } else {
+        if((PANEL->mp) > 0)
+          hipMemcpy2DAsync(PANEL->dL2,
+                           PANEL->dldl2 * sizeof(double),
+                           Mptr(PANEL->A, 0, 0, PANEL->lda),
+                           PANEL->lda * sizeof(double),
+                           PANEL->mp * sizeof(double),
+                           jb,
+                           hipMemcpyHostToDevice,
+                           dataStream);
+      }
+    } else {
+      if(PANEL->mp > 0)
+        hipMemcpy2DAsync(Mptr(PANEL->dA, 0, -jb, PANEL->dlda),
+                         PANEL->dlda * sizeof(double),
+                         Mptr(PANEL->A, 0, 0, PANEL->lda),
+                         PANEL->lda * sizeof(double),
+                         PANEL->mp * sizeof(double),
+                         jb,
+                         hipMemcpyHostToDevice,
+                         dataStream);
+    }
+  }
+}
diff --git a/src/panel/HPL_pdpanel_SendToHost.cpp b/src/panel/HPL_pdpanel_SendToHost.cpp
new file mode 100644
index 0000000..e8a496f
--- /dev/null
+++ b/src/panel/HPL_pdpanel_SendToHost.cpp
@@ -0,0 +1,28 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+#include "hpl.hpp"
+
+void HPL_pdpanel_SendToHost(HPL_T_panel* PANEL) {
+  int jb;
+
+  jb = PANEL->jb;
+
+  if((PANEL->grid->mycol != PANEL->pcol) || (jb <= 0)) return;
+
+  if(PANEL->mp > 0)
+    hipMemcpy2DAsync(PANEL->A,
+                     PANEL->lda * sizeof(double),
+                     PANEL->dA,
+                     PANEL->dlda * sizeof(double),
+                     PANEL->mp * sizeof(double),
+                     jb,
+                     hipMemcpyDeviceToHost,
+                     dataStream);
+}
diff --git a/src/panel/HPL_pdpanel_bcast.cpp b/src/panel/HPL_pdpanel_bcast.cpp
new file mode 100644
index 0000000..32b0bf3
--- /dev/null
+++ b/src/panel/HPL_pdpanel_bcast.cpp
@@ -0,0 +1,56 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+int HPL_pdpanel_bcast(HPL_T_panel* PANEL) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pdpanel_bcast broadcasts  the  current  panel.  Successful  completion
+   * is indicated by a return code of HPL_SUCCESS.
+   *
+   * Arguments
+   * =========
+   *
+   * PANEL   (input/output)                HPL_T_panel *
+   *         On entry,  PANEL  points to the  current panel data structure
+   *         being broadcast.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  if(PANEL == NULL) { return HPL_SUCCESS; }
+  if(PANEL->grid->npcol <= 1) { return HPL_SUCCESS; }
+
+  MPI_Comm comm = PANEL->grid->row_comm;
+  int      root = PANEL->pcol;
+
+#ifdef HPL_DETAILED_TIMING
+  HPL_ptimer(HPL_TIMING_LBCAST);
+#endif
+  /*
+   * Single Bcast call
+   */
+  int err = HPL_bcast(PANEL->dL2, PANEL->len, root, comm, PANEL->algo->btopo);
+
+#ifdef HPL_DETAILED_TIMING
+  HPL_ptimer(HPL_TIMING_LBCAST);
+#endif
+
+  return err;
+}
diff --git a/src/panel/HPL_pdpanel_disp.cpp b/src/panel/HPL_pdpanel_disp.cpp
new file mode 100644
index 0000000..cd589e0
--- /dev/null
+++ b/src/panel/HPL_pdpanel_disp.cpp
@@ -0,0 +1,48 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+int HPL_pdpanel_disp(HPL_T_panel** PANEL) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pdpanel_disp deallocates  the  panel  structure  and  resources  and
+   * stores the error code returned by the panel factorization.
+   *
+   * Arguments
+   * =========
+   *
+   * PANEL   (local input/output)          HPL_T_panel * *
+   *         On entry,  PANEL  points  to  the  address  of the panel data
+   *         structure to be deallocated.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  int mpierr;
+
+  /*
+   * Deallocate the panel resources and panel structure
+   */
+  (*PANEL)->free_work_now = 1;
+  mpierr                  = HPL_pdpanel_free(*PANEL);
+  if(*PANEL) free(*PANEL);
+  *PANEL = NULL;
+
+  return (mpierr);
+}
diff --git a/src/panel/HPL_pdpanel_free.cpp b/src/panel/HPL_pdpanel_free.cpp
new file mode 100644
index 0000000..dcdc9b8
--- /dev/null
+++ b/src/panel/HPL_pdpanel_free.cpp
@@ -0,0 +1,56 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+#include "hpl.hpp"
+
+int HPL_pdpanel_free(HPL_T_panel* PANEL) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pdpanel_free deallocates  the panel resources  and  stores the error
+   * code returned by the panel factorization.
+   *
+   * Arguments
+   * =========
+   *
+   * PANEL   (local input/output)          HPL_T_panel *
+   *         On entry,  PANEL  points  to  the  panel data  structure from
+   *         which the resources should be deallocated.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  if(PANEL->pmat->info == 0) PANEL->pmat->info = *(PANEL->DINFO);
+
+  if(PANEL->free_work_now == 1) {
+
+    if(PANEL->dLWORK) hipFree(PANEL->dLWORK);
+    if(PANEL->dUWORK) hipFree(PANEL->dUWORK);
+    if(PANEL->LWORK) hipHostFree(PANEL->LWORK);
+    if(PANEL->UWORK) hipHostFree(PANEL->UWORK);
+
+    PANEL->max_lwork_size = 0;
+    PANEL->max_uwork_size = 0;
+
+    if(PANEL->IWORK) free(PANEL->IWORK);
+    if(PANEL->fWORK) free(PANEL->fWORK);
+
+    PANEL->max_iwork_size = 0;
+    PANEL->max_fwork_size = 0;
+  }
+
+  return (HPL_SUCCESS);
+}
diff --git a/src/panel/HPL_pdpanel_init.cpp b/src/panel/HPL_pdpanel_init.cpp
new file mode 100644
index 0000000..584f7f4
--- /dev/null
+++ b/src/panel/HPL_pdpanel_init.cpp
@@ -0,0 +1,475 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+#include <unistd.h>
+
+static int Malloc(HPL_T_grid* GRID, void** ptr, const size_t bytes) {
+
+  int mycol, myrow, npcol, nprow;
+  (void)HPL_grid_info(GRID, &nprow, &npcol, &myrow, &mycol);
+
+  unsigned long pg_size = sysconf(_SC_PAGESIZE);
+  int           err     = posix_memalign(ptr, pg_size, bytes);
+
+  /*Check workspace allocation is valid*/
+  if(err != 0) {
+    return HPL_FAILURE;
+  } else {
+    return HPL_SUCCESS;
+  }
+}
+
+static int hostMalloc(HPL_T_grid* GRID, void** ptr, const size_t bytes) {
+
+  int mycol, myrow, npcol, nprow;
+  (void)HPL_grid_info(GRID, &nprow, &npcol, &myrow, &mycol);
+
+  hipError_t err = hipHostMalloc(ptr, bytes);
+
+  /*Check workspace allocation is valid*/
+  if(err != hipSuccess) {
+    return HPL_FAILURE;
+  } else {
+    return HPL_SUCCESS;
+  }
+}
+
+static int deviceMalloc(HPL_T_grid* GRID, void** ptr, const size_t bytes) {
+
+  int mycol, myrow, npcol, nprow;
+  (void)HPL_grid_info(GRID, &nprow, &npcol, &myrow, &mycol);
+
+  hipError_t err = hipMalloc(ptr, bytes);
+
+  /*Check workspace allocation is valid*/
+  if(err != hipSuccess) {
+    return HPL_FAILURE;
+  } else {
+    return HPL_SUCCESS;
+  }
+}
+
+void HPL_pdpanel_init(HPL_T_grid*  GRID,
+                      HPL_T_palg*  ALGO,
+                      const int    M,
+                      const int    N,
+                      const int    JB,
+                      HPL_T_pmat*  A,
+                      const int    IA,
+                      const int    JA,
+                      const int    TAG,
+                      HPL_T_panel* PANEL) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pdpanel_init initializes a panel data structure.
+   *
+   *
+   * Arguments
+   * =========
+   *
+   * GRID    (local input)                 HPL_T_grid *
+   *         On entry,  GRID  points  to the data structure containing the
+   *         process grid information.
+   *
+   * ALGO    (global input)                HPL_T_palg *
+   *         On entry,  ALGO  points to  the data structure containing the
+   *         algorithmic parameters.
+   *
+   * M       (local input)                 const int
+   *         On entry, M specifies the global number of rows of the panel.
+   *         M must be at least zero.
+   *
+   * N       (local input)                 const int
+   *         On entry,  N  specifies  the  global number of columns of the
+   *         panel and trailing submatrix. N must be at least zero.
+   *
+   * JB      (global input)                const int
+   *         On entry, JB specifies is the number of columns of the panel.
+   *         JB must be at least zero.
+   *
+   * A       (local input/output)          HPL_T_pmat *
+   *         On entry, A points to the data structure containing the local
+   *         array information.
+   *
+   * IA      (global input)                const int
+   *         On entry,  IA  is  the global row index identifying the panel
+   *         and trailing submatrix. IA must be at least zero.
+   *
+   * JA      (global input)                const int
+   *         On entry, JA is the global column index identifying the panel
+   *         and trailing submatrix. JA must be at least zero.
+   *
+   * TAG     (global input)                const int
+   *         On entry, TAG is the row broadcast message id.
+   *
+   * PANEL   (local input/output)          HPL_T_panel *
+   *         On entry,  PANEL  points to the data structure containing the
+   *         panel information.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  size_t dalign;
+  int icurcol, icurrow, ii, itmp1, jj, lwork, uwork, ml2, mp, mycol, myrow, nb,
+      npcol, nprow, nq, nu, ldu;
+
+  PANEL->grid = GRID; /* ptr to the process grid */
+  PANEL->algo = ALGO; /* ptr to the algo parameters */
+  PANEL->pmat = A;    /* ptr to the local array info */
+
+  myrow = GRID->myrow;
+  mycol = GRID->mycol;
+  nprow = GRID->nprow;
+  npcol = GRID->npcol;
+  nb    = A->nb;
+
+  HPL_infog2l(IA,
+              JA,
+              nb,
+              nb,
+              nb,
+              nb,
+              0,
+              0,
+              myrow,
+              mycol,
+              nprow,
+              npcol,
+              &ii,
+              &jj,
+              &icurrow,
+              &icurcol);
+  mp = HPL_numrocI(M, IA, nb, nb, myrow, 0, nprow);
+  nq = HPL_numrocI(N, JA, nb, nb, mycol, 0, npcol);
+
+  const int inxtcol = MModAdd1(icurcol, npcol);
+  const int inxtrow = MModAdd1(icurrow, nprow);
+
+  /* ptr to trailing part of A */
+  PANEL->A  = A->A;
+  PANEL->dA = Mptr((double*)(A->dA), ii, jj, A->ld);
+
+  /*
+   * Workspace pointers are initialized to NULL.
+   */
+  PANEL->L2    = nullptr;
+  PANEL->dL2   = nullptr;
+  PANEL->L1    = nullptr;
+  PANEL->dL1   = nullptr;
+  PANEL->DINFO = nullptr;
+  PANEL->U     = nullptr;
+  PANEL->dU    = nullptr;
+  PANEL->W     = nullptr;
+  PANEL->dW    = nullptr;
+  PANEL->U1    = nullptr;
+  PANEL->dU1   = nullptr;
+  PANEL->W1    = nullptr;
+  PANEL->dW1   = nullptr;
+  PANEL->U2    = nullptr;
+  PANEL->dU2   = nullptr;
+  PANEL->W2    = nullptr;
+  PANEL->dW2   = nullptr;
+  // PANEL->WORK    = NULL;
+  // PANEL->IWORK   = NULL;
+  /*
+   * Local lengths, indexes process coordinates
+   */
+  PANEL->nb    = nb;      /* distribution blocking factor */
+  PANEL->jb    = JB;      /* panel width */
+  PANEL->m     = M;       /* global # of rows of trailing part of A */
+  PANEL->n     = N;       /* global # of cols of trailing part of A */
+  PANEL->ia    = IA;      /* global row index of trailing part of A */
+  PANEL->ja    = JA;      /* global col index of trailing part of A */
+  PANEL->mp    = mp;      /* local # of rows of trailing part of A */
+  PANEL->nq    = nq;      /* local # of cols of trailing part of A */
+  PANEL->ii    = ii;      /* local row index of trailing part of A */
+  PANEL->jj    = jj;      /* local col index of trailing part of A */
+  PANEL->lda   = A->ld;   /* local leading dim of array A */
+  PANEL->dlda  = A->ld;   /* local leading dim of array A */
+  PANEL->prow  = icurrow; /* proc row owning 1st row of trailing A */
+  PANEL->pcol  = icurcol; /* proc col owning 1st col of trailing A */
+  PANEL->msgid = TAG;     /* message id to be used for panel bcast */
+                          /*
+                           * Initialize  ldl2 and len to temporary dummy values and Update tag for
+                           * next panel
+                           */
+  PANEL->ldl2  = 0;       /* local leading dim of array L2 */
+  PANEL->dldl2 = 0;       /* local leading dim of array L2 */
+  PANEL->len   = 0;       /* length of the buffer to broadcast */
+  PANEL->nu0   = 0;
+  PANEL->nu1   = 0;
+  PANEL->nu2   = 0;
+  PANEL->ldu0  = 0;
+  PANEL->ldu1  = 0;
+  PANEL->ldu2  = 0;
+
+  /*
+   * Figure out the exact amount of workspace  needed by the factorization
+   * and the update - Allocate that space - Finish the panel data structu-
+   * re initialization.
+   *
+   * L1:    JB x JB in all processes
+   * DINFO: 1       in all processes
+   *
+   * We also make an array of necessary intergers for swaps in the update.
+   *
+   * If nprow is 1, we just allocate an array of 2*JB integers for the swap.
+   * When nprow > 1, we allocate the space for the index arrays immediate-
+   * ly. The exact size of this array depends on the swapping routine that
+   * will be used, so we allocate the maximum:
+   *
+   *       lindxU   is of size         JB +
+   *       lindxA   is of size at most JB +
+   *       lindxAU  is of size at most JB +
+   *       permU    is of size at most JB
+   *
+   *       ipiv     is of size at most JB
+   *
+   * that is  5*JB.
+   *
+   * We make sure that those three arrays are contiguous in memory for the
+   * later panel broadcast (using type punning to put the integer array at
+   * the end.  We  also  choose  to put this amount of space right after
+   * L2 (when it exist) so that one can receive a contiguous buffer.
+   */
+
+  /*Split fraction*/
+  const double fraction = ALGO->frac;
+
+  dalign      = ALGO->align * sizeof(double);
+  size_t lpiv = (5 * JB * sizeof(int) + sizeof(double) - 1) / (sizeof(double));
+
+  if(npcol > 1) {
+    ml2 = (myrow == icurrow ? mp - JB : mp);
+    ml2 = Mmax(0, ml2);
+    ml2 = ((ml2 + 95) / 128) * 128 + 32; /*pad*/
+  } else {
+    ml2 = 0; // L2 is aliased inside A
+  }
+
+  /* Size of LBcast message */
+  PANEL->len = ml2 * JB + JB * JB + lpiv; // L2, L1, integer arrays
+
+  /* space for L */
+  lwork = PANEL->len + 1;
+
+  nu  = Mmax(0, (mycol == icurcol ? nq - JB : nq));
+  ldu = nu + JB + 256; /*extra space for potential padding*/
+
+  /* space for U */
+  uwork = JB * ldu;
+
+  if(PANEL->max_lwork_size < (size_t)(lwork) * sizeof(double)) {
+    if(PANEL->LWORK) {
+      hipFree(PANEL->dLWORK);
+      free(PANEL->LWORK);
+    }
+    // size_t numbytes = (((size_t)((size_t)(lwork) * sizeof( double )) +
+    // (size_t)4095)/(size_t)4096)*(size_t)4096;
+    size_t numbytes = (size_t)(lwork) * sizeof(double);
+
+    if(deviceMalloc(GRID, (void**)&(PANEL->dLWORK), numbytes) != HPL_SUCCESS) {
+      HPL_pabort(__LINE__,
+                 "HPL_pdpanel_init",
+                 "Device memory allocation failed for L workspace.");
+    }
+    if(hostMalloc(GRID, (void**)&(PANEL->LWORK), numbytes) != HPL_SUCCESS) {
+      HPL_pabort(__LINE__,
+                 "HPL_pdpanel_init",
+                 "Host memory allocation failed for L workspace.");
+    }
+
+    PANEL->max_lwork_size = (size_t)(lwork) * sizeof(double);
+  }
+  if(PANEL->max_uwork_size < (size_t)(uwork) * sizeof(double)) {
+    if(PANEL->UWORK) {
+      hipFree(PANEL->dUWORK);
+      free(PANEL->UWORK);
+    }
+    // size_t numbytes = (((size_t)((size_t)(uwork) * sizeof( double )) +
+    // (size_t)4095)/(size_t)4096)*(size_t)4096;
+    size_t numbytes = (size_t)(uwork) * sizeof(double);
+
+    if(deviceMalloc(GRID, (void**)&(PANEL->dUWORK), numbytes) != HPL_SUCCESS) {
+      HPL_pabort(__LINE__,
+                 "HPL_pdpanel_init",
+                 "Device memory allocation failed for U workspace.");
+    }
+    if(hostMalloc(GRID, (void**)&(PANEL->UWORK), numbytes) != HPL_SUCCESS) {
+      HPL_pabort(__LINE__,
+                 "HPL_pdpanel_init",
+                 "Host memory allocation failed for U workspace.");
+    }
+
+    PANEL->max_uwork_size = (size_t)(uwork) * sizeof(double);
+  }
+
+  /*
+   * Initialize the pointers of the panel structure
+   */
+  if(npcol == 1) {
+    PANEL->L2    = PANEL->A + (myrow == icurrow ? JB : 0);
+    PANEL->dL2   = PANEL->dA + (myrow == icurrow ? JB : 0);
+    PANEL->ldl2  = A->ld;
+    PANEL->dldl2 = A->ld; /*L2 is aliased inside A*/
+
+    PANEL->L1  = (double*)PANEL->LWORK;
+    PANEL->dL1 = (double*)PANEL->dLWORK;
+  } else {
+    PANEL->L2    = (double*)PANEL->LWORK;
+    PANEL->dL2   = (double*)PANEL->dLWORK;
+    PANEL->ldl2  = Mmax(0, ml2);
+    PANEL->dldl2 = Mmax(0, ml2);
+
+    PANEL->L1  = PANEL->L2 + ml2 * JB;
+    PANEL->dL1 = PANEL->dL2 + ml2 * JB;
+  }
+
+  PANEL->U  = (double*)PANEL->UWORK;
+  PANEL->dU = (double*)PANEL->dUWORK;
+  PANEL->W  = A->W;
+  PANEL->dW = A->dW;
+
+  if(nprow == 1) {
+    PANEL->nu0  = (mycol == inxtcol) ? Mmin(JB, nu) : 0;
+    PANEL->ldu0 = PANEL->nu0;
+
+    PANEL->nu1  = 0;
+    PANEL->ldu1 = 0;
+
+    PANEL->nu2  = nu - PANEL->nu0;
+    PANEL->ldu2 = ((PANEL->nu2 + 95) / 128) * 128 + 32; /*pad*/
+
+    PANEL->U2  = PANEL->U + JB * JB;
+    PANEL->dU2 = PANEL->dU + JB * JB;
+    PANEL->U1  = PANEL->U2 + PANEL->ldu2 * JB;
+    PANEL->dU1 = PANEL->dU2 + PANEL->ldu2 * JB;
+
+    PANEL->permU  = (int*)(PANEL->L1 + JB * JB);
+    PANEL->dpermU = (int*)(PANEL->dL1 + JB * JB);
+    PANEL->ipiv   = PANEL->permU + JB;
+    PANEL->dipiv  = PANEL->dpermU + JB;
+
+    PANEL->DINFO  = (double*)(PANEL->ipiv + 2 * JB);
+    PANEL->dDINFO = (double*)(PANEL->dipiv + 2 * JB);
+  } else {
+    const int NSplit = Mmax(0, ((((int)(A->nq * fraction)) / nb) * nb));
+    PANEL->nu0       = (mycol == inxtcol) ? Mmin(JB, nu) : 0;
+    PANEL->ldu0      = PANEL->nu0;
+
+    PANEL->nu2  = Mmin(nu - PANEL->nu0, NSplit);
+    PANEL->ldu2 = ((PANEL->nu2 + 95) / 128) * 128 + 32; /*pad*/
+
+    PANEL->nu1  = nu - PANEL->nu0 - PANEL->nu2;
+    PANEL->ldu1 = ((PANEL->nu1 + 95) / 128) * 128 + 32; /*pad*/
+
+    PANEL->U2  = PANEL->U + JB * JB;
+    PANEL->dU2 = PANEL->dU + JB * JB;
+    PANEL->U1  = PANEL->U2 + PANEL->ldu2 * JB;
+    PANEL->dU1 = PANEL->dU2 + PANEL->ldu2 * JB;
+
+    PANEL->W2  = PANEL->W + JB * JB;
+    PANEL->dW2 = PANEL->dW + JB * JB;
+    PANEL->W1  = PANEL->W2 + PANEL->ldu2 * JB;
+    PANEL->dW1 = PANEL->dW2 + PANEL->ldu2 * JB;
+
+    PANEL->lindxA   = (int*)(PANEL->L1 + JB * JB);
+    PANEL->dlindxA  = (int*)(PANEL->dL1 + JB * JB);
+    PANEL->lindxAU  = PANEL->lindxA + JB;
+    PANEL->dlindxAU = PANEL->dlindxA + JB;
+    PANEL->lindxU   = PANEL->lindxAU + JB;
+    PANEL->dlindxU  = PANEL->dlindxAU + JB;
+    PANEL->permU    = PANEL->lindxU + JB;
+    PANEL->dpermU   = PANEL->dlindxU + JB;
+
+    // Put ipiv array at the end
+    PANEL->ipiv  = PANEL->permU + JB;
+    PANEL->dipiv = PANEL->dpermU + JB;
+
+    PANEL->DINFO  = ((double*)PANEL->lindxA) + lpiv;
+    PANEL->dDINFO = ((double*)PANEL->dlindxA) + lpiv;
+  }
+
+  *(PANEL->DINFO) = 0.0;
+
+  /*
+   * If nprow is 1, we just allocate an array of JB integers to store the
+   * pivot IDs during factoring, and a scratch array of mp integers.
+   * When nprow > 1, we allocate the space for the index arrays immediate-
+   * ly. The exact size of this array depends on the swapping routine that
+   * will be used, so we allocate the maximum:
+   *
+   *    IWORK[0] is of size at most 1      +
+   *    IPL      is of size at most 1      +
+   *    IPID     is of size at most 4 * JB +
+   *    IPIV     is of size at most JB     +
+   *    SCRATCH  is of size at most MP
+   *
+   *    ipA      is of size at most 1      +
+   *    iplen    is of size at most NPROW  + 1 +
+   *    ipcounts is of size at most NPROW  +
+   *    ioffsets is of size at most NPROW  +
+   *    iwork    is of size at most MAX( 2*JB, NPROW+1 ).
+   *
+   * that is  mp + 4 + 5*JB + 3*NPROW + MAX( 2*JB, NPROW+1 ).
+   *
+   * We use the fist entry of this to work array  to indicate  whether the
+   * the  local  index arrays have already been computed,  and if yes,  by
+   * which function:
+   *    IWORK[0] = -1: no index arrays have been computed so far;
+   *    IWORK[0] =  1: HPL_pdlaswp already computed those arrays;
+   * This allows to save some redundant and useless computations.
+   */
+  if(nprow == 1) {
+    lwork = mp + JB;
+  } else {
+    itmp1 = (JB << 1);
+    lwork = nprow + 1;
+    itmp1 = Mmax(itmp1, lwork);
+    lwork = mp + 4 + (5 * JB) + (3 * nprow) + itmp1;
+  }
+
+  if(PANEL->max_iwork_size < (size_t)(lwork) * sizeof(int)) {
+    if(PANEL->IWORK) { free(PANEL->IWORK); }
+    size_t numbytes = (size_t)(lwork) * sizeof(int);
+
+    if(Malloc(GRID, (void**)&(PANEL->IWORK), numbytes) != HPL_SUCCESS) {
+      HPL_pabort(__LINE__,
+                 "HPL_pdpanel_init",
+                 "Host memory allocation failed for integer workspace.");
+    }
+    PANEL->max_iwork_size = (size_t)(lwork) * sizeof(int);
+  }
+
+  if(lwork) *(PANEL->IWORK) = -1;
+
+  /*Finally, we need 4 + 4*JB entries of scratch for pdfact */
+  lwork = (size_t)(((4 + ((unsigned int)(JB) << 1)) << 1));
+  if(PANEL->max_fwork_size < (size_t)(lwork) * sizeof(double)) {
+    if(PANEL->fWORK) { free(PANEL->fWORK); }
+    size_t numbytes = (size_t)(lwork) * sizeof(double);
+
+    if(Malloc(GRID, (void**)&(PANEL->fWORK), numbytes) != HPL_SUCCESS) {
+      HPL_pabort(__LINE__,
+                 "HPL_pdpanel_init",
+                 "Host memory allocation failed for pdfact scratch workspace.");
+    }
+    PANEL->max_fwork_size = (size_t)(lwork) * sizeof(double);
+  }
+}
diff --git a/src/panel/HPL_pdpanel_new.cpp b/src/panel/HPL_pdpanel_new.cpp
new file mode 100644
index 0000000..f790182
--- /dev/null
+++ b/src/panel/HPL_pdpanel_new.cpp
@@ -0,0 +1,105 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_pdpanel_new(HPL_T_grid*   GRID,
+                     HPL_T_palg*   ALGO,
+                     const int     M,
+                     const int     N,
+                     const int     JB,
+                     HPL_T_pmat*   A,
+                     const int     IA,
+                     const int     JA,
+                     const int     TAG,
+                     HPL_T_panel** PANEL) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pdpanel_new creates and initializes a panel data structure.
+   *
+   *
+   * Arguments
+   * =========
+   *
+   * GRID    (local input)                 HPL_T_grid *
+   *         On entry,  GRID  points  to the data structure containing the
+   *         process grid information.
+   *
+   * ALGO    (global input)                HPL_T_palg *
+   *         On entry,  ALGO  points to  the data structure containing the
+   *         algorithmic parameters.
+   *
+   * M       (local input)                 const int
+   *         On entry, M specifies the global number of rows of the panel.
+   *         M must be at least zero.
+   *
+   * N       (local input)                 const int
+   *         On entry,  N  specifies  the  global number of columns of the
+   *         panel and trailing submatrix. N must be at least zero.
+   *
+   * JB      (global input)                const int
+   *         On entry, JB specifies is the number of columns of the panel.
+   *         JB must be at least zero.
+   *
+   * A       (local input/output)          HPL_T_pmat *
+   *         On entry, A points to the data structure containing the local
+   *         array information.
+   *
+   * IA      (global input)                const int
+   *         On entry,  IA  is  the global row index identifying the panel
+   *         and trailing submatrix. IA must be at least zero.
+   *
+   * JA      (global input)                const int
+   *         On entry, JA is the global column index identifying the panel
+   *         and trailing submatrix. JA must be at least zero.
+   *
+   * TAG     (global input)                const int
+   *         On entry, TAG is the row broadcast message id.
+   *
+   * PANEL   (local input/output)          HPL_T_panel * *
+   *         On entry,  PANEL  points  to  the  address  of the panel data
+   *         structure to create and initialize.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  HPL_T_panel* p = NULL;
+
+  /*
+   * Allocate the panel structure - Check for enough memory
+   */
+  if(!(p = (HPL_T_panel*)malloc(sizeof(HPL_T_panel)))) {
+    HPL_pabort(__LINE__, "HPL_pdpanel_new", "Memory allocation failed");
+  }
+
+  p->max_pinned_work_size = 0;
+  p->max_lwork_size       = 0;
+  p->max_uwork_size       = 0;
+  p->max_iwork_size       = 0;
+  p->max_fwork_size       = 0;
+  p->free_work_now        = 0;
+  p->A                    = NULL;
+  p->LWORK                = NULL;
+  p->dLWORK               = NULL;
+  p->UWORK                = NULL;
+  p->dUWORK               = NULL;
+  p->fWORK                = NULL;
+  p->IWORK                = NULL;
+  HPL_pdpanel_init(GRID, ALGO, M, N, JB, A, IA, JA, TAG, p);
+  *PANEL = p;
+}
diff --git a/src/panel/HPL_pdpanel_wait.cpp b/src/panel/HPL_pdpanel_wait.cpp
new file mode 100644
index 0000000..5bc6bcf
--- /dev/null
+++ b/src/panel/HPL_pdpanel_wait.cpp
@@ -0,0 +1,22 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+#include "hpl.hpp"
+
+void HPL_pdpanel_Wait(HPL_T_panel* PANEL) {
+
+#ifdef HPL_DETAILED_TIMING
+  HPL_ptimer(HPL_TIMING_COPY);
+#endif
+  // Wait for panel
+  hipStreamSynchronize(dataStream);
+#ifdef HPL_DETAILED_TIMING
+  HPL_ptimer(HPL_TIMING_COPY);
+#endif
+}
diff --git a/src/pauxil/HPL_dlaswp00N_device.cpp b/src/pauxil/HPL_dlaswp00N_device.cpp
new file mode 100644
index 0000000..8819a82
--- /dev/null
+++ b/src/pauxil/HPL_dlaswp00N_device.cpp
@@ -0,0 +1,111 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+#include <hip/hip_runtime.h>
+
+#define BLOCK_SIZE 512
+
+__global__ void dlaswp00N(const int N,
+                          const int M,
+                          double* __restrict__ A,
+                          const int LDA,
+                          const int* __restrict__ IPIV) {
+
+  __shared__ double s_An_init[2048];
+  __shared__ double s_An_ipiv[2048];
+
+  const int m = threadIdx.x;
+  const int n = blockIdx.x;
+
+  // read in block column
+  for(int i = m; i < M; i += blockDim.x)
+    s_An_init[i] = A[i + n * ((size_t)LDA)];
+
+  __syncthreads();
+
+  // local block
+  for(int i = m; i < M; i += blockDim.x) {
+    const int ip = IPIV[i];
+
+    if(ip < M) { // local swap
+      s_An_ipiv[i] = s_An_init[ip];
+    } else { // non local swap
+      s_An_ipiv[i] = A[ip + n * ((size_t)LDA)];
+    }
+  }
+  __syncthreads();
+
+  // write out local block
+  for(int i = m; i < M; i += blockDim.x)
+    A[i + n * ((size_t)LDA)] = s_An_ipiv[i];
+
+  // remaining swaps in column
+  for(int i = m; i < M; i += blockDim.x) {
+    const int ip_ex = IPIV[i + M];
+
+    if(ip_ex > -1) { A[ip_ex + n * ((size_t)LDA)] = s_An_init[i]; }
+  }
+}
+
+void HPL_dlaswp00N(const int  M,
+                   const int  N,
+                   double*    A,
+                   const int  LDA,
+                   const int* IPIV) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_dlaswp00N performs a series of local row interchanges on a matrix
+   * A. One row interchange is initiated for rows 0 through M-1 of A.
+   *
+   * Arguments
+   * =========
+   *
+   * M       (local input)                 const int
+   *         On entry, M specifies the number of rows of the array A to be
+   *         interchanged. M must be at least zero.
+   *
+   * N       (local input)                 const int
+   *         On entry, N  specifies  the number of columns of the array A.
+   *         N must be at least zero.
+   *
+   * A       (local input/output)          double *
+   *         On entry, A  points to an array of dimension (LDA,N) to which
+   *         the row interchanges will be  applied.  On exit, the permuted
+   *         matrix.
+   *
+   * LDA     (local input)                 const int
+   *         On entry, LDA specifies the leading dimension of the array A.
+   *         LDA must be at least MAX(1,M).
+   *
+   * IPIV    (local input)                 const int *
+   *         On entry,  IPIV  is  an  array of size  M  that  contains the
+   *         pivoting  information.  For  k  in [0..M),  IPIV[k]=IROFF + l
+   *         implies that local rows k and l are to be interchanged.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  if((M <= 0) || (N <= 0)) return;
+
+  hipStream_t stream;
+  rocblas_get_stream(handle, &stream);
+
+  int grid_size = N;
+  dlaswp00N<<<grid_size, BLOCK_SIZE, 0, stream>>>(N, M, A, LDA, IPIV);
+}
diff --git a/src/pauxil/HPL_dlaswp01T_device.cpp b/src/pauxil/HPL_dlaswp01T_device.cpp
new file mode 100644
index 0000000..d858d47
--- /dev/null
+++ b/src/pauxil/HPL_dlaswp01T_device.cpp
@@ -0,0 +1,135 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+#include <hip/hip_runtime.h>
+
+#define TILE_DIM 32
+#define BLOCK_ROWS 8
+
+/* Build U matrix from rows of A */
+__global__ void dlaswp01T(const int M,
+                          const int N,
+                          double* __restrict__ A,
+                          const int LDA,
+                          double* __restrict__ U,
+                          const int LDU,
+                          const int* __restrict__ LINDXU) {
+
+  __shared__ double s_U[TILE_DIM][TILE_DIM + 1];
+
+  const int m = threadIdx.x + TILE_DIM * blockIdx.x;
+  const int n = threadIdx.y + TILE_DIM * blockIdx.y;
+
+  if(m < M) {
+    const int ipa = LINDXU[m];
+
+    // save in LDS for the moment
+    // possible cache-hits if ipas are close
+    s_U[threadIdx.x][threadIdx.y + 0] =
+        (n + 0 < N) ? A[ipa + (n + 0) * ((size_t)LDA)] : 0.0;
+    s_U[threadIdx.x][threadIdx.y + 8] =
+        (n + 8 < N) ? A[ipa + (n + 8) * ((size_t)LDA)] : 0.0;
+    s_U[threadIdx.x][threadIdx.y + 16] =
+        (n + 16 < N) ? A[ipa + (n + 16) * ((size_t)LDA)] : 0.0;
+    s_U[threadIdx.x][threadIdx.y + 24] =
+        (n + 24 < N) ? A[ipa + (n + 24) * ((size_t)LDA)] : 0.0;
+  }
+
+  __syncthreads();
+
+  const int um = threadIdx.y + TILE_DIM * blockIdx.x;
+  const int un = threadIdx.x + TILE_DIM * blockIdx.y;
+
+  if(un < N) {
+    // write out chunks of U
+    if((um + 0) < M)
+      U[un + (um + 0) * ((size_t)LDU)] = s_U[threadIdx.y + 0][threadIdx.x];
+    if((um + 8) < M)
+      U[un + (um + 8) * ((size_t)LDU)] = s_U[threadIdx.y + 8][threadIdx.x];
+    if((um + 16) < M)
+      U[un + (um + 16) * ((size_t)LDU)] = s_U[threadIdx.y + 16][threadIdx.x];
+    if((um + 24) < M)
+      U[un + (um + 24) * ((size_t)LDU)] = s_U[threadIdx.y + 24][threadIdx.x];
+  }
+}
+
+void HPL_dlaswp01T(const int  M,
+                   const int  N,
+                   double*    A,
+                   const int  LDA,
+                   double*    U,
+                   const int  LDU,
+                   const int* LINDXU) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_dlaswp01T copies  scattered rows  of  A  into an array U.  The
+   * row offsets in  A  of the source rows  are specified by LINDXU.
+   * Rows of A are stored as columns in U.
+   *
+   * Arguments
+   * =========
+   *
+   * M       (local input)                 const int
+   *         On entry, M  specifies the number of rows of A that should be
+   *         moved within A or copied into U. M must be at least zero.
+   *
+   * N       (local input)                 const int
+   *         On entry, N  specifies the length of rows of A that should be
+   *         moved within A or copied into U. N must be at least zero.
+   *
+   * A       (local input/output)          double *
+   *         On entry, A points to an array of dimension (LDA,N). The rows
+   *         of this array specified by LINDXA should be moved within A or
+   *         copied into U.
+   *
+   * LDA     (local input)                 const int
+   *         On entry, LDA specifies the leading dimension of the array A.
+   *         LDA must be at least MAX(1,M).
+   *
+   * U       (local input/output)          double *
+   *         On entry, U points to an array of dimension (LDU,M). The rows
+   *         of A specified by  LINDXA  are copied within this array  U at
+   *         the  positions indicated by positive values of LINDXAU.  The
+   *         rows of A are stored as columns in U.
+   *
+   * LDU     (local input)                 const int
+   *         On entry, LDU specifies the leading dimension of the array U.
+   *         LDU must be at least MAX(1,N).
+   *
+   * LINDXU  (local input)                 const int *
+   *         On entry, LINDXU is an array of dimension M that contains the
+   *         local  row indexes  of  A  that should be copied into U.
+   *
+   * ---------------------------------------------------------------------
+   */
+  /*
+   * .. Local Variables ..
+   */
+
+  if((M <= 0) || (N <= 0)) return;
+
+  dim3 grid_size((M + TILE_DIM - 1) / TILE_DIM, (N + TILE_DIM - 1) / TILE_DIM);
+  dim3 block_size(TILE_DIM, BLOCK_ROWS);
+  dlaswp01T<<<grid_size, block_size, 0, computeStream>>>(
+      M, N, A, LDA, U, LDU, LINDXU);
+
+  /*
+   * End of HPL_dlaswp01T
+   */
+}
diff --git a/src/pauxil/HPL_dlaswp02T_device.cpp b/src/pauxil/HPL_dlaswp02T_device.cpp
new file mode 100644
index 0000000..c950596
--- /dev/null
+++ b/src/pauxil/HPL_dlaswp02T_device.cpp
@@ -0,0 +1,106 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+#include <hip/hip_runtime.h>
+#include <cassert>
+
+#define assertm(exp, msg) assert(((void)msg, exp))
+
+/* Perform any local row swaps of A */
+__global__ void dlaswp02T(const int M,
+                          const int N,
+                          double* __restrict__ A,
+                          const int LDA,
+                          const int* __restrict__ LINDXAU,
+                          const int* __restrict__ LINDXA) {
+
+  const int n = blockIdx.x;
+  const int m = threadIdx.x;
+
+  const int ipau = LINDXAU[m]; // src row
+  const int ipa  = LINDXA[m];  // dst row
+
+  const double An = A[ipau + n * ((size_t)LDA)];
+
+  __syncthreads();
+
+  A[ipa + n * ((size_t)LDA)] = An;
+}
+
+void HPL_dlaswp02T(const int  M,
+                   const int  N,
+                   double*    A,
+                   const int  LDA,
+                   const int* LINDXAU,
+                   const int* LINDXA) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_dlaswp02T copies  scattered rows  of  A  into itself. The row
+   * offsets in  A  of the source rows  are specified by LINDXA.
+   * The  destination of those rows are specified by  LINDXAU.  A
+   * positive value of LINDXAU indicates that the array  destination is U,
+   * and A otherwise.
+   *
+   * Arguments
+   * =========
+   *
+   * M       (local input)                 const int
+   *         On entry, M  specifies the number of rows of A that should be
+   *         moved within A or copied into U. M must be at least zero.
+   *
+   * N       (local input)                 const int
+   *         On entry, N  specifies the length of rows of A that should be
+   *         moved within A or copied into U. N must be at least zero.
+   *
+   * A       (local input/output)          double *
+   *         On entry, A points to an array of dimension (LDA,N). The rows
+   *         of this array specified by LINDXA should be moved within A or
+   *         copied into U.
+   *
+   * LDA     (local input)                 const int
+   *         On entry, LDA specifies the leading dimension of the array A.
+   *         LDA must be at least MAX(1,M).
+   *
+   * LINDXAU (local input)                 const int *
+   *         On entry, LINDXA is an array of dimension M that contains the
+   *         local  row indexes  of  A  that should be moved within  A.
+   *
+   * LINDXA  (local input)                 const int *
+   *         On entry, LINDXAU  is an array of dimension  M that  contains
+   *         the local  row indexes of  A  where the rows of  A  should be
+   *         copied to.
+   *
+   * ---------------------------------------------------------------------
+   */
+  /*
+   * .. Local Variables ..
+   */
+
+  if((M <= 0) || (N <= 0)) return;
+
+  assertm(M <= 1024, "NB too large in HPL_dlaswp02T");
+
+  dim3 grid_size(N);
+  dim3 block_size(M);
+  dlaswp02T<<<N, M, 0, computeStream>>>(M, N, A, LDA, LINDXAU, LINDXA);
+
+  /*
+   * End of HPL_dlaswp02T
+   */
+}
diff --git a/src/pauxil/HPL_dlaswp03T_device.cpp b/src/pauxil/HPL_dlaswp03T_device.cpp
new file mode 100644
index 0000000..4264538
--- /dev/null
+++ b/src/pauxil/HPL_dlaswp03T_device.cpp
@@ -0,0 +1,133 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+#include <hip/hip_runtime.h>
+
+#define TILE_DIM 32
+#define BLOCK_ROWS 8
+
+/* Build W matrix from rows of A */
+__global__ void dlaswp03T(const int M,
+                          const int N,
+                          double* __restrict__ A,
+                          const int LDA,
+                          double* __restrict__ W,
+                          const int LDW,
+                          const int* __restrict__ LINDXU) {
+
+  __shared__ double s_W[TILE_DIM][TILE_DIM + 1];
+
+  const int m = threadIdx.x + TILE_DIM * blockIdx.x;
+  const int n = threadIdx.y + TILE_DIM * blockIdx.y;
+
+  if(m < M) {
+    const int ipa = LINDXU[m];
+
+    // save in LDS for the moment
+    // possible cache-hits if ipas are close
+    s_W[threadIdx.x][threadIdx.y + 0] =
+        (n + 0 < N) ? A[ipa + (n + 0) * ((size_t)LDA)] : 0.0;
+    s_W[threadIdx.x][threadIdx.y + 8] =
+        (n + 8 < N) ? A[ipa + (n + 8) * ((size_t)LDA)] : 0.0;
+    s_W[threadIdx.x][threadIdx.y + 16] =
+        (n + 16 < N) ? A[ipa + (n + 16) * ((size_t)LDA)] : 0.0;
+    s_W[threadIdx.x][threadIdx.y + 24] =
+        (n + 24 < N) ? A[ipa + (n + 24) * ((size_t)LDA)] : 0.0;
+  }
+
+  __syncthreads();
+
+  const int wm = threadIdx.y + TILE_DIM * blockIdx.x;
+  const int wn = threadIdx.x + TILE_DIM * blockIdx.y;
+
+  if(wn < N) {
+    // write out chunks of W
+    if((wm + 0) < M)
+      W[wn + (wm + 0) * ((size_t)LDW)] = s_W[threadIdx.y + 0][threadIdx.x];
+    if((wm + 8) < M)
+      W[wn + (wm + 8) * ((size_t)LDW)] = s_W[threadIdx.y + 8][threadIdx.x];
+    if((wm + 16) < M)
+      W[wn + (wm + 16) * ((size_t)LDW)] = s_W[threadIdx.y + 16][threadIdx.x];
+    if((wm + 24) < M)
+      W[wn + (wm + 24) * ((size_t)LDW)] = s_W[threadIdx.y + 24][threadIdx.x];
+  }
+}
+
+void HPL_dlaswp03T(const int  M,
+                   const int  N,
+                   double*    A,
+                   const int  LDA,
+                   double*    W,
+                   const int  LDW,
+                   const int* LINDXU) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_dlaswp03T packs scattered rows of an array  A  into workspace  W.
+   * The row offsets in A are specified by LINDXU.
+   *
+   * Arguments
+   * =========
+   *
+   * M       (local input)                 const int
+   *         On entry, M  specifies the number of rows of A that should be
+   *         swapped with columns of W. M must be at least zero.
+   *
+   * N       (local input)                 const int
+   *         On entry, N specifies the length of the rows of A that should
+   *         be swapped with columns of W. N must be at least zero.
+   *
+   * A       (local output)                double *
+   *         On entry, A points to an array of dimension (LDA,N). On exit,
+   *         the  rows of this array specified by  LINDXU  are replaced by
+   *         columns of W.
+   *
+   * LDA     (local input)                 const int
+   *         On entry, LDA specifies the leading dimension of the array A.
+   *         LDA must be at least MAX(1,M).
+   *
+   * W       (local input/output)          double *
+   *         On entry,  W  points  to an array of dimension (LDW,*).  This
+   *         array contains the columns of  W  that are to be swapped with
+   *         rows of A.
+   *
+   * LDW     (local input)                 const int
+   *         On entry, LDW specifies the leading dimension of the array W.
+   *         LDW must be at least MAX(1,N).
+   *
+   * LINDXU  (local input)                 const int *
+   *         On entry, LINDXU is an array of dimension M that contains the
+   *         local row indexes of A that should be copied into W.
+   *
+   * ---------------------------------------------------------------------
+   */
+  /*
+   * .. Local Variables ..
+   */
+
+  if((M <= 0) || (N <= 0)) return;
+
+  dim3 grid_size((M + TILE_DIM - 1) / TILE_DIM, (N + TILE_DIM - 1) / TILE_DIM);
+  dim3 block_size(TILE_DIM, BLOCK_ROWS);
+  dlaswp03T<<<grid_size, block_size, 0, computeStream>>>(
+      M, N, A, LDA, W, LDW, LINDXU);
+
+  /*
+   * End of HPL_dlaswp03T
+   */
+}
diff --git a/src/pauxil/HPL_dlaswp04T_device.cpp b/src/pauxil/HPL_dlaswp04T_device.cpp
new file mode 100644
index 0000000..ca94a00
--- /dev/null
+++ b/src/pauxil/HPL_dlaswp04T_device.cpp
@@ -0,0 +1,128 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+#include <hip/hip_runtime.h>
+
+#define TILE_DIM 32
+#define BLOCK_ROWS 8
+
+static __global__ void dlaswp04T(const int M,
+                                 const int N,
+                                 double* __restrict__ A,
+                                 const int LDA,
+                                 double* __restrict__ W,
+                                 const int LDW,
+                                 const int* __restrict__ LINDXU) {
+
+  __shared__ double s_W[TILE_DIM][TILE_DIM + 1];
+
+  const int am = threadIdx.x + TILE_DIM * blockIdx.x;
+  const int an = threadIdx.y + TILE_DIM * blockIdx.y;
+
+  const int wm = threadIdx.y + TILE_DIM * blockIdx.x;
+  const int wn = threadIdx.x + TILE_DIM * blockIdx.y;
+
+  if(wn < N) {
+    s_W[threadIdx.y + 0][threadIdx.x] =
+        (wm + 0 < M) ? W[wn + (wm + 0) * ((size_t)LDW)] : 0.0;
+    s_W[threadIdx.y + 8][threadIdx.x] =
+        (wm + 8 < M) ? W[wn + (wm + 8) * ((size_t)LDW)] : 0.0;
+    s_W[threadIdx.y + 16][threadIdx.x] =
+        (wm + 16 < M) ? W[wn + (wm + 16) * ((size_t)LDW)] : 0.0;
+    s_W[threadIdx.y + 24][threadIdx.x] =
+        (wm + 24 < M) ? W[wn + (wm + 24) * ((size_t)LDW)] : 0.0;
+  }
+
+  __syncthreads();
+
+  if(am < M) {
+    const int aip = LINDXU[am];
+    if((an + 0) < N)
+      A[aip + (an + 0) * ((size_t)LDA)] = s_W[threadIdx.x][threadIdx.y + 0];
+    if((an + 8) < N)
+      A[aip + (an + 8) * ((size_t)LDA)] = s_W[threadIdx.x][threadIdx.y + 8];
+    if((an + 16) < N)
+      A[aip + (an + 16) * ((size_t)LDA)] = s_W[threadIdx.x][threadIdx.y + 16];
+    if((an + 24) < N)
+      A[aip + (an + 24) * ((size_t)LDA)] = s_W[threadIdx.x][threadIdx.y + 24];
+  }
+}
+
+void HPL_dlaswp04T(const int  M,
+                   const int  N,
+                   double*    A,
+                   const int  LDA,
+                   double*    W,
+                   const int  LDW,
+                   const int* LINDXU) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_dlaswp04T writes columns  of  W  into  rows  of  A  at  positions
+   * indicated by LINDXU.
+   *
+   * Arguments
+   * =========
+   *
+   * M       (local input)                 const int
+   *         On entry, M  specifies the number of rows of A that should be
+   *         replaced with columns of W. M must be at least zero.
+   *
+   * N       (local input)                 const int
+   *         On entry, N specifies the length of the rows of A that should
+   *         be replaced with columns of W. N must be at least zero.
+   *
+   * A       (local output)                double *
+   *         On entry, A points to an array of dimension (LDA,N). On exit,
+   *         the  rows of this array specified by  LINDXU  are replaced by
+   *         columns of W.
+   *
+   * LDA     (local input)                 const int
+   *         On entry, LDA specifies the leading dimension of the array A.
+   *         LDA must be at least MAX(1,M).
+   *
+   * W       (local input/output)          double *
+   *         On entry,  W  points  to an array of dimension (LDW,*).  This
+   *         array contains the columns of  W  that are to be writen to
+   *         rows of A.
+   *
+   * LDW     (local input)                 const int
+   *         On entry, LDW specifies the leading dimension of the array W.
+   *         LDW must be at least MAX(1,N).
+   *
+   * LINDXU  (local input)                 const int *
+   *         On entry, LINDXU is an array of dimension M that contains the
+   *         local row indexes of A that should be replaced with W.
+   *
+   * ---------------------------------------------------------------------
+   */
+  /*
+   * .. Local Variables ..
+   */
+
+  if((M <= 0) || (N <= 0)) return;
+
+  dim3 grid_size((M + TILE_DIM - 1) / TILE_DIM, (N + TILE_DIM - 1) / TILE_DIM);
+  dim3 block_size(TILE_DIM, BLOCK_ROWS);
+  dlaswp04T<<<grid_size, block_size, 0, computeStream>>>(
+      M, N, A, LDA, W, LDW, LINDXU);
+
+  /*
+   * End of HPL_dlaswp04T
+   */
+}
diff --git a/src/pauxil/HPL_dlaswp10N_device.cpp b/src/pauxil/HPL_dlaswp10N_device.cpp
new file mode 100644
index 0000000..78a8910
--- /dev/null
+++ b/src/pauxil/HPL_dlaswp10N_device.cpp
@@ -0,0 +1,91 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+#include <hip/hip_runtime.h>
+
+#define BLOCK_SIZE 512
+
+__global__ void dlaswp10N(const int M,
+                          const int N,
+                          double* __restrict__ A,
+                          const int LDA,
+                          const int* __restrict__ IPIV) {
+
+  const int m = threadIdx.x + BLOCK_SIZE * blockIdx.x;
+
+  if(m < M) {
+    for(int i = 0; i < N; i++) {
+      const int ip = IPIV[i];
+
+      if(ip != i) {
+        // swap
+        const double Ai           = A[m + i * ((size_t)LDA)];
+        const double Aip          = A[m + ip * ((size_t)LDA)];
+        A[m + i * ((size_t)LDA)]  = Aip;
+        A[m + ip * ((size_t)LDA)] = Ai;
+      }
+    }
+  }
+}
+
+void HPL_dlaswp10N(const int  M,
+                   const int  N,
+                   double*    A,
+                   const int  LDA,
+                   const int* IPIV) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_dlaswp10N performs a sequence  of  local column interchanges on a
+   * matrix A.  One column interchange is initiated  for columns 0 through
+   * N-1 of A.
+   *
+   * Arguments
+   * =========
+   *
+   * M       (local input)                 const int
+   *         __arg0__
+   *
+   * N       (local input)                 const int
+   *         On entry,  M  specifies  the number of rows of the array A. M
+   *         must be at least zero.
+   *
+   * A       (local input/output)          double *
+   *         On entry, N specifies the number of columns of the array A. N
+   *         must be at least zero.
+   *
+   * LDA     (local input)                 const int
+   *         On entry, A  points to an  array of  dimension (LDA,N).  This
+   *         array contains the columns onto which the interchanges should
+   *         be applied. On exit, A contains the permuted matrix.
+   *
+   * IPIV    (local input)                 const int *
+   *         On entry, LDA specifies the leading dimension of the array A.
+   *         LDA must be at least MAX(1,M).
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  if((M <= 0) || (N <= 0)) return;
+
+  hipStream_t stream;
+  rocblas_get_stream(handle, &stream);
+
+  dim3 grid_size((M + BLOCK_SIZE - 1) / BLOCK_SIZE);
+  dlaswp10N<<<grid_size, dim3(BLOCK_SIZE), 0, stream>>>(M, N, A, LDA, IPIV);
+}
diff --git a/src/pauxil/HPL_indxg2l.cpp b/src/pauxil/HPL_indxg2l.cpp
new file mode 100644
index 0000000..4ae5811
--- /dev/null
+++ b/src/pauxil/HPL_indxg2l.cpp
@@ -0,0 +1,96 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+int HPL_indxg2l(const int IG,
+                const int INB,
+                const int NB,
+                const int SRCPROC,
+                const int NPROCS) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_indxg2l computes  the local index of a matrix entry pointed to by
+   * the  global index IG.  This  local  returned index is the same in all
+   * processes.
+   *
+   * Arguments
+   * =========
+   *
+   * IG      (input)                       const int
+   *         On entry, IG specifies the global index of the matrix  entry.
+   *         IG must be at least zero.
+   *
+   * INB     (input)                       const int
+   *         On entry,  INB  specifies  the size of the first block of the
+   *         global matrix. INB must be at least one.
+   *
+   * NB      (input)                       const int
+   *         On entry,  NB specifies the blocking factor used to partition
+   *         and distribute the matrix. NB must be larger than one.
+   *
+   * SRCPROC (input)                       const int
+   *         On entry, if SRCPROC = -1, the data  is not  distributed  but
+   *         replicated,  in  which  case  this  routine returns IG in all
+   *         processes. Otherwise, the value of SRCPROC is ignored.
+   *
+   * NPROCS  (input)                       const int
+   *         On entry,  NPROCS  specifies the total number of process rows
+   *         or columns over which the matrix is distributed.  NPROCS must
+   *         be at least one.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  int i, j;
+
+  if((IG < INB) || (SRCPROC == -1) || (NPROCS == 1))
+    /*
+     * IG  belongs  to the first block,  or the data is not distributed,  or
+     * there is just one process in this dimension of the grid.
+     */
+    return (IG);
+  /*
+   * IG  =  INB - NB + ( l * NPROCS + MYROC ) * NB + X  with  0 <= X < NB,
+   * thus IG is to be found in the block (IG-INB+NB) / NB = l*NPROCS+MYROC
+   * with  0 <= MYROC < NPROCS.  The local index to be returned depends on
+   * whether  IG  resides in the process owning the first partial block of
+   * size INB (MYROC=0). To determine this cheaply, let i = (IG-INB) / NB,
+   * so that if NPROCS divides i+1, i.e. MYROC=0,  we have i+1 = l*NPROCS.
+   * If we set  j = i / NPROCS, it follows that j = l-1. Therefore, i+1 is
+   * equal to (j+1) * NPROCS.  Conversely, if NPROCS does not divide  i+1,
+   * then i+1 = l*NPROCS + MYROC with 1 <= MYROC < NPROCS. It follows that
+   * j=l and thus (j+1)*NPROCS > i+1.
+   */
+  j = (i = (IG - INB) / NB) / NPROCS;
+  /*
+   * When IG resides in the process owning the first partial block of size
+   * INB (MYROC = 0), then the result IL can be written as:
+   * IL = INB - NB + l * NB + X  = IG + ( l - (l * NPROCS + MYROC) ) * NB.
+   * Using the above notation,  we have i+1 = l*NPROCS + MYROC = l*NPROCS,
+   * i.e l = ( i+1 ) / NPROCS = j+1,  since  NPROCS divides i+1, therefore
+   * IL = IG + ( j + 1 - ( i + 1 ) ) * NB.
+   *
+   * Otherwise when MYROC >= 1, the result IL can be written as:
+   * IL = l * NB + X = IG - INB + ( ( l+1 ) - ( l * NPROCS + MYROC ) )*NB.
+   * We still have i+1 = l*NPROCS+MYROC. Since NPROCS does not divide i+1,
+   * we have j = (l*NPROCS+MYROC-1) / NPROCS = l, i.e
+   * IL = IG - INB + ( j + 1 - ( i + 1 ) ) * NB.
+   */
+  return (NB * (j - i) + ((i + 1 - (j + 1) * NPROCS) ? IG - INB : IG));
+}
diff --git a/src/pauxil/HPL_indxg2lp.cpp b/src/pauxil/HPL_indxg2lp.cpp
new file mode 100644
index 0000000..e148f83
--- /dev/null
+++ b/src/pauxil/HPL_indxg2lp.cpp
@@ -0,0 +1,116 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_indxg2lp(int*      IL,
+                  int*      PROC,
+                  const int IG,
+                  const int INB,
+                  const int NB,
+                  const int SRCPROC,
+                  const int NPROCS) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_indxg2lp computes the local index of a matrix entry pointed to by
+   * the global  index IG as well as the process coordinate which posseses
+   * this entry. The local returned index is the same in all processes.
+   *
+   * Arguments
+   * =========
+   *
+   * IL      (output)                      int *
+   *         On exit, IL specifies the local index corresponding to IG. IL
+   *         is at least zero.
+   *
+   * PROC    (output)                      int *
+   *         On exit,  PROC  is the  coordinate of the process  owning the
+   *         entry specified by the global index IG. PROC is at least zero
+   *         and less than NPROCS.
+   *
+   * IG      (input)                       const int
+   *         On entry, IG specifies the global index of the matrix  entry.
+   *         IG must be at least zero.
+   *
+   * INB     (input)                       const int
+   *         On entry,  INB  specifies  the size of the first block of the
+   *         global matrix. INB must be at least one.
+   *
+   * NB      (input)                       const int
+   *         On entry,  NB specifies the blocking factor used to partition
+   *         and distribute the matrix A. NB must be larger than one.
+   *
+   * SRCPROC (input)                       const int
+   *         On entry, if SRCPROC = -1, the data  is not  distributed  but
+   *         replicated,  in  which  case  this  routine returns IG in all
+   *         processes. Otherwise, the value of SRCPROC is ignored.
+   *
+   * NPROCS  (input)                       const int
+   *         On entry,  NPROCS  specifies the total number of process rows
+   *         or columns over which the matrix is distributed.  NPROCS must
+   *         be at least one.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  int i, j;
+
+  if((IG < INB) || (SRCPROC == -1) || (NPROCS == 1)) {
+    /*
+     * IG  belongs  to the first block,  or the data is not distributed,  or
+     * there is just one process in this dimension of the grid.
+     */
+    *IL   = IG;
+    *PROC = SRCPROC;
+  } else {
+    /*
+     * IG  =  INB - NB + ( l * NPROCS + MYROC ) * NB + X  with  0 <= X < NB,
+     * thus IG is to be found in the block (IG-INB+NB) / NB = l*NPROCS+MYROC
+     * with  0 <= MYROC < NPROCS.  The local index to be returned depends on
+     * whether  IG  resides in the process owning the first partial block of
+     * size INB (MYROC=0). To determine this cheaply, let i = (IG-INB) / NB,
+     * so that if NPROCS divides i+1, i.e. MYROC=0,  we have i+1 = l*NPROCS.
+     * If we set  j = i / NPROCS, it follows that j = l-1. Therefore, i+1 is
+     * equal to (j+1) * NPROCS.  Conversely, if NPROCS does not divide  i+1,
+     * then i+1 = l*NPROCS + MYROC with 1 <= MYROC < NPROCS. It follows that
+     * j=l and thus (j+1)*NPROCS > i+1.
+     */
+    j = (i = (IG - INB) / NB) / NPROCS;
+    /*
+     * IG  is in block  1 + ( IG - INB ) / NB.  Add this to SRCPROC and take
+     * the NPROCS modulo (definition of the block-cyclic data distribution).
+     */
+    *PROC = SRCPROC + 1 + i;
+    *PROC = MPosMod(*PROC, NPROCS);
+    /*
+     * When IG resides in the process owning the first partial block of size
+     * INB (MYROC = 0), then the result IL can be written as:
+     * IL = INB - NB + l * NB + X  = IG + ( l - (l * NPROCS + MYROC) ) * NB.
+     * Using the above notation,  we have i+1 = l*NPROCS + MYROC = l*NPROCS,
+     * i.e l = ( i+1 ) / NPROCS = j+1,  since  NPROCS divides i+1, therefore
+     * IL = IG + ( j + 1 - ( i + 1 ) ) * NB.
+     *
+     * Otherwise when MYROC >= 1, the result IL can be written as:
+     * IL = l * NB + X = IG - INB + ( ( l+1 ) - ( l * NPROCS + MYROC ) )*NB.
+     * We still have i+1 = l*NPROCS+MYROC. Since NPROCS does not divide i+1,
+     * we have j = (l*NPROCS+MYROC-1) / NPROCS = l, i.e
+     * IL = IG - INB + ( j + 1 - ( i + 1 ) ) * NB.
+     */
+    *IL = NB * (j - i) + ((i + 1 - (j + 1) * NPROCS) ? IG - INB : IG);
+  }
+}
diff --git a/src/pauxil/HPL_indxg2p.cpp b/src/pauxil/HPL_indxg2p.cpp
new file mode 100644
index 0000000..89f4cfd
--- /dev/null
+++ b/src/pauxil/HPL_indxg2p.cpp
@@ -0,0 +1,74 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+int HPL_indxg2p(const int IG,
+                const int INB,
+                const int NB,
+                const int SRCPROC,
+                const int NPROCS) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_indxg2p computes the process coordinate  which posseses the entry
+   * of a matrix specified by a global index IG.
+   *
+   * Arguments
+   * =========
+   *
+   * IG      (input)                       const int
+   *         On entry, IG specifies the global index of the matrix  entry.
+   *         IG must be at least zero.
+   *
+   * INB     (input)                       const int
+   *         On entry,  INB  specifies  the size of the first block of the
+   *         global matrix. INB must be at least one.
+   *
+   * NB      (input)                       const int
+   *         On entry,  NB specifies the blocking factor used to partition
+   *         and distribute the matrix A. NB must be larger than one.
+   *
+   * SRCPROC (input)                       const int
+   *         On entry,  SRCPROC  specifies  the coordinate of the  process
+   *         that possesses the first row or column of the matrix. SRCPROC
+   *         must be at least zero and strictly less than NPROCS.
+   *
+   * NPROCS  (input)                       const int
+   *         On entry,  NPROCS  specifies the total number of process rows
+   *         or columns over which the matrix is distributed.  NPROCS must
+   *         be at least one.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  int proc;
+
+  if((IG < INB) || (SRCPROC == -1) || (NPROCS == 1))
+    /*
+     * IG  belongs  to the first block,  or the data is not distributed,  or
+     * there is just one process in this dimension of the grid.
+     */
+    return (SRCPROC);
+  /*
+   * Otherwise,  IG is in block 1 + ( IG - INB ) / NB. Add this to SRCPROC
+   * and take the NPROCS  modulo (definition of the block-cyclic data dis-
+   * tribution).
+   */
+  proc = SRCPROC + 1 + (IG - INB) / NB;
+  return (MPosMod(proc, NPROCS));
+}
diff --git a/src/pauxil/HPL_indxl2g.cpp b/src/pauxil/HPL_indxl2g.cpp
new file mode 100644
index 0000000..3c646c0
--- /dev/null
+++ b/src/pauxil/HPL_indxl2g.cpp
@@ -0,0 +1,105 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+int HPL_indxl2g(const int IL,
+                const int INB,
+                const int NB,
+                const int PROC,
+                const int SRCPROC,
+                const int NPROCS) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_indxl2g computes the global index of a matrix  entry  pointed to
+   * by the local index IL of the process indicated by PROC.
+   *
+   * Arguments
+   * =========
+   *
+   * IL      (input)                       const int
+   *         On entry, IL specifies the local  index of the matrix  entry.
+   *         IL must be at least zero.
+   *
+   * INB     (input)                       const int
+   *         On entry,  INB  specifies  the size of the first block of the
+   *         global matrix. INB must be at least one.
+   *
+   * NB      (input)                       const int
+   *         On entry,  NB specifies the blocking factor used to partition
+   *         and distribute the matrix A. NB must be larger than one.
+   *
+   * PROC    (input)                       const int
+   *         On entry, PROC  specifies the coordinate of the process whose
+   *         local array row or column is to be determined. PROC  must  be
+   *         at least zero and strictly less than NPROCS.
+   *
+   * SRCPROC (input)                       const int
+   *         On entry,  SRCPROC  specifies  the coordinate of the  process
+   *         that possesses the first row or column of the matrix. SRCPROC
+   *         must be at least zero and strictly less than NPROCS.
+   *
+   * NPROCS  (input)                       const int
+   *         On entry,  NPROCS  specifies the total number of process rows
+   *         or columns over which the matrix is distributed.  NPROCS must
+   *         be at least one.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  if((SRCPROC == -1) || (NPROCS == 1)) {
+    /*
+     * The data is not distributed, or there is just one process in this di-
+     * mension of the grid.
+     */
+    return (IL);
+  } else if(PROC == SRCPROC) {
+    /*
+     * If I am SRCPROC, my first block is of size INB
+     */
+    if(IL < INB)
+      /*
+       * If  IL  belongs to the first block,  the local and global indexes are
+       * equal.
+       */
+      return (IL);
+    /*
+     * The  number  of  entire  blocks  before  the  one  IL  belongs  to is
+     * ( IL - INB ) / NB + 1.  In  the other NPROCS-1 processes,  there  are
+     * thus NB*( ( IL-INB )/NB + 1 ) entries,  that are  globally before the
+     * global entry corresponding to IL.
+     */
+    return ((NPROCS - 1) * NB * ((IL - INB) / NB + 1) + IL);
+  } else if(PROC < SRCPROC) {
+    /*
+     * Otherwise, the process of coordinate  MOD(SRCPROC+1, NPROCS) owns the
+     * second block. Let IPROC = PROC-SRCPROC-1+NPROCS be the number of pro-
+     * cesses between this process and  PROC  not  included  when going from
+     * left to right on the process line  with  possible wrap around.  These
+     * IPROC  processes have one more NB block than the other processes, who
+     * own IL / NB blocks of size NB.
+     */
+    return (NB * ((NPROCS - 1) * (IL / NB) + PROC - SRCPROC - 1 + NPROCS) + IL +
+            INB);
+  } else {
+    /*
+     * Same reasoning as above with IPROC = PROC - SRCPROC - 1.
+     */
+    return (NB * ((NPROCS - 1) * (IL / NB) + PROC - SRCPROC - 1) + IL + INB);
+  }
+}
diff --git a/src/pauxil/HPL_infog2l.cpp b/src/pauxil/HPL_infog2l.cpp
new file mode 100644
index 0000000..c64e6b1
--- /dev/null
+++ b/src/pauxil/HPL_infog2l.cpp
@@ -0,0 +1,280 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_infog2l(int       I,
+                 int       J,
+                 const int IMB,
+                 const int MB,
+                 const int INB,
+                 const int NB,
+                 const int RSRC,
+                 const int CSRC,
+                 const int MYROW,
+                 const int MYCOL,
+                 const int NPROW,
+                 const int NPCOL,
+                 int*      II,
+                 int*      JJ,
+                 int*      PROW,
+                 int*      PCOL) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_infog2l computes the starting local index II, JJ corresponding to
+   * the submatrix starting globally at the entry pointed by  I,  J.  This
+   * routine returns the coordinates in the grid of the process owning the
+   * matrix entry of global indexes I, J, namely PROW and PCOL.
+   *
+   * Arguments
+   * =========
+   *
+   * I       (global input)                int
+   *         On entry,  I  specifies  the  global  row index of the matrix
+   *         entry. I must be at least zero.
+   *
+   * J       (global input)                int
+   *         On entry,  J  specifies the global column index of the matrix
+   *         entry. J must be at least zero.
+   *
+   * IMB     (global input)                const int
+   *         On entry,  IMB  specifies  the size of the first row block of
+   *         the global matrix. IMB must be at least one.
+   *
+   * MB      (global input)                const int
+   *         On entry,  MB specifies the blocking factor used to partition
+   *         and  distribute the rows of the matrix A.  MB  must be larger
+   *         than one.
+   *
+   * INB     (global input)                const int
+   *         On entry, INB specifies the size of the first column block of
+   *         the global matrix. INB must be at least one.
+   *
+   * NB      (global input)                const int
+   *         On entry,  NB specifies the blocking factor used to partition
+   *         and distribute the columns of the matrix A. NB must be larger
+   *         than one.
+   *
+   * RSRC    (global input)                const int
+   *         On entry,  RSRC  specifies  the row coordinate of the process
+   *         that possesses the row  I.  RSRC  must  be at least zero  and
+   *         strictly less than NPROW.
+   *
+   * CSRC    (global input)                const int
+   *         On entry, CSRC specifies the column coordinate of the process
+   *         that possesses the column J. CSRC  must be at least zero  and
+   *         strictly less than NPCOL.
+   *
+   * MYROW   (local input)                 const int
+   *         On entry, MYROW  specifies my  row process  coordinate in the
+   *         grid. MYROW is greater than or equal  to zero  and  less than
+   *         NPROW.
+   *
+   * MYCOL   (local input)                 const int
+   *         On entry, MYCOL specifies my column process coordinate in the
+   *         grid. MYCOL is greater than or equal  to zero  and  less than
+   *         NPCOL.
+   *
+   * NPROW   (global input)                const int
+   *         On entry,  NPROW  specifies the number of process rows in the
+   *         grid. NPROW is at least one.
+   *
+   * NPCOL   (global input)                const int
+   *         On entry,  NPCOL  specifies  the number of process columns in
+   *         the grid. NPCOL is at least one.
+   *
+   * II      (local output)                int *
+   *         On exit, II  specifies the  local  starting  row index of the
+   *         submatrix. On exit, II is at least 0.
+   *
+   * JJ      (local output)                int *
+   *         On exit, JJ  specifies the local starting column index of the
+   *         submatrix. On exit, JJ is at least 0.
+   *
+   * PROW    (global output)               int *
+   *         On exit, PROW is the row coordinate of the process owning the
+   *         entry specified by the global index I.  PROW is at least zero
+   *         and less than NPROW.
+   *
+   * PCOL    (global output)               int *
+   *         On exit, PCOL  is the column coordinate of the process owning
+   *         the entry specified by the global index J.  PCOL  is at least
+   *         zero and less than NPCOL.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  int ilocblk, imb, inb, mb, mydist, nb, nblocks, csrc, rsrc;
+
+  imb   = IMB;
+  *PROW = RSRC;
+
+  if((*PROW == -1) || (NPROW == 1)) {
+    /*
+     * The data is not distributed,  or there is just one process row in the
+     * grid.
+     */
+    *II = I;
+  } else if(I < imb) {
+    /*
+     * I refers to an entry in the first block of rows
+     */
+    *II = (MYROW == *PROW ? I : 0);
+  } else {
+    mb   = MB;
+    rsrc = *PROW;
+    /*
+     * The discussion goes as follows:  compute  my distance from the source
+     * process so that  within  this process coordinate system,  the  source
+     * process   is  the  process  such  that  mydist = 0,  or  equivalently
+     * MYROW == rsrc.
+     *
+     * Find  out  the global coordinate of the block I belongs to (nblocks),
+     * as well as the minimum local number of blocks that every process has.
+     *
+     * when mydist < nblocks-ilocblk*NPROCS,  I own ilocblk + 1 full blocks,
+     * when mydist > nblocks-ilocblk*NPROCS,  I own ilocblk     full blocks,
+     * when mydist = nblocks-ilocblk*NPROCS,  I own ilocblk     full blocks
+     * but not I, or I own ilocblk + 1 blocks and the entry I refers to.
+     */
+    if(MYROW == rsrc) {
+      /*
+       * I refers  to an entry  that is not in the first block, find out which
+       * process has it.
+       */
+      nblocks = (I - imb) / mb + 1;
+      *PROW += nblocks;
+      *PROW -= (*PROW / NPROW) * NPROW;
+      /*
+       * Since  mydist = 0  and nblocks - ilocblk * NPROW >= 0, there are only
+       * three possible cases:
+       *
+       *   1) When 0 = mydist = nblocks - ilocblk * NPROW = 0 and I do not own
+       *      I, in which case II = IMB + ( ilocblk - 1 ) * MB. Note that this
+       *      case  cannot  happen  when  ilocblk is zero, since nblocks is at
+       *      least one.
+       *
+       *   2) When  0 = mydist = nblocks - ilocblk * NPROW = 0 and I own I, in
+       *      which  case  I  and  II  can  respectively  be  written as IMB +
+       *      (nblocks-1)*NB + IL  and  IMB + (ilocblk-1) * MB + IL.  That  is
+       *      II = I + (ilocblk-nblocks)*MB. Note that this case cannot happen
+       *      when ilocblk is zero, since nblocks is at least one.
+       *
+       *   3) mydist = 0 < nblocks - ilocblk * NPROW,  the source process owns
+       *      ilocblk+1 full blocks,  and  therefore  II = IMB + ilocblk * MB.
+       *      Note that when ilocblk is zero, II is just IMB.
+       */
+      if(nblocks < NPROW) {
+        *II = imb;
+      } else {
+        ilocblk = nblocks / NPROW;
+        if(ilocblk * NPROW >= nblocks) {
+          *II = ((MYROW == *PROW) ? I + (ilocblk - nblocks) * mb
+                                  : imb + (ilocblk - 1) * mb);
+        } else {
+          *II = imb + ilocblk * mb;
+        }
+      }
+    } else {
+      /*
+       * I refers  to  an entry that is not in the first block, find out which
+       * process has it.
+       */
+      nblocks = (I -= imb) / mb + 1;
+      *PROW += nblocks;
+      *PROW -= (*PROW / NPROW) * NPROW;
+      /*
+       * Compute  my distance from the source process so that within this pro-
+       * cess coordinate system,  the  source process is the process such that
+       * mydist=0.
+       */
+      if((mydist = MYROW - rsrc) < 0) mydist += NPROW;
+      /*
+       * When mydist <  nblocks - ilocblk * NPROW, I own ilocblk+1 full blocks
+       * of size MB since I am not the source process, i.e. II=(ilocblk+1)*MB.
+       * When mydist>=nblocks-ilocblk*NPROW and I do not own I,  I own ilocblk
+       * full blocks of size MB, i.e. II = ilocblk*MB, otherwise I own ilocblk
+       * blocks and I,  in which case I can be written as IMB + (nblocks-1)*MB
+       * + IL and II = ilocblk*MB + IL = I - IMB + (ilocblk - nblocks + 1)*MB.
+       */
+      if(nblocks < NPROW) {
+        mydist -= nblocks;
+        *II = ((mydist < 0) ? mb
+                            : ((MYROW == *PROW) ? I + (1 - nblocks) * mb : 0));
+      } else {
+        ilocblk = nblocks / NPROW;
+        mydist -= nblocks - ilocblk * NPROW;
+        *II =
+            ((mydist < 0) ? (ilocblk + 1) * mb
+                          : ((MYROW == *PROW) ? (ilocblk - nblocks + 1) * mb + I
+                                              : ilocblk * mb));
+      }
+    }
+  }
+  /*
+   * Idem for the columns
+   */
+  inb   = INB;
+  *PCOL = CSRC;
+
+  if((*PCOL == -1) || (NPCOL == 1)) {
+    *JJ = J;
+  } else if(J < inb) {
+    *JJ = (MYCOL == *PCOL ? J : 0);
+  } else {
+    nb   = NB;
+    csrc = *PCOL;
+
+    if(MYCOL == csrc) {
+      nblocks = (J - inb) / nb + 1;
+      *PCOL += nblocks;
+      *PCOL -= (*PCOL / NPCOL) * NPCOL;
+
+      if(nblocks < NPCOL) {
+        *JJ = inb;
+      } else {
+        ilocblk = nblocks / NPCOL;
+        if(ilocblk * NPCOL >= nblocks) {
+          *JJ = ((MYCOL == *PCOL) ? J + (ilocblk - nblocks) * nb
+                                  : inb + (ilocblk - 1) * nb);
+        } else {
+          *JJ = inb + ilocblk * nb;
+        }
+      }
+    } else {
+      nblocks = (J -= inb) / nb + 1;
+      *PCOL += nblocks;
+      *PCOL -= (*PCOL / NPCOL) * NPCOL;
+
+      if((mydist = MYCOL - csrc) < 0) mydist += NPCOL;
+
+      if(nblocks < NPCOL) {
+        mydist -= nblocks;
+        *JJ = ((mydist < 0) ? nb
+                            : ((MYCOL == *PCOL) ? J + (1 - nblocks) * nb : 0));
+      } else {
+        ilocblk = nblocks / NPCOL;
+        mydist -= nblocks - ilocblk * NPCOL;
+        *JJ =
+            ((mydist < 0) ? (ilocblk + 1) * nb
+                          : ((MYCOL == *PCOL) ? (ilocblk - nblocks + 1) * nb + J
+                                              : ilocblk * nb));
+      }
+    }
+  }
+}
diff --git a/src/pauxil/HPL_numroc.cpp b/src/pauxil/HPL_numroc.cpp
new file mode 100644
index 0000000..95b96ce
--- /dev/null
+++ b/src/pauxil/HPL_numroc.cpp
@@ -0,0 +1,67 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+int HPL_numroc(const int N,
+               const int INB,
+               const int NB,
+               const int PROC,
+               const int SRCPROC,
+               const int NPROCS) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_numroc returns  the  local number of matrix rows/columns process
+   * PROC  will  get  if  we give out  N rows/columns starting from global
+   * index 0.
+   *
+   * Arguments
+   * =========
+   *
+   * N       (input)                       const int
+   *         On entry, N  specifies the number of rows/columns being dealt
+   *         out. N must be at least zero.
+   *
+   * INB     (input)                       const int
+   *         On entry,  INB  specifies  the size of the first block of the
+   *         global matrix. INB must be at least one.
+   *
+   * NB      (input)                       const int
+   *         On entry,  NB specifies the blocking factor used to partition
+   *         and distribute the matrix A. NB must be larger than one.
+   *
+   * PROC    (input)                       const int
+   *         On entry, PROC specifies  the coordinate of the process whose
+   *         local portion is determined.  PROC must be at least zero  and
+   *         strictly less than NPROCS.
+   *
+   * SRCPROC (input)                       const int
+   *         On entry,  SRCPROC  specifies  the coordinate of the  process
+   *         that possesses the first row or column of the matrix. SRCPROC
+   *         must be at least zero and strictly less than NPROCS.
+   *
+   * NPROCS  (input)                       const int
+   *         On entry,  NPROCS  specifies the total number of process rows
+   *         or columns over which the matrix is distributed.  NPROCS must
+   *         be at least one.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  return (HPL_numrocI(N, 0, INB, NB, PROC, SRCPROC, NPROCS));
+}
diff --git a/src/pauxil/HPL_numrocI.cpp b/src/pauxil/HPL_numrocI.cpp
new file mode 100644
index 0000000..7e22f5d
--- /dev/null
+++ b/src/pauxil/HPL_numrocI.cpp
@@ -0,0 +1,185 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+int HPL_numrocI(const int N,
+                const int I,
+                const int INB,
+                const int NB,
+                const int PROC,
+                const int SRCPROC,
+                const int NPROCS) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_numrocI returns  the  local number of matrix rows/columns process
+   * PROC  will  get  if  we give out  N rows/columns starting from global
+   * index I.
+   *
+   * Arguments
+   * =========
+   *
+   * N       (input)                       const int
+   *         On entry, N  specifies the number of rows/columns being dealt
+   *         out. N must be at least zero.
+   *
+   * I       (input)                       const int
+   *         On entry, I  specifies the global index of the matrix  entry
+   *         I must be at least zero.
+   *
+   * INB     (input)                       const int
+   *         On entry,  INB  specifies  the size of the first block of th
+   *         global matrix. INB must be at least one.
+   *
+   * NB      (input)                       const int
+   *         On entry,  NB specifies the blocking factor used to partition
+   *         and distribute the matrix A. NB must be larger than one.
+   *
+   * PROC    (input)                       const int
+   *         On entry, PROC specifies  the coordinate of the process whos
+   *         local portion is determined.  PROC must be at least zero  an
+   *         strictly less than NPROCS.
+   *
+   * SRCPROC (input)                       const int
+   *         On entry,  SRCPROC  specifies  the coordinate of the  proces
+   *         that possesses the first row or column of the matrix. SRCPRO
+   *         must be at least zero and strictly less than NPROCS.
+   *
+   * NPROCS  (input)                       const int
+   *         On entry,  NPROCS  specifies the total number of process row
+   *         or columns over which the matrix is distributed.  NPROCS mus
+   *         be at least one.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  int ilocblk, inb, mydist, nblocks, srcproc;
+
+  if((SRCPROC == -1) || (NPROCS == 1))
+    /*
+     * The data is not distributed, or there is just one process in this di-
+     * mension of the grid.
+     */
+    return (N);
+  /*
+   * Compute coordinate of process owning I and corresponding INB
+   */
+  srcproc = SRCPROC;
+
+  if((inb = INB - I) <= 0) {
+    /*
+     * I is not in the first block, find out which process has it and update
+     * the size of first block
+     */
+    srcproc += (nblocks = (-inb) / NB + 1);
+    srcproc -= (srcproc / NPROCS) * NPROCS;
+    inb += nblocks * NB;
+  }
+  /*
+   * Now  everything  is  just like  N, I=0, INB, NB, srcproc, NPROCS. The
+   * discussion goes as follows:  compute my distance from the source pro-
+   * cess  so that within this process coordinate system,  the source pro-
+   * cess is the process such that mydist = 0, or PROC == srcproc.
+   *
+   * Find  out  how  many  full  blocks are globally (nblocks) and locally
+   * (ilocblk) in those N entries. Then remark that
+   *
+   * when  mydist < nblocks - ilocblk*NPROCS, I own ilocblk+1 full blocks,
+   * when  mydist > nblocks - ilocblk*NPROCS, I own ilocblk   full blocks,
+   * when  mydist = nblocks - ilocblk*NPROCS, either the last block is not
+   * full and I own it,  or the last block is full and I am the first pro-
+   * cess owning only ilocblk full blocks.
+   */
+  if(PROC == srcproc) {
+    /*
+     * I am the source process, i.e. I own I (mydist=0).  When N <= INB, the
+     * answer is simply N.
+     */
+    if(N <= inb) return (N);
+    /*
+     * Find  out  how  many  full  blocks are globally (nblocks) and locally
+     * (ilocblk) in those N entries.
+     */
+    nblocks = (N - inb) / NB + 1;
+    /*
+     * Since  mydist = 0 and nblocks - ilocblk * NPROCS >= 0, there are only
+     * two possible cases:
+     *
+     *   1) When mydist = nblocks - ilocblk * NPROCS = 0, that is NPROCS di-
+     *      vides the global number of full blocks,  then the source process
+     *      srcproc owns one more block than the other processes;  and N can
+     *      be rewritten as N = INB + (nblocks-1) * NB + LNB  with  LNB >= 0
+     *      size of the last block. Similarly, the local value Np correspon-
+     *      ding to N can be written as  Np = INB + (ilocblk-1) * NB + LNB =
+     *      N + ( ilocblk-1 - (nblocks-1) )*NB.  Note  that this case cannot
+     *      happen when ilocblk is zero, since nblocks is at least one.
+     *
+     *   2) mydist = 0 < nblocks - ilocblk * NPROCS, the source process only
+     *      owns full blocks,  and  therefore Np = INB + ilocblk * NB.  Note
+     *      that when ilocblk is zero, Np is just INB.
+     */
+    if(nblocks < NPROCS) return (inb);
+
+    ilocblk = nblocks / NPROCS;
+    return ((nblocks - ilocblk * NPROCS) ? inb + ilocblk * NB
+                                         : N + (ilocblk - nblocks) * NB);
+  } else {
+    /*
+     * I am not the source process. When N <= INB, the answer is simply 0.
+     */
+    if(N <= inb) return (0);
+    /*
+     * Find  out  how  many  full  blocks are globally (nblocks) and locally
+     * (ilocblk) in those N entries
+     */
+    nblocks = (N - inb) / NB + 1;
+    /*
+     * Compute  my distance from the source process so that within this pro-
+     * cess coordinate system,  the source  process is the process such that
+     * mydist=0.
+     */
+    if((mydist = PROC - srcproc) < 0) mydist += NPROCS;
+    /*
+     * When mydist < nblocks - ilocblk*NPROCS, I own ilocblk + 1 full blocks
+     * of size NB since I am not the source process,
+     *
+     * when mydist > nblocks - ilocblk * NPROCS, I own ilocblk   full blocks
+     * of size NB since I am not the source process,
+     *
+     * when mydist = nblocks - ilocblk*NPROCS,
+     * either the last block is not full and I own it, in which case
+     *    N = INB + (nblocks - 1)*NB + LNB with  LNB  the  size  of the last
+     *    block such that NB > LNB > 0;  the local value Np corresponding to
+     *    N is given by  Np = ilocblk*NB+LNB = N-INB+(ilocblk-nblocks+1)*NB;
+     * or the  last  block  is  full  and I am the first process owning only
+     *    ilocblk full blocks of size NB, that is N = INB+(nblocks-1)*NB and
+     *    Np = ilocblk * NB = N - INB + (ilocblk-nblocks+1) * NB.
+     */
+    if(nblocks < NPROCS)
+      return ((mydist < nblocks)
+                  ? NB
+                  : ((mydist > nblocks) ? 0 : N - inb + NB * (1 - nblocks)));
+
+    ilocblk = nblocks / NPROCS;
+    mydist -= nblocks - ilocblk * NPROCS;
+    return ((mydist < 0)
+                ? (ilocblk + 1) * NB
+                : ((mydist > 0) ? ilocblk * NB
+                                : N - inb + NB * (ilocblk - nblocks + 1)));
+  }
+}
diff --git a/src/pauxil/HPL_pabort.cpp b/src/pauxil/HPL_pabort.cpp
new file mode 100644
index 0000000..0a89a85
--- /dev/null
+++ b/src/pauxil/HPL_pabort.cpp
@@ -0,0 +1,85 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_pabort(int LINE, const char* SRNAME, const char* FORM, ...) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pabort displays an error message on stderr and halts execution.
+   *
+   *
+   * Arguments
+   * =========
+   *
+   * LINE    (local input)                 int
+   *         On entry,  LINE  specifies the line  number in the file where
+   *         the  error  has  occured.  When  LINE  is not a positive line
+   *         number, it is ignored.
+   *
+   * SRNAME  (local input)                 const char *
+   *         On entry, SRNAME  should  be the name of the routine  calling
+   *         this error handler.
+   *
+   * FORM    (local input)                 const char *
+   *         On entry, FORM specifies the format, i.e., how the subsequent
+   *         arguments are converted for output.
+   *
+   *         (local input)                 ...
+   *         On entry,  ...  is the list of arguments to be printed within
+   *         the format string.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  va_list argptr;
+  int     rank;
+  char    cline[128];
+
+  va_start(argptr, FORM);
+  (void)vsprintf(cline, FORM, argptr);
+  va_end(argptr);
+
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  /*
+   * Display an error message
+   */
+  if(LINE <= 0)
+    HPL_fprintf(stderr,
+                "%s %s %d, %s %s:\n>>> %s <<< Abort ...\n\n",
+                "HPL ERROR",
+                "from process #",
+                rank,
+                "in function",
+                SRNAME,
+                cline);
+  else
+    HPL_fprintf(stderr,
+                "%s %s %d, %s %d %s %s:\n>>> %s <<< Abort ...\n\n",
+                "HPL ERROR",
+                "from process #",
+                rank,
+                "on line",
+                LINE,
+                "of function",
+                SRNAME,
+                cline);
+
+  MPI_Abort(MPI_COMM_WORLD, -1);
+  exit(-1);
+}
diff --git a/src/pauxil/HPL_pdlamch.cpp b/src/pauxil/HPL_pdlamch.cpp
new file mode 100644
index 0000000..e6fb8e8
--- /dev/null
+++ b/src/pauxil/HPL_pdlamch.cpp
@@ -0,0 +1,87 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+double HPL_pdlamch(MPI_Comm COMM, const HPL_T_MACH CMACH) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pdlamch determines  machine-specific  arithmetic  constants  such  as
+   * the relative machine precision (eps),  the safe minimum(sfmin) such that
+   * 1/sfmin does not overflow, the base of the machine (base), the precision
+   * (prec),  the  number  of  (base)  digits in the  mantissa  (t),  whether
+   * rounding occurs in addition (rnd = 1.0 and 0.0 otherwise),  the  minimum
+   * exponent before  (gradual)  underflow (emin),  the  underflow  threshold
+   * (rmin)- base**(emin-1), the largest exponent before overflow (emax), the
+   * overflow threshold (rmax)  - (base**emax)*(1-eps).
+   *
+   * Arguments
+   * =========
+   *
+   * COMM    (global/local input)          MPI_Comm
+   *         The MPI communicator identifying the process collection.
+   *
+   * CMACH   (global input)                const HPL_T_MACH
+   *         Specifies the value to be returned by HPL_pdlamch
+   *            = HPL_MACH_EPS,   HPL_pdlamch := eps (default)
+   *            = HPL_MACH_SFMIN, HPL_pdlamch := sfmin
+   *            = HPL_MACH_BASE,  HPL_pdlamch := base
+   *            = HPL_MACH_PREC,  HPL_pdlamch := eps*base
+   *            = HPL_MACH_MLEN,  HPL_pdlamch := t
+   *            = HPL_MACH_RND,   HPL_pdlamch := rnd
+   *            = HPL_MACH_EMIN,  HPL_pdlamch := emin
+   *            = HPL_MACH_RMIN,  HPL_pdlamch := rmin
+   *            = HPL_MACH_EMAX,  HPL_pdlamch := emax
+   *            = HPL_MACH_RMAX,  HPL_pdlamch := rmax
+   *
+   *         where
+   *
+   *            eps   = relative machine precision,
+   *            sfmin = safe minimum,
+   *            base  = base of the machine,
+   *            prec  = eps*base,
+   *            t     = number of digits in the mantissa,
+   *            rnd   = 1.0 if rounding occurs in addition,
+   *            emin  = minimum exponent before underflow,
+   *            rmin  = underflow threshold,
+   *            emax  = largest exponent before overflow,
+   *            rmax  = overflow threshold.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  double param;
+
+  param = HPL_dlamch(CMACH);
+
+  switch(CMACH) {
+    case HPL_MACH_EPS:
+    case HPL_MACH_SFMIN:
+    case HPL_MACH_EMIN:
+    case HPL_MACH_RMIN:
+      (void)HPL_all_reduce((void*)(&param), 1, HPL_DOUBLE, HPL_MAX, COMM);
+      break;
+    case HPL_MACH_EMAX:
+    case HPL_MACH_RMAX:
+      (void)HPL_all_reduce((void*)(&param), 1, HPL_DOUBLE, HPL_MIN, COMM);
+      break;
+    default: break;
+  }
+
+  return (param);
+}
diff --git a/src/pauxil/HPL_pdlange_device.cpp b/src/pauxil/HPL_pdlange_device.cpp
new file mode 100644
index 0000000..bf0fb1f
--- /dev/null
+++ b/src/pauxil/HPL_pdlange_device.cpp
@@ -0,0 +1,302 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+#include <hip/hip_runtime.h>
+
+#define BLOCK_SIZE 512
+#define GRID_SIZE 512
+
+__global__ void normA_1(const int N,
+                        const int M,
+                        const double* __restrict__ A,
+                        const int LDA,
+                        double* __restrict__ normAtmp) {
+  __shared__ double s_norm[BLOCK_SIZE];
+
+  const int t  = threadIdx.x;
+  const int i  = blockIdx.x;
+  size_t    id = i * BLOCK_SIZE + t;
+
+  s_norm[t] = 0.0;
+  for(; id < (size_t)N * M; id += gridDim.x * BLOCK_SIZE) {
+    const int    m   = id % M;
+    const int    n   = id / M;
+    const double Anm = fabs(A[n + ((size_t)m) * LDA]);
+
+    s_norm[t] = (Anm > s_norm[t]) ? Anm : s_norm[t];
+  }
+  __syncthreads();
+
+  for(int k = BLOCK_SIZE / 2; k > 0; k /= 2) {
+    if(t < k) {
+      s_norm[t] = (s_norm[t + k] > s_norm[t]) ? s_norm[t + k] : s_norm[t];
+    }
+    __syncthreads();
+  }
+
+  if(t == 0) normAtmp[i] = s_norm[0];
+}
+
+__global__ void normA_2(const int N, double* __restrict__ normAtmp) {
+  __shared__ double s_norm[BLOCK_SIZE];
+
+  const int t = threadIdx.x;
+
+  s_norm[t] = 0.0;
+  for(size_t id = t; id < N; id += BLOCK_SIZE) {
+    const double Anm = normAtmp[id];
+    s_norm[t]        = (Anm > s_norm[t]) ? Anm : s_norm[t];
+  }
+  __syncthreads();
+
+  for(int k = BLOCK_SIZE / 2; k > 0; k /= 2) {
+    if(t < k) {
+      s_norm[t] = (s_norm[t + k] > s_norm[t]) ? s_norm[t + k] : s_norm[t];
+    }
+    __syncthreads();
+  }
+
+  if(t == 0) normAtmp[0] = s_norm[0];
+}
+
+__global__ void norm1(const int N,
+                      const int M,
+                      const double* __restrict__ A,
+                      const int LDA,
+                      double* __restrict__ work) {
+
+  __shared__ double s_norm1[BLOCK_SIZE];
+
+  const int t = threadIdx.x;
+  const int n = blockIdx.x;
+
+  s_norm1[t] = 0.0;
+  for(size_t id = t; id < M; id += BLOCK_SIZE) {
+    s_norm1[t] += fabs(A[id + n * ((size_t)LDA)]);
+  }
+
+  __syncthreads();
+
+  for(int k = BLOCK_SIZE / 2; k > 0; k /= 2) {
+    if(t < k) { s_norm1[t] += s_norm1[t + k]; }
+    __syncthreads();
+  }
+
+  if(t == 0) work[n] = s_norm1[0];
+}
+
+__global__ void norminf(const int N,
+                        const int M,
+                        const double* __restrict__ A,
+                        const int LDA,
+                        double* __restrict__ work) {
+  const int    t  = threadIdx.x;
+  const int    b  = blockIdx.x;
+  const size_t id = b * BLOCK_SIZE + t; // row id
+
+  if(id < M) {
+    double norm = 0.0;
+    for(size_t i = 0; i < N; i++) { norm += fabs(A[id + i * ((size_t)LDA)]); }
+    work[id] = norm;
+  }
+}
+
+double HPL_pdlange(const HPL_T_grid* GRID,
+                   const HPL_T_NORM  NORM,
+                   const int         M,
+                   const int         N,
+                   const int         NB,
+                   const double*     A,
+                   const int         LDA) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pdlange returns  the value of the one norm,  or the infinity norm,
+   * or the element of largest absolute value of a distributed matrix A:
+   *
+   *
+   *    max(abs(A(i,j))) when NORM = HPL_NORM_A,
+   *    norm1(A),        when NORM = HPL_NORM_1,
+   *    normI(A),        when NORM = HPL_NORM_I,
+   *
+   * where norm1 denotes the one norm of a matrix (maximum column sum) and
+   * normI denotes  the infinity norm of a matrix (maximum row sum).  Note
+   * that max(abs(A(i,j))) is not a matrix norm.
+   *
+   * Arguments
+   * =========
+   *
+   * GRID    (local input)                 const HPL_T_grid *
+   *         On entry,  GRID  points  to the data structure containing the
+   *         process grid information.
+   *
+   * NORM    (global input)                const HPL_T_NORM
+   *         On entry,  NORM  specifies  the  value to be returned by this
+   *         function as described above.
+   *
+   * M       (global input)                const int
+   *         On entry,  M  specifies  the number  of rows of the matrix A.
+   *         M must be at least zero.
+   *
+   * N       (global input)                const int
+   *         On entry,  N specifies the number of columns of the matrix A.
+   *         N must be at least zero.
+   *
+   * NB      (global input)                const int
+   *         On entry,  NB specifies the blocking factor used to partition
+   *         and distribute the matrix. NB must be larger than one.
+   *
+   * A       (local input)                 const double *
+   *         On entry,  A  points to an array of dimension  (LDA,LocQ(N)),
+   *         that contains the local pieces of the distributed matrix A.
+   *
+   * LDA     (local input)                 const int
+   *         On entry, LDA specifies the leading dimension of the array A.
+   *         LDA must be at least max(1,LocP(M)).
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  double   s, v0 = HPL_rzero, *work = NULL, *dwork = NULL;
+  MPI_Comm Acomm, Ccomm, Rcomm;
+  int      ii, jj, mp, mycol, myrow, npcol, nprow, nq;
+
+  (void)HPL_grid_info(GRID, &nprow, &npcol, &myrow, &mycol);
+  Rcomm = GRID->row_comm;
+  Ccomm = GRID->col_comm;
+  Acomm = GRID->all_comm;
+
+  Mnumroc(mp, M, NB, NB, myrow, 0, nprow);
+  Mnumroc(nq, N, NB, NB, mycol, 0, npcol);
+
+  if(Mmin(M, N) == 0) {
+    return (v0);
+  } else if(NORM == HPL_NORM_A) {
+    /*
+     * max( abs( A ) )
+     */
+    if((nq > 0) && (mp > 0)) {
+      if(nq == 1) { // column vector
+        int id;
+        rocblas_idamax(handle, mp, A, 1, &id);
+        hipMemcpy(&v0, A + id - 1, 1 * sizeof(double), hipMemcpyDeviceToHost);
+      } else if(mp == 1) { // row vector
+        int id;
+        rocblas_idamax(handle, nq, A, LDA, &id);
+        hipMemcpy(&v0,
+                  A + ((size_t)id * LDA),
+                  1 * sizeof(double),
+                  hipMemcpyDeviceToHost);
+      } else {
+        // custom reduction kernels
+        hipMalloc(&dwork, GRID_SIZE * sizeof(double));
+
+        size_t grid_size = (nq * mp + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        grid_size        = (grid_size < GRID_SIZE) ? grid_size : GRID_SIZE;
+
+        normA_1<<<grid_size, BLOCK_SIZE>>>(nq, mp, A, LDA, dwork);
+        normA_2<<<1, BLOCK_SIZE>>>(grid_size, dwork);
+
+        hipMemcpy(&v0, dwork, 1 * sizeof(double), hipMemcpyDeviceToHost);
+        hipFree(dwork);
+      }
+    }
+    (void)HPL_reduce((void*)(&v0), 1, HPL_DOUBLE, HPL_MAX, 0, Acomm);
+  } else if(NORM == HPL_NORM_1) {
+    /*
+     * Find norm_1( A ).
+     */
+    if(nq > 0) {
+      work = (double*)malloc((size_t)(nq) * sizeof(double));
+      if(work == NULL) {
+        HPL_pabort(__LINE__, "HPL_pdlange", "Memory allocation failed");
+      }
+
+      if(nq == 1) { // column vector
+        rocblas_dasum(handle, mp, A, 1, work);
+      } else {
+        hipMalloc(&dwork, nq * sizeof(double));
+        norm1<<<nq, BLOCK_SIZE>>>(nq, mp, A, LDA, dwork);
+        hipMemcpy(work, dwork, nq * sizeof(double), hipMemcpyDeviceToHost);
+      }
+      /*
+       * Find sum of global matrix columns, store on row 0 of process grid
+       */
+      (void)HPL_reduce((void*)(work), nq, HPL_DOUBLE, HPL_SUM, 0, Ccomm);
+      /*
+       * Find maximum sum of columns for 1-norm
+       */
+      if(myrow == 0) {
+        v0 = work[HPL_idamax(nq, work, 1)];
+        v0 = Mabs(v0);
+      }
+      if(work) free(work);
+      if(dwork) hipFree(dwork);
+    }
+    /*
+     * Find max in row 0, store result in process (0,0)
+     */
+    if(myrow == 0)
+      (void)HPL_reduce((void*)(&v0), 1, HPL_DOUBLE, HPL_MAX, 0, Rcomm);
+  } else if(NORM == HPL_NORM_I) {
+    /*
+     * Find norm_inf( A )
+     */
+    if(mp > 0) {
+      work = (double*)malloc((size_t)(mp) * sizeof(double));
+      if(work == NULL) {
+        HPL_pabort(__LINE__, "HPL_pdlange", "Memory allocation failed");
+      }
+
+      if(mp == 1) { // row vector
+        rocblas_dasum(handle, nq, A, LDA, work);
+      } else {
+        hipMalloc(&dwork, mp * sizeof(double));
+
+        size_t grid_size = (mp + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        norminf<<<grid_size, BLOCK_SIZE>>>(nq, mp, A, LDA, dwork);
+        hipMemcpy(work, dwork, mp * sizeof(double), hipMemcpyDeviceToHost);
+      }
+
+      /*
+       * Find sum of global matrix rows, store on column 0 of process grid
+       */
+      (void)HPL_reduce((void*)(work), mp, HPL_DOUBLE, HPL_SUM, 0, Rcomm);
+      /*
+       * Find maximum sum of rows for inf-norm
+       */
+      if(mycol == 0) {
+        v0 = work[HPL_idamax(mp, work, 1)];
+        v0 = Mabs(v0);
+      }
+      if(work) free(work);
+      if(dwork) hipFree(dwork);
+    }
+    /*
+     * Find max in column 0, store result in process (0,0)
+     */
+    if(mycol == 0)
+      (void)HPL_reduce((void*)(&v0), 1, HPL_DOUBLE, HPL_MAX, 0, Ccomm);
+  }
+  /*
+   * Broadcast answer to every process in the grid
+   */
+  (void)HPL_broadcast((void*)(&v0), 1, HPL_DOUBLE, 0, Acomm);
+
+  return (v0);
+}
diff --git a/src/pauxil/HPL_pwarn.cpp b/src/pauxil/HPL_pwarn.cpp
new file mode 100644
index 0000000..e11b4bb
--- /dev/null
+++ b/src/pauxil/HPL_pwarn.cpp
@@ -0,0 +1,89 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_pwarn(FILE*       STREAM,
+               int         LINE,
+               const char* SRNAME,
+               const char* FORM,
+               ...) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pwarn displays an error message.
+   *
+   *
+   * Arguments
+   * =========
+   *
+   * STREAM  (local input)                 FILE *
+   *         On entry, STREAM specifies the output stream.
+   *
+   * LINE    (local input)                 int
+   *         On entry,  LINE  specifies the line  number in the file where
+   *         the  error  has  occured.  When  LINE  is not a positive line
+   *         number, it is ignored.
+   *
+   * SRNAME  (local input)                 const char *
+   *         On entry, SRNAME  should  be the name of the routine  calling
+   *         this error handler.
+   *
+   * FORM    (local input)                 const char *
+   *         On entry, FORM specifies the format, i.e., how the subsequent
+   *         arguments are converted for output.
+   *
+   *         (local input)                 ...
+   *         On entry,  ...  is the list of arguments to be printed within
+   *         the format string.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  va_list argptr;
+  int     rank;
+  char    cline[128];
+
+  va_start(argptr, FORM);
+  (void)vsprintf(cline, FORM, argptr);
+  va_end(argptr);
+
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  /*
+   * Display an error message
+   */
+  if(LINE <= 0)
+    HPL_fprintf(STREAM,
+                "%s %s %d, %s %s:\n>>> %s <<<\n\n",
+                "HPL ERROR",
+                "from process #",
+                rank,
+                "in function",
+                SRNAME,
+                cline);
+  else
+    HPL_fprintf(STREAM,
+                "%s %s %d, %s %d %s %s:\n>>> %s <<<\n\n",
+                "HPL ERROR",
+                "from process #",
+                rank,
+                "on line",
+                LINE,
+                "of function",
+                SRNAME,
+                cline);
+}
diff --git a/src/pfact/HPL_dlocmax.cpp b/src/pfact/HPL_dlocmax.cpp
new file mode 100644
index 0000000..de12400
--- /dev/null
+++ b/src/pfact/HPL_dlocmax.cpp
@@ -0,0 +1,110 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_dlocmax(HPL_T_panel* PANEL,
+                 const int    N,
+                 const int    II,
+                 const int    JJ,
+                 double*      WORK,
+                 int          thread_rank,
+                 int          thread_size,
+                 int*         max_index,
+                 double*      max_value) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_dlocmax finds  the maximum entry in the current column  and packs
+   * the useful information in  WORK[0:3].  On exit,  WORK[0] contains the
+   * local maximum  absolute value  scalar,  WORK[1] is the  corresponding
+   * local row index,  WORK[2]  is the corresponding global row index, and
+   * WORK[3] is the coordinate of the process owning this max.  When N  is
+   * less than 1, the WORK[0:2] is initialized to zero, and WORK[3] is set
+   * to the total number of process rows.
+   *
+   * Arguments
+   * =========
+   *
+   * PANEL   (local input/output)          HPL_T_panel *
+   *         On entry,  PANEL  points to the data structure containing the
+   *         panel information.
+   *
+   * N       (local input)                 const int
+   *         On entry,  N specifies the local number of rows of the column
+   *         of A on which we operate.
+   *
+   * II      (local input)                 const int
+   *         On entry, II  specifies the row offset where the column to be
+   *         operated on starts with respect to the panel.
+   *
+   * JJ      (local input)                 const int
+   *         On entry, JJ  specifies the column offset where the column to
+   *         be operated on starts with respect to the panel.
+   *
+   * WORK    (local workspace)             double *
+   *         On entry, WORK  is  a workarray of size at least 4.  On exit,
+   *         WORK[0] contains  the  local  maximum  absolute value scalar,
+   *         WORK[1] contains  the corresponding local row index,  WORK[2]
+   *         contains the corresponding global row index, and  WORK[3]  is
+   *         the coordinate of process owning this max.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  double* A;
+  int     kk, igindx, ilindx, myrow, nb, nprow;
+
+  if(N > 0) {
+    A     = Mptr(PANEL->A, II, JJ, PANEL->lda);
+    myrow = PANEL->grid->myrow;
+    nprow = PANEL->grid->nprow;
+    nb    = PANEL->nb;
+
+    HPL_idamax_omp(
+        N, A, 1, nb, II, thread_rank, thread_size, max_index, max_value);
+
+    if(thread_rank == 0) {
+      ilindx = max_index[0];
+      kk     = PANEL->ii + II + (ilindx);
+      Mindxl2g(igindx, kk, nb, nb, myrow, 0, nprow);
+      /*
+       * WORK[0] := local maximum absolute value scalar,
+       * WORK[1] := corresponding local  row index,
+       * WORK[2] := corresponding global row index,
+       * WORK[3] := coordinate of process owning this max.
+       */
+      WORK[0] = max_value[0];
+      WORK[1] = (double)(ilindx);
+      WORK[2] = (double)(igindx);
+      WORK[3] = (double)(myrow);
+    }
+  } else {
+    /*
+     * If I do not have any row of A, then set the coordinate of the process
+     * (WORK[3]) owning this "ghost" row,  such that it  will never be used,
+     * even if there are only zeros in the current column of A.
+     */
+    if(thread_rank == 0) {
+      WORK[0] = WORK[1] = WORK[2] = HPL_rzero;
+      WORK[3]                     = (double)(PANEL->grid->nprow);
+    }
+  }
+
+// make sure WORK is visible to all threads
+#pragma omp barrier
+}
diff --git a/src/pfact/HPL_dlocswpN.cpp b/src/pfact/HPL_dlocswpN.cpp
new file mode 100644
index 0000000..51a0f8a
--- /dev/null
+++ b/src/pfact/HPL_dlocswpN.cpp
@@ -0,0 +1,150 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_dlocswpN(HPL_T_panel* PANEL,
+                  const int    II,
+                  const int    JJ,
+                  double*      WORK) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_dlocswpN performs  the local swapping operations  within a panel.
+   * The lower triangular  N0-by-N0  upper block of the panel is stored in
+   * no-transpose form (i.e. just like the input matrix itself).
+   *
+   * Arguments
+   * =========
+   *
+   * PANEL   (local input/output)          HPL_T_panel *
+   *         On entry,  PANEL  points to the data structure containing the
+   *         panel information.
+   *
+   * II      (local input)                 const int
+   *         On entry, II  specifies the row offset where the column to be
+   *         operated on starts with respect to the panel.
+   *
+   * JJ      (local input)                 const int
+   *         On entry, JJ  specifies the column offset where the column to
+   *         be operated on starts with respect to the panel.
+   *
+   * WORK    (local workspace)             double *
+   *         On entry, WORK  is a workarray of size at least 2 * (4+2*N0).
+   *         WORK[0] contains  the  local  maximum  absolute value scalar,
+   *         WORK[1] contains  the corresponding local row index,  WORK[2]
+   *         contains the corresponding global row index, and  WORK[3]  is
+   *         the coordinate of process owning this max.  The N0 length max
+   *         row is stored in WORK[4:4+N0-1];  Note  that this is also the
+   *         JJth row  (or column) of L1. The remaining part of this array
+   *         is used as workspace.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  double  gmax;
+  double *A1, *A2, *L, *Wr0, *Wmx;
+  int     ilindx, lda, myrow, n0;
+
+  myrow  = PANEL->grid->myrow;
+  n0     = PANEL->jb;
+  int NB = PANEL->nb;
+  lda    = PANEL->lda;
+
+  Wr0     = (Wmx = WORK + 4) + NB;
+  Wmx[JJ] = gmax = WORK[0];
+
+  /*
+   * Replicated swap and copy of the current (new) row of A into L1
+   */
+  L = Mptr(PANEL->L1, JJ, 0, n0);
+  /*
+   * If the pivot is non-zero ...
+   */
+  if(gmax != HPL_rzero) {
+    /*
+     * and if I own the current row of A ...
+     */
+    if(myrow == PANEL->prow) {
+      /*
+       * and if I also own the row to be swapped with the current row of A ...
+       */
+      if(myrow == (int)(WORK[3])) {
+        /*
+         * and if the current row of A is not to swapped with itself ...
+         */
+        if((ilindx = (int)(WORK[1])) != 0) {
+          /*
+           * then copy the max row into L1 and locally swap the 2 rows of A.
+           */
+          A1 = Mptr(PANEL->A, II, 0, lda);
+          A2 = Mptr(A1, ilindx, 0, lda);
+
+          HPL_dcopy(n0, Wmx, 1, L, n0);
+          HPL_dcopy(n0, Wmx, 1, A1, lda);
+          HPL_dcopy(n0, Wr0, 1, A2, lda);
+
+        } else {
+          /*
+           * otherwise the current row of  A  is swapped with itself, so just
+           * copy the current of A into L1.
+           */
+          *Mptr(PANEL->A, II, JJ, lda) = gmax;
+
+          HPL_dcopy(n0, Wmx, 1, L, n0);
+        }
+
+      } else {
+        /*
+         * otherwise, the row to be swapped with the current row of A is in Wmx,
+         * so copy Wmx into L1 and A.
+         */
+        A1 = Mptr(PANEL->A, II, 0, lda);
+
+        HPL_dcopy(n0, Wmx, 1, L, n0);
+        HPL_dcopy(n0, Wmx, 1, A1, lda);
+      }
+
+    } else {
+      /*
+       * otherwise I do not own the current row of A, so copy the max row  Wmx
+       * into L1.
+       */
+      HPL_dcopy(n0, Wmx, 1, L, n0);
+
+      /*
+       * and if I own the max row, overwrite it with the current row Wr0.
+       */
+      if(myrow == (int)(WORK[3])) {
+        A2 = Mptr(PANEL->A, II + (size_t)(WORK[1]), 0, lda);
+
+        HPL_dcopy(n0, Wr0, 1, A2, lda);
+      }
+    }
+  } else {
+    /*
+     * Otherwise the max element in the current column is zero,  simply copy
+     * the current row Wr0 into L1. The matrix is singular.
+     */
+    HPL_dcopy(n0, Wr0, 1, L, n0);
+
+    /*
+     * set INFO.
+     */
+    if(*(PANEL->DINFO) == 0.0) *(PANEL->DINFO) = (double)(PANEL->ia + JJ + 1);
+  }
+}
diff --git a/src/pfact/HPL_dlocswpT.cpp b/src/pfact/HPL_dlocswpT.cpp
new file mode 100644
index 0000000..eca4e4f
--- /dev/null
+++ b/src/pfact/HPL_dlocswpT.cpp
@@ -0,0 +1,150 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_dlocswpT(HPL_T_panel* PANEL,
+                  const int    II,
+                  const int    JJ,
+                  double*      WORK) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_dlocswpT performs  the local swapping operations  within a panel.
+   * The lower triangular  N0-by-N0  upper block of the panel is stored in
+   * transpose form.
+   *
+   * Arguments
+   * =========
+   *
+   * PANEL   (local input/output)          HPL_T_panel *
+   *         On entry,  PANEL  points to the data structure containing the
+   *         panel information.
+   *
+   * II      (local input)                 const int
+   *         On entry, II  specifies the row offset where the column to be
+   *         operated on starts with respect to the panel.
+   *
+   * JJ      (local input)                 const int
+   *         On entry, JJ  specifies the column offset where the column to
+   *         be operated on starts with respect to the panel.
+   *
+   * WORK    (local workspace)             double *
+   *         On entry, WORK  is a workarray of size at least 2 * (4+2*N0).
+   *         WORK[0] contains  the  local  maximum  absolute value scalar,
+   *         WORK[1] contains  the corresponding local row index,  WORK[2]
+   *         contains the corresponding global row index, and  WORK[3]  is
+   *         the coordinate of process owning this max.  The N0 length max
+   *         row is stored in WORK[4:4+N0-1];  Note  that this is also the
+   *         JJth row  (or column) of L1. The remaining part of this array
+   *         is used as workspace.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  double  gmax;
+  double *A1, *A2, *L, *Wr0, *Wmx;
+  int     ilindx, lda, myrow, n0;
+
+  myrow  = PANEL->grid->myrow;
+  n0     = PANEL->jb;
+  int NB = PANEL->nb;
+  lda    = PANEL->lda;
+
+  Wr0     = (Wmx = WORK + 4) + NB;
+  Wmx[JJ] = gmax = WORK[0];
+
+  /*
+   * Replicated swap and copy of the current (new) row of A into L1
+   */
+  L = Mptr(PANEL->L1, 0, JJ, n0);
+  /*
+   * If the pivot is non-zero ...
+   */
+  if(gmax != HPL_rzero) {
+    /*
+     * and if I own the current row of A ...
+     */
+    if(myrow == PANEL->prow) {
+      /*
+       * and if I also own the row to be swapped with the current row of A ...
+       */
+      if(myrow == (int)(WORK[3])) {
+        /*
+         * and if the current row of A is not to swapped with itself ...
+         */
+        if((ilindx = (int)(WORK[1])) != 0) {
+          /*
+           * then copy the max row into L1 and locally swap the 2 rows of A.
+           */
+          A1 = Mptr(PANEL->A, II, 0, lda);
+          A2 = Mptr(A1, ilindx, 0, lda);
+
+          HPL_dcopy(n0, Wmx, 1, L, 1);
+          HPL_dcopy(n0, Wmx, 1, A1, lda);
+          HPL_dcopy(n0, Wr0, 1, A2, lda);
+
+        } else {
+          /*
+           * otherwise the current row of  A  is swapped with itself, so just
+           * copy the current of A into L1.
+           */
+          *Mptr(PANEL->A, II, JJ, lda) = gmax;
+
+          HPL_dcopy(n0, Wmx, 1, L, 1);
+        }
+
+      } else {
+        /*
+         * otherwise, the row to be swapped with the current row of A is in Wmx,
+         * so copy Wmx into L1 and A.
+         */
+        A1 = Mptr(PANEL->A, II, 0, lda);
+
+        HPL_dcopy(n0, Wmx, 1, L, 1);
+        HPL_dcopy(n0, Wmx, 1, A1, lda);
+      }
+
+    } else {
+      /*
+       * otherwise I do not own the current row of A, so copy the max row  Wmx
+       * into L1.
+       */
+      HPL_dcopy(n0, Wmx, 1, L, 1);
+
+      /*
+       * and if I own the max row, overwrite it with the current row Wr0.
+       */
+      if(myrow == (int)(WORK[3])) {
+        A2 = Mptr(PANEL->A, II + (size_t)(WORK[1]), 0, lda);
+
+        HPL_dcopy(n0, Wr0, 1, A2, lda);
+      }
+    }
+  } else {
+    /*
+     * Otherwise the max element in the current column is zero,  simply copy
+     * the current row Wr0 into L1. The matrix is singular.
+     */
+    HPL_dcopy(n0, Wr0, 1, L, 1);
+
+    /*
+     * Set INFO.
+     */
+    if(*(PANEL->DINFO) == 0.0) *(PANEL->DINFO) = (double)(PANEL->ia + JJ + 1);
+  }
+}
diff --git a/src/pfact/HPL_pdfact.cpp b/src/pfact/HPL_pdfact.cpp
new file mode 100644
index 0000000..2e8f21a
--- /dev/null
+++ b/src/pfact/HPL_pdfact.cpp
@@ -0,0 +1,109 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+#include <assert.h>
+
+void HPL_pdfact(HPL_T_panel* PANEL) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pdfact recursively factorizes a  1-dimensional  panel of columns.
+   * The  RPFACT  function pointer specifies the recursive algorithm to be
+   * used, either Crout, Left- or Right looking.  NBMIN allows to vary the
+   * recursive stopping criterium in terms of the number of columns in the
+   * panel, and  NDIV allows to specify the number of subpanels each panel
+   * should be divided into. Usuallly a value of 2 will be chosen. Finally
+   * PFACT is a function pointer specifying the non-recursive algorithm to
+   * to be used on at most NBMIN columns. One can also choose here between
+   * Crout, Left- or Right looking.  Empirical tests seem to indicate that
+   * values of 4 or 8 for NBMIN give the best results.
+   *
+   * Bi-directional  exchange  is  used  to  perform  the  swap::broadcast
+   * operations  at once  for one column in the panel.  This  results in a
+   * lower number of slightly larger  messages than usual.  On P processes
+   * and assuming bi-directional links,  the running time of this function
+   * can be approximated by (when N is equal to N0):
+   *
+   *    N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) +
+   *    N0^2 * ( M - N0/3 ) * gam2-3
+   *
+   * where M is the local number of rows of  the panel, lat and bdwth  are
+   * the latency and bandwidth of the network for  double  precision  real
+   * words, and  gam2-3  is  an estimate of the  Level 2 and Level 3  BLAS
+   * rate of execution. The  recursive  algorithm  allows indeed to almost
+   * achieve  Level 3 BLAS  performance  in the panel factorization.  On a
+   * large  number of modern machines,  this  operation is however latency
+   * bound,  meaning  that its cost can  be estimated  by only the latency
+   * portion N0 * log_2(P) * lat.  Mono-directional links will double this
+   * communication cost.
+   *
+   * Arguments
+   * =========
+   *
+   * PANEL   (local input/output)          HPL_T_panel *
+   *         On entry,  PANEL  points to the data structure containing the
+   *         panel information.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  int jb, i;
+
+  jb = PANEL->jb;
+  PANEL->n -= jb;
+  PANEL->ja += jb;
+
+  if((PANEL->grid->mycol != PANEL->pcol) || (jb <= 0)) return;
+#ifdef HPL_DETAILED_TIMING
+  HPL_ptimer(HPL_TIMING_RPFACT);
+#endif
+  /*
+   * Factor the panel - Update the panel pointers
+   */
+  double max_value[128];
+  int    max_index[128];
+
+  roctxRangePush("pdfact");
+
+#pragma omp parallel shared(max_value, max_index)
+  {
+    const int thread_rank = omp_get_thread_num();
+    const int thread_size = omp_get_num_threads();
+    assert(thread_size <= 128);
+
+    PANEL->algo->rffun(PANEL,
+                       PANEL->mp,
+                       jb,
+                       0,
+                       PANEL->fWORK,
+                       thread_rank,
+                       thread_size,
+                       max_value,
+                       max_index);
+  }
+
+  roctxRangePop();
+
+  // PANEL->A   = Mptr( PANEL->A, 0, jb, PANEL->lda );
+  PANEL->dA = Mptr(PANEL->dA, 0, jb, PANEL->dlda);
+  PANEL->nq -= jb;
+  PANEL->jj += jb;
+#ifdef HPL_DETAILED_TIMING
+  HPL_ptimer(HPL_TIMING_RPFACT);
+#endif
+}
diff --git a/src/pfact/HPL_pdmxswp.cpp b/src/pfact/HPL_pdmxswp.cpp
new file mode 100644
index 0000000..dbcd366
--- /dev/null
+++ b/src/pfact/HPL_pdmxswp.cpp
@@ -0,0 +1,132 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_pdmxswp(HPL_T_panel* PANEL,
+                 const int    M,
+                 const int    II,
+                 const int    JJ,
+                 double*      WORK) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pdmxswp swaps  and  broadcasts  the  absolute value max row using
+   * bi-directional exchange.  The buffer is partially set by HPL_dlocmax.
+   *
+   * Bi-directional  exchange  is  used  to  perform  the  swap::broadcast
+   * operations  at once  for one column in the panel.  This  results in a
+   * lower number of slightly larger  messages than usual.  On P processes
+   * and assuming bi-directional links,  the running time of this function
+   * can be approximated by
+   *
+   *    log_2( P ) * ( lat + ( 2 * N0 + 4 ) / bdwth )
+   *
+   * where  lat and bdwth are the latency and bandwidth of the network for
+   * double precision real elements.  Communication  only  occurs  in  one
+   * process  column. Mono-directional links  will cause the communication
+   * cost to double.
+   *
+   * Arguments
+   * =========
+   *
+   * PANEL   (local input/output)          HPL_T_panel *
+   *         On entry,  PANEL  points to the data structure containing the
+   *         panel information.
+   *
+   * M       (local input)                 const int
+   *         On entry,  M specifies the local number of rows of the matrix
+   *         column on which this function operates.
+   *
+   * II      (local input)                 const int
+   *         On entry, II  specifies the row offset where the column to be
+   *         operated on starts with respect to the panel.
+   *
+   * JJ      (local input)                 const int
+   *         On entry, JJ  specifies the column offset where the column to
+   *         be operated on starts with respect to the panel.
+   *
+   * WORK    (local workspace)             double *
+   *         On entry, WORK  is a workarray of size at least 2 * (4+2*N0).
+   *         It  is assumed that  HPL_dlocmax  was called  prior  to  this
+   *         routine to  initialize  the first four entries of this array.
+   *         On exit, the  N0  length max row is stored in WORK[4:4+N0-1];
+   *         Note that this is also the  JJth  row  (or column) of L1. The
+   *         remaining part is used as a temporary array.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  double *    A0, *Wmx, *Wwork;
+  HPL_T_grid* grid;
+  MPI_Comm    comm;
+  int         cnt_, cnt0, i, icurrow, lda, myrow, n0;
+
+/* ..
+ * .. Executable Statements ..
+ */
+#ifdef HPL_DETAILED_TIMING
+  HPL_ptimer(HPL_TIMING_MXSWP);
+#endif
+  grid    = PANEL->grid;
+  comm    = grid->col_comm;
+  myrow   = grid->myrow;
+  n0      = PANEL->jb;
+  int NB  = PANEL->nb;
+  icurrow = PANEL->prow;
+  /*
+   * Set up pointers in workspace:  WORK and Wwork  point to the beginning
+   * of the buffers of size 4 + 2*N0 to be combined. Wmx points to the row
+   * owning the local (before combine) and global (after combine) absolute
+   * value max. A0 points to the copy of the current row of the matrix.
+   */
+  cnt0 = 4 + 2 * NB;
+
+  A0    = (Wmx = WORK + 4) + NB;
+  Wwork = WORK + cnt0;
+
+  /*
+   * Wmx[0:N0-1] := A[ilindx,0:N0-1] where ilindx is  (int)(WORK[1])  (row
+   * with max in current column). If I am the current process row, pack in
+   * addition the current row of A in A0[0:N0-1].  If I do not own any row
+   * of A, then zero out Wmx[0:N0-1].
+   */
+  if(M > 0) {
+    lda = PANEL->lda;
+
+    HPL_dcopy(n0, Mptr(PANEL->A, II + (int)(WORK[1]), 0, lda), lda, Wmx, 1);
+    if(myrow == icurrow) {
+      HPL_dcopy(n0, Mptr(PANEL->A, II, 0, lda), lda, A0, 1);
+    } else {
+      for(i = 0; i < n0; i++) A0[i] = HPL_rzero;
+    }
+  } else {
+    for(i = 0; i < n0; i++) A0[i] = HPL_rzero;
+    for(i = 0; i < n0; i++) Wmx[i] = HPL_rzero;
+  }
+
+  /* Perform swap-broadcast */
+  HPL_all_reduce_dmxswp(WORK, cnt0, icurrow, comm, Wwork);
+
+  /*
+   * Save the global pivot index in pivot array
+   */
+  (PANEL->ipiv)[JJ] = (int)WORK[2];
+#ifdef HPL_DETAILED_TIMING
+  HPL_ptimer(HPL_TIMING_MXSWP);
+#endif
+}
diff --git a/src/pfact/HPL_pdpancrN.cpp b/src/pfact/HPL_pdpancrN.cpp
new file mode 100644
index 0000000..003e266
--- /dev/null
+++ b/src/pfact/HPL_pdpancrN.cpp
@@ -0,0 +1,233 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_pdpancrN(HPL_T_panel* PANEL,
+                  const int    M,
+                  const int    N,
+                  const int    ICOFF,
+                  double*      WORK,
+                  int          thread_rank,
+                  int          thread_size,
+                  double*      max_value,
+                  int*         max_index) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pdpancrN factorizes  a panel of columns that is a sub-array of a
+   * larger one-dimensional panel  A using the Crout variant of the  usual
+   * one-dimensional algorithm.  The lower triangular N0-by-N0 upper block
+   * of the panel is stored in no-transpose form (i.e. just like the input
+   * matrix itself).
+   *
+   * Bi-directional  exchange  is  used  to  perform  the  swap::broadcast
+   * operations  at once  for one column in the panel.  This  results in a
+   * lower number of slightly larger  messages than usual.  On P processes
+   * and assuming bi-directional links,  the running time of this function
+   * can be approximated by (when N is equal to N0):
+   *
+   *    N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) +
+   *    N0^2 * ( M - N0/3 ) * gam2-3
+   *
+   * where M is the local number of rows of  the panel, lat and bdwth  are
+   * the latency and bandwidth of the network for  double  precision  real
+   * words, and gam2-3 is  an  estimate  of the  Level 2 and Level 3  BLAS
+   * rate of execution. The  recursive  algorithm  allows indeed to almost
+   * achieve  Level 3 BLAS  performance  in the panel factorization.  On a
+   * large  number of modern machines,  this  operation is however latency
+   * bound,  meaning  that its cost can  be estimated  by only the latency
+   * portion N0 * log_2(P) * lat.  Mono-directional links will double this
+   * communication cost.
+   *
+   * Note that  one  iteration of the the main loop is unrolled. The local
+   * computation of the absolute value max of the next column is performed
+   * just after its update by the current column. This allows to bring the
+   * current column only  once through  cache at each  step.  The  current
+   * implementation  does not perform  any blocking  for  this sequence of
+   * BLAS operations, however the design allows for plugging in an optimal
+   * (machine-specific) specialized  BLAS-like kernel.  This idea has been
+   * suggested to us by Fred Gustavson, IBM T.J. Watson Research Center.
+   *
+   * Arguments
+   * =========
+   *
+   * PANEL   (local input/output)          HPL_T_panel *
+   *         On entry,  PANEL  points to the data structure containing the
+   *         panel information.
+   *
+   * M       (local input)                 const int
+   *         On entry,  M specifies the local number of rows of sub(A).
+   *
+   * N       (local input)                 const int
+   *         On entry,  N specifies the local number of columns of sub(A).
+   *
+   * ICOFF   (global input)                const int
+   *         On entry, ICOFF specifies the row and column offset of sub(A)
+   *         in A.
+   *
+   * WORK    (local workspace)             double *
+   *         On entry, WORK  is a workarray of size at least 2*(4+2*N0).
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  double *A, *L1, *L1ptr;
+  int     Mm1, Nm1, curr, ii, iip1, jj, kk = 0, lda, m = M, n0;
+/* ..
+ * .. Executable Statements ..
+ */
+#ifdef HPL_DETAILED_TIMING
+  if(thread_rank == 0) HPL_ptimer(HPL_TIMING_PFACT);
+#endif
+  A    = PANEL->A;
+  lda  = PANEL->lda;
+  L1   = PANEL->L1;
+  n0   = PANEL->jb;
+  curr = (int)(PANEL->grid->myrow == PANEL->prow);
+
+  Nm1 = N - 1;
+  jj  = ICOFF;
+  if(curr != 0) {
+    ii   = ICOFF;
+    iip1 = ii + 1;
+    Mm1  = m - 1;
+  } else {
+    ii   = 0;
+    iip1 = ii;
+    Mm1  = m;
+  }
+
+  /*
+   * Find local absolute value max in first column - initialize WORK[0:3]
+   */
+  HPL_dlocmax(
+      PANEL, m, ii, jj, WORK, thread_rank, thread_size, max_index, max_value);
+
+  while(Nm1 > 0) {
+    /*
+     * Swap and broadcast the current row
+     */
+    if(thread_rank == 0) {
+      HPL_pdmxswp(PANEL, m, ii, jj, WORK);
+      HPL_dlocswpN(PANEL, ii, jj, WORK);
+    }
+    /*
+     * Compute row (column) jj of L1
+     */
+    if(kk > 0) {
+      L1ptr = Mptr(L1, jj, jj + 1, n0);
+
+      if(thread_rank == 0) {
+        HPL_dgemv(HplColumnMajor,
+                  HplTrans,
+                  kk,
+                  Nm1,
+                  -HPL_rone,
+                  Mptr(L1, ICOFF, jj + 1, n0),
+                  n0,
+                  Mptr(L1, jj, ICOFF, n0),
+                  n0,
+                  HPL_rone,
+                  L1ptr,
+                  n0);
+
+        if(curr != 0) HPL_dcopy(Nm1, L1ptr, n0, Mptr(A, ii, jj + 1, lda), lda);
+      }
+    }
+
+#pragma omp barrier
+
+    /*
+     * Scale current column by its absolute value max entry  -  Update  dia-
+     * diagonal and subdiagonal elements in column  A(iip1:iip1+Mm1-1, jj+1)
+     * and  find local  absolute value max in  that column  (Only  one  pass
+     * through cache for each current column).  This sequence of  operations
+     * could benefit from a specialized blocked implementation.
+     */
+    if(WORK[0] != HPL_rzero)
+      HPL_dscal_omp(Mm1,
+                    HPL_rone / WORK[0],
+                    Mptr(A, iip1, jj, lda),
+                    1,
+                    PANEL->nb,
+                    iip1,
+                    thread_rank,
+                    thread_size);
+
+    HPL_dgemv_omp(HplColumnMajor,
+                  HplNoTrans,
+                  Mm1,
+                  kk + 1,
+                  -HPL_rone,
+                  Mptr(A, iip1, ICOFF, lda),
+                  lda,
+                  Mptr(L1, ICOFF, jj + 1, n0),
+                  1,
+                  HPL_rone,
+                  Mptr(A, iip1, jj + 1, lda),
+                  1,
+                  PANEL->nb,
+                  iip1,
+                  thread_rank,
+                  thread_size);
+
+    HPL_dlocmax(PANEL,
+                Mm1,
+                iip1,
+                jj + 1,
+                WORK,
+                thread_rank,
+                thread_size,
+                max_index,
+                max_value);
+    if(curr != 0) {
+      ii = iip1;
+      iip1++;
+      m = Mm1;
+      Mm1--;
+    }
+
+    Nm1--;
+    jj++;
+    kk++;
+  }
+  /*
+   * Swap and broadcast last row - Scale last column by its absolute value
+   * max entry
+   */
+  if(thread_rank == 0) {
+    HPL_pdmxswp(PANEL, m, ii, jj, WORK);
+    HPL_dlocswpN(PANEL, ii, jj, WORK);
+  }
+
+#pragma omp barrier
+
+  if(WORK[0] != HPL_rzero)
+    HPL_dscal_omp(Mm1,
+                  HPL_rone / WORK[0],
+                  Mptr(A, iip1, jj, lda),
+                  1,
+                  PANEL->nb,
+                  iip1,
+                  thread_rank,
+                  thread_size);
+
+#ifdef HPL_DETAILED_TIMING
+  if(thread_rank == 0) HPL_ptimer(HPL_TIMING_PFACT);
+#endif
+}
diff --git a/src/pfact/HPL_pdpancrT.cpp b/src/pfact/HPL_pdpancrT.cpp
new file mode 100644
index 0000000..84de5f5
--- /dev/null
+++ b/src/pfact/HPL_pdpancrT.cpp
@@ -0,0 +1,232 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_pdpancrT(HPL_T_panel* PANEL,
+                  const int    M,
+                  const int    N,
+                  const int    ICOFF,
+                  double*      WORK,
+                  int          thread_rank,
+                  int          thread_size,
+                  double*      max_value,
+                  int*         max_index) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pdpancrT factorizes  a panel of columns that is a sub-array of a
+   * larger one-dimensional panel  A using the Crout variant of the  usual
+   * one-dimensional algorithm.  The lower triangular N0-by-N0 upper block
+   * of the panel is stored in transpose form.
+   *
+   * Bi-directional  exchange  is  used  to  perform  the  swap::broadcast
+   * operations  at once  for one column in the panel.  This  results in a
+   * lower number of slightly larger  messages than usual.  On P processes
+   * and assuming bi-directional links,  the running time of this function
+   * can be approximated by (when N is equal to N0):
+   *
+   *    N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) +
+   *    N0^2 * ( M - N0/3 ) * gam2-3
+   *
+   * where M is the local number of rows of  the panel, lat and bdwth  are
+   * the latency and bandwidth of the network for  double  precision  real
+   * words, and  gam2-3  is an  estimate of the  Level 2 and Level 3  BLAS
+   * rate of execution. The  recursive  algorithm  allows indeed to almost
+   * achieve  Level 3 BLAS  performance  in the panel factorization.  On a
+   * large  number of modern machines,  this  operation is however latency
+   * bound,  meaning  that its cost can  be estimated  by only the latency
+   * portion N0 * log_2(P) * lat.  Mono-directional links will double this
+   * communication cost.
+   *
+   * Note that  one  iteration of the the main loop is unrolled. The local
+   * computation of the absolute value max of the next column is performed
+   * just after its update by the current column. This allows to bring the
+   * current column only  once through  cache at each  step.  The  current
+   * implementation  does not perform  any blocking  for  this sequence of
+   * BLAS operations, however the design allows for plugging in an optimal
+   * (machine-specific) specialized  BLAS-like kernel.  This idea has been
+   * suggested to us by Fred Gustavson, IBM T.J. Watson Research Center.
+   *
+   * Arguments
+   * =========
+   *
+   * PANEL   (local input/output)          HPL_T_panel *
+   *         On entry,  PANEL  points to the data structure containing the
+   *         panel information.
+   *
+   * M       (local input)                 const int
+   *         On entry,  M specifies the local number of rows of sub(A).
+   *
+   * N       (local input)                 const int
+   *         On entry,  N specifies the local number of columns of sub(A).
+   *
+   * ICOFF   (global input)                const int
+   *         On entry, ICOFF specifies the row and column offset of sub(A)
+   *         in A.
+   *
+   * WORK    (local workspace)             double *
+   *         On entry, WORK  is a workarray of size at least 2*(4+2*N0).
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  double *A, *L1, *L1ptr;
+  int     Mm1, Nm1, curr, ii, iip1, jj, kk = 0, lda, m = M, n0;
+/* ..
+ * .. Executable Statements ..
+ */
+#ifdef HPL_DETAILED_TIMING
+  if(thread_rank == 0) HPL_ptimer(HPL_TIMING_PFACT);
+#endif
+  A    = PANEL->A;
+  lda  = PANEL->lda;
+  L1   = PANEL->L1;
+  n0   = PANEL->jb;
+  curr = (int)(PANEL->grid->myrow == PANEL->prow);
+
+  Nm1 = N - 1;
+  jj  = ICOFF;
+  if(curr != 0) {
+    ii   = ICOFF;
+    iip1 = ii + 1;
+    Mm1  = m - 1;
+  } else {
+    ii   = 0;
+    iip1 = ii;
+    Mm1  = m;
+  }
+
+  /*
+   * Find local absolute value max in first column - initialize WORK[0:3]
+   */
+  HPL_dlocmax(
+      PANEL, m, ii, jj, WORK, thread_rank, thread_size, max_index, max_value);
+
+  while(Nm1 > 0) {
+    /*
+     * Swap and broadcast the current row
+     */
+    if(thread_rank == 0) {
+      HPL_pdmxswp(PANEL, m, ii, jj, WORK);
+      HPL_dlocswpT(PANEL, ii, jj, WORK);
+    }
+    /*
+     * Compute row (column) jj of L1
+     */
+    if(kk > 0) {
+      L1ptr = Mptr(L1, jj + 1, jj, n0);
+
+      if(thread_rank == 0) {
+        HPL_dgemv(HplColumnMajor,
+                  HplNoTrans,
+                  Nm1,
+                  kk,
+                  -HPL_rone,
+                  Mptr(L1, jj + 1, ICOFF, n0),
+                  n0,
+                  Mptr(L1, ICOFF, jj, n0),
+                  1,
+                  HPL_rone,
+                  L1ptr,
+                  1);
+
+        if(curr != 0) HPL_dcopy(Nm1, L1ptr, 1, Mptr(A, ii, jj + 1, lda), lda);
+      }
+    }
+
+#pragma omp barrier
+
+    /*
+     * Scale current column by its absolute value max entry  -  Update  dia-
+     * diagonal and subdiagonal elements in column  A(iip1:iip1+Mm1-1, jj+1)
+     * and  find local  absolute value max in  that column  (Only  one  pass
+     * through cache for each current column).  This sequence of  operations
+     * could benefit from a specialized blocked implementation.
+     */
+    if(WORK[0] != HPL_rzero)
+      HPL_dscal_omp(Mm1,
+                    HPL_rone / WORK[0],
+                    Mptr(A, iip1, jj, lda),
+                    1,
+                    PANEL->nb,
+                    iip1,
+                    thread_rank,
+                    thread_size);
+
+    HPL_dgemv_omp(HplColumnMajor,
+                  HplNoTrans,
+                  Mm1,
+                  kk + 1,
+                  -HPL_rone,
+                  Mptr(A, iip1, ICOFF, lda),
+                  lda,
+                  Mptr(L1, jj + 1, ICOFF, n0),
+                  n0,
+                  HPL_rone,
+                  Mptr(A, iip1, jj + 1, lda),
+                  1,
+                  PANEL->nb,
+                  iip1,
+                  thread_rank,
+                  thread_size);
+
+    HPL_dlocmax(PANEL,
+                Mm1,
+                iip1,
+                jj + 1,
+                WORK,
+                thread_rank,
+                thread_size,
+                max_index,
+                max_value);
+    if(curr != 0) {
+      ii = iip1;
+      iip1++;
+      m = Mm1;
+      Mm1--;
+    }
+
+    Nm1--;
+    jj++;
+    kk++;
+  }
+  /*
+   * Swap and broadcast last row - Scale last column by its absolute value
+   * max entry
+   */
+  if(thread_rank == 0) {
+    HPL_pdmxswp(PANEL, m, ii, jj, WORK);
+    HPL_dlocswpT(PANEL, ii, jj, WORK);
+  }
+
+#pragma omp barrier
+
+  if(WORK[0] != HPL_rzero)
+    HPL_dscal_omp(Mm1,
+                  HPL_rone / WORK[0],
+                  Mptr(A, iip1, jj, lda),
+                  1,
+                  PANEL->nb,
+                  iip1,
+                  thread_rank,
+                  thread_size);
+
+#ifdef HPL_DETAILED_TIMING
+  if(thread_rank == 0) HPL_ptimer(HPL_TIMING_PFACT);
+#endif
+}
diff --git a/src/pfact/HPL_pdpanllN.cpp b/src/pfact/HPL_pdpanllN.cpp
new file mode 100644
index 0000000..5f27ea2
--- /dev/null
+++ b/src/pfact/HPL_pdpanllN.cpp
@@ -0,0 +1,224 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_pdpanllN(HPL_T_panel* PANEL,
+                  const int    M,
+                  const int    N,
+                  const int    ICOFF,
+                  double*      WORK,
+                  int          thread_rank,
+                  int          thread_size,
+                  double*      max_value,
+                  int*         max_index) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pdpanllN factorizes  a panel of columns that is a sub-array of a
+   * larger one-dimensional panel A  using the Left-looking variant of the
+   * usual one-dimensional algorithm.  The lower triangular N0-by-N0 upper
+   * block of the panel is stored in no-transpose form (i.e. just like the
+   * input matrix itself).
+   *
+   * Bi-directional  exchange  is  used  to  perform  the  swap::broadcast
+   * operations  at once  for one column in the panel.  This  results in a
+   * lower number of slightly larger  messages than usual.  On P processes
+   * and assuming bi-directional links,  the running time of this function
+   * can be approximated by (when N is equal to N0):
+   *
+   *    N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) +
+   *    N0^2 * ( M - N0/3 ) * gam2-3
+   *
+   * where M is the local number of rows of  the panel, lat and bdwth  are
+   * the latency and bandwidth of the network for  double  precision  real
+   * words, and  gam2-3  is  an estimate of the  Level 2 and Level 3  BLAS
+   * rate of execution. The  recursive  algorithm  allows indeed to almost
+   * achieve  Level 3 BLAS  performance  in the panel factorization.  On a
+   * large  number of modern machines,  this  operation is however latency
+   * bound,  meaning  that its cost can  be estimated  by only the latency
+   * portion N0 * log_2(P) * lat.  Mono-directional links will double this
+   * communication cost.
+   *
+   * Note that  one  iteration of the the main loop is unrolled. The local
+   * computation of the absolute value max of the next column is performed
+   * just after its update by the current column. This allows to bring the
+   * current column only  once through  cache at each  step.  The  current
+   * implementation  does not perform  any blocking  for  this sequence of
+   * BLAS operations, however the design allows for plugging in an optimal
+   * (machine-specific) specialized  BLAS-like kernel.  This idea has been
+   * suggested to us by Fred Gustavson, IBM T.J. Watson Research Center.
+   *
+   * Arguments
+   * =========
+   *
+   * PANEL   (local input/output)          HPL_T_panel *
+   *         On entry,  PANEL  points to the data structure containing the
+   *         panel information.
+   *
+   * M       (local input)                 const int
+   *         On entry,  M specifies the local number of rows of sub(A).
+   *
+   * N       (local input)                 const int
+   *         On entry,  N specifies the local number of columns of sub(A).
+   *
+   * ICOFF   (global input)                const int
+   *         On entry, ICOFF specifies the row and column offset of sub(A)
+   *         in A.
+   *
+   * WORK    (local workspace)             double *
+   *         On entry, WORK  is a workarray of size at least 2*(4+2*N0).
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  double *A, *L1, *L1ptr;
+  int     Mm1, Nm1, curr, ii, iip1, jj, kk, lda, m = M, n0;
+/* ..
+ * .. Executable Statements ..
+ */
+#ifdef HPL_DETAILED_TIMING
+  if(thread_rank == 0) HPL_ptimer(HPL_TIMING_PFACT);
+#endif
+  A    = PANEL->A;
+  lda  = PANEL->lda;
+  L1   = PANEL->L1;
+  n0   = PANEL->jb;
+  curr = (int)(PANEL->grid->myrow == PANEL->prow);
+
+  Nm1 = N - 1;
+  jj  = ICOFF;
+  if(curr != 0) {
+    ii   = ICOFF;
+    iip1 = ii + 1;
+    Mm1  = m - 1;
+  } else {
+    ii   = 0;
+    iip1 = ii;
+    Mm1  = m;
+  }
+
+  /*
+   * Find local absolute value max in first column and initialize WORK[0:3]
+   */
+  HPL_dlocmax(
+      PANEL, m, ii, jj, WORK, thread_rank, thread_size, max_index, max_value);
+
+  while(Nm1 > 0) {
+    /*
+     * Swap and broadcast the current row
+     */
+    if(thread_rank == 0) {
+      HPL_pdmxswp(PANEL, m, ii, jj, WORK);
+      HPL_dlocswpN(PANEL, ii, jj, WORK);
+    }
+
+    L1ptr = Mptr(L1, ICOFF, jj + 1, n0);
+    kk    = jj + 1 - ICOFF;
+    if(thread_rank == 0) {
+      HPL_dtrsv(HplColumnMajor,
+                HplLower,
+                HplNoTrans,
+                HplUnit,
+                kk,
+                Mptr(L1, ICOFF, ICOFF, n0),
+                n0,
+                L1ptr,
+                1);
+    }
+
+#pragma omp barrier
+
+    /*
+     * Scale  current column by its absolute value max entry  -  Update  and
+     * find local  absolute value max  in next column (Only one pass through
+     * cache for each next column).  This sequence of operations could bene-
+     * fit from a specialized  blocked implementation.
+     */
+    if(WORK[0] != HPL_rzero)
+      HPL_dscal_omp(Mm1,
+                    HPL_rone / WORK[0],
+                    Mptr(A, iip1, jj, lda),
+                    1,
+                    PANEL->nb,
+                    iip1,
+                    thread_rank,
+                    thread_size);
+
+    HPL_dgemv_omp(HplColumnMajor,
+                  HplNoTrans,
+                  Mm1,
+                  kk,
+                  -HPL_rone,
+                  Mptr(A, iip1, ICOFF, lda),
+                  lda,
+                  L1ptr,
+                  1,
+                  HPL_rone,
+                  Mptr(A, iip1, jj + 1, lda),
+                  1,
+                  PANEL->nb,
+                  iip1,
+                  thread_rank,
+                  thread_size);
+
+    HPL_dlocmax(PANEL,
+                Mm1,
+                iip1,
+                jj + 1,
+                WORK,
+                thread_rank,
+                thread_size,
+                max_index,
+                max_value);
+    if(curr != 0) {
+      if(thread_rank == 0) {
+        HPL_dcopy(kk, L1ptr, 1, Mptr(A, ICOFF, jj + 1, lda), 1);
+      }
+      ii = iip1;
+      iip1++;
+      m = Mm1;
+      Mm1--;
+    }
+    Nm1--;
+    jj++;
+  }
+  /*
+   * Swap and broadcast last row - Scale last column by its absolute value
+   * max entry
+   */
+  if(thread_rank == 0) {
+    HPL_pdmxswp(PANEL, m, ii, jj, WORK);
+    HPL_dlocswpN(PANEL, ii, jj, WORK);
+  }
+
+#pragma omp barrier
+
+  if(WORK[0] != HPL_rzero)
+    HPL_dscal_omp(Mm1,
+                  HPL_rone / WORK[0],
+                  Mptr(A, iip1, jj, lda),
+                  1,
+                  PANEL->nb,
+                  iip1,
+                  thread_rank,
+                  thread_size);
+
+#ifdef HPL_DETAILED_TIMING
+  if(thread_rank == 0) HPL_ptimer(HPL_TIMING_PFACT);
+#endif
+}
diff --git a/src/pfact/HPL_pdpanllT.cpp b/src/pfact/HPL_pdpanllT.cpp
new file mode 100644
index 0000000..c11d204
--- /dev/null
+++ b/src/pfact/HPL_pdpanllT.cpp
@@ -0,0 +1,223 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_pdpanllT(HPL_T_panel* PANEL,
+                  const int    M,
+                  const int    N,
+                  const int    ICOFF,
+                  double*      WORK,
+                  int          thread_rank,
+                  int          thread_size,
+                  double*      max_value,
+                  int*         max_index) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pdpanllT factorizes  a panel of columns that is a sub-array of a
+   * larger one-dimensional panel A  using the Left-looking variant of the
+   * usual one-dimensional algorithm.  The lower triangular N0-by-N0 upper
+   * block of the panel is stored in transpose form.
+   *
+   * Bi-directional  exchange  is  used  to  perform  the  swap::broadcast
+   * operations  at once  for one column in the panel.  This  results in a
+   * lower number of slightly larger  messages than usual.  On P processes
+   * and assuming bi-directional links,  the running time of this function
+   * can be approximated by (when N is equal to N0):
+   *
+   *    N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) +
+   *    N0^2 * ( M - N0/3 ) * gam2-3
+   *
+   * where M is the local number of rows of  the panel, lat and bdwth  are
+   * the latency and bandwidth of the network for  double  precision  real
+   * words, and   gam2-3  is an estimate of the  Level 2 and Level 3  BLAS
+   * rate of execution. The  recursive  algorithm  allows indeed to almost
+   * achieve  Level 3 BLAS  performance  in the panel factorization.  On a
+   * large  number of modern machines,  this  operation is however latency
+   * bound,  meaning  that its cost can  be estimated  by only the latency
+   * portion N0 * log_2(P) * lat.  Mono-directional links will double this
+   * communication cost.
+   *
+   * Note that  one  iteration of the the main loop is unrolled. The local
+   * computation of the absolute value max of the next column is performed
+   * just after its update by the current column. This allows to bring the
+   * current column only  once through  cache at each  step.  The  current
+   * implementation  does not perform  any blocking  for  this sequence of
+   * BLAS operations, however the design allows for plugging in an optimal
+   * (machine-specific) specialized  BLAS-like kernel.  This idea has been
+   * suggested to us by Fred Gustavson, IBM T.J. Watson Research Center.
+   *
+   * Arguments
+   * =========
+   *
+   * PANEL   (local input/output)          HPL_T_panel *
+   *         On entry,  PANEL  points to the data structure containing the
+   *         panel information.
+   *
+   * M       (local input)                 const int
+   *         On entry,  M specifies the local number of rows of sub(A).
+   *
+   * N       (local input)                 const int
+   *         On entry,  N specifies the local number of columns of sub(A).
+   *
+   * ICOFF   (global input)                const int
+   *         On entry, ICOFF specifies the row and column offset of sub(A)
+   *         in A.
+   *
+   * WORK    (local workspace)             double *
+   *         On entry, WORK  is a workarray of size at least 2*(4+2*N0).
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  double *A, *L1, *L1ptr;
+  int     Mm1, Nm1, curr, ii, iip1, jj, kk, lda, m = M, n0;
+/* ..
+ * .. Executable Statements ..
+ */
+#ifdef HPL_DETAILED_TIMING
+  if(thread_rank == 0) HPL_ptimer(HPL_TIMING_PFACT);
+#endif
+  A    = PANEL->A;
+  lda  = PANEL->lda;
+  L1   = PANEL->L1;
+  n0   = PANEL->jb;
+  curr = (int)(PANEL->grid->myrow == PANEL->prow);
+
+  Nm1 = N - 1;
+  jj  = ICOFF;
+  if(curr != 0) {
+    ii   = ICOFF;
+    iip1 = ii + 1;
+    Mm1  = m - 1;
+  } else {
+    ii   = 0;
+    iip1 = ii;
+    Mm1  = m;
+  }
+
+  /*
+   * Find local absolute value max in first column and initialize WORK[0:3]
+   */
+  HPL_dlocmax(
+      PANEL, m, ii, jj, WORK, thread_rank, thread_size, max_index, max_value);
+
+  while(Nm1 > 0) {
+    /*
+     * Swap and broadcast the current row
+     */
+    if(thread_rank == 0) {
+      HPL_pdmxswp(PANEL, m, ii, jj, WORK);
+      HPL_dlocswpT(PANEL, ii, jj, WORK);
+    }
+
+    L1ptr = Mptr(L1, jj + 1, ICOFF, n0);
+    kk    = jj + 1 - ICOFF;
+    if(thread_rank == 0) {
+      HPL_dtrsv(HplColumnMajor,
+                HplUpper,
+                HplTrans,
+                HplUnit,
+                kk,
+                Mptr(L1, ICOFF, ICOFF, n0),
+                n0,
+                L1ptr,
+                n0);
+    }
+
+#pragma omp barrier
+
+    /*
+     * Scale  current column by its absolute value max entry  -  Update  and
+     * find local  absolute value max  in next column (Only one pass through
+     * cache for each next column).  This sequence of operations could bene-
+     * fit from a specialized  blocked implementation.
+     */
+    if(WORK[0] != HPL_rzero)
+      HPL_dscal_omp(Mm1,
+                    HPL_rone / WORK[0],
+                    Mptr(A, iip1, jj, lda),
+                    1,
+                    PANEL->nb,
+                    iip1,
+                    thread_rank,
+                    thread_size);
+
+    HPL_dgemv_omp(HplColumnMajor,
+                  HplNoTrans,
+                  Mm1,
+                  kk,
+                  -HPL_rone,
+                  Mptr(A, iip1, ICOFF, lda),
+                  lda,
+                  L1ptr,
+                  n0,
+                  HPL_rone,
+                  Mptr(A, iip1, jj + 1, lda),
+                  1,
+                  PANEL->nb,
+                  iip1,
+                  thread_rank,
+                  thread_size);
+
+    HPL_dlocmax(PANEL,
+                Mm1,
+                iip1,
+                jj + 1,
+                WORK,
+                thread_rank,
+                thread_size,
+                max_index,
+                max_value);
+    if(curr != 0) {
+      if(thread_rank == 0) {
+        HPL_dcopy(kk, L1ptr, n0, Mptr(A, ICOFF, jj + 1, lda), 1);
+      }
+      ii = iip1;
+      iip1++;
+      m = Mm1;
+      Mm1--;
+    }
+    Nm1--;
+    jj++;
+  }
+  /*
+   * Swap and broadcast last row - Scale last column by its absolute value
+   * max entry
+   */
+  if(thread_rank == 0) {
+    HPL_pdmxswp(PANEL, m, ii, jj, WORK);
+    HPL_dlocswpT(PANEL, ii, jj, WORK);
+  }
+
+#pragma omp barrier
+
+  if(WORK[0] != HPL_rzero)
+    HPL_dscal_omp(Mm1,
+                  HPL_rone / WORK[0],
+                  Mptr(A, iip1, jj, lda),
+                  1,
+                  PANEL->nb,
+                  iip1,
+                  thread_rank,
+                  thread_size);
+
+#ifdef HPL_DETAILED_TIMING
+  if(thread_rank == 0) HPL_ptimer(HPL_TIMING_PFACT);
+#endif
+}
diff --git a/src/pfact/HPL_pdpanrlN.cpp b/src/pfact/HPL_pdpanrlN.cpp
new file mode 100644
index 0000000..c341e1d
--- /dev/null
+++ b/src/pfact/HPL_pdpanrlN.cpp
@@ -0,0 +1,228 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_pdpanrlN(HPL_T_panel* PANEL,
+                  const int    M,
+                  const int    N,
+                  const int    ICOFF,
+                  double*      WORK,
+                  int          thread_rank,
+                  int          thread_size,
+                  double*      max_value,
+                  int*         max_index) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pdpanrlN factorizes  a panel of columns  that is a sub-array of a
+   * larger one-dimensional panel A using the Right-looking variant of the
+   * usual one-dimensional algorithm.  The lower triangular N0-by-N0 upper
+   * block of the panel is stored in no-transpose form (i.e. just like the
+   * input matrix itself).
+   *
+   * Bi-directional  exchange  is  used  to  perform  the  swap::broadcast
+   * operations  at once  for one column in the panel.  This  results in a
+   * lower number of slightly larger  messages than usual.  On P processes
+   * and assuming bi-directional links,  the running time of this function
+   * can be approximated by (when N is equal to N0):
+   *
+   *    N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) +
+   *    N0^2 * ( M - N0/3 ) * gam2-3
+   *
+   * where M is the local number of rows of  the panel, lat and bdwth  are
+   * the latency and bandwidth of the network for  double  precision  real
+   * words, and  gam2-3  is  an estimate of the  Level 2 and Level 3  BLAS
+   * rate of execution. The  recursive  algorithm  allows indeed to almost
+   * achieve  Level 3 BLAS  performance  in the panel factorization.  On a
+   * large  number of modern machines,  this  operation is however latency
+   * bound,  meaning  that its cost can  be estimated  by only the latency
+   * portion N0 * log_2(P) * lat.  Mono-directional links will double this
+   * communication cost.
+   *
+   * Note that  one  iteration of the the main loop is unrolled. The local
+   * computation of the absolute value max of the next column is performed
+   * just after its update by the current column. This allows to bring the
+   * current column only  once through  cache at each  step.  The  current
+   * implementation  does not perform  any blocking  for  this sequence of
+   * BLAS operations, however the design allows for plugging in an optimal
+   * (machine-specific) specialized  BLAS-like kernel.  This idea has been
+   * suggested to us by Fred Gustavson, IBM T.J. Watson Research Center.
+   *
+   * Arguments
+   * =========
+   *
+   * PANEL   (local input/output)          HPL_T_panel *
+   *         On entry,  PANEL  points to the data structure containing the
+   *         panel information.
+   *
+   * M       (local input)                 const int
+   *         On entry,  M specifies the local number of rows of sub(A).
+   *
+   * N       (local input)                 const int
+   *         On entry,  N specifies the local number of columns of sub(A).
+   *
+   * ICOFF   (global input)                const int
+   *         On entry, ICOFF specifies the row and column offset of sub(A)
+   *         in A.
+   *
+   * WORK    (local workspace)             double *
+   *         On entry, WORK  is a workarray of size at least 2*(4+2*N0).
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  double *A, *Acur, *Anxt;
+  int     Mm1, Nm1, curr, ii, iip1, jj, lda, m = M;
+/* ..
+ * .. Executable Statements ..
+ */
+#ifdef HPL_DETAILED_TIMING
+  if(thread_rank == 0) HPL_ptimer(HPL_TIMING_PFACT);
+#endif
+  A    = PANEL->A;
+  lda  = PANEL->lda;
+  curr = (int)(PANEL->grid->myrow == PANEL->prow);
+
+  Nm1 = N - 1;
+  jj  = ICOFF;
+  if(curr != 0) {
+    ii   = ICOFF;
+    iip1 = ii + 1;
+    Mm1  = m - 1;
+  } else {
+    ii   = 0;
+    iip1 = ii;
+    Mm1  = m;
+  }
+  /*
+   * Find local absolute value max in first column - initialize WORK[0:3]
+   */
+  HPL_dlocmax(
+      PANEL, m, ii, jj, WORK, thread_rank, thread_size, max_index, max_value);
+
+  while(Nm1 >= 1) {
+    Acur = Mptr(A, iip1, jj, lda);
+    Anxt = Mptr(Acur, 0, 1, lda);
+    /*
+     * Swap and broadcast the current row
+     */
+    if(thread_rank == 0) {
+      HPL_pdmxswp(PANEL, m, ii, jj, WORK);
+      HPL_dlocswpN(PANEL, ii, jj, WORK);
+    }
+
+#pragma omp barrier
+
+    /*
+     * Scale current column by its absolute value max entry  -  Update trai-
+     * ling sub-matrix and find local absolute value max in next column (On-
+     * ly one pass through cache for each current column).  This sequence of
+     * operations could benefit from a specialized blocked implementation.
+     */
+    if(WORK[0] != HPL_rzero)
+      HPL_dscal_omp(Mm1,
+                    HPL_rone / WORK[0],
+                    Acur,
+                    1,
+                    PANEL->nb,
+                    iip1,
+                    thread_rank,
+                    thread_size);
+
+    HPL_daxpy_omp(Mm1,
+                  -WORK[4 + jj + 1],
+                  Acur,
+                  1,
+                  Anxt,
+                  1,
+                  PANEL->nb,
+                  iip1,
+                  thread_rank,
+                  thread_size);
+
+    HPL_dlocmax(PANEL,
+                Mm1,
+                iip1,
+                jj + 1,
+                WORK,
+                thread_rank,
+                thread_size,
+                max_index,
+                max_value);
+
+    if(Nm1 > 1)
+      HPL_dger_omp(HplColumnMajor,
+                   Mm1,
+                   Nm1 - 1,
+                   -HPL_rone,
+                   Acur,
+                   1,
+                   WORK + 4 + jj + 2,
+                   1,
+                   Mptr(Anxt, 0, 1, lda),
+                   lda,
+                   PANEL->nb,
+                   iip1,
+                   thread_rank,
+                   thread_size);
+
+#pragma omp barrier
+
+    /*
+     * Same thing as above but with worse data access on y (A += x * y^T)
+     *
+     *    if( Nm1 > 1 ) )
+     *       HPL_dger( HplColumnMajor, Mm1, Nm1-1, -HPL_rone, Acur, 1,
+     *                 Mptr( L1, jj, jj+2, n0 ), n0, Mptr( Anxt, 0, 1, lda ),
+     *                 lda );
+     */
+    if(curr != 0) {
+      ii = iip1;
+      iip1++;
+      m = Mm1;
+      Mm1--;
+    }
+
+    Nm1--;
+    jj++;
+  }
+  /*
+   * Swap and broadcast last row - Scale last column by its absolute value
+   * max entry
+   */
+  if(thread_rank == 0) {
+    HPL_pdmxswp(PANEL, m, ii, jj, WORK);
+    HPL_dlocswpN(PANEL, ii, jj, WORK);
+  }
+
+#pragma omp barrier
+
+  if(WORK[0] != HPL_rzero)
+    HPL_dscal_omp(Mm1,
+                  HPL_rone / WORK[0],
+                  Mptr(A, iip1, jj, lda),
+                  1,
+                  PANEL->nb,
+                  iip1,
+                  thread_rank,
+                  thread_size);
+
+#ifdef HPL_DETAILED_TIMING
+  if(thread_rank == 0) HPL_ptimer(HPL_TIMING_PFACT);
+#endif
+}
diff --git a/src/pfact/HPL_pdpanrlT.cpp b/src/pfact/HPL_pdpanrlT.cpp
new file mode 100644
index 0000000..3052223
--- /dev/null
+++ b/src/pfact/HPL_pdpanrlT.cpp
@@ -0,0 +1,224 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_pdpanrlT(HPL_T_panel* PANEL,
+                  const int    M,
+                  const int    N,
+                  const int    ICOFF,
+                  double*      WORK,
+                  int          thread_rank,
+                  int          thread_size,
+                  double*      max_value,
+                  int*         max_index) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pdpanrlT factorizes  a panel of columns  that is a sub-array of a
+   * larger one-dimensional panel A using the Right-looking variant of the
+   * usual one-dimensional algorithm.  The lower triangular N0-by-N0 upper
+   * block of the panel is stored in transpose form.
+   *
+   * Bi-directional  exchange  is  used  to  perform  the  swap::broadcast
+   * operations  at once  for one column in the panel.  This  results in a
+   * lower number of slightly larger  messages than usual.  On P processes
+   * and assuming bi-directional links,  the running time of this function
+   * can be approximated by (when N is equal to N0):
+   *
+   *    N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) +
+   *    N0^2 * ( M - N0/3 ) * gam2-3
+   *
+   * where M is the local number of rows of  the panel, lat and bdwth  are
+   * the latency and bandwidth of the network for  double  precision  real
+   * words,  and  gam2-3  is an estimate of the  Level 2 and Level 3  BLAS
+   * rate of execution. The  recursive  algorithm  allows indeed to almost
+   * achieve  Level 3 BLAS  performance  in the panel factorization.  On a
+   * large  number of modern machines,  this  operation is however latency
+   * bound,  meaning  that its cost can  be estimated  by only the latency
+   * portion N0 * log_2(P) * lat.  Mono-directional links will double this
+   * communication cost.
+   *
+   * Note that  one  iteration of the the main loop is unrolled. The local
+   * computation of the absolute value max of the next column is performed
+   * just after its update by the current column. This allows to bring the
+   * current column only  once through  cache at each  step.  The  current
+   * implementation  does not perform  any blocking  for  this sequence of
+   * BLAS operations, however the design allows for plugging in an optimal
+   * (machine-specific) specialized  BLAS-like kernel.  This idea has been
+   * suggested to us by Fred Gustavson, IBM T.J. Watson Research Center.
+   *
+   * Arguments
+   * =========
+   *
+   * PANEL   (local input/output)          HPL_T_panel *
+   *         On entry,  PANEL  points to the data structure containing the
+   *         panel information.
+   *
+   * M       (local input)                 const int
+   *         On entry,  M specifies the local number of rows of sub(A).
+   *
+   * N       (local input)                 const int
+   *         On entry,  N specifies the local number of columns of sub(A).
+   *
+   * ICOFF   (global input)                const int
+   *         On entry, ICOFF specifies the row and column offset of sub(A)
+   *         in A.
+   *
+   * WORK    (local workspace)             double *
+   *         On entry, WORK  is a workarray of size at least 2*(4+2*N0).
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  double *A, *Acur, *Anxt, *L1;
+  int     Mm1, Nm1, curr, ii, iip1, jj, lda, m = M, n0;
+/* ..
+ * .. Executable Statements ..
+ */
+#ifdef HPL_DETAILED_TIMING
+  if(thread_rank == 0) HPL_ptimer(HPL_TIMING_PFACT);
+#endif
+  A    = PANEL->A;
+  lda  = PANEL->lda;
+  L1   = PANEL->L1;
+  n0   = PANEL->jb;
+  curr = (int)(PANEL->grid->myrow == PANEL->prow);
+
+  Nm1 = N - 1;
+  jj  = ICOFF;
+  if(curr != 0) {
+    ii   = ICOFF;
+    iip1 = ii + 1;
+    Mm1  = m - 1;
+  } else {
+    ii   = 0;
+    iip1 = ii;
+    Mm1  = m;
+  }
+
+  /*
+   * Find local absolute value max in first column - initialize WORK[0:3]
+   */
+  HPL_dlocmax(
+      PANEL, m, ii, jj, WORK, thread_rank, thread_size, max_index, max_value);
+
+  while(Nm1 >= 1) {
+    Acur = Mptr(A, iip1, jj, lda);
+    Anxt = Mptr(Acur, 0, 1, lda);
+    /*
+     * Swap and broadcast the current row
+     */
+    if(thread_rank == 0) {
+      HPL_pdmxswp(PANEL, m, ii, jj, WORK);
+      HPL_dlocswpT(PANEL, ii, jj, WORK);
+    }
+
+#pragma omp barrier
+
+    /*
+     * Scale current column by its absolute value max entry  -  Update trai-
+     * ling sub-matrix and find local absolute value max in next column (On-
+     * ly one pass through cache for each current column).  This sequence of
+     * operations could benefit from a specialized blocked implementation.
+     */
+    if(WORK[0] != HPL_rzero)
+      HPL_dscal_omp(Mm1,
+                    HPL_rone / WORK[0],
+                    Acur,
+                    1,
+                    PANEL->nb,
+                    iip1,
+                    thread_rank,
+                    thread_size);
+
+    HPL_daxpy_omp(Mm1,
+                  -(*(Mptr(L1, jj + 1, jj, n0))),
+                  Acur,
+                  1,
+                  Anxt,
+                  1,
+                  PANEL->nb,
+                  iip1,
+                  thread_rank,
+                  thread_size);
+
+    HPL_dlocmax(PANEL,
+                Mm1,
+                iip1,
+                jj + 1,
+                WORK,
+                thread_rank,
+                thread_size,
+                max_index,
+                max_value);
+
+    if(Nm1 > 1) {
+
+      HPL_dger_omp(HplColumnMajor,
+                   Mm1,
+                   Nm1 - 1,
+                   -HPL_rone,
+                   Acur,
+                   1,
+                   Mptr(L1, jj + 2, jj, n0),
+                   1,
+                   Mptr(Anxt, 0, 1, lda),
+                   lda,
+                   PANEL->nb,
+                   iip1,
+                   thread_rank,
+                   thread_size);
+    }
+
+#pragma omp barrier
+
+    if(curr != 0) {
+      ii = iip1;
+      iip1++;
+      m = Mm1;
+      Mm1--;
+    }
+
+    Nm1--;
+    jj++;
+  }
+  /*
+   * Swap and broadcast last row - Scale last column by its absolute value
+   * max entry
+   */
+  if(thread_rank == 0) {
+    HPL_pdmxswp(PANEL, m, ii, jj, WORK);
+    HPL_dlocswpT(PANEL, ii, jj, WORK);
+  }
+
+#pragma omp barrier
+
+  if(WORK[0] != HPL_rzero)
+    HPL_dscal_omp(Mm1,
+                  HPL_rone / WORK[0],
+                  Mptr(A, iip1, jj, lda),
+                  1,
+                  PANEL->nb,
+                  iip1,
+                  thread_rank,
+                  thread_size);
+
+#ifdef HPL_DETAILED_TIMING
+  if(thread_rank == 0) HPL_ptimer(HPL_TIMING_PFACT);
+#endif
+}
diff --git a/src/pfact/HPL_pdrpancrN.cpp b/src/pfact/HPL_pdrpancrN.cpp
new file mode 100644
index 0000000..b5389a9
--- /dev/null
+++ b/src/pfact/HPL_pdrpancrN.cpp
@@ -0,0 +1,214 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_pdrpancrN(HPL_T_panel* PANEL,
+                   const int    M,
+                   const int    N,
+                   const int    ICOFF,
+                   double*      WORK,
+                   int          thread_rank,
+                   int          thread_size,
+                   double*      max_value,
+                   int*         max_index) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pdrpancrN HPL_pdrpancrN recursively  factorizes  a panel of columns
+   * using  the recursive  Crout  variant of the usual one-dimensional
+   * algorithm. The lower triangular  N0-by-N0  upper block  of  the  panel  is
+   * stored in no-transpose form (i.e. just like the input matrix itself).
+   *
+   * Bi-directional  exchange  is  used  to  perform  the  swap::broadcast
+   * operations  at once  for one column in the panel.  This  results in a
+   * lower number of slightly larger  messages than usual.  On P processes
+   * and assuming bi-directional links,  the running time of this function
+   * can be approximated by (when N is equal to N0):
+   *
+   *    N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) +
+   *    N0^2 * ( M - N0/3 ) * gam2-3
+   *
+   * where M is the local number of rows of  the panel, lat and bdwth  are
+   * the latency and bandwidth of the network for  double  precision  real
+   * words, and  gam2-3  is  an estimate of the  Level 2 and Level 3  BLAS
+   * rate of execution. The  recursive  algorithm  allows indeed to almost
+   * achieve  Level 3 BLAS  performance  in the panel factorization.  On a
+   * large  number of modern machines,  this  operation is however latency
+   * bound,  meaning  that its cost can  be estimated  by only the latency
+   * portion N0 * log_2(P) * lat.  Mono-directional links will double this
+   * communication cost.
+   *
+   * Arguments
+   * =========
+   *
+   * PANEL   (local input/output)          HPL_T_panel *
+   *         On entry,  PANEL  points to the data structure containing the
+   *         panel information.
+   *
+   * M       (local input)                 const int
+   *         On entry,  M specifies the local number of rows of sub(A).
+   *
+   * N       (local input)                 const int
+   *         On entry,  N specifies the local number of columns of sub(A).
+   *
+   * ICOFF   (global input)                const int
+   *         On entry, ICOFF specifies the row and column offset of sub(A)
+   *         in A.
+   *
+   * WORK    (local workspace)             double *
+   *         On entry, WORK  is a workarray of size at least 2*(4+2*N0).
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  double *A, *Aptr, *L1, *L1ptr;
+  int     curr, ii, ioff, jb, jj, lda, m, n, n0, nb, nbdiv, nbmin;
+
+  if(N <= (nbmin = PANEL->algo->nbmin)) {
+    PANEL->algo->pffun(PANEL,
+                       M,
+                       N,
+                       ICOFF,
+                       WORK,
+                       thread_rank,
+                       thread_size,
+                       max_value,
+                       max_index);
+    return;
+  }
+  /*
+   * Find  new recursive blocking factor.  To avoid an infinite loop,  one
+   * must guarantee: 1 <= jb < N, knowing that  N  is greater than  NBMIN.
+   * First, we compute nblocks:  the number of blocks of size  NBMIN in N,
+   * including the last one that may be smaller.  nblocks  is thus  larger
+   * than or equal to one, since N >= NBMIN.
+   * The ratio ( nblocks + NDIV - 1 ) / NDIV  is thus larger than or equal
+   * to one as  well.  For  NDIV >= 2,  we  are guaranteed  that the quan-
+   * tity ( ( nblocks + NDIV  - 1 ) / NDIV ) * NBMIN  is less  than N  and
+   * greater than or equal to NBMIN.
+   */
+  nbdiv = PANEL->algo->nbdiv;
+  ii = jj = 0;
+  m       = M;
+  n       = N;
+  nb = jb = ((((N + nbmin - 1) / nbmin) + nbdiv - 1) / nbdiv) * nbmin;
+
+  A     = PANEL->A;
+  lda   = PANEL->lda;
+  L1    = PANEL->L1;
+  n0    = PANEL->jb;
+  L1ptr = Mptr(L1, ICOFF, ICOFF, n0);
+  curr  = (int)(PANEL->grid->myrow == PANEL->prow);
+
+  if(curr != 0)
+    Aptr = Mptr(A, ICOFF, ICOFF, lda);
+  else
+    Aptr = Mptr(A, 0, ICOFF, lda);
+  /*
+   * The triangular solve is replicated in every  process row.  The  panel
+   * factorization is  such that  the first rows of  A  are accumulated in
+   * every process row during the (panel) swapping phase.  We  ensure this
+   * way a minimum amount  of communication during the entire panel facto-
+   * rization.
+   */
+  do {
+    n -= jb;
+    ioff = ICOFF + jj;
+
+#pragma omp barrier
+
+    /*
+     * Local update - Factor current panel - Replicated update and solve
+     */
+    HPL_dgemm_omp(HplColumnMajor,
+                  HplNoTrans,
+                  HplNoTrans,
+                  m,
+                  jb,
+                  jj,
+                  -HPL_rone,
+                  Mptr(Aptr, ii, 0, lda),
+                  lda,
+                  Mptr(L1ptr, 0, jj, n0),
+                  n0,
+                  HPL_rone,
+                  Mptr(Aptr, ii, jj, lda),
+                  lda,
+                  PANEL->nb,
+                  (curr != 0) ? ICOFF + ii : 0,
+                  thread_rank,
+                  thread_size);
+
+    HPL_pdrpancrN(PANEL,
+                  m,
+                  jb,
+                  ioff,
+                  WORK,
+                  thread_rank,
+                  thread_size,
+                  max_value,
+                  max_index);
+
+    if(n > 0) {
+
+      if(thread_rank == 0) {
+        HPL_dgemm(HplColumnMajor,
+                  HplNoTrans,
+                  HplNoTrans,
+                  jb,
+                  n,
+                  jj,
+                  -HPL_rone,
+                  Mptr(L1ptr, jj, 0, n0),
+                  n0,
+                  Mptr(L1ptr, 0, jj + jb, n0),
+                  n0,
+                  HPL_rone,
+                  Mptr(L1ptr, jj, jj + jb, n0),
+                  n0);
+
+        HPL_dtrsm(HplColumnMajor,
+                  HplLeft,
+                  HplLower,
+                  HplNoTrans,
+                  HplUnit,
+                  jb,
+                  n,
+                  HPL_rone,
+                  Mptr(L1ptr, jj, jj, n0),
+                  n0,
+                  Mptr(L1ptr, jj, jj + jb, n0),
+                  n0);
+      }
+    }
+    /*
+     * Copy back upper part of A in current process row - Go the next block
+     */
+    if(curr != 0) {
+      if(thread_rank == 0) {
+        HPL_dlacpy(
+            ioff, jb, Mptr(L1, 0, ioff, n0), n0, Mptr(A, 0, ioff, lda), lda);
+      }
+      ii += jb;
+      m -= jb;
+    }
+    jj += jb;
+    jb = Mmin(n, nb);
+
+  } while(n > 0);
+}
diff --git a/src/pfact/HPL_pdrpancrT.cpp b/src/pfact/HPL_pdrpancrT.cpp
new file mode 100644
index 0000000..7694cb5
--- /dev/null
+++ b/src/pfact/HPL_pdrpancrT.cpp
@@ -0,0 +1,213 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_pdrpancrT(HPL_T_panel* PANEL,
+                   const int    M,
+                   const int    N,
+                   const int    ICOFF,
+                   double*      WORK,
+                   int          thread_rank,
+                   int          thread_size,
+                   double*      max_value,
+                   int*         max_index) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pdrpancrT recursively  factorizes  a panel  of columns using  the
+   * recursive  Crout  variant  of  the  usual one-dimensional  algorithm.
+   * The lower triangular N0-by-N0  upper block of the panel  is stored in
+   * transpose form.
+   *
+   * Bi-directional  exchange  is  used  to  perform  the  swap::broadcast
+   * operations  at once  for one column in the panel.  This  results in a
+   * lower number of slightly larger  messages than usual.  On P processes
+   * and assuming bi-directional links,  the running time of this function
+   * can be approximated by (when N is equal to N0):
+   *
+   *    N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) +
+   *    N0^2 * ( M - N0/3 ) * gam2-3
+   *
+   * where M is the local number of rows of  the panel, lat and bdwth  are
+   * the latency and bandwidth of the network for  double  precision  real
+   * words, and  gam2-3  is  an estimate of the  Level 2 and Level 3  BLAS
+   * rate of execution. The  recursive  algorithm  allows indeed to almost
+   * achieve  Level 3 BLAS  performance  in the panel factorization.  On a
+   * large  number of modern machines,  this  operation is however latency
+   * bound,  meaning  that its cost can  be estimated  by only the latency
+   * portion N0 * log_2(P) * lat.  Mono-directional links will double this
+   * communication cost.
+   *
+   * Arguments
+   * =========
+   *
+   * PANEL   (local input/output)          HPL_T_panel *
+   *         On entry,  PANEL  points to the data structure containing the
+   *         panel information.
+   *
+   * M       (local input)                 const int
+   *         On entry,  M specifies the local number of rows of sub(A).
+   *
+   * N       (local input)                 const int
+   *         On entry,  N specifies the local number of columns of sub(A).
+   *
+   * ICOFF   (global input)                const int
+   *         On entry, ICOFF specifies the row and column offset of sub(A)
+   *         in A.
+   *
+   * WORK    (local workspace)             double *
+   *         On entry, WORK  is a workarray of size at least 2*(4+2*N0).
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  double *A, *Aptr, *L1, *L1ptr;
+  int     curr, ii, ioff, jb, jj, lda, m, n, n0, nb, nbdiv, nbmin;
+
+  if(N <= (nbmin = PANEL->algo->nbmin)) {
+    PANEL->algo->pffun(PANEL,
+                       M,
+                       N,
+                       ICOFF,
+                       WORK,
+                       thread_rank,
+                       thread_size,
+                       max_value,
+                       max_index);
+    return;
+  }
+  /*
+   * Find  new recursive blocking factor.  To avoid an infinite loop,  one
+   * must guarantee: 1 <= jb < N, knowing that  N  is greater than  NBMIN.
+   * First, we compute nblocks:  the number of blocks of size  NBMIN in N,
+   * including the last one that may be smaller.  nblocks  is thus  larger
+   * than or equal to one, since N >= NBMIN.
+   * The ratio ( nblocks + NDIV - 1 ) / NDIV  is thus larger than or equal
+   * to one as  well.  For  NDIV >= 2,  we  are guaranteed  that the quan-
+   * tity ( ( nblocks + NDIV  - 1 ) / NDIV ) * NBMIN  is less  than N  and
+   * greater than or equal to NBMIN.
+   */
+  nbdiv = PANEL->algo->nbdiv;
+  ii = jj = 0;
+  m       = M;
+  n       = N;
+  nb = jb = ((((N + nbmin - 1) / nbmin) + nbdiv - 1) / nbdiv) * nbmin;
+
+  A     = PANEL->A;
+  lda   = PANEL->lda;
+  L1    = PANEL->L1;
+  n0    = PANEL->jb;
+  L1ptr = Mptr(L1, ICOFF, ICOFF, n0);
+  curr  = (int)(PANEL->grid->myrow == PANEL->prow);
+
+  if(curr != 0)
+    Aptr = Mptr(A, ICOFF, ICOFF, lda);
+  else
+    Aptr = Mptr(A, 0, ICOFF, lda);
+  /*
+   * The triangular solve is replicated in every  process row.  The  panel
+   * factorization is  such that  the first rows of  A  are accumulated in
+   * every process row during the (panel) swapping phase.  We  ensure this
+   * way a minimum amount  of communication during the entire panel facto-
+   * rization.
+   */
+  do {
+    n -= jb;
+    ioff = ICOFF + jj;
+
+#pragma omp barrier
+
+    /*
+     * Local update - Factor current panel - Replicated update and solve
+     */
+    HPL_dgemm_omp(HplColumnMajor,
+                  HplNoTrans,
+                  HplTrans,
+                  m,
+                  jb,
+                  jj,
+                  -HPL_rone,
+                  Mptr(Aptr, ii, 0, lda),
+                  lda,
+                  Mptr(L1ptr, jj, 0, n0),
+                  n0,
+                  HPL_rone,
+                  Mptr(Aptr, ii, jj, lda),
+                  lda,
+                  PANEL->nb,
+                  (curr != 0) ? ICOFF + ii : 0,
+                  thread_rank,
+                  thread_size);
+
+    HPL_pdrpancrT(PANEL,
+                  m,
+                  jb,
+                  ioff,
+                  WORK,
+                  thread_rank,
+                  thread_size,
+                  max_value,
+                  max_index);
+
+    if(n > 0) {
+      if(thread_rank == 0) {
+        HPL_dgemm(HplColumnMajor,
+                  HplNoTrans,
+                  HplNoTrans,
+                  n,
+                  jb,
+                  jj,
+                  -HPL_rone,
+                  Mptr(L1ptr, jj + jb, 0, n0),
+                  n0,
+                  Mptr(L1ptr, 0, jj, n0),
+                  n0,
+                  HPL_rone,
+                  Mptr(L1ptr, jj + jb, jj, n0),
+                  n0);
+
+        HPL_dtrsm(HplColumnMajor,
+                  HplRight,
+                  HplUpper,
+                  HplNoTrans,
+                  HplUnit,
+                  n,
+                  jb,
+                  HPL_rone,
+                  Mptr(L1ptr, jj, jj, n0),
+                  n0,
+                  Mptr(L1ptr, jj + jb, jj, n0),
+                  n0);
+      }
+    }
+    /*
+     * Copy back upper part of A in current process row - Go the next block
+     */
+    if(curr != 0) {
+      if(thread_rank == 0) {
+        HPL_dlatcpy(
+            ioff, jb, Mptr(L1, ioff, 0, n0), n0, Mptr(A, 0, ioff, lda), lda);
+      }
+      ii += jb;
+      m -= jb;
+    }
+    jj += jb;
+    jb = Mmin(n, nb);
+
+  } while(n > 0);
+}
diff --git a/src/pfact/HPL_pdrpanllN.cpp b/src/pfact/HPL_pdrpanllN.cpp
new file mode 100644
index 0000000..4ed3a61
--- /dev/null
+++ b/src/pfact/HPL_pdrpanllN.cpp
@@ -0,0 +1,193 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_pdrpanllN(HPL_T_panel* PANEL,
+                   const int    M,
+                   const int    N,
+                   const int    ICOFF,
+                   double*      WORK,
+                   int          thread_rank,
+                   int          thread_size,
+                   double*      max_value,
+                   int*         max_index) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pdrpanllN recursively  factorizes  a panel  of columns using  the
+   * recursive Left-looking variant of the one-dimensional algorithm.  The
+   * lower triangular  N0-by-N0  upper block  of  the  panel  is stored in
+   * no-transpose form (i.e. just like the input matrix itself).
+   *
+   * Bi-directional  exchange  is  used  to  perform  the  swap::broadcast
+   * operations  at once  for one column in the panel.  This  results in a
+   * lower number of slightly larger  messages than usual.  On P processes
+   * and assuming bi-directional links,  the running time of this function
+   * can be approximated by (when N is equal to N0):
+   *
+   *    N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) +
+   *    N0^2 * ( M - N0/3 ) * gam2-3
+   *
+   * where M is the local number of rows of  the panel, lat and bdwth  are
+   * the latency and bandwidth of the network for  double  precision  real
+   * words,  and  gam2-3  is an estimate of the  Level 2 and Level 3  BLAS
+   * rate of execution. The  recursive  algorithm  allows indeed to almost
+   * achieve  Level 3 BLAS  performance  in the panel factorization.  On a
+   * large  number of modern machines,  this  operation is however latency
+   * bound,  meaning  that its cost can  be estimated  by only the latency
+   * portion N0 * log_2(P) * lat.  Mono-directional links will double this
+   * communication cost.
+   *
+   * Arguments
+   * =========
+   *
+   * PANEL   (local input/output)          HPL_T_panel *
+   *         On entry,  PANEL  points to the data structure containing the
+   *         panel information.
+   *
+   * M       (local input)                 const int
+   *         On entry,  M specifies the local number of rows of sub(A).
+   *
+   * N       (local input)                 const int
+   *         On entry,  N specifies the local number of columns of sub(A).
+   *
+   * ICOFF   (global input)                const int
+   *         On entry, ICOFF specifies the row and column offset of sub(A)
+   *         in A.
+   *
+   * WORK    (local workspace)             double *
+   *         On entry, WORK  is a workarray of size at least 2*(4+2*N0).
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  double *A, *Aptr, *L1, *L1ptr;
+  int     curr, ii, ioff, jb, jj, lda, m, n, n0, nb, nbdiv, nbmin;
+
+  if(N <= (nbmin = PANEL->algo->nbmin)) {
+    PANEL->algo->pffun(PANEL,
+                       M,
+                       N,
+                       ICOFF,
+                       WORK,
+                       thread_rank,
+                       thread_size,
+                       max_value,
+                       max_index);
+    return;
+  }
+  /*
+   * Find  new recursive blocking factor.  To avoid an infinite loop,  one
+   * must guarantee: 1 <= jb < N, knowing that  N  is greater than  NBMIN.
+   * First, we compute nblocks:  the number of blocks of size  NBMIN in N,
+   * including the last one that may be smaller.  nblocks  is thus  larger
+   * than or equal to one, since N >= NBMIN.
+   * The ratio ( nblocks + NDIV - 1 ) / NDIV  is thus larger than or equal
+   * to one as  well.  For  NDIV >= 2,  we  are guaranteed  that the quan-
+   * tity ( ( nblocks + NDIV  - 1 ) / NDIV ) * NBMIN  is less  than N  and
+   * greater than or equal to NBMIN.
+   */
+  nbdiv = PANEL->algo->nbdiv;
+  ii = jj = 0;
+  m       = M;
+  n       = N;
+  nb = jb = ((((N + nbmin - 1) / nbmin) + nbdiv - 1) / nbdiv) * nbmin;
+
+  A     = PANEL->A;
+  lda   = PANEL->lda;
+  L1    = PANEL->L1;
+  n0    = PANEL->jb;
+  L1ptr = Mptr(L1, ICOFF, ICOFF, n0);
+  curr  = (int)(PANEL->grid->myrow == PANEL->prow);
+
+  if(curr != 0)
+    Aptr = Mptr(A, ICOFF, ICOFF, lda);
+  else
+    Aptr = Mptr(A, 0, ICOFF, lda);
+  /*
+   * The triangular solve is replicated in every  process row.  The  panel
+   * factorization is  such that  the first rows of  A  are accumulated in
+   * every process row during the (panel) swapping phase.  We  ensure this
+   * way a minimum amount  of communication during the entire panel facto-
+   * rization.
+   */
+  do {
+    n -= jb;
+    ioff = ICOFF + jj;
+    /*
+     * Replicated solve - Local update - Factor current panel
+     */
+    if(thread_rank == 0) {
+      HPL_dtrsm(HplColumnMajor,
+                HplLeft,
+                HplLower,
+                HplNoTrans,
+                HplUnit,
+                jj,
+                jb,
+                HPL_rone,
+                L1ptr,
+                n0,
+                Mptr(L1ptr, 0, jj, n0),
+                n0);
+    }
+
+    HPL_dgemm_omp(HplColumnMajor,
+                  HplNoTrans,
+                  HplNoTrans,
+                  m,
+                  jb,
+                  jj,
+                  -HPL_rone,
+                  Mptr(Aptr, ii, 0, lda),
+                  lda,
+                  Mptr(L1ptr, 0, jj, n0),
+                  n0,
+                  HPL_rone,
+                  Mptr(Aptr, ii, jj, lda),
+                  lda,
+                  PANEL->nb,
+                  (curr != 0) ? ICOFF + ii : 0,
+                  thread_rank,
+                  thread_size);
+
+    HPL_pdrpanllN(PANEL,
+                  m,
+                  jb,
+                  ioff,
+                  WORK,
+                  thread_rank,
+                  thread_size,
+                  max_value,
+                  max_index);
+    /*
+     * Copy back upper part of A in current process row - Go the next block
+     */
+    if(curr != 0) {
+      if(thread_rank == 0) {
+        HPL_dlacpy(
+            ioff, jb, Mptr(L1, 0, ioff, n0), n0, Mptr(A, 0, ioff, lda), lda);
+      }
+      ii += jb;
+      m -= jb;
+    }
+    jj += jb;
+    jb = Mmin(n, nb);
+
+  } while(n > 0);
+}
diff --git a/src/pfact/HPL_pdrpanllT.cpp b/src/pfact/HPL_pdrpanllT.cpp
new file mode 100644
index 0000000..df92e4a
--- /dev/null
+++ b/src/pfact/HPL_pdrpanllT.cpp
@@ -0,0 +1,193 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_pdrpanllT(HPL_T_panel* PANEL,
+                   const int    M,
+                   const int    N,
+                   const int    ICOFF,
+                   double*      WORK,
+                   int          thread_rank,
+                   int          thread_size,
+                   double*      max_value,
+                   int*         max_index) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pdrpanllT recursively  factorizes  a panel of columns  using  the
+   * recursive Left-looking variant of the one-dimensional algorithm.  The
+   * lower  triangular  N0-by-N0  upper block  of  the panel  is stored in
+   * transpose form.
+   *
+   * Bi-directional  exchange  is  used  to  perform  the  swap::broadcast
+   * operations  at once  for one column in the panel.  This  results in a
+   * lower number of slightly larger  messages than usual.  On P processes
+   * and assuming bi-directional links,  the running time of this function
+   * can be approximated by (when N is equal to N0):
+   *
+   *    N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) +
+   *    N0^2 * ( M - N0/3 ) * gam2-3
+   *
+   * where M is the local number of rows of  the panel, lat and bdwth  are
+   * the latency and bandwidth of the network for  double  precision  real
+   * words,  and  gam2-3  is an estimate of the  Level 2 and Level 3  BLAS
+   * rate of execution. The  recursive  algorithm  allows indeed to almost
+   * achieve  Level 3 BLAS  performance  in the panel factorization.  On a
+   * large  number of modern machines,  this  operation is however latency
+   * bound,  meaning  that its cost can  be estimated  by only the latency
+   * portion N0 * log_2(P) * lat.  Mono-directional links will double this
+   * communication cost.
+   *
+   * Arguments
+   * =========
+   *
+   * PANEL   (local input/output)          HPL_T_panel *
+   *         On entry,  PANEL  points to the data structure containing the
+   *         panel information.
+   *
+   * M       (local input)                 const int
+   *         On entry,  M specifies the local number of rows of sub(A).
+   *
+   * N       (local input)                 const int
+   *         On entry,  N specifies the local number of columns of sub(A).
+   *
+   * ICOFF   (global input)                const int
+   *         On entry, ICOFF specifies the row and column offset of sub(A)
+   *         in A.
+   *
+   * WORK    (local workspace)             double *
+   *         On entry, WORK  is a workarray of size at least 2*(4+2*N0).
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  double *A, *Aptr, *L1, *L1ptr;
+  int     curr, ii, ioff, jb, jj, lda, m, n, n0, nb, nbdiv, nbmin;
+
+  if(N <= (nbmin = PANEL->algo->nbmin)) {
+    PANEL->algo->pffun(PANEL,
+                       M,
+                       N,
+                       ICOFF,
+                       WORK,
+                       thread_rank,
+                       thread_size,
+                       max_value,
+                       max_index);
+    return;
+  }
+  /*
+   * Find  new recursive blocking factor.  To avoid an infinite loop,  one
+   * must guarantee: 1 <= jb < N, knowing that  N  is greater than  NBMIN.
+   * First, we compute nblocks:  the number of blocks of size  NBMIN in N,
+   * including the last one that may be smaller.  nblocks  is thus  larger
+   * than or equal to one, since N >= NBMIN.
+   * The ratio ( nblocks + NDIV - 1 ) / NDIV  is thus larger than or equal
+   * to one as  well.  For  NDIV >= 2,  we  are guaranteed  that the quan-
+   * tity ( ( nblocks + NDIV  - 1 ) / NDIV ) * NBMIN  is less  than N  and
+   * greater than or equal to NBMIN.
+   */
+  nbdiv = PANEL->algo->nbdiv;
+  ii = jj = 0;
+  m       = M;
+  n       = N;
+  nb = jb = ((((N + nbmin - 1) / nbmin) + nbdiv - 1) / nbdiv) * nbmin;
+
+  A     = PANEL->A;
+  lda   = PANEL->lda;
+  L1    = PANEL->L1;
+  n0    = PANEL->jb;
+  L1ptr = Mptr(L1, ICOFF, ICOFF, n0);
+  curr  = (int)(PANEL->grid->myrow == PANEL->prow);
+
+  if(curr != 0)
+    Aptr = Mptr(A, ICOFF, ICOFF, lda);
+  else
+    Aptr = Mptr(A, 0, ICOFF, lda);
+  /*
+   * The triangular solve is replicated in every  process row.  The  panel
+   * factorization is  such that  the first rows of  A  are accumulated in
+   * every process row during the (panel) swapping phase.  We  ensure this
+   * way a minimum amount  of communication during the entire panel facto-
+   * rization.
+   */
+  do {
+    n -= jb;
+    ioff = ICOFF + jj;
+    /*
+     * Replicated solve - Local update - Factor current panel
+     */
+    if(thread_rank == 0) {
+      HPL_dtrsm(HplColumnMajor,
+                HplRight,
+                HplUpper,
+                HplNoTrans,
+                HplUnit,
+                jb,
+                jj,
+                HPL_rone,
+                L1ptr,
+                n0,
+                Mptr(L1ptr, jj, 0, n0),
+                n0);
+    }
+
+    HPL_dgemm_omp(HplColumnMajor,
+                  HplNoTrans,
+                  HplTrans,
+                  m,
+                  jb,
+                  jj,
+                  -HPL_rone,
+                  Mptr(Aptr, ii, 0, lda),
+                  lda,
+                  Mptr(L1ptr, jj, 0, n0),
+                  n0,
+                  HPL_rone,
+                  Mptr(Aptr, ii, jj, lda),
+                  lda,
+                  PANEL->nb,
+                  (curr != 0) ? ICOFF + ii : 0,
+                  thread_rank,
+                  thread_size);
+
+    HPL_pdrpanllT(PANEL,
+                  m,
+                  jb,
+                  ioff,
+                  WORK,
+                  thread_rank,
+                  thread_size,
+                  max_value,
+                  max_index);
+    /*
+     * Copy back upper part of A in current process row - Go the next block
+     */
+    if(curr != 0) {
+      if(thread_rank == 0) {
+        HPL_dlatcpy(
+            ioff, jb, Mptr(L1, ioff, 0, n0), n0, Mptr(A, 0, ioff, lda), lda);
+      }
+      ii += jb;
+      m -= jb;
+    }
+    jj += jb;
+    jb = Mmin(n, nb);
+
+  } while(n > 0);
+}
diff --git a/src/pfact/HPL_pdrpanrlN.cpp b/src/pfact/HPL_pdrpanrlN.cpp
new file mode 100644
index 0000000..33524b7
--- /dev/null
+++ b/src/pfact/HPL_pdrpanrlN.cpp
@@ -0,0 +1,198 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_pdrpanrlN(HPL_T_panel* PANEL,
+                   const int    M,
+                   const int    N,
+                   const int    ICOFF,
+                   double*      WORK,
+                   int          thread_rank,
+                   int          thread_size,
+                   double*      max_value,
+                   int*         max_index) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pdrpanrlN recursively  factorizes  a panel of columns  using  the
+   * recursive Right-looking variant of the one-dimensional algorithm. The
+   * lower triangular  N0-by-N0  upper block  of  the  panel  is stored in
+   * no-transpose form (i.e. just like the input matrix itself).
+   *
+   * Bi-directional  exchange  is  used  to  perform  the  swap::broadcast
+   * operations  at once  for one column in the panel.  This  results in a
+   * lower number of slightly larger  messages than usual.  On P processes
+   * and assuming bi-directional links,  the running time of this function
+   * can be approximated by (when N is equal to N0):
+   *
+   *    N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) +
+   *    N0^2 * ( M - N0/3 ) * gam2-3
+   *
+   * where M is the local number of rows of  the panel, lat and bdwth  are
+   * the latency and bandwidth of the network for  double  precision  real
+   * words, and   gam2-3  is an estimate of the  Level 2 and Level 3  BLAS
+   * rate of execution. The  recursive  algorithm  allows indeed to almost
+   * achieve  Level 3 BLAS  performance  in the panel factorization.  On a
+   * large  number of modern machines,  this  operation is however latency
+   * bound,  meaning  that its cost can  be estimated  by only the latency
+   * portion N0 * log_2(P) * lat.  Mono-directional links will double this
+   * communication cost.
+   *
+   * Arguments
+   * =========
+   *
+   * PANEL   (local input/output)          HPL_T_panel *
+   *         On entry,  PANEL  points to the data structure containing the
+   *         panel information.
+   *
+   * M       (local input)                 const int
+   *         On entry,  M specifies the local number of rows of sub(A).
+   *
+   * N       (local input)                 const int
+   *         On entry,  N specifies the local number of columns of sub(A).
+   *
+   * ICOFF   (global input)                const int
+   *         On entry, ICOFF specifies the row and column offset of sub(A)
+   *         in A.
+   *
+   * WORK    (local workspace)             double *
+   *         On entry, WORK  is a workarray of size at least 2*(4+2*N0).
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  double *A, *Aptr, *L1, *L1ptr;
+  int     curr, ii, ioff, jb, jj, lda, m, n, n0, nb, nbdiv, nbmin;
+
+  if(N <= (nbmin = PANEL->algo->nbmin)) {
+    PANEL->algo->pffun(PANEL,
+                       M,
+                       N,
+                       ICOFF,
+                       WORK,
+                       thread_rank,
+                       thread_size,
+                       max_value,
+                       max_index);
+    return;
+  }
+  /*
+   * Find  new recursive blocking factor.  To avoid an infinite loop,  one
+   * must guarantee: 1 <= jb < N, knowing that  N  is greater than  NBMIN.
+   * First, we compute nblocks:  the number of blocks of size  NBMIN in N,
+   * including the last one that may be smaller.  nblocks  is thus  larger
+   * than or equal to one, since N >= NBMIN.
+   * The ratio ( nblocks + NDIV - 1 ) / NDIV  is thus larger than or equal
+   * to one as  well.  For  NDIV >= 2,  we  are guaranteed  that the quan-
+   * tity ( ( nblocks + NDIV  - 1 ) / NDIV ) * NBMIN  is less  than N  and
+   * greater than or equal to NBMIN.
+   */
+  nbdiv = PANEL->algo->nbdiv;
+  ii = jj = 0;
+  m       = M;
+  n       = N;
+  nb = jb = ((((N + nbmin - 1) / nbmin) + nbdiv - 1) / nbdiv) * nbmin;
+
+  A     = PANEL->A;
+  lda   = PANEL->lda;
+  L1    = PANEL->L1;
+  n0    = PANEL->jb;
+  L1ptr = Mptr(L1, ICOFF, ICOFF, n0);
+  curr  = (int)(PANEL->grid->myrow == PANEL->prow);
+
+  if(curr != 0)
+    Aptr = Mptr(A, ICOFF, ICOFF, lda);
+  else
+    Aptr = Mptr(A, 0, ICOFF, lda);
+  /*
+   * The triangular solve is replicated in every  process row.  The  panel
+   * factorization is  such that  the first rows of  A  are accumulated in
+   * every process row during the (panel) swapping phase.  We  ensure this
+   * way a minimum amount  of communication during the entire panel facto-
+   * rization.
+   */
+  do {
+    n -= jb;
+    ioff = ICOFF + jj;
+    /*
+     * Factor current panel - Replicated solve - Local update
+     */
+    HPL_pdrpanrlN(PANEL,
+                  m,
+                  jb,
+                  ioff,
+                  WORK,
+                  thread_rank,
+                  thread_size,
+                  max_value,
+                  max_index);
+
+    if(thread_rank == 0) {
+      HPL_dtrsm(HplColumnMajor,
+                HplLeft,
+                HplLower,
+                HplNoTrans,
+                HplUnit,
+                jb,
+                n,
+                HPL_rone,
+                Mptr(L1ptr, jj, jj, n0),
+                n0,
+                Mptr(L1ptr, jj, jj + jb, n0),
+                n0);
+    }
+    if(curr != 0) {
+      ii += jb;
+      m -= jb;
+    }
+
+#pragma omp barrier
+
+    HPL_dgemm_omp(HplColumnMajor,
+                  HplNoTrans,
+                  HplNoTrans,
+                  m,
+                  n,
+                  jb,
+                  -HPL_rone,
+                  Mptr(Aptr, ii, jj, lda),
+                  lda,
+                  Mptr(L1ptr, jj, jj + jb, n0),
+                  n0,
+                  HPL_rone,
+                  Mptr(Aptr, ii, jj + jb, lda),
+                  lda,
+                  PANEL->nb,
+                  (curr != 0) ? ICOFF + ii : 0,
+                  thread_rank,
+                  thread_size);
+
+    /*
+     * Copy back upper part of A in current process row - Go the next block
+     */
+    if(curr != 0) {
+      if(thread_rank == 0) {
+        HPL_dlacpy(
+            ioff, jb, Mptr(L1, 0, ioff, n0), n0, Mptr(A, 0, ioff, lda), lda);
+      }
+    }
+    jj += jb;
+    jb = Mmin(n, nb);
+
+  } while(n > 0);
+}
diff --git a/src/pfact/HPL_pdrpanrlT.cpp b/src/pfact/HPL_pdrpanrlT.cpp
new file mode 100644
index 0000000..8a93391
--- /dev/null
+++ b/src/pfact/HPL_pdrpanrlT.cpp
@@ -0,0 +1,198 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_pdrpanrlT(HPL_T_panel* PANEL,
+                   const int    M,
+                   const int    N,
+                   const int    ICOFF,
+                   double*      WORK,
+                   int          thread_rank,
+                   int          thread_size,
+                   double*      max_value,
+                   int*         max_index) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pdrpanrlT recursively  factorizes  a panel of columns  using  the
+   * recursive Right-looking variant of the one-dimensional algorithm. The
+   * lower  triangular  N0-by-N0  upper  block of the panel  is stored  in
+   * transpose form.
+   *
+   * Bi-directional  exchange  is  used  to  perform  the  swap::broadcast
+   * operations  at once  for one column in the panel.  This  results in a
+   * lower number of slightly larger  messages than usual.  On P processes
+   * and assuming bi-directional links,  the running time of this function
+   * can be approximated by (when N is equal to N0):
+   *
+   *    N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) +
+   *    N0^2 * ( M - N0/3 ) * gam2-3
+   *
+   * where M is the local number of rows of  the panel, lat and bdwth  are
+   * the latency and bandwidth of the network for  double  precision  real
+   * words, and  gam2-3  is  an estimate of the  Level 2 and Level 3  BLAS
+   * rate of execution. The  recursive  algorithm  allows indeed to almost
+   * achieve  Level 3 BLAS  performance  in the panel factorization.  On a
+   * large  number of modern machines,  this  operation is however latency
+   * bound,  meaning  that its cost can  be estimated  by only the latency
+   * portion N0 * log_2(P) * lat.  Mono-directional links will double this
+   * communication cost.
+   *
+   * Arguments
+   * =========
+   *
+   * PANEL   (local input/output)          HPL_T_panel *
+   *         On entry,  PANEL  points to the data structure containing the
+   *         panel information.
+   *
+   * M       (local input)                 const int
+   *         On entry,  M specifies the local number of rows of sub(A).
+   *
+   * N       (local input)                 const int
+   *         On entry,  N specifies the local number of columns of sub(A).
+   *
+   * ICOFF   (global input)                const int
+   *         On entry, ICOFF specifies the row and column offset of sub(A)
+   *         in A.
+   *
+   * WORK    (local workspace)             double *
+   *         On entry, WORK  is a workarray of size at least 2*(4+2*N0).
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  double *A, *Aptr, *L1, *L1ptr;
+  int     curr, ii, ioff, jb, jj, lda, m, n, n0, nb, nbdiv, nbmin;
+
+  if(N <= (nbmin = PANEL->algo->nbmin)) {
+    PANEL->algo->pffun(PANEL,
+                       M,
+                       N,
+                       ICOFF,
+                       WORK,
+                       thread_rank,
+                       thread_size,
+                       max_value,
+                       max_index);
+    return;
+  }
+  /*
+   * Find  new recursive blocking factor.  To avoid an infinite loop,  one
+   * must guarantee: 1 <= jb < N, knowing that  N  is greater than  NBMIN.
+   * First, we compute nblocks:  the number of blocks of size  NBMIN in N,
+   * including the last one that may be smaller.  nblocks  is thus  larger
+   * than or equal to one, since N >= NBMIN.
+   * The ratio ( nblocks + NDIV - 1 ) / NDIV  is thus larger than or equal
+   * to one as  well.  For  NDIV >= 2,  we  are guaranteed  that the quan-
+   * tity ( ( nblocks + NDIV  - 1 ) / NDIV ) * NBMIN  is less  than N  and
+   * greater than or equal to NBMIN.
+   */
+  nbdiv = PANEL->algo->nbdiv;
+  ii = jj = 0;
+  m       = M;
+  n       = N;
+  nb = jb = ((((N + nbmin - 1) / nbmin) + nbdiv - 1) / nbdiv) * nbmin;
+
+  A     = PANEL->A;
+  lda   = PANEL->lda;
+  L1    = PANEL->L1;
+  n0    = PANEL->jb;
+  L1ptr = Mptr(L1, ICOFF, ICOFF, n0);
+  curr  = (int)(PANEL->grid->myrow == PANEL->prow);
+
+  if(curr != 0)
+    Aptr = Mptr(A, ICOFF, ICOFF, lda);
+  else
+    Aptr = Mptr(A, 0, ICOFF, lda);
+  /*
+   * The triangular solve is replicated in every  process row.  The  panel
+   * factorization is  such that  the first rows of  A  are accumulated in
+   * every process row during the (panel) swapping phase.  We  ensure this
+   * way a minimum amount  of communication during the entire panel facto-
+   * rization.
+   */
+  do {
+    n -= jb;
+    ioff = ICOFF + jj;
+    /*
+     * Factor current panel - Replicated solve - Local update
+     */
+    HPL_pdrpanrlT(PANEL,
+                  m,
+                  jb,
+                  ioff,
+                  WORK,
+                  thread_rank,
+                  thread_size,
+                  max_value,
+                  max_index);
+
+    if(thread_rank == 0) {
+      HPL_dtrsm(HplColumnMajor,
+                HplRight,
+                HplUpper,
+                HplNoTrans,
+                HplUnit,
+                n,
+                jb,
+                HPL_rone,
+                Mptr(L1ptr, jj, jj, n0),
+                n0,
+                Mptr(L1ptr, jj + jb, jj, n0),
+                n0);
+    }
+    if(curr != 0) {
+      ii += jb;
+      m -= jb;
+    }
+
+#pragma omp barrier
+
+    HPL_dgemm_omp(HplColumnMajor,
+                  HplNoTrans,
+                  HplTrans,
+                  m,
+                  n,
+                  jb,
+                  -HPL_rone,
+                  Mptr(Aptr, ii, jj, lda),
+                  lda,
+                  Mptr(L1ptr, jj + jb, jj, n0),
+                  n0,
+                  HPL_rone,
+                  Mptr(Aptr, ii, jj + jb, lda),
+                  lda,
+                  PANEL->nb,
+                  (curr != 0) ? ICOFF + ii : 0,
+                  thread_rank,
+                  thread_size);
+
+    /*
+     * Copy back upper part of A in current process row - Go the next block
+     */
+    if(curr != 0) {
+      if(thread_rank == 0) {
+        HPL_dlatcpy(
+            ioff, jb, Mptr(L1, ioff, 0, n0), n0, Mptr(A, 0, ioff, lda), lda);
+      }
+    }
+    jj += jb;
+    jb = Mmin(n, nb);
+
+  } while(n > 0);
+}
diff --git a/src/pgesv/HPL_pdgesv.cpp b/src/pgesv/HPL_pdgesv.cpp
new file mode 100644
index 0000000..ef0b0cd
--- /dev/null
+++ b/src/pgesv/HPL_pdgesv.cpp
@@ -0,0 +1,409 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_pdgesv(HPL_T_grid* GRID, HPL_T_palg* ALGO, HPL_T_pmat* A) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pdgesv factors a N+1-by-N matrix using LU factorization with row
+   * partial pivoting.  The main algorithm  is the "right looking" variant
+   * with  or  without look-ahead.  The  lower  triangular  factor is left
+   * unpivoted and the pivots are not returned. The right hand side is the
+   * N+1 column of the coefficient matrix.
+   *
+   * Arguments
+   * =========
+   *
+   * GRID    (local input)                 HPL_T_grid *
+   *         On entry,  GRID  points  to the data structure containing the
+   *         process grid information.
+   *
+   * ALGO    (global input)                HPL_T_palg *
+   *         On entry,  ALGO  points to  the data structure containing the
+   *         algorithmic parameters.
+   *
+   * A       (local input/output)          HPL_T_pmat *
+   *         On entry, A points to the data structure containing the local
+   *         array information.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  if(A->n <= 0) return;
+
+  A->info = 0;
+
+  HPL_T_panel * p, **panel = NULL;
+  HPL_T_UPD_FUN HPL_pdupdate;
+  int N, icurcol = 0, j, jb, jj = 0, jstart, k, mycol, n, nb, nn, npcol, nq,
+         tag = MSGID_BEGIN_FACT, test;
+#ifdef HPL_PROGRESS_REPORT
+  double start_time, time, step_time, gflops, step_gflops;
+#endif
+
+  // depth        = ALGO->depth;
+  const int depth = 1; // NC: Hardcoded now
+
+  mycol        = GRID->mycol;
+  npcol        = GRID->npcol;
+  HPL_pdupdate = ALGO->upfun;
+  N            = A->n;
+  nb           = A->nb;
+
+  if(N <= 0) return;
+
+#ifdef HPL_PROGRESS_REPORT
+  start_time = HPL_ptimer_walltime();
+#endif
+
+  /*
+   * Allocate a panel list of length depth + 1 (depth >= 1)
+   */
+  panel = (HPL_T_panel**)malloc((size_t)(depth + 1) * sizeof(HPL_T_panel*));
+  if(panel == NULL) {
+    HPL_pabort(__LINE__, "HPL_pdgesvK2", "Memory allocation failed");
+  }
+  /*
+   * Create and initialize the first panel
+   */
+  nq     = HPL_numroc(N + 1, nb, nb, mycol, 0, npcol);
+  nn     = N;
+  jstart = 0;
+
+  jb = Mmin(nn, nb);
+  HPL_pdpanel_new(
+      GRID, ALGO, nn, nn + 1, jb, A, jstart, jstart, tag, &panel[0]);
+  nn -= jb;
+  jstart += jb;
+  if(mycol == icurcol) {
+    jj += jb;
+    nq -= jb;
+  }
+  icurcol = MModAdd1(icurcol, npcol);
+  tag     = MNxtMgid(tag, MSGID_BEGIN_FACT, MSGID_END_FACT);
+
+  /*
+   * Create second panel
+   */
+  HPL_pdpanel_new(
+      GRID, ALGO, nn, nn + 1, Mmin(nn, nb), A, jstart, jstart, tag, &panel[1]);
+  tag = MNxtMgid(tag, MSGID_BEGIN_FACT, MSGID_END_FACT);
+
+  /*
+   * Initialize the lookahead - Factor jstart columns: panel[0]
+   */
+  jb = jstart;
+  jb = Mmin(jb, nb);
+  /*
+   * Factor and broadcast 0-th panel
+   */
+  HPL_pdpanel_SendToHost(panel[0]);
+  HPL_pdpanel_Wait(panel[0]);
+
+  HPL_pdfact(panel[0]);
+
+  // send the panel back to device before bcast
+  HPL_pdpanel_SendToDevice(panel[0]);
+  HPL_pdpanel_Wait(panel[0]);
+
+  HPL_pdpanel_bcast(panel[0]);
+
+  // start Ubcast+row swapping for second part of A
+  HPL_pdlaswp_start(panel[0], HPL_UPD_2);
+
+  if(mycol == icurcol) {
+    // start Ubcast+row swapping for look ahead
+    HPL_pdlaswp_start(panel[0], HPL_LOOK_AHEAD);
+  }
+
+  // start Ubcast+row swapping for first part of A
+  HPL_pdlaswp_start(panel[0], HPL_UPD_1);
+
+  // Ubcast+row swaps for second part of A
+  HPL_pdlaswp_exchange(panel[0], HPL_UPD_2);
+
+  if(mycol == icurcol) {
+    // Ubcast+row swaps for look ahead
+    // nn = HPL_numrocI(jb, j, nb, nb, mycol, 0, npcol);
+    HPL_pdlaswp_exchange(panel[0], HPL_LOOK_AHEAD);
+  }
+
+  double stepStart, stepEnd;
+
+#ifdef HPL_PROGRESS_REPORT
+#ifdef HPL_DETAILED_TIMING
+  float  smallDgemmTime, largeDgemm1Time, largeDgemm2Time;
+  double smallDgemmGflops, largeDgemm1Gflops, largeDgemm2Gflops;
+
+  if(GRID->myrow == 0 && mycol == 0) {
+    printf("-------------------------------------------------------------------"
+           "-------------------------------------------------------------------"
+           "---------------------------------------------\n");
+    printf("   %%   | Column    | Step Time (s) ||         DGEMM GFLOPS        "
+           " || Panel Copy(s) | pdfact (s) | pmxswp (s) | Lbcast (s) | laswp "
+           "(s) | GPU Sync (s) | Step GFLOPS | Overall GFLOPS\n");
+    printf("       |           |               |  Small   |  First   | Second  "
+           " |               |            |            |            |          "
+           " |              |             |               \n");
+    printf("-------------------------------------------------------------------"
+           "-------------------------------------------------------------------"
+           "---------------------------------------------\n");
+  }
+#else
+  if(GRID->myrow == 0 && mycol == 0) {
+    printf("---------------------------------------------------\n");
+    printf("   %%   | Column    | Step Time (s) | Overall GFLOPS\n");
+    printf("       |           |               |               \n");
+    printf("---------------------------------------------------\n");
+  }
+#endif
+#endif
+
+  /*
+   * Main loop over the remaining columns of A
+   */
+  for(j = jstart; j < N; j += nb) {
+    HPL_ptimer_stepReset(HPL_TIMING_N, HPL_TIMING_BEG);
+
+    stepStart = MPI_Wtime();
+    n         = N - j;
+    jb        = Mmin(n, nb);
+    /*
+     * Initialize current panel - Finish latest update, Factor and broadcast
+     * current panel
+     */
+    (void)HPL_pdpanel_free(panel[1]);
+    HPL_pdpanel_init(GRID, ALGO, n, n + 1, jb, A, j, j, tag, panel[1]);
+
+    if(mycol == icurcol) {
+      /* update look ahead */
+      HPL_pdlaswp_end(panel[0], HPL_LOOK_AHEAD);
+      HPL_pdupdate(panel[0], HPL_LOOK_AHEAD);
+
+      // when the look ahead update is finished, copy back the current panel
+      hipStreamWaitEvent(dataStream, update[HPL_LOOK_AHEAD], 0);
+      HPL_pdpanel_SendToHost(panel[1]);
+
+      /* Queue up finishing the second section */
+      HPL_pdlaswp_end(panel[0], HPL_UPD_2);
+      HPL_pdupdate(panel[0], HPL_UPD_2);
+
+#ifdef HPL_DETAILED_TIMING
+      HPL_ptimer(HPL_TIMING_UPDATE);
+      hipEventSynchronize(update[HPL_LOOK_AHEAD]);
+      HPL_ptimer(HPL_TIMING_UPDATE);
+#endif
+
+      // wait for the panel to arrive
+      HPL_pdpanel_Wait(panel[0]);
+
+#ifdef HPL_PROGRESS_REPORT
+#ifdef HPL_DETAILED_TIMING
+      const int curr = (panel[0]->grid->myrow == panel[0]->prow ? 1 : 0);
+      const int mp   = panel[0]->mp - (curr != 0 ? jb : 0);
+
+      // compute the GFLOPs of the look ahead update DGEMM
+      hipEventElapsedTime(&smallDgemmTime,
+                          dgemmStart[HPL_LOOK_AHEAD],
+                          dgemmStop[HPL_LOOK_AHEAD]);
+      smallDgemmGflops =
+          (2.0 * mp * jb * jb) / (1000.0 * 1000.0 * smallDgemmTime);
+#endif
+#endif
+
+      /*Panel factorization FLOP count is (2/3)NB^3 - (1/2)NB^2 - (1/6)NB +
+       * (N-i*NB)(NB^2-NB)*/
+      HPL_pdfact(panel[1]); /* factor current panel */
+
+      // send the panel back to device before bcast
+      HPL_pdpanel_SendToDevice(panel[1]);
+      HPL_pdpanel_Wait(panel[0]);
+    } else {
+      /* Queue up finishing the second section */
+      HPL_pdlaswp_end(panel[0], HPL_UPD_2);
+      HPL_pdupdate(panel[0], HPL_UPD_2);
+    }
+
+    /* broadcast current panel */
+    HPL_pdpanel_bcast(panel[1]);
+
+    // start Ubcast+row swapping for second part of A
+    HPL_pdlaswp_start(panel[1], HPL_UPD_2);
+
+    // while the second section is updating, exchange the rows from the first
+    // section
+    HPL_pdlaswp_exchange(panel[0], HPL_UPD_1);
+
+    /* Queue up finishing the first section */
+    HPL_pdlaswp_end(panel[0], HPL_UPD_1);
+    HPL_pdupdate(panel[0], HPL_UPD_1);
+
+    if(mycol == icurcol) {
+      jj += jb;
+      nq -= jb;
+    }
+    icurcol = MModAdd1(icurcol, npcol);
+    tag     = MNxtMgid(tag, MSGID_BEGIN_FACT, MSGID_END_FACT);
+
+    if(mycol == icurcol) {
+      // prep the row swaps for the next look ahead
+      //  nn = HPL_numrocI(jb, j+nb, nb, nb, mycol, 0, npcol);
+      HPL_pdlaswp_start(panel[1], HPL_LOOK_AHEAD);
+
+      // start Ubcast+row swapping for first part of A
+      HPL_pdlaswp_start(panel[1], HPL_UPD_1);
+
+      HPL_pdlaswp_exchange(panel[1], HPL_UPD_2);
+
+      HPL_pdlaswp_exchange(panel[1], HPL_LOOK_AHEAD);
+    } else {
+      // start Ubcast+row swapping for first part of A
+      HPL_pdlaswp_start(panel[1], HPL_UPD_1);
+
+      HPL_pdlaswp_exchange(panel[1], HPL_UPD_2);
+    }
+
+    // wait here for the updates to compete
+#ifdef HPL_DETAILED_TIMING
+    HPL_ptimer(HPL_TIMING_UPDATE);
+#endif
+    hipDeviceSynchronize();
+#ifdef HPL_DETAILED_TIMING
+    HPL_ptimer(HPL_TIMING_UPDATE);
+#endif
+
+    stepEnd = MPI_Wtime();
+
+#ifdef HPL_PROGRESS_REPORT
+#ifdef HPL_DETAILED_TIMING
+    const int curr = (panel[0]->grid->myrow == panel[0]->prow ? 1 : 0);
+    const int mp   = panel[0]->mp - (curr != 0 ? jb : 0);
+
+    largeDgemm1Time = 0.0;
+    largeDgemm2Time = 0.0;
+    if(panel[0]->nu1) {
+      hipEventElapsedTime(
+          &largeDgemm1Time, dgemmStart[HPL_UPD_1], dgemmStop[HPL_UPD_1]);
+      largeDgemm1Gflops = (2.0 * mp * jb * (panel[0]->nu1)) /
+                          (1000.0 * 1000.0 * (largeDgemm1Time));
+    }
+    if(panel[0]->nu2) {
+      hipEventElapsedTime(
+          &largeDgemm2Time, dgemmStart[HPL_UPD_2], dgemmStop[HPL_UPD_2]);
+      largeDgemm2Gflops = (2.0 * mp * jb * (panel[0]->nu2)) /
+                          (1000.0 * 1000.0 * (largeDgemm2Time));
+    }
+#endif
+    /* if this is process 0,0 and not the first panel */
+    if(GRID->myrow == 0 && mycol == 0 && j > 0) {
+      time      = HPL_ptimer_walltime() - start_time;
+      step_time = stepEnd - stepStart;
+      /*
+      Step FLOP count is (2/3)NB^3 - (1/2)NB^2 - (1/6)NB
+                          + 2*n*NB^2 - n*NB + 2*NB*n^2
+
+      Overall FLOP count is (2/3)(N^3-n^3) - (1/2)(N^2-n^2) - (1/6)(N-n)
+      */
+      step_gflops =
+          ((2.0 / 3.0) * jb * jb * jb - (1.0 / 2.0) * jb * jb -
+           (1.0 / 6.0) * jb + 2.0 * n * jb * jb - jb * n + 2.0 * jb * n * n) /
+          (step_time > 0.0 ? step_time : 1.e-6) / 1.e9;
+      gflops = ((2.0 / 3.0) * (N * (double)N * N - n * (double)n * n) -
+                (1.0 / 2.0) * (N * (double)N - n * (double)n) -
+                (1.0 / 6.0) * ((double)N - (double)n)) /
+               (time > 0.0 ? time : 1.e-6) / 1.e9;
+      printf("%5.1f%% | %09d | ", j * 100.0 / N, j);
+      printf("   %9.7f  |", stepEnd - stepStart);
+
+#ifdef HPL_DETAILED_TIMING
+      if(panel[0]->nu0) {
+        printf(" %9.3e|", smallDgemmGflops);
+      } else {
+        printf("          |");
+      }
+      if(panel[0]->nu2) {
+        printf(" %9.3e|", largeDgemm2Gflops);
+      } else {
+        printf("          |");
+      }
+
+      if(panel[0]->nu1) {
+        printf(" %9.3e|", largeDgemm1Gflops);
+      } else {
+        printf("          |");
+      }
+
+      if(panel[0]->nu0) {
+        printf("   %9.3e   |  %9.3e |  %9.3e |",
+               HPL_ptimer_getStep(HPL_TIMING_COPY),
+               HPL_ptimer_getStep(HPL_TIMING_RPFACT),
+               HPL_ptimer_getStep(HPL_TIMING_MXSWP));
+      } else {
+        printf("               |            |            |");
+      }
+
+      printf("  %9.3e | %9.3e |   %9.3e  |",
+             HPL_ptimer_getStep(HPL_TIMING_LBCAST),
+             HPL_ptimer_getStep(HPL_TIMING_LASWP),
+             HPL_ptimer_getStep(HPL_TIMING_UPDATE));
+
+      printf("  %9.3e  |", step_gflops);
+#endif
+
+      printf("    %9.3e   \n", gflops);
+    }
+#endif
+
+    /*
+     * Circular  of the panel pointers:
+     * xtmp = x[0]; for( k=0; k < 1; k++ ) x[k] = x[k+1]; x[d] = xtmp;
+     *
+     * Go to next process row and column - update the message ids for broadcast
+     */
+    p        = panel[0];
+    panel[0] = panel[1];
+    panel[1] = p;
+  }
+  /*
+   * Clean-up: Finish updates - release panels and panel list
+   */
+  // nn = HPL_numrocI(1, N, nb, nb, mycol, 0, npcol);
+  HPL_pdlaswp_end(panel[0], HPL_LOOK_AHEAD);
+  HPL_pdupdate(panel[0], HPL_LOOK_AHEAD);
+
+  HPL_pdlaswp_end(panel[0], HPL_UPD_2);
+  HPL_pdupdate(panel[0], HPL_UPD_2);
+
+#ifdef HPL_DETAILED_TIMING
+  HPL_ptimer(HPL_TIMING_UPDATE);
+#endif
+  hipDeviceSynchronize();
+#ifdef HPL_DETAILED_TIMING
+  HPL_ptimer(HPL_TIMING_UPDATE);
+#endif
+
+  HPL_pdpanel_disp(&panel[0]);
+  HPL_pdpanel_disp(&panel[1]);
+  if(panel) free(panel);
+
+  /*
+   * Solve upper triangular system
+   */
+  if(A->info == 0) HPL_pdtrsv(GRID, A);
+}
diff --git a/src/pgesv/HPL_pdlaswp.cpp b/src/pgesv/HPL_pdlaswp.cpp
new file mode 100644
index 0000000..f35113b
--- /dev/null
+++ b/src/pgesv/HPL_pdlaswp.cpp
@@ -0,0 +1,532 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_pdlaswp_start(HPL_T_panel* PANEL, const HPL_T_UPD UPD) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pdlaswp_start begins the  NB  row interchanges to  NN columns of the
+   * trailing submatrix and broadcast a column panel. The rows needed for
+   * the row interchanges are packed into U (in the current row) and W
+   *
+   * Arguments
+   * =========
+   *
+   * PANEL   (local input/output)          HPL_T_panel *
+   *         On entry,  PANEL  points to the data structure containing the
+   *         panel information.
+   *
+   * ---------------------------------------------------------------------
+   */
+  /*
+   * .. Local Variables ..
+   */
+  double *U, *W;
+  double *dA, *dU, *dW;
+  int *   ipID, *iplen, *ipcounts, *ipoffsets, *iwork,
+      *lindxU = NULL, *lindxA = NULL, *lindxAU, *permU;
+  int *dlindxU = NULL, *dlindxA = NULL, *dlindxAU, *dpermU, *dpermU_ex;
+  int  icurrow, *iflag, *ipA, *ipl, jb, k, lda, myrow, n, nprow, LDU, LDW;
+
+  /* ..
+   * .. Executable Statements ..
+   */
+  n  = PANEL->n;
+  jb = PANEL->jb;
+
+  /*
+   * Retrieve parameters from the PANEL data structure
+   */
+  nprow = PANEL->grid->nprow;
+  myrow = PANEL->grid->myrow;
+  iflag = PANEL->IWORK;
+
+  MPI_Comm comm = PANEL->grid->col_comm;
+
+  // quick return if we're 1xQ
+  if(nprow == 1) return;
+
+  dA      = PANEL->dA;
+  lda     = PANEL->dlda;
+  icurrow = PANEL->prow;
+
+  if(UPD == HPL_LOOK_AHEAD) {
+    U   = PANEL->U;
+    W   = PANEL->W;
+    dU  = PANEL->dU;
+    dW  = PANEL->dW;
+    LDU = PANEL->ldu0;
+    LDW = PANEL->ldu0;
+    n   = PANEL->nu0;
+
+  } else if(UPD == HPL_UPD_1) {
+    U   = PANEL->U1;
+    W   = PANEL->W1;
+    dU  = PANEL->dU1;
+    dW  = PANEL->dW1;
+    LDU = PANEL->ldu1;
+    LDW = PANEL->ldu1;
+    n   = PANEL->nu1;
+    // we call the row swap start before the first section is updated
+    //  so shift the pointers
+    dA = Mptr(dA, 0, PANEL->nu0, lda);
+
+  } else if(UPD == HPL_UPD_2) {
+    U   = PANEL->U2;
+    W   = PANEL->W2;
+    dU  = PANEL->dU2;
+    dW  = PANEL->dW2;
+    LDU = PANEL->ldu2;
+    LDW = PANEL->ldu2;
+    n   = PANEL->nu2;
+    // we call the row swap start before the first section is updated
+    //  so shift the pointers
+    dA = Mptr(dA, 0, PANEL->nu0 + PANEL->nu1, lda);
+  }
+
+  /*
+   * Quick return if there is nothing to do
+   */
+  if((n <= 0) || (jb <= 0)) return;
+
+  /*
+   * Compute ipID (if not already done for this panel). lindxA and lindxAU
+   * are of length at most 2*jb - iplen is of size nprow+1, ipmap, ipmapm1
+   * are of size nprow,  permU is of length jb, and  this function needs a
+   * workspace of size max( 2 * jb (plindx1), nprow+1(equil)):
+   * 1(iflag) + 1(ipl) + 1(ipA) + 9*jb + 3*nprow + 1 + MAX(2*jb,nprow+1)
+   * i.e. 4 + 9*jb + 3*nprow + max(2*jb, nprow+1);
+   */
+  k         = (int)((unsigned int)(jb) << 1);
+  ipl       = iflag + 1;
+  ipID      = ipl + 1;
+  ipA       = ipID + ((unsigned int)(k) << 1);
+  iplen     = ipA + 1;
+  ipcounts  = iplen + nprow + 1;
+  ipoffsets = ipcounts + nprow;
+  iwork     = ipoffsets + nprow;
+
+  lindxU  = PANEL->lindxU;
+  lindxA  = PANEL->lindxA;
+  lindxAU = PANEL->lindxAU;
+  permU   = PANEL->permU;
+
+  dlindxU   = PANEL->dlindxU;
+  dlindxA   = PANEL->dlindxA;
+  dlindxAU  = PANEL->dlindxAU;
+  dpermU    = PANEL->dpermU;
+  dpermU_ex = dpermU + jb;
+
+  if(*iflag == -1) /* no index arrays have been computed so far */
+  {
+    // get the ipivs on the host after the Bcast
+    if(PANEL->grid->mycol != PANEL->pcol) {
+      hipMemcpy2DAsync(PANEL->ipiv,
+                       PANEL->jb * sizeof(int),
+                       PANEL->dipiv,
+                       PANEL->jb * sizeof(int),
+                       PANEL->jb * sizeof(int),
+                       1,
+                       hipMemcpyDeviceToHost,
+                       dataStream);
+    }
+    hipStreamSynchronize(dataStream);
+
+    // compute spreading info
+    HPL_pipid(PANEL, ipl, ipID);
+    HPL_plindx(
+        PANEL, *ipl, ipID, ipA, lindxU, lindxAU, lindxA, iplen, permU, iwork);
+    *iflag = 1;
+  }
+
+  /*
+   * For i in [0..2*jb),  lindxA[i] is the offset in A of a row that ulti-
+   * mately goes to U( :, lindxAU[i] ).  In each rank, we directly pack
+   * into U, otherwise we pack into workspace. The  first
+   * entry of each column packed in workspace is in fact the row or column
+   * offset in U where it should go to.
+   */
+  if(myrow == icurrow) {
+    // copy needed rows of A into U
+    HPL_dlaswp01T(jb, n, dA, lda, dU, LDU, dlindxU);
+  } else {
+    // copy needed rows from A into U(:, iplen[myrow])
+    HPL_dlaswp03T(iplen[myrow + 1] - iplen[myrow],
+                  n,
+                  dA,
+                  lda,
+                  Mptr(dU, 0, iplen[myrow], LDU),
+                  LDU,
+                  dlindxU);
+  }
+
+  // record when packing completes
+  hipEventRecord(swapStartEvent[UPD], computeStream);
+
+  /*
+   * End of HPL_pdlaswp_start
+   */
+}
+
+void HPL_pdlaswp_exchange(HPL_T_panel* PANEL, const HPL_T_UPD UPD) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pdlaswp_exchange applies the  NB  row interchanges to  NN columns of
+   * the trailing submatrix and broadcast a column panel.
+   *
+   * A "Spread then roll" algorithm performs  the swap :: broadcast  of the
+   * row panel U at once,  resulting in a minimal communication volume  and
+   * a "very good"  use of the connectivity if available.  With  P  process
+   * rows  and  assuming  bi-directional links,  the  running time  of this
+   * function can be approximated by:
+   *
+   *    (log_2(P)+(P-1)) * lat +   K * NB * LocQ(N) / bdwth
+   *
+   * where  NB  is the number of rows of the row panel U,  N is the global
+   * number of columns being updated,  lat and bdwth  are the latency  and
+   * bandwidth  of  the  network  for  double  precision real words.  K is
+   * a constant in (2,3] that depends on the achieved bandwidth  during  a
+   * simultaneous  message exchange  between two processes.  An  empirical
+   * optimistic value of K is typically 2.4.
+   *
+   * Arguments
+   * =========
+   *
+   * PANEL   (local input/output)          HPL_T_panel *
+   *         On entry,  PANEL  points to the data structure containing the
+   *         panel information.
+   *
+   * ---------------------------------------------------------------------
+   */
+  /*
+   * .. Local Variables ..
+   */
+  double *U, *W;
+  double *dA, *dU, *dW;
+  int *   ipID, *iplen, *ipcounts, *ipoffsets, *iwork,
+      *lindxU = NULL, *lindxA = NULL, *lindxAU, *permU;
+  int *dlindxU = NULL, *dlindxA = NULL, *dlindxAU, *dpermU, *dpermU_ex;
+  int  icurrow, *iflag, *ipA, *ipl, jb, k, lda, myrow, n, nprow, LDU, LDW;
+
+  /* ..
+   * .. Executable Statements ..
+   */
+  n  = PANEL->n;
+  jb = PANEL->jb;
+
+  /*
+   * Retrieve parameters from the PANEL data structure
+   */
+  nprow = PANEL->grid->nprow;
+  myrow = PANEL->grid->myrow;
+  iflag = PANEL->IWORK;
+
+  MPI_Comm comm = PANEL->grid->col_comm;
+
+  // quick return if we're 1xQ
+  if(nprow == 1) return;
+
+  dA      = PANEL->dA;
+  lda     = PANEL->dlda;
+  icurrow = PANEL->prow;
+
+  if(UPD == HPL_LOOK_AHEAD) {
+    U   = PANEL->U;
+    W   = PANEL->W;
+    dU  = PANEL->dU;
+    dW  = PANEL->dW;
+    LDU = PANEL->ldu0;
+    LDW = PANEL->ldu0;
+    n   = PANEL->nu0;
+
+  } else if(UPD == HPL_UPD_1) {
+    U   = PANEL->U1;
+    W   = PANEL->W1;
+    dU  = PANEL->dU1;
+    dW  = PANEL->dW1;
+    LDU = PANEL->ldu1;
+    LDW = PANEL->ldu1;
+    n   = PANEL->nu1;
+    // we call the row swap start before the first section is updated
+    //  so shift the pointers
+    dA = Mptr(dA, 0, PANEL->nu0, lda);
+
+  } else if(UPD == HPL_UPD_2) {
+    U   = PANEL->U2;
+    W   = PANEL->W2;
+    dU  = PANEL->dU2;
+    dW  = PANEL->dW2;
+    LDU = PANEL->ldu2;
+    LDW = PANEL->ldu2;
+    n   = PANEL->nu2;
+    // we call the row swap start before the first section is updated
+    //  so shift the pointers
+    dA = Mptr(dA, 0, PANEL->nu0 + PANEL->nu1, lda);
+  }
+
+  /*
+   * Quick return if there is nothing to do
+   */
+  if((n <= 0) || (jb <= 0)) return;
+
+  /*
+   * Compute ipID (if not already done for this panel). lindxA and lindxAU
+   * are of length at most 2*jb - iplen is of size nprow+1, ipmap, ipmapm1
+   * are of size nprow,  permU is of length jb, and  this function needs a
+   * workspace of size max( 2 * jb (plindx1), nprow+1(equil)):
+   * 1(iflag) + 1(ipl) + 1(ipA) + 9*jb + 3*nprow + 1 + MAX(2*jb,nprow+1)
+   * i.e. 4 + 9*jb + 3*nprow + max(2*jb, nprow+1);
+   */
+  k         = (int)((unsigned int)(jb) << 1);
+  ipl       = iflag + 1;
+  ipID      = ipl + 1;
+  ipA       = ipID + ((unsigned int)(k) << 1);
+  iplen     = ipA + 1;
+  ipcounts  = iplen + nprow + 1;
+  ipoffsets = ipcounts + nprow;
+  iwork     = ipoffsets + nprow;
+
+  lindxA  = PANEL->lindxA;
+  lindxAU = PANEL->lindxAU;
+  lindxU  = PANEL->lindxU;
+  permU   = PANEL->permU;
+
+  dlindxA   = PANEL->dlindxA;
+  dlindxAU  = PANEL->dlindxAU;
+  dlindxU   = PANEL->dlindxU;
+  dpermU    = PANEL->dpermU;
+  dpermU_ex = dpermU + jb;
+
+  /* Set MPI message counts and offsets */
+  ipcounts[0]  = (iplen[1] - iplen[0]) * LDU;
+  ipoffsets[0] = 0;
+
+  for(int i = 1; i < nprow; ++i) {
+    ipcounts[i]  = (iplen[i + 1] - iplen[i]) * LDU;
+    ipoffsets[i] = ipcounts[i - 1] + ipoffsets[i - 1];
+  }
+  ipoffsets[nprow] = ipcounts[nprow - 1] + ipoffsets[nprow - 1];
+
+  /*
+   * For i in [0..2*jb),  lindxA[i] is the offset in A of a row that ulti-
+   * mately goes to U( :, lindxAU[i] ).  In each rank, we directly pack
+   * into U, otherwise we pack into workspace. The  first
+   * entry of each column packed in workspace is in fact the row or column
+   * offset in U where it should go to.
+   */
+
+  if(myrow == icurrow) {
+
+#ifdef HPL_DETAILED_TIMING
+    HPL_ptimer(HPL_TIMING_UPDATE);
+#endif
+
+    // hipStreamSynchronize(computeStream);
+    hipEventSynchronize(swapStartEvent[UPD]);
+
+#ifdef HPL_DETAILED_TIMING
+    HPL_ptimer(HPL_TIMING_UPDATE);
+    HPL_ptimer(HPL_TIMING_LASWP);
+#endif
+
+    // send rows to other ranks
+    HPL_scatterv(dU, ipcounts, ipoffsets, ipcounts[myrow], icurrow, comm);
+
+    // All gather dU
+    HPL_allgatherv(dU, ipcounts[myrow], ipcounts, ipoffsets, comm);
+
+#ifdef HPL_DETAILED_TIMING
+    HPL_ptimer(HPL_TIMING_LASWP);
+#endif
+
+  } else {
+
+#ifdef HPL_DETAILED_TIMING
+    HPL_ptimer(HPL_TIMING_UPDATE);
+#endif
+
+    // wait for dU to be ready
+    // hipStreamSynchronize(computeStream);
+    hipEventSynchronize(swapStartEvent[UPD]);
+
+#ifdef HPL_DETAILED_TIMING
+    HPL_ptimer(HPL_TIMING_UPDATE);
+    HPL_ptimer(HPL_TIMING_LASWP);
+#endif
+
+    // receive rows from icurrow into dW
+    HPL_scatterv(dW, ipcounts, ipoffsets, ipcounts[myrow], icurrow, comm);
+
+    // All gather dU
+    HPL_allgatherv(dU, ipcounts[myrow], ipcounts, ipoffsets, comm);
+
+#ifdef HPL_DETAILED_TIMING
+    HPL_ptimer(HPL_TIMING_LASWP);
+#endif
+  }
+  /*
+   * End of HPL_pdlaswp_exchange
+   */
+}
+
+void HPL_pdlaswp_end(HPL_T_panel* PANEL, const HPL_T_UPD UPD) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pdlaswp_end copies  scattered rows  of  A  into an array U.  The
+   * row offsets in  A  of the source rows  are specified by LINDXA.  The
+   * destination of those rows are specified by  LINDXAU.  A
+   * positive value of LINDXAU indicates that the array  destination is U,
+   * and A otherwise. Rows of A are stored as columns in U.
+   *
+   * Arguments
+   * =========
+   *
+   * PANEL   (local input/output)          HPL_T_panel *
+   *         On entry,  PANEL  points to the data structure containing the
+   *         panel information.
+   *
+   * ---------------------------------------------------------------------
+   */
+  /*
+   * .. Local Variables ..
+   */
+  double *U, *W;
+  double *dA, *dU, *dW;
+  int *   ipID, *iplen, *ipcounts, *ipoffsets, *iwork, *lindxA = NULL, *lindxAU,
+                                                    *permU;
+  int *dlindxA = NULL, *dlindxAU, *dlindxU, *dpermU, *dpermU_ex;
+  int  icurrow, *iflag, *ipA, *ipl, jb, k, lda, myrow, n, nprow, LDU, LDW;
+
+  /* ..
+   * .. Executable Statements ..
+   */
+  n  = PANEL->n;
+  jb = PANEL->jb;
+
+  /*
+   * Retrieve parameters from the PANEL data structure
+   */
+  nprow = PANEL->grid->nprow;
+  myrow = PANEL->grid->myrow;
+  iflag = PANEL->IWORK;
+
+  MPI_Comm comm = PANEL->grid->col_comm;
+
+  dA      = PANEL->dA;
+  lda     = PANEL->dlda;
+  icurrow = PANEL->prow;
+
+  if(UPD == HPL_LOOK_AHEAD) {
+    U   = PANEL->U;
+    W   = PANEL->W;
+    dU  = PANEL->dU;
+    dW  = PANEL->dW;
+    LDU = PANEL->ldu0;
+    LDW = PANEL->ldu0;
+    n   = PANEL->nu0;
+
+  } else if(UPD == HPL_UPD_1) {
+    U   = PANEL->U1;
+    W   = PANEL->W1;
+    dU  = PANEL->dU1;
+    dW  = PANEL->dW1;
+    LDU = PANEL->ldu1;
+    LDW = PANEL->ldu1;
+    n   = PANEL->nu1;
+    // we call the row swap start before the first section is updated
+    //  so shift the pointers
+    dA = Mptr(dA, 0, PANEL->nu0, lda);
+
+  } else if(UPD == HPL_UPD_2) {
+    U   = PANEL->U2;
+    W   = PANEL->W2;
+    dU  = PANEL->dU2;
+    dW  = PANEL->dW2;
+    LDU = PANEL->ldu2;
+    LDW = PANEL->ldu2;
+    n   = PANEL->nu2;
+    // we call the row swap start before the first section is updated
+    //  so shift the pointers
+    dA = Mptr(dA, 0, PANEL->nu0 + PANEL->nu1, lda);
+  }
+
+  /*
+   * Quick return if there is nothing to do
+   */
+  if((n <= 0) || (jb <= 0)) return;
+
+  // just local swaps if we're 1xQ
+  if(nprow == 1) {
+    HPL_dlaswp00N(jb, n, dA, lda, PANEL->dipiv);
+    return;
+  }
+
+  /*
+   * Compute ipID (if not already done for this panel). lindxA and lindxAU
+   * are of length at most 2*jb - iplen is of size nprow+1, ipmap, ipmapm1
+   * are of size nprow,  permU is of length jb, and  this function needs a
+   * workspace of size max( 2 * jb (plindx1), nprow+1(equil)):
+   * 1(iflag) + 1(ipl) + 1(ipA) + 9*jb + 3*nprow + 1 + MAX(2*jb,nprow+1)
+   * i.e. 4 + 9*jb + 3*nprow + max(2*jb, nprow+1);
+   */
+  k     = (int)((unsigned int)(jb) << 1);
+  ipl   = iflag + 1;
+  ipID  = ipl + 1;
+  ipA   = ipID + ((unsigned int)(k) << 1);
+  iplen = ipA + 1;
+
+  lindxA  = PANEL->lindxA;
+  lindxAU = PANEL->lindxAU;
+  permU   = PANEL->permU;
+
+  dlindxA   = PANEL->dlindxA;
+  dlindxAU  = PANEL->dlindxAU;
+  dlindxU   = PANEL->dlindxU;
+  dpermU    = PANEL->dpermU;
+  dpermU_ex = dpermU + jb;
+
+  /*
+   * For i in [0..2*jb),  lindxA[i] is the offset in A of a row that ulti-
+   * mately goes to U( :, lindxAU[i] ).  In each rank, we directly pack
+   * into U, otherwise we pack into workspace. The  first
+   * entry of each column packed in workspace is in fact the row or column
+   * offset in U where it should go to.
+   */
+
+  if(myrow == icurrow) {
+    // swap rows local to A on device
+    HPL_dlaswp02T(*ipA, n, dA, lda, dlindxAU, dlindxA);
+  } else {
+    // Queue inserting recieved rows in W into A on device
+    HPL_dlaswp04T(
+        iplen[myrow + 1] - iplen[myrow], n, dA, lda, dW, LDW, dlindxU);
+  }
+
+  /*
+   * Permute U in every process row
+   */
+  HPL_dlaswp10N(n, jb, dU, LDU, dpermU);
+  /*
+   * End of HPL_pdlaswp_endT
+   */
+}
diff --git a/src/pgesv/HPL_pdtrsv_device.cpp b/src/pgesv/HPL_pdtrsv_device.cpp
new file mode 100644
index 0000000..9a01362
--- /dev/null
+++ b/src/pgesv/HPL_pdtrsv_device.cpp
@@ -0,0 +1,352 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+#include <hip/hip_runtime.h>
+
+#define BLOCK_SIZE 512
+__global__ void setZero(const int N, double* __restrict__ X) {
+  const int    t  = threadIdx.x;
+  const int    b  = blockIdx.x;
+  const size_t id = b * BLOCK_SIZE + t; // row id
+
+  if(id < N) { X[id] = 0.0; }
+}
+
+void HPL_pdtrsv(HPL_T_grid* GRID, HPL_T_pmat* AMAT) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pdtrsv solves an upper triangular system of linear equations.
+   *
+   * The rhs is the last column of the N by N+1 matrix A. The solve starts
+   * in the process  column owning the  Nth  column of A, so the rhs b may
+   * need to be moved one process column to the left at the beginning. The
+   * routine therefore needs  a column  vector in every process column but
+   * the one owning  b. The result is  replicated in all process rows, and
+   * returned in XR, i.e. XR is of size nq = LOCq( N ) in all processes.
+   *
+   * The algorithm uses decreasing one-ring broadcast in process rows  and
+   * columns  implemented  in terms of  synchronous communication point to
+   * point primitives.  The  lookahead of depth 1 is used to minimize  the
+   * critical path. This entire operation is essentially ``latency'' bound
+   * and an estimate of its running time is given by:
+   *
+   *    (move rhs) lat + N / ( P bdwth ) +
+   *    (solve)    ((N / NB)-1) 2 (lat + NB / bdwth) +
+   *               gam2 N^2 / ( P Q ),
+   *
+   * where  gam2   is an estimate of the   Level 2 BLAS rate of execution.
+   * There are  N / NB  diagonal blocks. One must exchange  2  messages of
+   * length NB to compute the next  NB  entries of the vector solution, as
+   * well as performing a total of N^2 floating point operations.
+   *
+   * Arguments
+   * =========
+   *
+   * GRID    (local input)                 HPL_T_grid *
+   *         On entry,  GRID  points  to the data structure containing the
+   *         process grid information.
+   *
+   * AMAT    (local input/output)          HPL_T_pmat *
+   *         On entry,  AMAT  points  to the data structure containing the
+   *         local array information.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  MPI_Comm Ccomm, Rcomm;
+  double * Aprev = NULL, *XC = NULL, *XR = NULL, *Xd = NULL, *Xdprev = NULL,
+         *W  = NULL;
+  double *dA = NULL, *dAprev = NULL, *dAptr, *dXC = NULL, *dXR = NULL,
+         *dXd = NULL, *dXdprev = NULL, *dW = NULL;
+  int Alcol, Alrow, Anpprev, Anp, Anq, Bcol, Cmsgid, GridIsNotPx1, GridIsNot1xQ,
+      Rmsgid, colprev, kb, kbprev, lda, mycol, myrow, n, n1, n1p,
+      n1pprev = 0, nb, npcol, nprow, rowprev, tmp1, tmp2;
+/* ..
+ * .. Executable Statements ..
+ */
+#ifdef HPL_DETAILED_TIMING
+  HPL_ptimer(HPL_TIMING_PTRSV);
+#endif
+  if((n = AMAT->n) <= 0) return;
+
+  (void)HPL_grid_info(GRID, &nprow, &npcol, &myrow, &mycol);
+  Rcomm        = GRID->row_comm;
+  Rmsgid       = MSGID_BEGIN_PTRSV;
+  Ccomm        = GRID->col_comm;
+  Cmsgid       = MSGID_BEGIN_PTRSV + 1;
+  GridIsNot1xQ = (nprow > 1);
+  GridIsNotPx1 = (npcol > 1);
+
+  nb  = AMAT->nb;
+  lda = AMAT->ld;
+
+  Mnumroc(Anp, n, nb, nb, myrow, 0, nprow);
+  Mnumroc(Anq, n, nb, nb, mycol, 0, npcol);
+
+  dA  = AMAT->dA;
+  dXR = AMAT->dX;
+  XR  = AMAT->W + 2 * Anp;
+
+  XC  = AMAT->W;
+  dXC = AMAT->dW;
+
+  W  = AMAT->W + Anp;
+  dW = AMAT->dW + Anp;
+
+  hipStream_t stream;
+  rocblas_get_stream(handle, &stream);
+
+  /*
+   * Move the rhs in the process column owning the last column of A.
+   */
+
+  tmp1  = (n - 1) / nb;
+  Alrow = tmp1 - (tmp1 / nprow) * nprow;
+  Alcol = tmp1 - (tmp1 / npcol) * npcol;
+  kb    = n - tmp1 * nb;
+
+  dAptr      = (double*)(dA);
+  double* dB = Mptr(dAptr, 0, Anq, lda);
+
+  Mindxg2p(n, nb, nb, Bcol, 0, npcol);
+
+  if(Anp > 0) {
+    if(Alcol != Bcol) {
+      if(mycol == Bcol) {
+        hipMemcpy(dXC, dB, Anp * sizeof(double), hipMemcpyDeviceToDevice);
+        (void)HPL_send(dXC, Anp, Alcol, Rmsgid, Rcomm);
+      } else if(mycol == Alcol) {
+        (void)HPL_recv(dXC, Anp, Bcol, Rmsgid, Rcomm);
+      }
+    } else {
+      if(mycol == Bcol) {
+        hipMemcpy(dXC, dB, Anp * sizeof(double), hipMemcpyDeviceToDevice);
+      }
+    }
+  }
+
+  Rmsgid = (Rmsgid + 2 > MSGID_END_PTRSV ? MSGID_BEGIN_PTRSV : Rmsgid + 2);
+  if(mycol != Alcol) {
+    if(Anp) {
+      size_t grid_size = (Anp + BLOCK_SIZE - 1) / BLOCK_SIZE;
+      setZero<<<grid_size, BLOCK_SIZE, 0, stream>>>(Anp, dXC);
+    }
+  }
+  /*
+   * Set up lookahead
+   */
+  n1 = (npcol - 1) * nb;
+  n1 = Mmax(n1, nb);
+
+  Anpprev = Anp;
+  dAprev = dAptr = Mptr(dAptr, 0, Anq, lda);
+  Xdprev         = XR;
+  dXdprev        = dXR;
+  tmp1           = n - kb;
+  tmp1 -= (tmp2 = Mmin(tmp1, n1));
+  MnumrocI(n1pprev, tmp2, Mmax(0, tmp1), nb, nb, myrow, 0, nprow);
+
+  if(myrow == Alrow) { Anpprev = (Anp -= kb); }
+  if(mycol == Alcol) {
+    dAprev = (dAptr -= lda * kb);
+    Anq -= kb;
+    Xdprev  = (Xd = XR + Anq);
+    dXdprev = (dXd = dXR + Anq);
+    if(myrow == Alrow) {
+      rocblas_dtrsv(handle,
+                    rocblas_fill_upper,
+                    rocblas_operation_none,
+                    rocblas_diagonal_non_unit,
+                    kb,
+                    dAptr + Anp,
+                    lda,
+                    dXC + Anp,
+                    1);
+      rocblas_dcopy(handle, kb, dXC + Anp, 1, dXd, 1);
+    }
+  }
+
+  rowprev = Alrow;
+  Alrow   = MModSub1(Alrow, nprow);
+  colprev = Alcol;
+  Alcol   = MModSub1(Alcol, npcol);
+  kbprev  = kb;
+  n -= kb;
+  tmp1 = n - (kb = nb);
+  tmp1 -= (tmp2 = Mmin(tmp1, n1));
+  MnumrocI(n1p, tmp2, Mmax(0, tmp1), nb, nb, myrow, 0, nprow);
+  /*
+   * Start the operations
+   */
+  while(n > 0) {
+    if(mycol == Alcol) {
+      dAptr -= lda * kb;
+      Anq -= kb;
+      Xd  = XR + Anq;
+      dXd = dXR + Anq;
+    }
+    if(myrow == Alrow) { Anp -= kb; }
+    /*
+     * Broadcast  (decreasing-ring)  of  previous solution block in previous
+     * process column,  compute  partial update of current block and send it
+     * to current process column.
+     */
+    if(mycol == colprev) {
+      /*
+       * Send previous solution block in process row above
+       */
+      if(myrow == rowprev) {
+        if(GridIsNot1xQ) {
+          if(kbprev) {
+            hipDeviceSynchronize();
+            (void)HPL_send(
+                dXdprev, kbprev, MModSub1(myrow, nprow), Cmsgid, Ccomm);
+          }
+        }
+      } else {
+        if(kbprev) {
+          (void)HPL_recv(
+              dXdprev, kbprev, MModAdd1(myrow, nprow), Cmsgid, Ccomm);
+        }
+      }
+      /*
+       * Compute partial update of previous solution block and send it to cur-
+       * rent column
+       */
+      if(n1pprev > 0) {
+        tmp1              = Anpprev - n1pprev;
+        const double one  = 1.0;
+        const double mone = -1.0;
+        rocblas_dgemv(handle,
+                      rocblas_operation_none,
+                      n1pprev,
+                      kbprev,
+                      &mone,
+                      dAprev + tmp1,
+                      lda,
+                      dXdprev,
+                      1,
+                      &one,
+                      dXC + tmp1,
+                      1);
+        if(GridIsNotPx1) {
+          if(n1pprev) {
+            hipDeviceSynchronize();
+            (void)HPL_send(dXC + tmp1, n1pprev, Alcol, Rmsgid, Rcomm);
+          }
+        }
+      }
+      /*
+       * Finish  the (decreasing-ring) broadcast of the solution block in pre-
+       * vious process column
+       */
+      if((myrow != rowprev) && (myrow != MModAdd1(rowprev, nprow))) {
+        if(kbprev) {
+          hipDeviceSynchronize();
+          (void)HPL_send(
+              dXdprev, kbprev, MModSub1(myrow, nprow), Cmsgid, Ccomm);
+        }
+      }
+    } else if(mycol == Alcol) {
+      /*
+       * Current  column  receives  and accumulates partial update of previous
+       * solution block
+       */
+      if(n1pprev > 0) {
+        if(n1pprev) {
+          (void)HPL_recv(dW, n1pprev, colprev, Rmsgid, Rcomm);
+          const double one = 1.0;
+          rocblas_daxpy(
+              handle, n1pprev, &one, dW, 1, dXC + Anpprev - n1pprev, 1);
+        }
+      }
+    }
+    /*
+     * Solve current diagonal block
+     */
+    if((mycol == Alcol) && (myrow == Alrow)) {
+      rocblas_dtrsv(handle,
+                    rocblas_fill_upper,
+                    rocblas_operation_none,
+                    rocblas_diagonal_non_unit,
+                    kb,
+                    dAptr + Anp,
+                    lda,
+                    dXC + Anp,
+                    1);
+      rocblas_dcopy(handle, kb, dXC + Anp, 1, dXR + Anq, 1);
+    }
+    /*
+     *  Finish previous update
+     */
+    if((mycol == colprev) && ((tmp1 = Anpprev - n1pprev) > 0)) {
+      const double one  = 1.0;
+      const double mone = -1.0;
+      rocblas_dgemv(handle,
+                    rocblas_operation_none,
+                    tmp1,
+                    kbprev,
+                    &mone,
+                    dAprev,
+                    lda,
+                    dXdprev,
+                    1,
+                    &one,
+                    dXC,
+                    1);
+    }
+    /*
+     *  Save info of current step and update info for the next step
+     */
+    if(mycol == Alcol) {
+      dAprev  = dAptr;
+      Xdprev  = Xd;
+      dXdprev = dXd;
+    }
+    if(myrow == Alrow) { Anpprev -= kb; }
+
+    rowprev = Alrow;
+    colprev = Alcol;
+    n1pprev = n1p;
+    kbprev  = kb;
+    n -= kb;
+    Alrow = MModSub1(Alrow, nprow);
+    Alcol = MModSub1(Alcol, npcol);
+    tmp1  = n - (kb = nb);
+    tmp1 -= (tmp2 = Mmin(tmp1, n1));
+    MnumrocI(n1p, tmp2, Mmax(0, tmp1), nb, nb, myrow, 0, nprow);
+
+    Rmsgid = (Rmsgid + 2 > MSGID_END_PTRSV ? MSGID_BEGIN_PTRSV : Rmsgid + 2);
+    Cmsgid =
+        (Cmsgid + 2 > MSGID_END_PTRSV ? MSGID_BEGIN_PTRSV + 1 : Cmsgid + 2);
+  }
+  /*
+   * Replicate last solution block
+   */
+  if(mycol == colprev) {
+    if(kbprev) {
+      hipDeviceSynchronize();
+      (void)HPL_broadcast((void*)(dXR), kbprev, HPL_DOUBLE, rowprev, Ccomm);
+    }
+  }
+
+#ifdef HPL_DETAILED_TIMING
+  HPL_ptimer(HPL_TIMING_PTRSV);
+#endif
+}
diff --git a/src/pgesv/HPL_pdupdateNT.cpp b/src/pgesv/HPL_pdupdateNT.cpp
new file mode 100644
index 0000000..be2e98b
--- /dev/null
+++ b/src/pgesv/HPL_pdupdateNT.cpp
@@ -0,0 +1,169 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_pdupdateNT(HPL_T_panel* PANEL, const HPL_T_UPD UPD) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pdupdateNT applies the row interchanges and updates part of the
+   * trailing  (using the panel PANEL) submatrix.
+   *
+   * Arguments
+   * =========
+   *
+   * PANEL   (local input/output)          HPL_T_panel *
+   *         On entry,  PANEL  points to the data structure containing the
+   *         panel (to be updated) information.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  double *Aptr, *L1ptr, *L2ptr, *Uptr, *dpiv;
+  int*    dipiv;
+
+  int curr, i, jb, lda, ldl2, LDU, mp, n, nb;
+
+  /* ..
+   * .. Executable Statements ..
+   */
+  nb   = PANEL->nb;
+  jb   = PANEL->jb;
+  n    = PANEL->nq;
+  lda  = PANEL->dlda;
+  Aptr = PANEL->dA;
+
+  if(UPD == HPL_LOOK_AHEAD) {
+    Uptr = PANEL->dU;
+    LDU  = PANEL->ldu0;
+    n    = Mmin(PANEL->nu0, n);
+  } else if(UPD == HPL_UPD_1) {
+    Uptr = PANEL->dU1;
+    LDU  = PANEL->ldu1;
+    n    = Mmin(PANEL->nu1, n);
+    // we call the row swap start before the first section is updated
+    //  so shift the pointers
+    Aptr = Mptr(Aptr, 0, PANEL->nu0, lda);
+  } else if(UPD == HPL_UPD_2) {
+    Uptr = PANEL->dU2;
+    LDU  = PANEL->ldu2;
+    n    = Mmin(PANEL->nu2, n);
+    // we call the row swap start before the first section is updated
+    //  so shift the pointers
+    Aptr = Mptr(Aptr, 0, PANEL->nu0 + PANEL->nu1, lda);
+  }
+
+  /*
+   * There is nothing to update, enforce the panel broadcast.
+   */
+  if((n <= 0) || (jb <= 0)) { return; }
+
+  hipStream_t stream;
+  rocblas_get_stream(handle, &stream);
+
+  curr  = (PANEL->grid->myrow == PANEL->prow ? 1 : 0);
+  L2ptr = PANEL->dL2;
+  L1ptr = PANEL->dL1;
+  ldl2  = PANEL->dldl2;
+  mp    = PANEL->mp - (curr != 0 ? jb : 0);
+
+  const double one  = 1.0;
+  const double mone = -1.0;
+
+  /*
+   * Update
+   */
+  if(PANEL->grid->nprow == 1) {
+    /*
+     * 1 x Q case
+     */
+    rocblas_dtrsm(handle,
+                  rocblas_side_left,
+                  rocblas_fill_lower,
+                  rocblas_operation_none,
+                  rocblas_diagonal_unit,
+                  jb,
+                  n,
+                  &one,
+                  L1ptr,
+                  jb,
+                  Aptr,
+                  lda);
+
+    HPL_dlatcpy_gpu(n, jb, Aptr, lda, Uptr, LDU);
+  } else {
+    /*
+     * Compute redundantly row block of U and update trailing submatrix
+     */
+    rocblas_dtrsm(handle,
+                  rocblas_side_right,
+                  rocblas_fill_lower,
+                  rocblas_operation_transpose,
+                  rocblas_diagonal_unit,
+                  n,
+                  jb,
+                  &one,
+                  L1ptr,
+                  jb,
+                  Uptr,
+                  LDU);
+  }
+
+  /*
+   * Queue finishing the update
+   */
+  if(curr != 0) {
+    hipEventRecord(dgemmStart[UPD], stream);
+    rocblas_dgemm(handle,
+                  rocblas_operation_none,
+                  rocblas_operation_transpose,
+                  mp,
+                  n,
+                  jb,
+                  &mone,
+                  L2ptr,
+                  ldl2,
+                  Uptr,
+                  LDU,
+                  &one,
+                  Mptr(Aptr, jb, 0, lda),
+                  lda);
+    hipEventRecord(dgemmStop[UPD], stream);
+
+    if(PANEL->grid->nprow > 1) HPL_dlatcpy_gpu(jb, n, Uptr, LDU, Aptr, lda);
+  } else {
+    hipEventRecord(dgemmStart[UPD], stream);
+    rocblas_dgemm(handle,
+                  rocblas_operation_none,
+                  rocblas_operation_transpose,
+                  mp,
+                  n,
+                  jb,
+                  &mone,
+                  L2ptr,
+                  ldl2,
+                  Uptr,
+                  LDU,
+                  &one,
+                  Aptr,
+                  lda);
+    hipEventRecord(dgemmStop[UPD], stream);
+  }
+
+  hipEventRecord(update[UPD], stream);
+}
diff --git a/src/pgesv/HPL_pdupdateTT.cpp b/src/pgesv/HPL_pdupdateTT.cpp
new file mode 100644
index 0000000..bac3101
--- /dev/null
+++ b/src/pgesv/HPL_pdupdateTT.cpp
@@ -0,0 +1,168 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_pdupdateTT(HPL_T_panel* PANEL, const HPL_T_UPD UPD) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pdupdateNT applies the row interchanges and updates part of the
+   * trailing  (using the panel PANEL) submatrix.
+   *
+   * Arguments
+   * =========
+   *
+   * PANEL   (local input/output)          HPL_T_panel *
+   *         On entry,  PANEL  points to the data structure containing the
+   *         panel (to be updated) information.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  double *Aptr, *L1ptr, *L2ptr, *Uptr, *dpiv;
+  int*    dipiv;
+
+  int curr, i, iroff, jb, lda, ldl2, LDU, mp, n, nb;
+
+  /* ..
+   * .. Executable Statements ..
+   */
+  nb   = PANEL->nb;
+  jb   = PANEL->jb;
+  n    = PANEL->nq;
+  lda  = PANEL->dlda;
+  Aptr = PANEL->dA;
+
+  if(UPD == HPL_LOOK_AHEAD) {
+    Uptr = PANEL->dU;
+    LDU  = PANEL->ldu0;
+    n    = Mmin(PANEL->nu0, n);
+  } else if(UPD == HPL_UPD_1) {
+    Uptr = PANEL->dU1;
+    LDU  = PANEL->ldu1;
+    n    = Mmin(PANEL->nu1, n);
+    // we call the row swap start before the first section is updated
+    //  so shift the pointers
+    Aptr = Mptr(Aptr, 0, PANEL->nu0, lda);
+  } else if(UPD == HPL_UPD_2) {
+    Uptr = PANEL->dU2;
+    LDU  = PANEL->ldu2;
+    n    = Mmin(PANEL->nu2, n);
+    // we call the row swap start before the first section is updated
+    //  so shift the pointers
+    Aptr = Mptr(Aptr, 0, PANEL->nu0 + PANEL->nu1, lda);
+  }
+
+  /*
+   * There is nothing to update, enforce the panel broadcast.
+   */
+  if((n <= 0) || (jb <= 0)) { return; }
+
+  hipStream_t stream;
+  rocblas_get_stream(handle, &stream);
+
+  curr  = (PANEL->grid->myrow == PANEL->prow ? 1 : 0);
+  L2ptr = PANEL->dL2;
+  L1ptr = PANEL->dL1;
+  ldl2  = PANEL->dldl2;
+  mp    = PANEL->mp - (curr != 0 ? jb : 0);
+
+  const double one  = 1.0;
+  const double mone = -1.0;
+
+  /*
+   * Update
+   */
+  if(PANEL->grid->nprow == 1) {
+    /*
+     * 1 x Q case
+     */
+    rocblas_dtrsm(handle,
+                  rocblas_side_left,
+                  rocblas_fill_upper,
+                  rocblas_operation_transpose,
+                  rocblas_diagonal_unit,
+                  jb,
+                  n,
+                  &one,
+                  L1ptr,
+                  jb,
+                  Aptr,
+                  lda);
+    HPL_dlatcpy_gpu(n, jb, Aptr, lda, Uptr, LDU);
+  } else {
+    /*
+     * Compute redundantly row block of U and update trailing submatrix
+     */
+    rocblas_dtrsm(handle,
+                  rocblas_side_right,
+                  rocblas_fill_upper,
+                  rocblas_operation_none,
+                  rocblas_diagonal_unit,
+                  n,
+                  jb,
+                  &one,
+                  L1ptr,
+                  jb,
+                  Uptr,
+                  LDU);
+  }
+
+  /*
+   * Queue finishing the update
+   */
+  if(curr != 0) {
+    hipEventRecord(dgemmStart[UPD], stream);
+    rocblas_dgemm(handle,
+                  rocblas_operation_none,
+                  rocblas_operation_transpose,
+                  mp,
+                  n,
+                  jb,
+                  &mone,
+                  L2ptr,
+                  ldl2,
+                  Uptr,
+                  LDU,
+                  &one,
+                  Mptr(Aptr, jb, 0, lda),
+                  lda);
+    hipEventRecord(dgemmStop[UPD], stream);
+
+    if(PANEL->grid->nprow > 1) HPL_dlatcpy_gpu(jb, n, Uptr, LDU, Aptr, lda);
+  } else {
+    hipEventRecord(dgemmStart[UPD], stream);
+    rocblas_dgemm(handle,
+                  rocblas_operation_none,
+                  rocblas_operation_transpose,
+                  mp,
+                  n,
+                  jb,
+                  &mone,
+                  L2ptr,
+                  ldl2,
+                  Uptr,
+                  LDU,
+                  &one,
+                  Aptr,
+                  lda);
+    hipEventRecord(dgemmStop[UPD], stream);
+  }
+
+  hipEventRecord(update[UPD], stream);
+}
diff --git a/src/pgesv/HPL_perm.cpp b/src/pgesv/HPL_perm.cpp
new file mode 100644
index 0000000..3d2fab2
--- /dev/null
+++ b/src/pgesv/HPL_perm.cpp
@@ -0,0 +1,89 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_perm(const int N, int* LINDXA, int* LINDXAU, int* IWORK) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_perm combines  two  index  arrays  and generate the corresponding
+   * permutation. First, this function computes the inverse of LINDXA, and
+   * then combine it with LINDXAU.  Second, in order to be able to perform
+   * the permutation in place,  LINDXAU  is overwritten by the sequence of
+   * permutation  producing  the  same result.  What we ultimately want to
+   * achieve is:  U[LINDXAU[i]] := U[LINDXA[i]] for i in [0..N). After the
+   * call to this function,  this in place permutation can be performed by
+   * for i in [0..N) swap U[i] with U[LINDXAU[i]].
+   *
+   * Arguments
+   * =========
+   *
+   * N       (global input)                const int
+   *         On entry,  N  specifies the length of the arrays  LINDXA  and
+   *         LINDXAU. N should be at least zero.
+   *
+   * LINDXA  (global input/output)         int *
+   *         On entry,  LINDXA  is an array of dimension N  containing the
+   *         source indexes. On exit,  LINDXA  contains the combined index
+   *         array.
+   *
+   * LINDXAU (global input/output)         int *
+   *         On entry,  LINDXAU is an array of dimension N  containing the
+   *         target indexes.  On exit,  LINDXAU  contains  the sequence of
+   *         permutation,  that  should be applied  in increasing order to
+   *         permute the underlying array U in place.
+   *
+   * IWORK   (workspace)                   int *
+   *         On entry, IWORK is a workarray of dimension N.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  int i, j, k, fndd;
+
+  /*
+   * Inverse LINDXA - combine LINDXA and LINDXAU - Initialize IWORK
+   */
+  for(i = 0; i < N; i++) { IWORK[LINDXA[i]] = i; }
+  for(i = 0; i < N; i++) {
+    LINDXA[i] = LINDXAU[IWORK[i]];
+    IWORK[i]  = i;
+  }
+
+  for(i = 0; i < N; i++) {
+    /* search LINDXA such that    LINDXA[j]  == i */
+    j = 0;
+    do {
+      fndd = (LINDXA[j] == i);
+      j++;
+    } while(!fndd);
+    j--;
+    /* search IWORK  such that    IWORK[k]   == j */
+    k = 0;
+    do {
+      fndd = (IWORK[k] == j);
+      k++;
+    } while(!fndd);
+    k--;
+    /* swap IWORK[i] and IWORK[k]; LINDXAU[i] = k */
+    j          = IWORK[i];
+    IWORK[i]   = IWORK[k];
+    IWORK[k]   = j;
+    LINDXAU[i] = k;
+  }
+}
diff --git a/src/pgesv/HPL_pipid.cpp b/src/pgesv/HPL_pipid.cpp
new file mode 100644
index 0000000..1d1cfff
--- /dev/null
+++ b/src/pgesv/HPL_pipid.cpp
@@ -0,0 +1,164 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_pipid(HPL_T_panel* PANEL, int* K, int* IPID) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_pipid computes an array  IPID  that contains the source and final
+   * destination  of  matrix rows  resulting  from  the  application  of N
+   * interchanges  as computed by the  LU  factorization  with row partial
+   * pivoting. The array IPID is such that the row of global index IPID(i)
+   * should be mapped onto the row of global index IPID(i+1). Note that we
+   * cannot really know the length of IPID a priori. However, we know that
+   * this array is at least 2*N long,  since  there are N rows to swap and
+   * broadcast. The length of this array  must be smaller than or equal to
+   * 4*N, since every row is swapped with at most a single distinct remote
+   * row. The algorithm constructing  IPID  goes as follows: Let IA be the
+   * global index of the first row to be swapped.
+   *
+   * For every row src IA + i with i in [0..N) to be swapped with row  dst
+   * such that dst is given by DPIV[i]:
+   *
+   * Is row  src  the destination  of a previous row of the current block,
+   * that is, is there k odd such that IPID(k) is equal to src ?
+   *     Yes:  update  this destination  with dst.  For  example,  if  the
+   * pivot array is  (0,2)(1,1)(2,5) ... , then when we swap rows 2 and 5,
+   * we swap in fact row 0 and 5,  i.e.,  row 0 goes to 5 and not 2  as it
+   * was thought so far ...
+   *     No :  add  the pair (src,dst) at the end of IPID; row src has not
+   * been moved yet.
+   *
+   * Is row  dst  different  from src the destination of a previous row of
+   * the current block, i.e., is there k odd such that IPID(k) is equal to
+   * dst ?
+   *     Yes:  update  IPID(k) with src.  For example,  if the pivot array
+   * is (0,5)(1,1)(2,5) ... , then when  we swap rows  2 and 5, we swap in
+   * fact row 2 and 0,  i.e.,  row 0 goes to 2 and not 5 as it was thought
+   * so far ...
+   *     No : add  the  pair (dst,src) at the end of IPID; row dst has not
+   * been moved yet.
+   *
+   * Note that when src is equal to dst, the pair (dst,src)  should not be
+   * added to  IPID  in  order  to avoid duplicated entries in this array.
+   * During  the construction of the array  IPID,  we  make  sure that the
+   * first N entries are such that IPID(k) with k odd is equal to  IA+k/2.
+   * For k in  [0..K/2),  the  row  of global index  IPID(2*k)  should  be
+   * mapped onto the row of global index IPID(2*k+1).
+   *
+   * Arguments
+   * =========
+   *
+   * PANEL   (local input/output)          HPL_T_panel *
+   *         On entry,  PANEL  points to the data structure containing the
+   *         panel information.
+   *
+   * K       (global output)               int *
+   *         On exit, K specifies the number of entries in  IPID.  K is at
+   *         least 2*N, and at most 4*N.
+   *
+   * IPID    (global output)               int *
+   *         On entry, IPID is an array of length 4*N.  On exit, the first
+   *         K entries of that array contain the src and final destination
+   *         resulting  from  the  application of the  N  interchanges  as
+   *         specified by  DPIV.  The  pairs  (src,dst)  are  contiguously
+   *         stored and sorted so that IPID(2*i+1) is equal to IA+i with i
+   *         in [0..N)
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  int  dst, fndd, fnds, ia, i, j, jb, lst, off, src;
+  int* ipiv;
+
+  ipiv = PANEL->ipiv;
+  jb   = PANEL->jb;
+  src = ia = PANEL->ia;
+  dst      = (int)(ipiv[0]);
+  IPID[0]  = dst;
+  IPID[1]  = src;
+  *K       = 2;
+  if(src != dst) {
+    IPID[2] = src;
+    IPID[3] = dst;
+    *K += 2;
+  }
+
+  for(i = 1; i < jb; i++) {
+    fnds = 0;
+    j    = 1;
+
+    if((src = ia + i) == (dst = (int)(ipiv[i]))) {
+      do {
+        if(src == IPID[j]) {
+          fnds = j;
+        } else {
+          j += 2;
+        }
+      } while(!(fnds) && (j < *K));
+      if(!fnds) {
+        lst       = *K;
+        off       = 2;
+        IPID[lst] = src;
+      } else {
+        lst = fnds - 1;
+        off = 0;
+      }
+      IPID[lst + 1] = dst;
+    } else {
+      fndd = 0;
+      do {
+        if(src == IPID[j]) {
+          fnds = j;
+        } else if(dst == IPID[j]) {
+          fndd = j;
+        }
+        j += 2;
+      } while((!(fnds) || !(fndd)) && (j < *K));
+      if(!fnds) {
+        IPID[*K]     = src;
+        IPID[*K + 1] = dst;
+        off          = 2;
+      } else {
+        IPID[fnds] = dst;
+        off        = 0;
+      }
+      if(!fndd) {
+        lst       = *K + off;
+        IPID[lst] = dst;
+        off += 2;
+      } else {
+        lst = fndd - 1;
+      }
+      IPID[lst + 1] = src;
+    }
+    /*
+     * Enforce IPID(1,i) equal to src = ia + i
+     */
+    if(lst != (j = (i << 1))) {
+      src           = IPID[j];
+      IPID[j]       = IPID[lst];
+      IPID[lst]     = src;
+      dst           = IPID[j + 1];
+      IPID[j + 1]   = IPID[lst + 1];
+      IPID[lst + 1] = dst;
+    }
+    *K += off;
+  }
+}
diff --git a/src/pgesv/HPL_piplen.cpp b/src/pgesv/HPL_piplen.cpp
new file mode 100644
index 0000000..d11ea9e
--- /dev/null
+++ b/src/pgesv/HPL_piplen.cpp
@@ -0,0 +1,58 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_piplen(HPL_T_panel* PANEL,
+                const int    K,
+                const int*   IPID,
+                int*         IPLEN,
+                int*         IWORK) {
+
+  const int nprow   = PANEL->grid->nprow;
+  const int jb      = PANEL->jb;
+  const int nb      = PANEL->nb;
+  const int ia      = PANEL->ia;
+  const int icurrow = PANEL->prow;
+
+  int* iwork = IWORK + jb;
+
+  /*
+   * Compute IPLEN
+   */
+  for(int i = 0; i <= nprow; i++) IPLEN[i] = 0;
+
+  /*
+   * IPLEN[i]  is the number of rows of A in the processes  before
+   * process i, with the convention that IPLEN[nprow] is the total
+   * number of rows.
+   * In other words,  IPLEN[i+1] - IPLEN[i] is the local number of
+   * rows of  A  that should be moved for each process.
+   */
+  for(int i = 0; i < K; i += 2) {
+    const int src = IPID[i];
+    int       srcrow;
+    Mindxg2p(src, nb, nb, srcrow, 0, nprow);
+    if(srcrow == icurrow) {
+      const int dst = IPID[i + 1];
+      int       dstrow;
+      Mindxg2p(dst, nb, nb, dstrow, 0, nprow);
+      if((dstrow != srcrow) || (dst - ia < jb)) IPLEN[dstrow + 1]++;
+    }
+  }
+
+  for(int i = 1; i <= nprow; i++) { IPLEN[i] += IPLEN[i - 1]; }
+}
diff --git a/src/pgesv/HPL_plindx.cpp b/src/pgesv/HPL_plindx.cpp
new file mode 100644
index 0000000..8344008
--- /dev/null
+++ b/src/pgesv/HPL_plindx.cpp
@@ -0,0 +1,238 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+void HPL_plindx(HPL_T_panel* PANEL,
+                const int    K,
+                const int*   IPID,
+                int*         IPA,
+                int*         LINDXU,
+                int*         LINDXAU,
+                int*         LINDXA,
+                int*         IPLEN,
+                int*         PERMU,
+                int*         IWORK) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_plindx computes three local arrays LINDXU, LINDXA, and  LINDXAU
+   * containing the  local  source and final destination position  resulting
+   * from the application of row interchanges.  In addition, this function
+   * computes the array IPLEN that contains the mapping information for the
+   * spreading phase.
+   *
+   * Arguments
+   * =========
+   *
+   * PANEL   (local input/output)          HPL_T_panel *
+   *         On entry,  PANEL  points to the data structure containing the
+   *         panel information.
+   *
+   * K       (global input)                const int
+   *         On entry, K specifies the number of entries in IPID.  K is at
+   *         least 2*N, and at most 4*N.
+   *
+   * IPID    (global input)                const int *
+   *         On entry,  IPID  is an array of length K. The first K entries
+   *         of that array contain the src and final destination resulting
+   *         from the application of the interchanges.
+   *
+   * IPA     (global output)               int *
+   *         On exit,  IPA  specifies  the number of rows that the current
+   *         process row has that should be swapped with local rows of A.
+   *
+   * LINDXU  (global output)               int *
+   *         On entry, LINDXU  is an array of dimension N. On exit, this
+   *         array contains the local indexes of the rows of A I have that
+   *         should be copied into U.
+   *
+   * LINDXAU (global output)               int *
+   *         On entry, LINDXAU is an array of dimension N. On exit, this
+   *         array contains the local source indexes of the rows of A I
+   *         have that should be swapped locally.
+   *
+   * LINDXA  (global output)               int *
+   *         On entry, LINDXA  is an array of dimension N. On exit, this
+   *         array contains  the local destination indexes of the rows
+   *         of A I have that should be swapped locally.
+   *
+   * IPLEN   (global output)               int *
+   *         On entry, IPLEN is an array of dimension NPROW + 1. On  exit,
+   *         this array is such that  IPLEN[i]  is the number of rows of A
+   *         in  the  processes  before  process  IPMAP[i]  after the sort
+   *         with the convention that IPLEN[nprow]  is the total number of
+   *         rows of the panel.  In other words IPLEN[i+1]-IPLEN[i] is the
+   *         local number of rows of A that should be moved to the process
+   *         IPMAP[i]. IPLEN is such that the number of rows of the source
+   *         process  row can be computed as  IPLEN[1] - IPLEN[0], and the
+   *         remaining  entries  of  this  array  are  sorted  so that the
+   *         quantities IPLEN[i+1] - IPLEN[i] are logarithmically sorted.
+   *
+   * PERMU   (global output)               int *
+   *         On entry,  PERMU  is an array of dimension JB. On exit, PERMU
+   *         contains  a sequence of permutations,  that should be applied
+   *         in increasing order to permute in place the row panel U.
+   *
+   * IWORK   (workspace)                   int *
+   *         On entry, IWORK is a workarray of dimension 2*JB.
+   *
+   * ---------------------------------------------------------------------
+   */
+  const int myrow   = PANEL->grid->myrow;
+  const int nprow   = PANEL->grid->nprow;
+  const int jb      = PANEL->jb;
+  const int nb      = PANEL->nb;
+  const int ia      = PANEL->ia;
+  const int iroff   = PANEL->ii;
+  const int icurrow = PANEL->prow;
+
+  int* iwork = IWORK + jb;
+
+  /*
+   * Compute IPLEN
+   */
+  HPL_piplen(PANEL, K, IPID, IPLEN, IWORK);
+
+  /*
+   * Compute the local arrays  LINDXA  and  LINDXAU  containing  the local
+   * source and final destination position resulting from  the application
+   * of N interchanges. Compute LINDXA and LINDXAU in icurrow,  and LINDXA
+   * elsewhere and PERMU in every process.
+   */
+  if(myrow == icurrow) {
+    // for all rows to be swapped
+    int ip = 0, ipU = 0;
+    for(int i = 0; i < K; i += 2) {
+      const int src = IPID[i];
+      int       srcrow;
+      Mindxg2p(src, nb, nb, srcrow, 0, nprow);
+
+      if(srcrow == icurrow) { // if I own the src row
+        const int dst = IPID[i + 1];
+        int       dstrow;
+        Mindxg2p(dst, nb, nb, dstrow, 0, nprow);
+
+        int il;
+        Mindxg2l(il, src, nb, nb, myrow, 0, nprow);
+
+        if((dstrow == icurrow) && (dst - ia < jb)) {
+          // if I own the dst and it's in U
+
+          PERMU[ipU] = dst - ia;      // row index in U
+          iwork[ipU] = IPLEN[dstrow]; // Index in AllGathered U
+          ipU++;
+
+          LINDXU[IPLEN[dstrow]] = il - iroff; // Index in AllGathered U
+          IPLEN[dstrow]++;
+        } else if(dstrow != icurrow) {
+          // else if I don't own the dst
+
+          // Find the IPID pair with dst as the source
+          int j = 0;
+          int fndd;
+          do {
+            fndd = (dst == IPID[j]);
+            j += 2;
+          } while(!fndd && (j < K));
+          // This pair must have dst being sent to a position in U
+
+          PERMU[ipU] = IPID[j - 1] - ia; // row index in U
+          iwork[ipU] = IPLEN[dstrow];    // Index in AllGathered U
+          ipU++;
+
+          LINDXU[IPLEN[dstrow]] = il - iroff; // Index in AllGathered U
+          IPLEN[dstrow]++;
+        } else if((dstrow == icurrow) && (dst - ia >= jb)) {
+          // else I own the dst, but it's not in U
+
+          LINDXAU[ip] = il - iroff; // the src row must be in the first jb rows
+
+          int il;
+          Mindxg2l(il, dst, nb, nb, myrow, 0, nprow);
+          LINDXA[ip] = il - iroff; // the dst is somewhere below
+          ip++;
+        }
+      }
+    }
+    *IPA = ip;
+  } else {
+    // for all rows to be swapped
+    int ip = 0, ipU = 0;
+    for(int i = 0; i < K; i += 2) {
+      const int src = IPID[i];
+      int       srcrow;
+      Mindxg2p(src, nb, nb, srcrow, 0, nprow);
+      const int dst = IPID[i + 1];
+      int       dstrow;
+      Mindxg2p(dst, nb, nb, dstrow, 0, nprow);
+      /*
+       * LINDXU[i] is the local index of the row of A that belongs into U
+       */
+      if(myrow == dstrow) { // if I own the dst row
+        int il;
+        Mindxg2l(il, dst, nb, nb, myrow, 0, nprow);
+        LINDXU[ip] = il - iroff; // Local A index of incoming row
+        ip++;
+      }
+      /*
+       * iwork[i] is the local (current) position  index in U
+       * PERMU[i] is the local (final) destination index in U
+       */
+
+      // if the src row is coming from the current row rank
+      if(srcrow == icurrow) {
+
+        if((dstrow == icurrow) && (dst - ia < jb)) {
+          // If the row is going into U
+          PERMU[ipU] = dst - ia;      // row index in U
+          iwork[ipU] = IPLEN[dstrow]; // Index in AllGathered U
+          IPLEN[dstrow]++;
+          ipU++;
+        } else if(dstrow != icurrow) {
+          // If the row is going to another rank
+          // (So src must be in U)
+
+          // Find the IPID pair with dst as the source
+          int j = 0;
+          int fndd;
+          do {
+            fndd = (dst == IPID[j]);
+            j += 2;
+          } while(!fndd && (j < K));
+          // This pair must have dst being sent to a position in U
+
+          PERMU[ipU] = IPID[j - 1] - ia; // row index in U
+          iwork[ipU] = IPLEN[dstrow];    // Index in AllGathered U
+          IPLEN[dstrow]++;
+          ipU++;
+        }
+      }
+    }
+    *IPA = 0;
+  }
+  /*
+   * Simplify iwork and PERMU, return in PERMU the sequence of permutation
+   * that need to be apply to U after it has been broadcast.
+   */
+  HPL_perm(jb, iwork, PERMU, IWORK);
+  /*
+   * Reset IPLEN to its correct value
+   */
+  for(int i = nprow; i > 0; i--) IPLEN[i] = IPLEN[i - 1];
+  IPLEN[0] = 0;
+}
diff --git a/src/timer/HPL_ptimer.cpp b/src/timer/HPL_ptimer.cpp
new file mode 100644
index 0000000..53b82ec
--- /dev/null
+++ b/src/timer/HPL_ptimer.cpp
@@ -0,0 +1,262 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+/*
+ * ---------------------------------------------------------------------
+ * Static variables
+ * ---------------------------------------------------------------------
+ */
+static int    HPL_ptimer_disabled;
+static double HPL_ptimer_cpusec[HPL_NPTIMER], HPL_ptimer_cpustart[HPL_NPTIMER];
+static double HPL_ptimer_wallsec[HPL_NPTIMER],
+    HPL_ptimer_wallstart[HPL_NPTIMER];
+static double HPL_ptimer_wallstep[HPL_NPTIMER];
+/*
+ * ---------------------------------------------------------------------
+ * User callable functions
+ * ---------------------------------------------------------------------
+ */
+void HPL_ptimer_boot() {
+  /*
+   * HPL_ptimer_boot (re)sets all timers to 0, and enables HPL_ptimer.
+   */
+
+  int i;
+
+  HPL_ptimer_disabled = 0;
+
+  for(i = 0; i < HPL_NPTIMER; i++) {
+    HPL_ptimer_cpusec[i] = HPL_ptimer_wallsec[i] = HPL_rzero;
+    HPL_ptimer_wallstep[i]                       = HPL_rzero;
+    HPL_ptimer_cpustart[i] = HPL_ptimer_wallstart[i] = HPL_PTIMER_STARTFLAG;
+  }
+}
+
+void HPL_ptimer(const int I) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_ptimer provides a  "stopwatch"  functionality  cpu/wall  timer in
+   * seconds.  Up to  64  separate timers can be functioning at once.  The
+   * first call starts the timer,  and the second stops it.  This  routine
+   * can be disenabled  by calling HPL_ptimer_disable(),  so that calls to
+   * the timer are ignored.  This feature can be used to make sure certain
+   * sections of code do not affect timings,  even  if  they call routines
+   * which have HPL_ptimer calls in them. HPL_ptimer_enable()  will enable
+   * the  timer  functionality.  One  can retrieve  the current value of a
+   * timer by calling
+   *
+   * t0 = HPL_ptimer_inquire( HPL_WALL_TIME | HPL_CPU_TIME, I )
+   *
+   * where  I  is the timer index in  [0..64).  To  inititialize the timer
+   * functionality, one must have called HPL_ptimer_boot() prior to any of
+   * the functions mentioned above.
+   *
+   * Arguments
+   * =========
+   *
+   * I       (global input)                const int
+   *         On entry, I specifies the timer to stop/start.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  if(HPL_ptimer_disabled) return;
+  /*
+   * If timer has not been started, start it.  Otherwise,  stop it and add
+   * interval to count
+   */
+  if(HPL_ptimer_wallstart[I] == HPL_PTIMER_STARTFLAG) {
+    HPL_ptimer_wallstart[I] = HPL_ptimer_walltime();
+    HPL_ptimer_cpustart[I]  = HPL_ptimer_cputime();
+  } else {
+    HPL_ptimer_cpusec[I] += HPL_ptimer_cputime() - HPL_ptimer_cpustart[I];
+    const double walltime = HPL_ptimer_walltime() - HPL_ptimer_wallstart[I];
+    HPL_ptimer_wallstep[I] += walltime;
+    HPL_ptimer_wallsec[I] += walltime;
+    HPL_ptimer_wallstart[I] = HPL_PTIMER_STARTFLAG;
+  }
+}
+
+void HPL_ptimer_enable(void) {
+  /*
+   * HPL_ptimer_enable sets it so calls to HPL_ptimer are not ignored.
+   */
+
+  HPL_ptimer_disabled = 0;
+  return;
+}
+
+void HPL_ptimer_disable(void) {
+  /*
+   * HPL_ptimer_disable sets it so calls to HPL_ptimer are ignored.
+   */
+
+  HPL_ptimer_disabled = 1;
+  return;
+}
+
+void HPL_ptimer_stepReset(const int N, const int IBEG) {
+  for(int i = 0; i < N; i++) { HPL_ptimer_wallstep[IBEG + i] = HPL_rzero; }
+}
+
+double HPL_ptimer_getStep(const int I) {
+
+  double time;
+
+  /*
+   * If wall-time are not available on this machine, return
+   * HPL_PTIMER_ERROR
+   */
+  if(HPL_ptimer_walltime() == HPL_PTIMER_ERROR)
+    time = HPL_PTIMER_ERROR;
+  else
+    time = HPL_ptimer_wallstep[I];
+
+  return (time);
+}
+
+double HPL_ptimer_inquire(const HPL_T_PTIME TMTYPE, const int I) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_ptimer_inquire returns wall- or cpu- time that has accumulated in
+   * timer I.
+   *
+   * Arguments
+   * =========
+   *
+   * TMTYPE  (global input)              const HPL_T_PTIME
+   *         On entry, TMTYPE specifies what time will be returned as fol-
+   *         lows
+   *            = HPL_WALL_PTIME : wall clock time is returned,
+   *            = HPL_CPU_PTIME  : CPU time is returned (default).
+   *
+   * I       (global input)              const int
+   *         On entry, I specifies the timer to return.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  double time;
+
+  /*
+   * If wall- or cpu-time are not available on this machine, return
+   * HPL_PTIMER_ERROR
+   */
+  if(TMTYPE == HPL_WALL_PTIME) {
+    if(HPL_ptimer_walltime() == HPL_PTIMER_ERROR)
+      time = HPL_PTIMER_ERROR;
+    else
+      time = HPL_ptimer_wallsec[I];
+  } else {
+    if(HPL_ptimer_cputime() == HPL_PTIMER_ERROR)
+      time = HPL_PTIMER_ERROR;
+    else
+      time = HPL_ptimer_cpusec[I];
+  }
+  return (time);
+}
+
+void HPL_ptimer_combine(MPI_Comm             COMM,
+                        const HPL_T_PTIME_OP OPE,
+                        const HPL_T_PTIME    TMTYPE,
+                        const int            N,
+                        const int            IBEG,
+                        double*              TIMES) {
+  /*
+   * Purpose
+   * =======
+   *
+   * HPL_ptimer_combine  combines the timing information stored on a scope
+   * of processes into the user TIMES array.
+   *
+   * Arguments
+   * =========
+   *
+   * COMM    (global/local input)        MPI_Comm
+   *         The MPI communicator  identifying  the process  collection on
+   *         which the timings are taken.
+   *
+   * OPE     (global input)              const HPL_T_PTIME_OP
+   *         On entry, OP  specifies what combine operation should be done
+   *         as follows:
+   *            = HPL_AMAX_PTIME get max. time on any process (default),
+   *            = HPL_AMIN_PTIME get min. time on any process,
+   *            = HPL_SUM_PTIME  get sum of times across processes.
+   *
+   * TMTYPE  (global input)              const HPL_T_PTIME
+   *         On entry, TMTYPE specifies what time will be returned as fol-
+   *         lows
+   *            = HPL_WALL_PTIME : wall clock time is returned,
+   *            = HPL_CPU_PTIME  : CPU time is returned (default).
+   *
+   * N       (global input)              const int
+   *         On entry, N specifies the number of timers to combine.
+   *
+   * IBEG    (global input)              const int
+   *         On entry, IBEG specifies the first timer to be combined.
+   *
+   * TIMES   (global output)             double *
+   *         On entry, TIMES is an array of dimension at least N. On exit,
+   *         this array contains the requested timing information.
+   *
+   * ---------------------------------------------------------------------
+   */
+
+  int i, tmpdis;
+
+  tmpdis              = HPL_ptimer_disabled;
+  HPL_ptimer_disabled = 1;
+  /*
+   * Timer has been disabled for combine operation -  copy timing informa-
+   * tion into user times array.  If  wall- or  cpu-time are not available
+   * on this machine, fill in times with HPL_PTIMER_ERROR flag and return.
+   */
+  if(TMTYPE == HPL_WALL_PTIME) {
+    if(HPL_ptimer_walltime() == HPL_PTIMER_ERROR) {
+      for(i = 0; i < N; i++) TIMES[i] = HPL_PTIMER_ERROR;
+      return;
+    } else {
+      for(i = 0; i < N; i++) TIMES[i] = HPL_ptimer_wallsec[IBEG + i];
+    }
+  } else {
+    if(HPL_ptimer_cputime() == HPL_PTIMER_ERROR) {
+      for(i = 0; i < N; i++) TIMES[i] = HPL_PTIMER_ERROR;
+      return;
+    } else {
+      for(i = 0; i < N; i++) TIMES[i] = HPL_ptimer_cpusec[IBEG + i];
+    }
+  }
+  /*
+   * Combine all nodes information, restore HPL_ptimer_disabled, and return
+   */
+  for(i = 0; i < N; i++) TIMES[i] = Mmax(HPL_rzero, TIMES[i]);
+
+  if(OPE == HPL_AMAX_PTIME)
+    (void)HPL_all_reduce((void*)(TIMES), N, HPL_DOUBLE, HPL_MAX, COMM);
+  else if(OPE == HPL_AMIN_PTIME)
+    (void)HPL_all_reduce((void*)(TIMES), N, HPL_DOUBLE, HPL_MIN, COMM);
+  else if(OPE == HPL_SUM_PTIME)
+    (void)HPL_all_reduce((void*)(TIMES), N, HPL_DOUBLE, HPL_SUM, COMM);
+  else
+    (void)HPL_all_reduce((void*)(TIMES), N, HPL_DOUBLE, HPL_MAX, COMM);
+
+  HPL_ptimer_disabled = tmpdis;
+}
diff --git a/src/timer/HPL_ptimer_cputime.cpp b/src/timer/HPL_ptimer_cputime.cpp
new file mode 100644
index 0000000..a3f1577
--- /dev/null
+++ b/src/timer/HPL_ptimer_cputime.cpp
@@ -0,0 +1,45 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+/*
+ * Purpose
+ * =======
+ *
+ * HPL_ptimer_cputime returns the cpu time.
+ * The  clock() function is used to return an approximation of processor
+ * time used by the program.  The value returned is the CPU time used so
+ * far as a clock_t;  to get the number of seconds used,  the result  is
+ * divided by  CLOCKS_PER_SEC.  This function is part of the  ANSI/ISO C
+ * standard library.
+ *
+ * ---------------------------------------------------------------------
+ */
+
+#include <time.h>
+
+double HPL_ptimer_cputime(void) {
+  static double  cps = CLOCKS_PER_SEC;
+  double         d;
+  clock_t        t1;
+  static clock_t t0 = 0;
+
+  if(t0 == 0) t0 = clock();
+  t1 = clock() - t0;
+  d  = (double)(t1) / cps;
+  return (d);
+}
diff --git a/src/timer/HPL_ptimer_walltime.cpp b/src/timer/HPL_ptimer_walltime.cpp
new file mode 100644
index 0000000..de35681
--- /dev/null
+++ b/src/timer/HPL_ptimer_walltime.cpp
@@ -0,0 +1,29 @@
+/* ---------------------------------------------------------------------
+ * -- High Performance Computing Linpack Benchmark (HPL)
+ *    HPL - 2.2 - February 24, 2016
+ *    Antoine P. Petitet
+ *    University of Tennessee, Knoxville
+ *    Innovative Computing Laboratory
+ *    (C) Copyright 2000-2008 All Rights Reserved
+ *
+ *    Modified by: Noel Chalmers
+ *    (C) 2018-2022 Advanced Micro Devices, Inc.
+ *    See the rocHPL/LICENCE file for details.
+ *
+ *    SPDX-License-Identifier: (BSD-3-Clause)
+ * ---------------------------------------------------------------------
+ */
+
+#include "hpl.hpp"
+
+/*
+ * Purpose
+ * =======
+ *
+ * HPL_ptimer_walltime returns the elapsed (wall-clock) time.
+ *
+ *
+ * ---------------------------------------------------------------------
+ */
+
+double HPL_ptimer_walltime(void) { return (MPI_Wtime()); }